From 4b4c5f463c188cb8fece81a3a9d74e90de3ffa00 Mon Sep 17 00:00:00 2001
From: Oksana Shadura <oksana.shadura@cern.ch>
Date: Wed, 28 Mar 2018 18:16:46 +0200
Subject: [PATCH] Reverting CRC32-SSE4.2 implementation, ROOT is not using
 CRC32 but Adler32 instead The problem was in crc32-pclmul_asm.S that is
 licensed under GPL, while ROOT has LGPL. ROOT compression performance stay
 the same.

---
 builtins/zlib/CMakeLists.txt                  |   9 -
 .../zlib/contrib/amd64/crc32-pclmul_asm.S     | 258 ------------------
 builtins/zlib/crc32.c                         |  67 -----
 3 files changed, 334 deletions(-)
 delete mode 100644 builtins/zlib/contrib/amd64/crc32-pclmul_asm.S

diff --git a/builtins/zlib/CMakeLists.txt b/builtins/zlib/CMakeLists.txt
index ac424bfd08f..95429540499 100644
--- a/builtins/zlib/CMakeLists.txt
+++ b/builtins/zlib/CMakeLists.txt
@@ -1,14 +1,5 @@
 project(ZLIB C)
 
-set(CMAKE_ASM_OUTPUT_EXTENSION ".o")
-set(CMAKE_ASM_OUTPUT_EXTENSION_REPLACE 1)
-set(CMAKE_ASM_COMPILE_OBJECT       "<CMAKE_ASM_COMPILER> <FLAGS> -o <OBJECT> <SOURCE>")
-set(CMAKE_ASM_SOURCE_FILE_EXTENSIONS s;asm;msa)
-if((CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64|X86_64") AND (CMAKE_SYSTEM_NAME MATCHES "Linux"))
-    set(ZLIB_ASMS contrib/amd64/crc32-pclmul_asm.S)
-    set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C)
-endif()
-
 root_check_assembler()
 
 set(ZLIB_PUBLIC_HEADERS
diff --git a/builtins/zlib/contrib/amd64/crc32-pclmul_asm.S b/builtins/zlib/contrib/amd64/crc32-pclmul_asm.S
deleted file mode 100644
index ed23699a052..00000000000
--- a/builtins/zlib/contrib/amd64/crc32-pclmul_asm.S
+++ /dev/null
@@ -1,258 +0,0 @@
-/* GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see http://www.gnu.org/licenses
- *
- * Please  visit http://www.xyratex.com/contact if you need additional
- * information or have any questions.
- *
- * GPL HEADER END
- */
-
-/*
- * Copyright 2012 Xyratex Technology Limited
- *
- * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
- * calculation.
- * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
- * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
- * at:
- * http://www.intel.com/products/processor/manuals/
- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
- * Volume 2B: Instruction Set Reference, N-Z
- *
- * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
- *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
- */
-
-/* This file is "stolen" from linux kernel 3.14 with following minor changes to
- * make it self-contained:
- *   - remove all header files it includes
- *   - define ENTRY and ENDPROC macros
- *   - prepend '$' to some immediate operands to make assembler happy.
- */
-
-#define ENTRY(name) \
-.globl name; \
-.hidden name; \
-.type name, @function; \
-name:
-
-#define ENDPROC(name) \
-.size name, .-name
-
-.align 16
-/*
- * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
- * #define CONSTANT_R1  0x154442bd4LL
- *
- * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
- * #define CONSTANT_R2  0x1c6e41596LL
- */
-.Lconstant_R2R1:
-	.octa 0x00000001c6e415960000000154442bd4
-/*
- * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
- * #define CONSTANT_R3  0x1751997d0LL
- *
- * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
- * #define CONSTANT_R4  0x0ccaa009eLL
- */
-.Lconstant_R4R3:
-	.octa 0x00000000ccaa009e00000001751997d0
-/*
- * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
- * #define CONSTANT_R5  0x163cd6124LL
- */
-.Lconstant_R5:
-	.octa 0x00000000000000000000000163cd6124
-.Lconstant_mask32:
-	.octa 0x000000000000000000000000FFFFFFFF
-/*
- * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
- *
- * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
- * #define CONSTANT_RU  0x1F7011641LL
- */
-.Lconstant_RUpoly:
-	.octa 0x00000001F701164100000001DB710641
-
-#define CONSTANT %xmm0
-
-#ifdef __x86_64__
-#define BUF     %rdi
-#define LEN     %rsi
-#define CRC     %edx
-#else
-#define BUF     %eax
-#define LEN     %edx
-#define CRC     %ecx
-#endif
-
-
-
-.text
-/**
- *      Calculate crc32
- *      BUF - buffer (16 bytes aligned)
- *      LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
- *      CRC - initial crc32
- *      return %eax crc32
- *      uint crc32_pclmul_le_16(unsigned char const *buffer,
- *	                     size_t len, uint crc32)
- */
-
-ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
-	movdqa  (BUF), %xmm1
-	movdqa  0x10(BUF), %xmm2
-	movdqa  0x20(BUF), %xmm3
-	movdqa  0x30(BUF), %xmm4
-	movd    CRC, CONSTANT
-	pxor    CONSTANT, %xmm1
-	sub     $0x40, LEN
-	add     $0x40, BUF
-#ifndef __x86_64__
-	/* This is for position independent code(-fPIC) support for 32bit */
-	call    delta
-delta:
-	pop     %ecx
-#endif
-	cmp     $0x40, LEN
-	jb      less_64
-
-#ifdef __x86_64__
-	movdqa .Lconstant_R2R1(%rip), CONSTANT
-#else
-	movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
-#endif
-
-loop_64:/*  64 bytes Full cache line folding */
-	prefetchnta    0x40(BUF)
-	movdqa  %xmm1, %xmm5
-	movdqa  %xmm2, %xmm6
-	movdqa  %xmm3, %xmm7
-#ifdef __x86_64__
-	movdqa  %xmm4, %xmm8
-#endif
-	PCLMULQDQ $00, CONSTANT, %xmm1
-	PCLMULQDQ $00, CONSTANT, %xmm2
-	PCLMULQDQ $00, CONSTANT, %xmm3
-#ifdef __x86_64__
-	PCLMULQDQ $00, CONSTANT, %xmm4
-#endif
-	PCLMULQDQ $0x11, CONSTANT, %xmm5
-	PCLMULQDQ $0x11, CONSTANT, %xmm6
-	PCLMULQDQ $0x11, CONSTANT, %xmm7
-#ifdef __x86_64__
-	PCLMULQDQ $0x11, CONSTANT, %xmm8
-#endif
-	pxor    %xmm5, %xmm1
-	pxor    %xmm6, %xmm2
-	pxor    %xmm7, %xmm3
-#ifdef __x86_64__
-	pxor    %xmm8, %xmm4
-#else
-	/* xmm8 unsupported for x32 */
-	movdqa  %xmm4, %xmm5
-	PCLMULQDQ $00, CONSTANT, %xmm4
-	PCLMULQDQ $0x11, CONSTANT, %xmm5
-	pxor    %xmm5, %xmm4
-#endif
-
-	pxor    (BUF), %xmm1
-	pxor    0x10(BUF), %xmm2
-	pxor    0x20(BUF), %xmm3
-	pxor    0x30(BUF), %xmm4
-
-	sub     $0x40, LEN
-	add     $0x40, BUF
-	cmp     $0x40, LEN
-	jge     loop_64
-less_64:/*  Folding cache line into 128bit */
-#ifdef __x86_64__
-	movdqa  .Lconstant_R4R3(%rip), CONSTANT
-#else
-	movdqa  .Lconstant_R4R3 - delta(%ecx), CONSTANT
-#endif
-	prefetchnta     (BUF)
-
-	movdqa  %xmm1, %xmm5
-	PCLMULQDQ $0x00, CONSTANT, %xmm1
-	PCLMULQDQ $0x11, CONSTANT, %xmm5
-	pxor    %xmm5, %xmm1
-	pxor    %xmm2, %xmm1
-
-	movdqa  %xmm1, %xmm5
-	PCLMULQDQ $0x00, CONSTANT, %xmm1
-	PCLMULQDQ $0x11, CONSTANT, %xmm5
-	pxor    %xmm5, %xmm1
-	pxor    %xmm3, %xmm1
-
-	movdqa  %xmm1, %xmm5
-	PCLMULQDQ $0x00, CONSTANT, %xmm1
-	PCLMULQDQ $0x11, CONSTANT, %xmm5
-	pxor    %xmm5, %xmm1
-	pxor    %xmm4, %xmm1
-
-	cmp     $0x10, LEN
-	jb      fold_64
-loop_16:/* Folding rest buffer into 128bit */
-	movdqa  %xmm1, %xmm5
-	PCLMULQDQ $0x00, CONSTANT, %xmm1
-	PCLMULQDQ $0x11, CONSTANT, %xmm5
-	pxor    %xmm5, %xmm1
-	pxor    (BUF), %xmm1
-	sub     $0x10, LEN
-	add     $0x10, BUF
-	cmp     $0x10, LEN
-	jge     loop_16
-
-fold_64:
-	/* perform the last 64 bit fold, also adds 32 zeroes
-	 * to the input stream */
-	PCLMULQDQ $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
-	psrldq  $0x08, %xmm1
-	pxor    CONSTANT, %xmm1
-
-	/* final 32-bit fold */
-	movdqa  %xmm1, %xmm2
-#ifdef __x86_64__
-	movdqa  .Lconstant_R5(%rip), CONSTANT
-	movdqa  .Lconstant_mask32(%rip), %xmm3
-#else
-	movdqa  .Lconstant_R5 - delta(%ecx), CONSTANT
-	movdqa  .Lconstant_mask32 - delta(%ecx), %xmm3
-#endif
-	psrldq  $0x04, %xmm2
-	pand    %xmm3, %xmm1
-	PCLMULQDQ $0x00, CONSTANT, %xmm1
-	pxor    %xmm2, %xmm1
-
-	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
-#ifdef __x86_64__
-	movdqa  .Lconstant_RUpoly(%rip), CONSTANT
-#else
-	movdqa  .Lconstant_RUpoly - delta(%ecx), CONSTANT
-#endif
-	movdqa  %xmm1, %xmm2
-	pand    %xmm3, %xmm1
-	PCLMULQDQ $0x10, CONSTANT, %xmm1
-	pand    %xmm3, %xmm1
-	PCLMULQDQ $0x00, CONSTANT, %xmm1
-	pxor    %xmm2, %xmm1
-	PEXTRD  $0x01, %xmm1, %eax
-
-	ret
-ENDPROC(crc32_pclmul_le_16)
diff --git a/builtins/zlib/crc32.c b/builtins/zlib/crc32.c
index 254d8aba544..acb6c50845b 100644
--- a/builtins/zlib/crc32.c
+++ b/builtins/zlib/crc32.c
@@ -273,72 +273,6 @@ local unsigned long crc32_generic(crc, buf, len)
     return crc ^ 0xffffffffUL;
 }
 
-#if defined (__x86_64__) && defined (__linux__)
-
-/* Function stolen from linux kernel 3.14. It computes the CRC over the given
- * buffer with initial CRC value <crc32>. The buffer is <len> byte in length,
- * and must be 16-byte aligned.
- */
-extern uint crc32_pclmul_le_16(unsigned char const *buffer,
-                               size_t len, uInt crc32);
-
-uLong crc32_pclmul(uLong, const Bytef *, uInt) __attribute__ ((__target__ ("sse4.2,pclmul")));
-
-uLong crc32_pclmul(crc, buf, len)
-    uLong crc;
-    const Bytef *buf;
-    uInt len;
-{
-#define PCLMUL_MIN_LEN 64
-#define PCLMUL_ALIGN 16
-#define PCLMUL_ALIGN_MASK 15
-
-    if (len < PCLMUL_MIN_LEN + PCLMUL_ALIGN  - 1)
-      return crc32_generic(crc, buf, len);
-
-    /* Handle the leading patial chunk */
-    uInt misalign = PCLMUL_ALIGN_MASK & ((unsigned long)buf);
-    uInt sz = (PCLMUL_ALIGN - misalign) % PCLMUL_ALIGN;
-    if (sz) {
-      crc = crc32_generic(crc, buf, sz);
-      buf += sz;
-      len -= sz;
-    }
-
-    /* Go over 16-byte chunks */
-    crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK),
-                             crc ^ 0xffffffffUL);
-    crc = crc ^ 0xffffffffUL;
-
-    /* Handle the trailing partial chunk */
-    sz = len & PCLMUL_ALIGN_MASK;
-    if (sz) {
-      crc = crc32_generic(crc, buf + len - sz, sz);
-    }
-
-    return crc;
-
-#undef PCLMUL_MIN_LEN
-#undef PCLMUL_ALIGN
-#undef PCLMUL_ALIGN_MASK
-}
-
-void *resolve_crc32(void)
-{
-	unsigned int eax, ebx, ecx, edx;
-	if (!__get_cpuid (1, &eax, &ebx, &ecx, &edx))
-		return crc32_generic;
-	/* We need SSE4.2 and PCLMUL ISA support */
-	if (!((ecx & bit_SSE4_2) && (ecx & bit_PCLMUL)))
-		return crc32_generic;
-	return crc32_pclmul;
-}
-
-/* This function needs to be resolved at load time */
-uLong crc32(unsigned long, const unsigned char FAR *, unsigned) __attribute__ ((ifunc ("resolve_crc32")));
-
-#else // if not x86_64
-
 uLong crc32(crc, buf, len)
     uLong crc;
     const Bytef *buf;
@@ -346,7 +280,6 @@ uLong crc32(crc, buf, len)
 {
     return crc32_generic(crc, buf, len);
 }
-#endif
 
 #ifdef BYFOUR
 
-- 
GitLab