]> granicus.if.org Git - php/commitdiff
X86: Fast CRC32 computation using PCLMULQDQ instruction
authorFrank Du <frank.du@intel.com>
Wed, 2 Sep 2020 10:53:29 +0000 (10:53 +0000)
committerGeorge Peter Banyard <girgias@php.net>
Wed, 2 Sep 2020 13:10:41 +0000 (15:10 +0200)
Based on:
"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0

Signed-off-by: Frank Du <frank.du@intel.com>
Closes GH-6018

12 files changed:
Zend/zend_cpuinfo.h
Zend/zend_portability.h
configure.ac
ext/hash/hash_crc32.c
ext/hash/tests/crc32.phpt
ext/standard/basic_functions.c
ext/standard/config.m4
ext/standard/config.w32
ext/standard/crc32.c
ext/standard/crc32_x86.c [new file with mode: 0644]
ext/standard/crc32_x86.h [new file with mode: 0644]
ext/standard/tests/strings/crc32.phpt

index b8b08e7ad06ef2c22160e15f26925678c944d076..0baec57c23a885cb8a7e65041bd2a9d28937b2be 100644 (file)
@@ -159,6 +159,17 @@ static zend_always_inline int zend_cpu_supports_sse42() {
        return __builtin_cpu_supports("sse4.2");
 }
 
+/* __builtin_cpu_supports has pclmul from gcc9 */
+#if (!defined(__GNUC__) || (ZEND_GCC_VERSION >= 9000))
+ZEND_NO_SANITIZE_ADDRESS
+static zend_always_inline int zend_cpu_supports_pclmul() {
+#if PHP_HAVE_BUILTIN_CPU_INIT
+        __builtin_cpu_init();
+#endif
+        return __builtin_cpu_supports("pclmul");
+}
+#endif
+
 ZEND_NO_SANITIZE_ADDRESS
 static zend_always_inline int zend_cpu_supports_avx() {
 #if PHP_HAVE_BUILTIN_CPU_INIT
@@ -196,6 +207,10 @@ static zend_always_inline int zend_cpu_supports_sse42() {
        return zend_cpu_supports(ZEND_CPU_FEATURE_SSE42);
 }
 
+static zend_always_inline int zend_cpu_supports_pclmul() {
+        return zend_cpu_supports(ZEND_CPU_FEATURE_PCLMULQDQ);
+}
+
 static zend_always_inline int zend_cpu_supports_avx() {
        return zend_cpu_supports(ZEND_CPU_FEATURE_AVX);
 }
index e94560d18f0ce2570978b959e8bde2948f323157..f4acc9930e8df0ea4874add925f21bfcccf202d6 100644 (file)
@@ -487,6 +487,10 @@ extern "C++" {
 #  define PHP_HAVE_SSE4_2
 # endif
 
+# if defined(HAVE_WMMINTRIN_H)
+#  define PHP_HAVE_PCLMUL
+# endif
+
 /*
  * AVX2 support was added in gcc 4.7, but AVX2 intrinsics don't work in
  * __attribute__((target("avx2"))) functions until gcc 4.9.
@@ -547,6 +551,58 @@ extern "C++" {
 # define ZEND_INTRIN_SSE4_2_FUNC_DECL(func)
 #endif
 
+#ifdef __PCLMUL__
+/* Instructions compiled directly. */
+# define ZEND_INTRIN_PCLMUL_NATIVE 1
+#elif (defined(HAVE_FUNC_ATTRIBUTE_TARGET) && defined(PHP_HAVE_PCLMUL)) || defined(ZEND_WIN32)
+/* Function resolved by ifunc or MINIT. */
+# define ZEND_INTRIN_PCLMUL_RESOLVER 1
+#endif
+
+/* Do not use for conditional declaration of API functions! */
+#if defined(ZEND_INTRIN_PCLMUL_RESOLVER) && defined(ZEND_INTRIN_HAVE_IFUNC_TARGET) && (!defined(__GNUC__) || (ZEND_GCC_VERSION >= 9000))
+/* __builtin_cpu_supports has pclmul from gcc9 */
+# define ZEND_INTRIN_PCLMUL_FUNC_PROTO 1
+#elif defined(ZEND_INTRIN_PCLMUL_RESOLVER)
+# define ZEND_INTRIN_PCLMUL_FUNC_PTR 1
+#endif
+
+#ifdef ZEND_INTRIN_PCLMUL_RESOLVER
+# ifdef HAVE_FUNC_ATTRIBUTE_TARGET
+#  define ZEND_INTRIN_PCLMUL_FUNC_DECL(func) ZEND_API func __attribute__((target("pclmul")))
+# else
+#  define ZEND_INTRIN_PCLMUL_FUNC_DECL(func) func
+# endif
+#else
+# define ZEND_INTRIN_PCLMUL_FUNC_DECL(func)
+#endif
+
+#if defined(ZEND_INTRIN_SSE4_2_NATIVE) && defined(ZEND_INTRIN_PCLMUL_NATIVE)
+/* Instructions compiled directly. */
+# define ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE 1
+#elif (defined(HAVE_FUNC_ATTRIBUTE_TARGET) && defined(PHP_HAVE_SSE4_2) && defined(PHP_HAVE_PCLMUL)) || defined(ZEND_WIN32)
+/* Function resolved by ifunc or MINIT. */
+# define ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER 1
+#endif
+
+/* Do not use for conditional declaration of API functions! */
+#if defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER) && defined(ZEND_INTRIN_HAVE_IFUNC_TARGET) && (!defined(__GNUC__) || (ZEND_GCC_VERSION >= 9000))
+/* __builtin_cpu_supports has pclmul from gcc9 */
+# define ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PROTO 1
+#elif defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER)
+# define ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR 1
+#endif
+
+#ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
+# ifdef HAVE_FUNC_ATTRIBUTE_TARGET
+#  define ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(func) ZEND_API func __attribute__((target("sse4.2,pclmul")))
+# else
+#  define ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(func) func
+# endif
+#else
+# define ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(func)
+#endif
+
 #ifdef __AVX2__
 # define ZEND_INTRIN_AVX2_NATIVE 1
 #elif (defined(HAVE_FUNC_ATTRIBUTE_TARGET) && defined(PHP_HAVE_AVX2)) || defined(ZEND_WIN32)
index 7e7f3edec48d16466542b1060acf65a789faa3e8..3ff9a91bf9fb5c3fa015b6af07d766d08a98d7b1 100644 (file)
@@ -409,6 +409,7 @@ sys/ipc.h \
 dlfcn.h \
 tmmintrin.h \
 nmmintrin.h \
+wmmintrin.h \
 immintrin.h
 ],[],[],[
 #ifdef HAVE_SYS_PARAM_H
index de270522d7e27bcd962f1a9c1685e6c4c20e85e9..ade32a3b35c1c93316f94691d6ebd49ec776e0f7 100644 (file)
@@ -18,6 +18,7 @@
 #include "php_hash.h"
 #include "php_hash_crc32.h"
 #include "php_hash_crc32_tables.h"
+#include "ext/standard/crc32_x86.h"
 
 PHP_HASH_API void PHP_CRC32Init(PHP_CRC32_CTX *context)
 {
@@ -26,27 +27,39 @@ PHP_HASH_API void PHP_CRC32Init(PHP_CRC32_CTX *context)
 
 PHP_HASH_API void PHP_CRC32Update(PHP_CRC32_CTX *context, const unsigned char *input, size_t len)
 {
-       size_t i;
+       size_t i = 0;
 
-       for (i = 0; i < len; ++i) {
+#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
+       i += crc32_x86_simd_update(X86_CRC32, &context->state, input, len);
+#endif
+
+       for (; i < len; ++i) {
                context->state = (context->state << 8) ^ crc32_table[(context->state >> 24) ^ (input[i] & 0xff)];
        }
 }
 
 PHP_HASH_API void PHP_CRC32BUpdate(PHP_CRC32_CTX *context, const unsigned char *input, size_t len)
 {
-       size_t i;
+       size_t i = 0;
+
+#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
+       i += crc32_x86_simd_update(X86_CRC32B, &context->state, input, len);
+#endif
 
-       for (i = 0; i < len; ++i) {
+       for (; i < len; ++i) {
                context->state = (context->state >> 8) ^ crc32b_table[(context->state ^ input[i]) & 0xff];
        }
 }
 
 PHP_HASH_API void PHP_CRC32CUpdate(PHP_CRC32_CTX *context, const unsigned char *input, size_t len)
 {
-       size_t i;
+       size_t i = 0;
+
+#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
+       i += crc32_x86_simd_update(X86_CRC32C, &context->state, input, len);
+#endif
 
-       for (i = 0; i < len; ++i) {
+       for (; i < len; ++i) {
                context->state = (context->state >> 8) ^ crc32c_table[(context->state ^ input[i]) & 0xff];
        }
 }
index f99152b49e03aa2721bc5369cf6d9420854e2bc6..fda63c71360b14a87cbc57a7b36a70c18904614f 100644 (file)
@@ -10,6 +10,20 @@ echo hash('crc32', 'message digest'), "\n";
 echo hash('crc32', 'abcdefghijklmnopqrstuvwxyz'), "\n";
 echo hash('crc32', 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'), "\n";
 echo hash('crc32', '12345678901234567890123456789012345678901234567890123456789012345678901234567890'), "\n";
+echo hash('crc32', '1234567890123456'), "\n";
+echo hash('crc32', '1234567890123456abc'), "\n";
+echo hash('crc32', '12345678901234561234567890123456'), "\n";
+echo hash('crc32', '12345678901234561234567890123456abc'), "\n";
+echo hash('crc32', '123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32', '123456789012345612345678901234561234567890123456abc'), "\n";
+echo hash('crc32', '1234567890123456123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32', '1234567890123456123456789012345612345678901234561234567890123456abc'), "\n";
+echo hash('crc32', '12345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32', '12345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n";
+echo hash('crc32', '12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32', '12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n";
+echo hash('crc32', '123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32', '123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n";
 
 echo "crc32b\n";
 echo hash('crc32b', ''), "\n";
@@ -19,6 +33,20 @@ echo hash('crc32b', 'message digest'), "\n";
 echo hash('crc32b', 'abcdefghijklmnopqrstuvwxyz'), "\n";
 echo hash('crc32b', 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'), "\n";
 echo hash('crc32b', '12345678901234567890123456789012345678901234567890123456789012345678901234567890'), "\n";
+echo hash('crc32b', '1234567890123456'), "\n";
+echo hash('crc32b', '1234567890123456abc'), "\n";
+echo hash('crc32b', '12345678901234561234567890123456'), "\n";
+echo hash('crc32b', '12345678901234561234567890123456abc'), "\n";
+echo hash('crc32b', '123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32b', '123456789012345612345678901234561234567890123456abc'), "\n";
+echo hash('crc32b', '1234567890123456123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32b', '1234567890123456123456789012345612345678901234561234567890123456abc'), "\n";
+echo hash('crc32b', '12345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32b', '12345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n";
+echo hash('crc32b', '12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32b', '12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n";
+echo hash('crc32b', '123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32b', '123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n";
 
 echo "crc32c\n";
 echo hash('crc32c', ''), "\n";
@@ -59,6 +87,20 @@ echo hash('crc32c', "Even if I could be Shakespeare, I think I should still choo
 echo hash('crc32c', "The fugacity of a constituent in a mixture of gases at a given temperature is proportional to its mole fraction.  Lewis-Randall Rule"), "\n";
 echo hash('crc32c', "How can you write a big system without C++?  -Paul Glick"), "\n";
 echo hash('crc32c', "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#\$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"), "\n";
+echo hash('crc32c', '1234567890123456'), "\n";
+echo hash('crc32c', '1234567890123456abc'), "\n";
+echo hash('crc32c', '12345678901234561234567890123456'), "\n";
+echo hash('crc32c', '12345678901234561234567890123456abc'), "\n";
+echo hash('crc32c', '123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32c', '123456789012345612345678901234561234567890123456abc'), "\n";
+echo hash('crc32c', '1234567890123456123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32c', '1234567890123456123456789012345612345678901234561234567890123456abc'), "\n";
+echo hash('crc32c', '12345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32c', '12345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n";
+echo hash('crc32c', '12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32c', '12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n";
+echo hash('crc32c', '123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n";
+echo hash('crc32c', '123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n";
 
 ?>
 --EXPECT--
@@ -70,6 +112,20 @@ crc32
 9693bf77
 882174a0
 96790816
+98b0e78d
+a6f33d71
+900a1d38
+396978fe
+adfc6afe
+d3ef9388
+c53911dc
+37006f1b
+4a54af3a
+98d05c71
+5a26f5b4
+b9108715
+cc684112
+b2ac45af
 crc32b
 00000000
 e8b7be43
@@ -78,6 +134,20 @@ e8b7be43
 4c2750bd
 1fc2e6d2
 7ca94a72
+1e5fcdb7
+70b54c2f
+094fb11e
+38210c49
+7399c6ef
+83e98d04
+1f26a94e
+e2e8634a
+0642542d
+43b42c9b
+262e1ded
+b7a463c4
+dfa1bbae
+4022d57a
 crc32c
 00000000
 c1d04330
@@ -116,4 +186,18 @@ de2e65c5
 297a88ed
 66ed1d8b
 dcded527
-9c44184b
\ No newline at end of file
+9c44184b
+9aa4287f
+ab2761c5
+cd486b4b
+c19c4a41
+1ea5b441
+36d20512
+31d11ffa
+65d5bb9e
+a0e3e317
+8dc10a7c
+7ab04135
+c292a38d
+e3e558ec
+b6c5e13e
index 18e4985da2b16ba9cf3c888891008d4f20e8d451..7051e4a456b8e61505a3e15848c2ff60da945b6d 100755 (executable)
@@ -33,6 +33,7 @@
 #include "ext/standard/php_dns.h"
 #include "ext/standard/php_uuencode.h"
 #include "ext/standard/php_mt_rand.h"
+#include "ext/standard/crc32_x86.h"
 
 #ifdef PHP_WIN32
 #include "win32/php_win32_globals.h"
@@ -363,6 +364,10 @@ PHP_MINIT_FUNCTION(basic) /* {{{ */
        BASIC_MINIT_SUBMODULE(string_intrin)
 #endif
 
+#if ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR
+       BASIC_MINIT_SUBMODULE(crc32_x86_intrin)
+#endif
+
 #if ZEND_INTRIN_AVX2_FUNC_PTR || ZEND_INTRIN_SSSE3_FUNC_PTR
        BASIC_MINIT_SUBMODULE(base64_intrin)
 #endif
index 04b9d0aea1d6a2ae1f65d725451302651bc017dd..1cb0af79f68e012eb03fe329660d55cc46e8dce2 100644 (file)
@@ -449,7 +449,7 @@ PHP_NEW_EXTENSION(standard, array.c base64.c basic_functions.c browscap.c crc32.
                             http_fopen_wrapper.c php_fopen_wrapper.c credits.c css.c \
                             var_unserializer.c ftok.c sha1.c user_filters.c uuencode.c \
                             filters.c proc_open.c streamsfuncs.c http.c password.c \
-                            random.c net.c hrtime.c,,,
+                            random.c net.c hrtime.c crc32_x86.c,,,
                            -DZEND_ENABLE_STATIC_TSRMLS_CACHE=1)
 
 PHP_ADD_MAKEFILE_FRAGMENT
index a5737ea8538b8a1dedb0ee47459d3e6d09b6212f..d23bfd9db26172a19f5bca8632e5a0d6a26ae6ab 100644 (file)
@@ -25,7 +25,7 @@ ADD_FLAG("LIBS_STANDARD", "iphlpapi.lib");
 
 EXTENSION("standard", "array.c base64.c basic_functions.c browscap.c \
        crc32.c crypt.c crypt_freesec.c crypt_blowfish.c crypt_sha256.c \
-       crypt_sha512.c  php_crypt_r.c \
+       crypt_sha512.c  php_crypt_r.c crc32_x86.c \
        datetime.c dir.c dl.c dns.c dns_win32.c exec.c \
        file.c filestat.c formatted_print.c fsock.c head.c html.c image.c \
        info.c iptc.c lcg.c link.c mail.c math.c md5.c metaphone.c microtime.c \
index 6ab5c317bfc72fae2898ee78202b6b6a7e3f99c7..40d9404053e285aab1ba6357f3af4979a029701d 100644 (file)
@@ -17,6 +17,7 @@
 #include "php.h"
 #include "basic_functions.h"
 #include "crc32.h"
+#include "crc32_x86.h"
 
 #if HAVE_AARCH64_CRC32
 # include <arm_acle.h>
@@ -74,7 +75,7 @@ PHP_FUNCTION(crc32)
        char *p;
        size_t nr;
        uint32_t crcinit = 0;
-       register uint32_t crc;
+       uint32_t crc;
 
        ZEND_PARSE_PARAMETERS_START(1, 1)
                Z_PARAM_STRING(p, nr)
@@ -89,6 +90,12 @@ PHP_FUNCTION(crc32)
        }
 #endif
 
+#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
+       size_t nr_simd = crc32_x86_simd_update(X86_CRC32B, &crc, (const unsigned char *)p, nr);
+       nr -= nr_simd;
+       p += nr_simd;
+#endif
+
        for (; nr--; ++p) {
                crc = ((crc >> 8) & 0x00FFFFFF) ^ crc32tab[(crc ^ (*p)) & 0xFF ];
        }
diff --git a/ext/standard/crc32_x86.c b/ext/standard/crc32_x86.c
new file mode 100644 (file)
index 0000000..296eade
--- /dev/null
@@ -0,0 +1,349 @@
+/*
+  +----------------------------------------------------------------------+
+  | Copyright (c) The PHP Group                                          |
+  +----------------------------------------------------------------------+
+  | This source file is subject to version 3.01 of the PHP license,      |
+  | that is bundled with this package in the file LICENSE, and is        |
+  | available through the world-wide-web at the following url:           |
+  | https://www.php.net/license/3_01.txt                                 |
+  | If you did not receive a copy of the PHP license and are unable to   |
+  | obtain it through the world-wide-web, please send a note to          |
+  | license@php.net so we can mail you a copy immediately.               |
+  +----------------------------------------------------------------------+
+  | Author: Frank Du <frank.du@intel.com>                                |
+  +----------------------------------------------------------------------+
+  | Compute the crc32 of the buffer. Based on:                           |
+  | "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ"       |
+  |  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0          |
+*/
+
+#include "crc32_x86.h"
+
+#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
+# include <nmmintrin.h>
+# include <wmmintrin.h>
+#endif
+
+#if ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
+# include "Zend/zend_cpuinfo.h"
+#endif
+
+#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
+
+typedef struct _crc32_pclmul_bit_consts {
+       uint64_t k1k2[2];
+       uint64_t k3k4[2];
+       uint64_t k5k6[2];
+       uint64_t uPx[2];
+} crc32_pclmul_consts;
+
+static const crc32_pclmul_consts crc32_pclmul_consts_maps[X86_CRC32_MAX] = {
+       { /* X86_CRC32, polynomial: 0x04C11DB7 */
+               {0x00e6228b11, 0x008833794c}, /* endianness swap */
+               {0x00e8a45605, 0x00c5b9cd4c}, /* endianness swap */
+               {0x00490d678d, 0x00f200aa66}, /* endianness swap */
+               {0x0104d101df, 0x0104c11db7}
+       },
+       { /* X86_CRC32B, polynomial: 0x04C11DB7 with reversed ordering */
+               {0x0154442bd4, 0x01c6e41596},
+               {0x01751997d0, 0x00ccaa009e},
+               {0x0163cd6124, 0x01db710640},
+               {0x01f7011641, 0x01db710641},
+       },
+       { /* X86_CRC32C, polynomial: 0x1EDC6F41 with reversed ordering */
+               {0x00740eef02, 0x009e4addf8},
+               {0x00f20c0dfe, 0x014cd00bd6},
+               {0x00dd45aab8, 0x0000000000},
+               {0x00dea713f1, 0x0105ec76f0}
+       }
+};
+
+static uint8_t pclmul_shuf_mask_table[16] = {
+       0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
+       0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+};
+
+/* Folding of 128-bit data chunks */
+#define CRC32_FOLDING_BLOCK_SIZE (16)
+
+/* PCLMUL version of non-relfected crc32 */
+ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
+size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
+{
+       size_t nr_in = nr;
+       __m128i x0, x1, x2, k, shuf_mask;
+
+       if (nr < CRC32_FOLDING_BLOCK_SIZE) {
+               return 0;
+       }
+
+       shuf_mask = _mm_loadu_si128((__m128i *)(pclmul_shuf_mask_table));
+       x0 = _mm_cvtsi32_si128(*crc);
+       x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
+       x0 = _mm_slli_si128(x0, 12);
+       x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
+       x0 = _mm_xor_si128(x1, x0);
+       p += CRC32_FOLDING_BLOCK_SIZE;
+       nr -= CRC32_FOLDING_BLOCK_SIZE;
+
+       if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
+               __m128i x3, x4;
+
+               x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
+               x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
+               x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
+               x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
+               x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
+               x3 = _mm_shuffle_epi8(x3, shuf_mask); /* endianness swap */
+               p += CRC32_FOLDING_BLOCK_SIZE * 3;
+               nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
+
+               k = _mm_loadu_si128((__m128i *)consts->k1k2);
+               /* parallel folding by 4 */
+               while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
+                       __m128i x5, x6, x7, x8, x9, x10, x11;
+                       x4 = _mm_clmulepi64_si128(x0, k, 0x00);
+                       x5 = _mm_clmulepi64_si128(x1, k, 0x00);
+                       x6 = _mm_clmulepi64_si128(x2, k, 0x00);
+                       x7 = _mm_clmulepi64_si128(x3, k, 0x00);
+                       x0 = _mm_clmulepi64_si128(x0, k, 0x11);
+                       x1 = _mm_clmulepi64_si128(x1, k, 0x11);
+                       x2 = _mm_clmulepi64_si128(x2, k, 0x11);
+                       x3 = _mm_clmulepi64_si128(x3, k, 0x11);
+                       x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
+                       x8 = _mm_shuffle_epi8(x8, shuf_mask); /* endianness swap */
+                       x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
+                       x9 = _mm_shuffle_epi8(x9, shuf_mask); /* endianness swap */
+                       x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
+                       x10 = _mm_shuffle_epi8(x10, shuf_mask); /* endianness swap */
+                       x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
+                       x11 = _mm_shuffle_epi8(x11, shuf_mask); /* endianness swap */
+                       x0 = _mm_xor_si128(x0, x4);
+                       x1 = _mm_xor_si128(x1, x5);
+                       x2 = _mm_xor_si128(x2, x6);
+                       x3 = _mm_xor_si128(x3, x7);
+                       x0 = _mm_xor_si128(x0, x8);
+                       x1 = _mm_xor_si128(x1, x9);
+                       x2 = _mm_xor_si128(x2, x10);
+                       x3 = _mm_xor_si128(x3, x11);
+
+                       p += CRC32_FOLDING_BLOCK_SIZE * 4;
+                       nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
+               }
+
+               k = _mm_loadu_si128((__m128i *)consts->k3k4);
+               /* fold 4 to 1, [x1, x2, x3] -> x0 */
+               x4 = _mm_clmulepi64_si128(x0, k, 0x00);
+               x0 = _mm_clmulepi64_si128(x0, k, 0x11);
+               x0 = _mm_xor_si128(x0, x1);
+               x0 = _mm_xor_si128(x0, x4);
+               x4 = _mm_clmulepi64_si128(x0, k, 0x00);
+               x0 = _mm_clmulepi64_si128(x0, k, 0x11);
+               x0 = _mm_xor_si128(x0, x2);
+               x0 = _mm_xor_si128(x0, x4);
+               x4 = _mm_clmulepi64_si128(x0, k, 0x00);
+               x0 = _mm_clmulepi64_si128(x0, k, 0x11);
+               x0 = _mm_xor_si128(x0, x3);
+               x0 = _mm_xor_si128(x0, x4);
+       }
+
+       k = _mm_loadu_si128((__m128i *)consts->k3k4);
+       /* folding by 1 */
+       while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
+               /* load next to x2, fold to x0, x1 */
+               x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
+               x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
+               x1 = _mm_clmulepi64_si128(x0, k, 0x00);
+               x0 = _mm_clmulepi64_si128(x0, k, 0x11);
+               x0 = _mm_xor_si128(x0, x2);
+               x0 = _mm_xor_si128(x0, x1);
+               p += CRC32_FOLDING_BLOCK_SIZE;
+               nr -= CRC32_FOLDING_BLOCK_SIZE;
+       }
+
+       /* reduce 128-bits(final fold) to 96-bits */
+       k = _mm_loadu_si128((__m128i*)consts->k5k6);
+       x1 = _mm_clmulepi64_si128(x0, k, 0x11);
+       x0 = _mm_slli_si128(x0, 8);
+       x0 = _mm_srli_si128(x0, 4);
+       x0 = _mm_xor_si128(x0, x1);
+       /* reduce 96-bits to 64-bits */
+       x1 = _mm_clmulepi64_si128(x0, k, 0x01);
+       x0 = _mm_xor_si128(x0, x1);
+
+       /* barrett reduction */
+       k = _mm_loadu_si128((__m128i*)consts->uPx);
+       x1 = _mm_move_epi64(x0);
+       x1 = _mm_srli_si128(x1, 4);
+       x1 = _mm_clmulepi64_si128(x1, k, 0x00);
+       x1 = _mm_srli_si128(x1, 4);
+       x1 = _mm_clmulepi64_si128(x1, k, 0x10);
+       x0 = _mm_xor_si128(x1, x0);
+       *crc =  _mm_extract_epi32(x0, 0);
+       return (nr_in - nr); /* the nr processed */
+}
+
+/* PCLMUL version of relfected crc32 */
+ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
+size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
+{
+       size_t nr_in = nr;
+       __m128i x0, x1, x2, k;
+
+       if (nr < CRC32_FOLDING_BLOCK_SIZE) {
+               return 0;
+       }
+
+       x0 = _mm_loadu_si128((__m128i *)(p + 0x00));
+       x0 = _mm_xor_si128(x0, _mm_cvtsi32_si128(*crc));
+       p += CRC32_FOLDING_BLOCK_SIZE;
+       nr -= CRC32_FOLDING_BLOCK_SIZE;
+       if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
+               __m128i x3, x4;
+
+               x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
+               x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
+               x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
+               p += CRC32_FOLDING_BLOCK_SIZE * 3;
+               nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
+
+               k = _mm_loadu_si128((__m128i *)consts->k1k2);
+               /* parallel folding by 4 */
+               while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
+                       __m128i x5, x6, x7, x8, x9, x10, x11;
+                       x4 = _mm_clmulepi64_si128(x0, k, 0x00);
+                       x5 = _mm_clmulepi64_si128(x1, k, 0x00);
+                       x6 = _mm_clmulepi64_si128(x2, k, 0x00);
+                       x7 = _mm_clmulepi64_si128(x3, k, 0x00);
+                       x0 = _mm_clmulepi64_si128(x0, k, 0x11);
+                       x1 = _mm_clmulepi64_si128(x1, k, 0x11);
+                       x2 = _mm_clmulepi64_si128(x2, k, 0x11);
+                       x3 = _mm_clmulepi64_si128(x3, k, 0x11);
+                       x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
+                       x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
+                       x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
+                       x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
+                       x0 = _mm_xor_si128(x0, x4);
+                       x1 = _mm_xor_si128(x1, x5);
+                       x2 = _mm_xor_si128(x2, x6);
+                       x3 = _mm_xor_si128(x3, x7);
+                       x0 = _mm_xor_si128(x0, x8);
+                       x1 = _mm_xor_si128(x1, x9);
+                       x2 = _mm_xor_si128(x2, x10);
+                       x3 = _mm_xor_si128(x3, x11);
+
+                       p += CRC32_FOLDING_BLOCK_SIZE * 4;
+                       nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
+               }
+
+               k = _mm_loadu_si128((__m128i *)consts->k3k4);
+               /* fold 4 to 1, [x1, x2, x3] -> x0 */
+               x4 = _mm_clmulepi64_si128(x0, k, 0x00);
+               x0 = _mm_clmulepi64_si128(x0, k, 0x11);
+               x0 = _mm_xor_si128(x0, x1);
+               x0 = _mm_xor_si128(x0, x4);
+               x4 = _mm_clmulepi64_si128(x0, k, 0x00);
+               x0 = _mm_clmulepi64_si128(x0, k, 0x11);
+               x0 = _mm_xor_si128(x0, x2);
+               x0 = _mm_xor_si128(x0, x4);
+               x4 = _mm_clmulepi64_si128(x0, k, 0x00);
+               x0 = _mm_clmulepi64_si128(x0, k, 0x11);
+               x0 = _mm_xor_si128(x0, x3);
+               x0 = _mm_xor_si128(x0, x4);
+       }
+
+       k = _mm_loadu_si128((__m128i *)consts->k3k4);
+       /* folding by 1 */
+       while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
+               /* load next to x2, fold to x0, x1 */
+               x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
+               x1 = _mm_clmulepi64_si128(x0, k, 0x00);
+               x0 = _mm_clmulepi64_si128(x0, k, 0x11);
+               x0 = _mm_xor_si128(x0, x2);
+               x0 = _mm_xor_si128(x0, x1);
+               p += CRC32_FOLDING_BLOCK_SIZE;
+               nr -= CRC32_FOLDING_BLOCK_SIZE;
+       }
+
+       /* reduce 128-bits(final fold) to 96-bits */
+       x1 = _mm_clmulepi64_si128(x0, k, 0x10);
+       x0 = _mm_srli_si128(x0, 8);
+       x0 = _mm_xor_si128(x0, x1);
+       /* reduce 96-bits to 64-bits */
+       x1 = _mm_shuffle_epi32(x0, 0xfc);
+       x0 = _mm_shuffle_epi32(x0, 0xf9);
+       k = _mm_loadu_si128((__m128i*)consts->k5k6);
+       x1 = _mm_clmulepi64_si128(x1, k, 0x00);
+       x0 = _mm_xor_si128(x0, x1);
+
+       /* barrett reduction */
+       x1 = _mm_shuffle_epi32(x0, 0xf3);
+       x0 = _mm_slli_si128(x0, 4);
+       k = _mm_loadu_si128((__m128i*)consts->uPx);
+       x1 = _mm_clmulepi64_si128(x1, k, 0x00);
+       x1 = _mm_clmulepi64_si128(x1, k, 0x10);
+       x0 = _mm_xor_si128(x1, x0);
+       *crc =  _mm_extract_epi32(x0, 2);
+       return (nr_in - nr); /* the nr processed */
+}
+
+# if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE
+size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
+# else /* ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER */
+size_t crc32_sse42_pclmul_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
+# endif
+{
+       if (type > X86_CRC32_MAX) {
+               return 0;
+       }
+       const crc32_pclmul_consts *consts = &crc32_pclmul_consts_maps[type];
+
+       switch (type) {
+       case X86_CRC32:
+               return crc32_pclmul_batch(crc, p, nr, consts);
+       case X86_CRC32B:
+       case X86_CRC32C:
+               return crc32_pclmul_reflected_batch(crc, p, nr, consts);
+       default:
+               return 0;
+       }
+}
+#endif
+
+#if ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
+static size_t crc32_x86_simd_update_default(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
+{
+       return 0;
+}
+
+# if ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PROTO
+size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) __attribute__((ifunc("resolve_crc32_x86_simd_update")));
+
+typedef size_t (*crc32_x86_simd_func_t)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr);
+
+ZEND_NO_SANITIZE_ADDRESS
+ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
+static crc32_x86_simd_func_t resolve_crc32_x86_simd_update() {
+       if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
+               return crc32_sse42_pclmul_update;
+       }
+       return crc32_x86_simd_update_default;
+}
+# else /* ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR */
+static size_t (*crc32_x86_simd_ptr)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) = crc32_x86_simd_update_default;
+
+size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) {
+       return crc32_x86_simd_ptr(type, crc, p, nr);
+}
+
+/* {{{ PHP_MINIT_FUNCTION */
+PHP_MINIT_FUNCTION(crc32_x86_intrin)
+{
+       if (zend_cpu_supports(ZEND_CPU_FEATURE_SSE42) && zend_cpu_supports(ZEND_CPU_FEATURE_PCLMULQDQ)) {
+               crc32_x86_simd_ptr = crc32_sse42_pclmul_update;
+       }
+       return SUCCESS;
+}
+/* }}} */
+# endif
+#endif
diff --git a/ext/standard/crc32_x86.h b/ext/standard/crc32_x86.h
new file mode 100644 (file)
index 0000000..6420030
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+  +----------------------------------------------------------------------+
+  | Copyright (c) The PHP Group                                          |
+  +----------------------------------------------------------------------+
+  | This source file is subject to version 3.01 of the PHP license,      |
+  | that is bundled with this package in the file LICENSE, and is        |
+  | available through the world-wide-web at the following url:           |
+  | https://www.php.net/license/3_01.txt                                 |
+  | If you did not receive a copy of the PHP license and are unable to   |
+  | obtain it through the world-wide-web, please send a note to          |
+  | license@php.net so we can mail you a copy immediately.               |
+  +----------------------------------------------------------------------+
+  | Author: Frank Du <frank.du@intel.com>                                |
+  +----------------------------------------------------------------------+
+*/
+
+#ifndef _CRC32_X86_HEADER_H_
+#define _CRC32_X86_HEADER_H_
+
+#include "php.h"
+
+typedef enum {
+       /* polynomial: 0x04C11DB7, used by bzip */
+       X86_CRC32 = 0,
+       /*
+        polynomial: 0x04C11DB7 with reversed ordering,
+        used by ethernet (IEEE 802.3), gzip, zip, png, etc
+       */
+       X86_CRC32B,
+       /*
+        polynomial: 0x1EDC6F41 with reversed ordering,
+        used by iSCSI, SCTP, Btrfs, ext4, etc
+       */
+       X86_CRC32C,
+       X86_CRC32_MAX,
+} X86_CRC32_TYPE;
+
+#if ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR
+PHP_MINIT_FUNCTION(crc32_x86_intrin);
+#endif
+
+#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
+/* Return the size processed by SIMD routine */
+size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr);
+#else
+static inline size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
+{
+       return 0;
+}
+#endif
+
+#endif
index 2fa8be59d8746669c21e7917e25e313d67f2ea82..da86d067bf66d5f76ab4986ff032e74b7140f110 100644 (file)
@@ -6,9 +6,37 @@ $input = array("foo", "bar", "baz", "grldsajkopallkjasd");
 foreach($input AS $i) {
     printf("%u\n", crc32($i));
 }
+printf("%u\n", crc32("1234567890123456"));
+printf("%u\n", crc32("1234567890123456abc"));
+printf("%u\n", crc32("12345678901234561234567890123456"));
+printf("%u\n", crc32("12345678901234561234567890123456abc"));
+printf("%u\n", crc32("123456789012345612345678901234561234567890123456"));
+printf("%u\n", crc32("123456789012345612345678901234561234567890123456abc"));
+printf("%u\n", crc32("1234567890123456123456789012345612345678901234561234567890123456"));
+printf("%u\n", crc32("1234567890123456123456789012345612345678901234561234567890123456abc"));
+printf("%u\n", crc32("12345678901234561234567890123456123456789012345612345678901234561234567890123456"));
+printf("%u\n", crc32("12345678901234561234567890123456123456789012345612345678901234561234567890123456abc"));
+printf("%u\n", crc32("12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456"));
+printf("%u\n", crc32("12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc"));
+printf("%u\n", crc32("123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456"));
+printf("%u\n", crc32("123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc"));
 ?>
 --EXPECT--
 2356372769
 1996459178
 2015626392
 824412087
+509595063
+1890929711
+156217630
+941689929
+1939457775
+2213121284
+522627406
+3806880586
+105010221
+1135881371
+640556525
+3081003972
+3751918510
+1076024698