From: Heikki Linnakangas Date: Tue, 14 Apr 2015 20:58:16 +0000 (+0300) Subject: Optimize pg_comp_crc32c_sse42 routine slightly, and also use it on x86. X-Git-Tag: REL9_5_ALPHA1~441 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=936546dcbc24ad1f2b3d33e73aa5c5fde4d2be84;p=postgresql Optimize pg_comp_crc32c_sse42 routine slightly, and also use it on x86. Eliminate the separate 'len' variable from the loops, and also use the 4 byte instruction. This shaves off a few more cycles. Even though this routine that uses the special SSE 4.2 instructions is much faster than a generic routine, it's still a hot spot, so let's make it as fast as possible. Change the configure test to not test _mm_crc32_u64. That variant is only available in the 64-bit x86-64 architecture, not in 32-bit x86. Modify pg_comp_crc32c_sse42 so that it only uses _mm_crc32_u64 on x86-64. With these changes, the SSE accelerated CRC-32C implementation can also be used on 32-bit x86 systems. This also fixes the 32-bit MSVC build. --- diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index f81e7d6139..4ef0de65a8 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -476,12 +476,16 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS # PGAC_SSE42_CRC32_INTRINSICS # ----------------------- -# Check if the compiler supports _mm_crc32_u8 and _mm_crc32_u64 intrinsics. +# Check if the compiler supports the x86 CRC instructions added in SSE 4.2, +# using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't +# test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if +# the other ones are, on x86-64 platforms) +# # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the # intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42. AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS], [define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl -AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=$1], [Ac_cachevar], +AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar], [pgac_save_CFLAGS=$CFLAGS CFLAGS="$pgac_save_CFLAGS $1" ac_save_c_werror_flag=$ac_c_werror_flag @@ -489,7 +493,7 @@ ac_c_werror_flag=yes AC_TRY_LINK([#include ], [unsigned int crc = 0; crc = _mm_crc32_u8(crc, 0); - crc = (unsigned int) _mm_crc32_u64(crc, 0);], + crc = _mm_crc32_u32(crc, 0);], [Ac_cachevar=yes], [Ac_cachevar=no]) ac_c_werror_flag=$ac_save_c_werror_flag diff --git a/configure b/configure index 6403141163..7c0bd0c696 100755 --- a/configure +++ b/configure @@ -14172,8 +14172,8 @@ fi # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used # with the default compiler flags. If not, check if adding the -msse4.2 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required. -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=" >&5 -$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=... " >&6; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5 +$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; } if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then : $as_echo_n "(cached) " >&6 else @@ -14189,7 +14189,7 @@ main () { unsigned int crc = 0; crc = _mm_crc32_u8(crc, 0); - crc = (unsigned int) _mm_crc32_u64(crc, 0); + crc = _mm_crc32_u32(crc, 0); ; return 0; } @@ -14212,8 +14212,8 @@ if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then fi if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=-msse4.2" >&5 -$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=-msse4.2... " >&6; } + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5 +$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; } if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then : $as_echo_n "(cached) " >&6 else @@ -14229,7 +14229,7 @@ main () { unsigned int crc = 0; crc = _mm_crc32_u8(crc, 0); - crc = (unsigned int) _mm_crc32_u64(crc, 0); + crc = _mm_crc32_u32(crc, 0); ; return 0; } diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index b6107103be..a22a9dd78b 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -22,30 +22,45 @@ pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) { const unsigned char *p = data; - const uint64 *p8; + const unsigned char *pend = p + len; /* * Process eight bytes of data at a time. * - * NB: We do unaligned 8-byte accesses here. The Intel architecture - * allows that, and performance testing didn't show any performance - * gain from aligning the beginning address. + * NB: We do unaligned accesses here. The Intel architecture allows that, + * and performance testing didn't show any performance gain from aligning + * the begin address. */ - p8 = (const uint64 *) p; - while (len >= 8) +#ifdef __x86_64__ + while (p + 8 <= pend) { - crc = (uint32) _mm_crc32_u64(crc, *p8++); - len -= 8; + crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p)); + p += 8; } + /* Process remaining full four bytes if any */ + if (p + 4 <= pend) + { + crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); + p += 4; + } +#else /* - * Handle any remaining bytes one at a time. + * Process four bytes at a time. (The eight byte instruction is not + * available on the 32-bit x86 architecture). */ - p = (const unsigned char *) p8; - while (len > 0) + while (p + 4 <= pend) + { + crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); + p += 4; + } +#endif /* __x86_64__ */ + + /* Process any remaining bytes one at a time. */ + while (p < pend) { - crc = _mm_crc32_u8(crc, *p++); - len--; + crc = _mm_crc32_u8(crc, *p); + p++; } return crc;