From: Xinchen Hui Date: Wed, 14 Feb 2018 16:18:47 +0000 (+0800) Subject: Optimized fast_memcpy with AVX instructions X-Git-Tag: php-7.3.0alpha1~431 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5c05870ef65834f5a2c5cb5baddee07cba9399dc;p=php Optimized fast_memcpy with AVX instructions Great thanks to welting for helping :) --- diff --git a/ext/opcache/ZendAccelerator.c b/ext/opcache/ZendAccelerator.c index 1c870e908d..d9d2cdb4d1 100644 --- a/ext/opcache/ZendAccelerator.c +++ b/ext/opcache/ZendAccelerator.c @@ -1358,7 +1358,7 @@ static zend_persistent_script *store_script_in_file_cache(zend_persistent_script memory_used = zend_accel_script_persist_calc(new_persistent_script, NULL, 0, 0); /* Allocate memory block */ -#ifdef __SSE2__ +#if defined(__AVX__) || defined(__SSE2__) /* Align to 64-byte boundary */ ZCG(mem) = zend_arena_alloc(&CG(arena), memory_used + 64); ZCG(mem) = (void*)(((zend_uintptr_t)ZCG(mem) + 63L) & ~63L); @@ -1465,7 +1465,7 @@ static zend_persistent_script *cache_script_in_shared_memory(zend_persistent_scr memory_used = zend_accel_script_persist_calc(new_persistent_script, key, key_length, 1); /* Allocate shared memory */ -#ifdef __SSE2__ +#if defined(__AVX__) || defined(__SSE2__) /* Align to 64-byte boundary */ ZCG(mem) = zend_shared_alloc(memory_used + 64); if (ZCG(mem)) { diff --git a/ext/opcache/zend_accelerator_util_funcs.c b/ext/opcache/zend_accelerator_util_funcs.c index bd46a86b27..bd77a139a2 100644 --- a/ext/opcache/zend_accelerator_util_funcs.c +++ b/ext/opcache/zend_accelerator_util_funcs.c @@ -596,8 +596,74 @@ static void zend_accel_class_hash_copy(HashTable *target, HashTable *source, uni return; } -#ifdef __SSE2__ -# include +#if defined(__AVX__) +# include +# if defined(__GNUC__) && defined(__i386__) +static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size) +{ + size_t delta = (char*)dest - (char*)src; + + __asm__ volatile ( + ".align 16\n\t" + ".LL0%=:\n\t" + "prefetchnta 0x40(%1)\n\t" + "vmovaps (%1), %%ymm0\n\t" + "vmovaps 0x20(%1), %%ymm1\n\t" + "vmovaps %%ymm0, (%1,%2)\n\t" + "vmovaps %%ymm1, 0x20(%1,%2)\n\t" + "addl $0x40, %1\n\t" + "subl $0x40, %0\n\t" + "ja .LL0%=" + : "+r"(size), + "+r"(src) + : "r"(delta) + : "cc", "memory", "%ymm0", "%ymm1"); +} +# elif defined(__GNUC__) && defined(__x86_64__) +static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size) +{ + size_t delta = (char*)dest - (char*)src; + + __asm__ volatile ( + ".align 16\n\t" + ".LL0%=:\n\t" + "prefetchnta 0x40(%1)\n\t" + "vmovaps (%1), %%ymm0\n\t" + "vmovaps 0x20(%1), %%ymm1\n\t" + "vmovaps %%ymm0, (%1,%2)\n\t" + "vmovaps %%ymm1, 0x20(%1,%2)\n\t" + "addq $0x40, %1\n\t" + "subq $0x40, %0\n\t" + "ja .LL0%=" + : "+r"(size), + "+r"(src) + : "r"(delta) + : "cc", "memory", "%ymm0", "%ymm1"); +} +# else +static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size) +{ + __m256i *dqdest = (__m256i*)dest; + const __m256i *dqsrc = (const __m256i*)src; + const __m256i *end = (const __m256i*)((const char*)src + size); + + do { +#ifdef PHP_WIN32 + _mm_prefetch((const char *)(dqsrc + 2), _MM_HINT_NTA); +#else + _mm_prefetch(dqsrc + 2, _MM_HINT_NTA); +#endif + + __m256i ymm0 = _mm256_load_ps((const float *)(dqsrc + 0)); + __m256i ymm1 = _mm256_load_ps((const float *)(dqsrc + 1)); + dqsrc += 2; + _mm256_store_ps((float *)(dqdest + 0), ymm0); + _mm256_store_ps((float *)(dqdest + 1), ymm1); + dqdest += 2; + } while (dqsrc != end); +} +# endif +#elif defined(__SSE2__) # include # if defined(__GNUC__) && defined(__i386__) static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size) @@ -691,7 +757,7 @@ zend_op_array* zend_accel_load_script(zend_persistent_script *persistent_script, ZCG(current_persistent_script) = persistent_script; ZCG(arena_mem) = NULL; if (EXPECTED(persistent_script->arena_size)) { -#ifdef __SSE2__ +#if defined(__AVX__) || defined(__SSE2__) /* Target address must be aligned to 64-byte boundary */ _mm_prefetch(persistent_script->arena_mem, _MM_HINT_NTA); ZCG(arena_mem) = zend_arena_alloc(&CG(arena), persistent_script->arena_size + 64); diff --git a/ext/opcache/zend_file_cache.c b/ext/opcache/zend_file_cache.c index de54949901..e48ccc22e8 100644 --- a/ext/opcache/zend_file_cache.c +++ b/ext/opcache/zend_file_cache.c @@ -809,7 +809,7 @@ int zend_file_cache_script_store(zend_persistent_script *script, int in_shm) return FAILURE; } -#ifdef __SSE2__ +#if defined(__AVX__) || defined(__SSE2__) /* Align to 64-byte boundary */ mem = emalloc(script->size + 64); buf = (void*)(((zend_uintptr_t)mem + 63L) & ~63L); @@ -1391,7 +1391,7 @@ zend_persistent_script *zend_file_cache_script_load(zend_file_handle *file_handl } checkpoint = zend_arena_checkpoint(CG(arena)); -#ifdef __SSE2__ +#if defined(__AVX__) || defined(__SSE2__) /* Align to 64-byte boundary */ mem = zend_arena_alloc(&CG(arena), info.mem_size + info.str_size + 64); mem = (void*)(((zend_uintptr_t)mem + 63L) & ~63L); @@ -1452,7 +1452,7 @@ zend_persistent_script *zend_file_cache_script_load(zend_file_handle *file_handl goto use_process_mem; } -#ifdef __SSE2__ +#if defined(__AVX__) || defined(__SSE2__) /* Align to 64-byte boundary */ buf = zend_shared_alloc(info.mem_size + 64); buf = (void*)(((zend_uintptr_t)buf + 63L) & ~63L); diff --git a/ext/opcache/zend_persist.c b/ext/opcache/zend_persist.c index 8c16538207..9fc6da744f 100644 --- a/ext/opcache/zend_persist.c +++ b/ext/opcache/zend_persist.c @@ -870,7 +870,7 @@ zend_persistent_script *zend_accel_script_persist(zend_persistent_script *script } zend_accel_store_interned_string(script->script.filename); -#ifdef __SSE2__ +#if defined(__AVX__) || defined(__SSE2__) /* Align to 64-byte boundary */ ZCG(mem) = (void*)(((zend_uintptr_t)ZCG(mem) + 63L) & ~63L); #else diff --git a/ext/opcache/zend_persist_calc.c b/ext/opcache/zend_persist_calc.c index eb802325b0..27d9fefb3c 100644 --- a/ext/opcache/zend_persist_calc.c +++ b/ext/opcache/zend_persist_calc.c @@ -415,7 +415,7 @@ uint32_t zend_accel_script_persist_calc(zend_persistent_script *new_persistent_s } ADD_STRING(new_persistent_script->script.filename); -#ifdef __SSE2__ +#if defined(__AVX__) || defined(__SSE2__) /* Align size to 64-byte boundary */ new_persistent_script->size = (new_persistent_script->size + 63) & ~63; #endif @@ -430,7 +430,7 @@ uint32_t zend_accel_script_persist_calc(zend_persistent_script *new_persistent_s zend_hash_persist_calc(&new_persistent_script->script.function_table, zend_persist_op_array_calc); zend_persist_op_array_calc_ex(&new_persistent_script->script.main_op_array); -#ifdef __SSE2__ +#if defined(__AVX__) || defined(__SSE2__) /* Align size to 64-byte boundary */ new_persistent_script->arena_size = (new_persistent_script->arena_size + 63) & ~63; #endif