memory_used = zend_accel_script_persist_calc(new_persistent_script, NULL, 0, 0);
/* Allocate memory block */
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Align to 64-byte boundary */
ZCG(mem) = zend_arena_alloc(&CG(arena), memory_used + 64);
ZCG(mem) = (void*)(((zend_uintptr_t)ZCG(mem) + 63L) & ~63L);
memory_used = zend_accel_script_persist_calc(new_persistent_script, key, key_length, 1);
/* Allocate shared memory */
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Align to 64-byte boundary */
ZCG(mem) = zend_shared_alloc(memory_used + 64);
if (ZCG(mem)) {
return;
}
-#ifdef __SSE2__
-# include <mmintrin.h>
+#if defined(__AVX__)
+# include <nmmintrin.h>
+# if defined(__GNUC__) && defined(__i386__)
+static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size)
+{
+ size_t delta = (char*)dest - (char*)src;
+
+ __asm__ volatile (
+ ".align 16\n\t"
+ ".LL0%=:\n\t"
+ "prefetchnta 0x40(%1)\n\t"
+ "vmovaps (%1), %%ymm0\n\t"
+ "vmovaps 0x20(%1), %%ymm1\n\t"
+ "vmovaps %%ymm0, (%1,%2)\n\t"
+ "vmovaps %%ymm1, 0x20(%1,%2)\n\t"
+ "addl $0x40, %1\n\t"
+ "subl $0x40, %0\n\t"
+ "ja .LL0%="
+ : "+r"(size),
+ "+r"(src)
+ : "r"(delta)
+ : "cc", "memory", "%ymm0", "%ymm1");
+}
+# elif defined(__GNUC__) && defined(__x86_64__)
+static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size)
+{
+ size_t delta = (char*)dest - (char*)src;
+
+ __asm__ volatile (
+ ".align 16\n\t"
+ ".LL0%=:\n\t"
+ "prefetchnta 0x40(%1)\n\t"
+ "vmovaps (%1), %%ymm0\n\t"
+ "vmovaps 0x20(%1), %%ymm1\n\t"
+ "vmovaps %%ymm0, (%1,%2)\n\t"
+ "vmovaps %%ymm1, 0x20(%1,%2)\n\t"
+ "addq $0x40, %1\n\t"
+ "subq $0x40, %0\n\t"
+ "ja .LL0%="
+ : "+r"(size),
+ "+r"(src)
+ : "r"(delta)
+ : "cc", "memory", "%ymm0", "%ymm1");
+}
+# else
+static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size)
+{
+ __m256i *dqdest = (__m256i*)dest;
+ const __m256i *dqsrc = (const __m256i*)src;
+ const __m256i *end = (const __m256i*)((const char*)src + size);
+
+ do {
+#ifdef PHP_WIN32
+ _mm_prefetch((const char *)(dqsrc + 2), _MM_HINT_NTA);
+#else
+ _mm_prefetch(dqsrc + 2, _MM_HINT_NTA);
+#endif
+
+ __m256i ymm0 = _mm256_load_ps((const float *)(dqsrc + 0));
+ __m256i ymm1 = _mm256_load_ps((const float *)(dqsrc + 1));
+ dqsrc += 2;
+ _mm256_store_ps((float *)(dqdest + 0), ymm0);
+ _mm256_store_ps((float *)(dqdest + 1), ymm1);
+ dqdest += 2;
+ } while (dqsrc != end);
+}
+# endif
+#elif defined(__SSE2__)
# include <emmintrin.h>
# if defined(__GNUC__) && defined(__i386__)
static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size)
ZCG(current_persistent_script) = persistent_script;
ZCG(arena_mem) = NULL;
if (EXPECTED(persistent_script->arena_size)) {
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Target address must be aligned to 64-byte boundary */
_mm_prefetch(persistent_script->arena_mem, _MM_HINT_NTA);
ZCG(arena_mem) = zend_arena_alloc(&CG(arena), persistent_script->arena_size + 64);
return FAILURE;
}
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Align to 64-byte boundary */
mem = emalloc(script->size + 64);
buf = (void*)(((zend_uintptr_t)mem + 63L) & ~63L);
}
checkpoint = zend_arena_checkpoint(CG(arena));
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Align to 64-byte boundary */
mem = zend_arena_alloc(&CG(arena), info.mem_size + info.str_size + 64);
mem = (void*)(((zend_uintptr_t)mem + 63L) & ~63L);
goto use_process_mem;
}
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Align to 64-byte boundary */
buf = zend_shared_alloc(info.mem_size + 64);
buf = (void*)(((zend_uintptr_t)buf + 63L) & ~63L);
}
ADD_STRING(new_persistent_script->script.filename);
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Align size to 64-byte boundary */
new_persistent_script->size = (new_persistent_script->size + 63) & ~63;
#endif
zend_hash_persist_calc(&new_persistent_script->script.function_table, zend_persist_op_array_calc);
zend_persist_op_array_calc_ex(&new_persistent_script->script.main_op_array);
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Align size to 64-byte boundary */
new_persistent_script->arena_size = (new_persistent_script->arena_size + 63) & ~63;
#endif