]> granicus.if.org Git - libx264/commitdiff
x86: AVX-512 pixel_avg_weight_w8
authorHenrik Gramner <henrik@gramner.com>
Sat, 24 Jun 2017 13:12:57 +0000 (15:12 +0200)
committerHenrik Gramner <henrik@gramner.com>
Mon, 26 Jun 2017 19:07:29 +0000 (21:07 +0200)
common/x86/mc-a.asm
common/x86/mc-c.c

index 2dbdee5d8534ac0a348ab1e5401610ed3ed42480..3c1d214547279177455730a452f32abd294492be 100644 (file)
@@ -276,6 +276,38 @@ cglobal pixel_avg_weight_w16
     vextracti128 [t0+t1], m0, 1
     AVG_END
 
+INIT_YMM avx512
+cglobal pixel_avg_weight_w8
+    BIWEIGHT_START
+    kxnorb         k1, k1, k1
+    kaddb          k1, k1, k1
+    AVG_START 5
+.height_loop:
+    movq          xm0, [t2]
+    movq          xm2, [t4]
+    movq          xm1, [t2+t3]
+    movq          xm5, [t4+t5]
+    lea            t2, [t2+t3*2]
+    lea            t4, [t4+t5*2]
+    vpbroadcastq   m0 {k1}, [t2]
+    vpbroadcastq   m2 {k1}, [t4]
+    vpbroadcastq   m1 {k1}, [t2+t3]
+    vpbroadcastq   m5 {k1}, [t4+t5]
+    punpcklbw      m0, m2
+    punpcklbw      m1, m5
+    pmaddubsw      m0, m3
+    pmaddubsw      m1, m3
+    pmulhrsw       m0, m4
+    pmulhrsw       m1, m4
+    packuswb       m0, m1
+    vextracti128 xmm1, m0, 1
+    movq         [t0], xm0
+    movhps    [t0+t1], xm0
+    lea            t0, [t0+t1*2]
+    movq         [t0], xmm1
+    movhps    [t0+t1], xmm1
+    AVG_END 4
+
 INIT_ZMM avx512
 cglobal pixel_avg_weight_w16
     BIWEIGHT_START
@@ -776,6 +808,9 @@ AVGH 16,  8
 INIT_XMM avx512
 AVGH 16, 16
 AVGH 16,  8
+AVGH  8, 16
+AVGH  8,  8
+AVGH  8,  4
 
 %endif ;HIGH_BIT_DEPTH
 
index 0a7e414c5c40058950f2ca486d924867c299e6f9..c06691c9526230fe8dd8f7fd77de894b0b386f4c 100644 (file)
@@ -871,6 +871,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     {
         pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx512;
         pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_avx512;
+        pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_avx512;
+        pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_avx512;
+        pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_avx512;
     }
 #endif // HIGH_BIT_DEPTH