From: Henrik Gramner Date: Fri, 30 Mar 2018 23:31:57 +0000 (+0200) Subject: x86: Correctly use v-prefix for instructions with opmasks X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5f7f950c80e330728ecb07bc133e17456870121a;p=libx264 x86: Correctly use v-prefix for instructions with opmasks This was always required, but accidentally happened to work correctly in a few cases. --- diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index ca842ee9..01311c49 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -621,8 +621,8 @@ cglobal sub16x16_dct, 3,3,6 SBUTTERFLY wd, 1, 0, 2 paddw m2, m1, m0 psubw m3, m1, m0 - paddw m2 {k1}, m1 ; 0+1+2+3 0<<1+1-2-3<<1 - psubw m3 {k1}, m0 ; 0-1-2+3 0-1<<1+2<<1-3 + vpaddw m2 {k1}, m1 ; 0+1+2+3 0<<1+1-2-3<<1 + vpsubw m3 {k1}, m0 ; 0-1-2+3 0-1<<1+2<<1-3 shufps m1, m2, m3, q2323 ; a3 b3 a2 b2 c3 d3 c2 d2 punpcklqdq m2, m3 ; a0 b0 a1 b1 c0 d0 c1 d1 SUMSUB_BA w, 1, 2, 3 @@ -630,8 +630,8 @@ cglobal sub16x16_dct, 3,3,6 shufps m1, m2, q2020 ; a0+a3 b0+b3 c0+c3 d0+d3 a0-a3 b0-b3 c0-c3 d0-d3 paddw m2, m1, m3 psubw m0, m1, m3 - paddw m2 {k2}, m1 ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1 - psubw m0 {k2}, m3 ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3' + vpaddw m2 {k2}, m1 ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1 + vpsubw m0 {k2}, m3 ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3' %endmacro INIT_XMM avx512 @@ -743,7 +743,7 @@ cglobal sub8x8_dct_dc, 3,3 paddw xmm0, xmm2 ; 0+1 0+1 2+3 2+3 punpckldq xmm0, xmm1 ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3 punpcklqdq xmm1, xmm0, xmm0 - psubw xmm0 {k1}, xm3, xmm0 + vpsubw xmm0 {k1}, xm3, xmm0 paddw xmm0, xmm1 ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3 movhps [r0], xmm0 RET diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index e3658f23..3c031313 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -2518,8 +2518,8 @@ cglobal mbtree_propagate_list_internal, 5,7,21 paddd m6, m7 ; i_mb_x += 8 pand m3, m8 ; {x, y} vprold m1, m3, 20 ; {y, x} << 4 - psubw m3 {k4}, m9, m3 ; {32-x, 32-y}, {32-x, y} - psubw m1 {k5}, m10, m1 ; ({32-y, x}, {y, x}) << 4 + vpsubw m3 {k4}, m9, m3 ; {32-x, 32-y}, {32-x, y} + vpsubw m1 {k5}, m10, m1 ; ({32-y, x}, {y, x}) << 4 pmullw m3, m1 paddsw m3, m3 ; prevent signed overflow in idx0 (32*32<<5 == 0x8000) pmulhrsw m2, m3, m4 ; idx01weight idx23weightp @@ -2530,11 +2530,11 @@ cglobal mbtree_propagate_list_internal, 5,7,21 vpcmpuw k2, ym1, ym20, 1 ; {mbx, mbx+1} < width kunpckwd k2, k2, k2 psrad m1, m0, 16 - paddd m1 {k6}, m11 + vpaddd m1 {k6}, m11 vpcmpud k1 {k1}, m1, m13, 1 ; mby < height | mby+1 < height pmaddwd m0, m15 - paddd m0 {k6}, m14 ; idx0 | idx2 + vpaddd m0 {k6}, m14 ; idx0 | idx2 vmovdqu16 m2 {k2}{z}, m2 ; idx01weight | idx23weight vptestmd k1 {k1}, m2, m2 ; mask out offsets with no changes diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 4271c921..04fe7099 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -4744,7 +4744,7 @@ cglobal intra_sad_x9_8x8, 5,7,8 %endmacro %macro SATD_AVX512_END 0-1 0 ; sa8d - paddw m0 {k1}{z}, m1 ; zero-extend to dwords + vpaddw m0 {k1}{z}, m1 ; zero-extend to dwords %if ARCH_X86_64 %if mmsize == 64 vextracti32x8 ym1, m0, 1