From: Fritz Koenig Date: Tue, 14 Sep 2010 01:34:34 +0000 (-0700) Subject: Removed unnecessary pxor. X-Git-Tag: v0.9.5~105 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=769f2424ccce47c491913c38b06581aa777a53c0;p=libvpx Removed unnecessary pxor. There is no need to make sure that the lower byte of the register is 0 because the downshift by 11 overwrites that byte. Change-Id: I89cbf004b2ff532a2c68e0dc399c45a49cdad5a1 --- diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm index 985d5a09d..57276b661 100644 --- a/vp8/common/x86/loopfilter_sse2.asm +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -196,12 +196,12 @@ pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values psubsb xmm2, xmm7 ; p1 - q1 - pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values + pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values - movdqa xmm3, xmm0 ; q0 + movdqa xmm3, xmm0 ; q0 psubsb xmm0, xmm6 ; q0 - p0 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) @@ -211,29 +211,28 @@ paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - pxor xmm0, xmm0 - pxor xmm5, xmm5 - punpcklbw xmm0, xmm2 - punpckhbw xmm5, xmm2 - psraw xmm0, 11 - psraw xmm5, 11 - packsswb xmm0, xmm5 - movdqa xmm2, xmm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; - - pxor xmm0, xmm0 ; 0 - movdqa xmm5, xmm1 ; abcdefgh - punpcklbw xmm0, xmm1 ; e0f0g0h0 + punpckhbw xmm5, xmm2 ; axbxcxdx + punpcklbw xmm2, xmm2 ; exfxgxhx + + psraw xmm5, 11 ; sign extended shift right by 3 + psraw xmm2, 11 ; sign extended shift right by 3 + packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + punpcklbw xmm0, xmm1 ; exfxgxhx + punpckhbw xmm1, xmm1 ; axbxcxdx + psraw xmm0, 11 ; sign extended shift right by 3 - pxor xmm1, xmm1 ; 0 - punpckhbw xmm1, xmm5 ; a0b0c0d0 psraw xmm1, 11 ; sign extended shift right by 3 - movdqa xmm5, xmm0 ; save results + movdqa xmm5, xmm0 ; save results packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw xmm5, [ones GLOBAL] paddsw xmm1, [ones GLOBAL] + psraw xmm5, 1 ; partial shifted one more time for 2nd tap psraw xmm1, 1 ; partial shifted one more time for 2nd tap + packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 pandn xmm4, xmm5 ; high edge variance additive %endmacro @@ -433,29 +432,27 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): pand xmm2, xmm4; ; Filter2 = vp8_filter & hev movdqa xmm5, xmm2 - paddsb xmm5, [t3 GLOBAL] + paddsb xmm5, [t3 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 3) + + punpckhbw xmm7, xmm5 ; axbxcxdx + punpcklbw xmm5, xmm5 ; exfxgxhx - pxor xmm0, xmm0 ; 0 - pxor xmm7, xmm7 ; 0 - punpcklbw xmm0, xmm5 ; e0f0g0h0 - psraw xmm0, 11 ; sign extended shift right by 3 - punpckhbw xmm7, xmm5 ; a0b0c0d0 psraw xmm7, 11 ; sign extended shift right by 3 - packsswb xmm0, xmm7 ; Filter2 >>=3; - movdqa xmm5, xmm0 ; Filter2 - paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) + psraw xmm5, 11 ; sign extended shift right by 3 + + packsswb xmm5, xmm7 ; Filter2 >>=3; + paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) + + punpckhbw xmm7, xmm2 ; axbxcxdx + punpcklbw xmm0, xmm2 ; exfxgxhx - pxor xmm0, xmm0 ; 0 - pxor xmm7, xmm7 ; 0 - punpcklbw xmm0, xmm2 ; e0f0g0h0 - psraw xmm0, 11 ; sign extended shift right by 3 - punpckhbw xmm7, xmm2 ; a0b0c0d0 psraw xmm7, 11 ; sign extended shift right by 3 - packsswb xmm0, xmm7 ; Filter2 >>=3; + psraw xmm0, 11 ; sign extended shift right by 3 - psubsb xmm3, xmm0 ; qs0 =qs0 - filter1 + packsswb xmm0, xmm7 ; Filter2 >>=3; paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 + psubsb xmm3, xmm0 ; qs0 =qs0 - filter1 pandn xmm4, xmm1 ; vp8_filter&=~hev %endmacro @@ -465,7 +462,6 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): ; *oq0 = s^0x80; ; s = vp8_signed_char_clamp(ps0 + u); ; *op0 = s^0x80; - pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm2, xmm2 @@ -1022,28 +1018,19 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - pxor xmm0, xmm0 - - pxor xmm5, xmm5 - punpcklbw xmm0, xmm2 punpckhbw xmm5, xmm2 - psraw xmm0, 11 + punpcklbw xmm2, xmm2 psraw xmm5, 11 - packsswb xmm0, xmm5 + psraw xmm2, 11 - movdqa xmm2, xmm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + punpcklbw xmm0, xmm1 ; exfxgxhx - pxor xmm0, xmm0 ; 0 - movdqa xmm5, xmm1 ; abcdefgh - - punpcklbw xmm0, xmm1 ; e0f0g0h0 + punpckhbw xmm1, xmm1 ; axbxcxdx psraw xmm0, 11 ; sign extended shift right by 3 - pxor xmm1, xmm1 ; 0 - punpckhbw xmm1, xmm5 ; a0b0c0d0 - psraw xmm1, 11 ; sign extended shift right by 3 movdqa xmm5, xmm0 ; save results @@ -1308,28 +1295,22 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): movdqa xmm5, xmm2 paddsb xmm5, [t3 GLOBAL] - pxor xmm0, xmm0 ; 0 - pxor xmm7, xmm7 ; 0 - - punpcklbw xmm0, xmm5 ; e0f0g0h0 - psraw xmm0, 11 ; sign extended shift right by 3 + punpckhbw xmm7, xmm5 ; axbxcxdx + punpcklbw xmm5, xmm5 ; exfxgxhx - punpckhbw xmm7, xmm5 ; a0b0c0d0 psraw xmm7, 11 ; sign extended shift right by 3 + psraw xmm5, 11 ; sign extended shift right by 3 - packsswb xmm0, xmm7 ; Filter2 >>=3; - movdqa xmm5, xmm0 ; Filter2 + packsswb xmm5, xmm7 ; Filter2 >>=3; paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) - pxor xmm0, xmm0 ; 0 - pxor xmm7, xmm7 ; 0 - punpcklbw xmm0, xmm2 ; e0f0g0h0 + punpcklbw xmm0, xmm2 ; exfxgxhx + punpckhbw xmm7, xmm2 ; axbxcxdx psraw xmm0, 11 ; sign extended shift right by 3 - punpckhbw xmm7, xmm2 ; a0b0c0d0 - psraw xmm7, 11 ; sign extended shift right by 3 + packsswb xmm0, xmm7 ; Filter2 >>=3; psubsb xmm3, xmm0 ; qs0 =qs0 - filter1 @@ -1344,7 +1325,6 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): ; *oq0 = s^0x80; ; s = vp8_signed_char_clamp(ps0 + u); ; *op0 = s^0x80; - pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm2, xmm2