From: Fiona Glaser Date: Fri, 6 Feb 2009 10:59:36 +0000 (-0800) Subject: More intra pred asm optimizations X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0743869d0743d47dc93c22b6d55ca84e1851ebc2;p=libx264 More intra pred asm optimizations SSSE3 version of predict_8x8_hu SSE2 version of predict_8x8c_p SSSE3 versions of both planar prediction functions Optimizations to predict_16x16_p_sse2 Some unnecessary REP_RETs -> RETs. SSE2 version of predict_8x8_vr by Holger. SSE2 version of predict_8x8_hd. Don't compile MMX versions of some of the pred functions on x86_64. Remove now-useless x86_64 C versions of 4x4 pred functions. Rewrite some of the x86_64-only C functions in asm. --- diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index 5596189e..899c24c8 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -4,6 +4,7 @@ ;* Copyright (C) 2005-2008 x264 project ;* ;* Authors: Loren Merritt +;* Holger Lubitz ;* Fiona Glaser ;* ;* This program is free software; you can redistribute it and/or modify @@ -86,6 +87,8 @@ pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 pb_00s_ff: times 8 db 0 pb_0s_ff: times 7 db 0 db 0xff +pw_ff00: times 8 dw 0xff00 +pb_reverse: db 7, 6, 5, 4, 3, 2, 1, 0 SECTION .text @@ -107,6 +110,20 @@ SECTION .text PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa %endmacro +%macro LOAD_PLANE_ARGS 0 +%ifdef ARCH_X86_64 + movd mm0, r1d + movd mm2, r2d + movd mm4, r3d + pshufw mm0, mm0, 0 + pshufw mm2, mm2, 0 + pshufw mm4, mm4, 0 +%else + pshufw mm0, r1m, 0 + pshufw mm2, r2m, 0 + pshufw mm4, r3m, 0 +%endif +%endmacro ;----------------------------------------------------------------------------- ; void predict_4x4_ddl_mmxext( uint8_t *src ) @@ -141,7 +158,7 @@ cglobal predict_4x4_ddr_%1, 1,1 punpckhbw mm2, [r0-1*FDEC_STRIDE-8] movd mm3, [r0-1*FDEC_STRIDE] punpckhwd mm1, mm2 - PALIGNR mm3, mm1, 5, mm4 + PALIGNR mm3, mm1, 5, mm1 movq mm1, mm3 PALIGNR mm3, [r0+2*FDEC_STRIDE-8], 7, mm4 movq mm2, mm3 @@ -175,7 +192,7 @@ cglobal predict_4x4_vr_%1, 1,1 PALIGNR mm7, mm1, 7, mm2 psllq mm1, 8 movd [r0+2*FDEC_STRIDE], mm7 - PALIGNR mm3, mm1, 7, mm2 + PALIGNR mm3, mm1, 7, mm1 movd [r0+3*FDEC_STRIDE], mm3 RET @@ -539,6 +556,156 @@ cglobal predict_8x8_ddr_mmxext, 2,2 movq [r0+Y*FDEC_STRIDE], mm0 RET +;----------------------------------------------------------------------------- +; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge ) +;----------------------------------------------------------------------------- +cglobal predict_8x8_hu_mmxext, 2,2 + movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7 + add r0, 4*FDEC_STRIDE + pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 + psllq mm1, 56 ; l7 .. .. .. .. .. .. .. + movq mm2, mm0 + psllw mm0, 8 + psrlw mm2, 8 + por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 + movq mm3, mm2 + movq mm4, mm2 + movq mm5, mm2 + psrlq mm2, 8 + psrlq mm3, 16 + por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1 + punpckhbw mm1, mm1 + por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2 + pavgb mm4, mm2 + PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6 + movq mm5, mm4 + punpcklbw mm4, mm1 ; p4 p3 p2 p1 + punpckhbw mm5, mm1 ; p8 p7 p6 p5 + movq mm6, mm5 + movq mm7, mm5 + movq mm0, mm5 + PALIGNR mm5, mm4, 2, mm1 + pshufw mm1, mm6, 11111001b + PALIGNR mm6, mm4, 4, mm2 + pshufw mm2, mm7, 11111110b + PALIGNR mm7, mm4, 6, mm4 + pshufw mm3, mm0, 11111111b + movq [r0-4*FDEC_STRIDE], mm4 + movq [r0-3*FDEC_STRIDE], mm5 + movq [r0-2*FDEC_STRIDE], mm6 + movq [r0-1*FDEC_STRIDE], mm7 + movq [r0+0*FDEC_STRIDE], mm0 + movq [r0+1*FDEC_STRIDE], mm1 + movq [r0+2*FDEC_STRIDE], mm2 + movq [r0+3*FDEC_STRIDE], mm3 + RET + +;----------------------------------------------------------------------------- +; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge ) +;----------------------------------------------------------------------------- + +; fills only some pixels: +; f01234567 +; 0........ +; 1,,,,,,,, +; 2 ....... +; 3 ,,,,,,, +; 4 ...... +; 5 ,,,,,, +; 6 ..... +; 7 ,,,,, + +cglobal predict_8x8_vr_core_mmxext, 2,2 + movq mm2, [r1+16] + movq mm3, [r1+15] + movq mm1, [r1+14] + movq mm4, mm3 + pavgb mm3, mm2 + PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7 + +%assign Y 0 +%rep 3 + movq [r0+ Y *FDEC_STRIDE], mm3 + movq [r0+(Y+1)*FDEC_STRIDE], mm0 + psllq mm3, 8 + psllq mm0, 8 +%assign Y (Y+2) +%endrep + movq [r0+ Y *FDEC_STRIDE], mm3 + movq [r0+(Y+1)*FDEC_STRIDE], mm0 + + RET + +;----------------------------------------------------------------------------- +; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ) +;----------------------------------------------------------------------------- +cglobal predict_8x8c_p_core_mmxext, 1,2 + LOAD_PLANE_ARGS + movq mm1, mm2 + pmullw mm2, [pw_3210 GLOBAL] + psllw mm1, 2 + paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b} + paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b} + + mov r1d, 8 +ALIGN 4 +.loop: + movq mm5, mm0 + movq mm6, mm1 + psraw mm5, 5 + psraw mm6, 5 + packuswb mm5, mm6 + movq [r0], mm5 + + paddsw mm0, mm4 + paddsw mm1, mm4 + add r0, FDEC_STRIDE + dec r1d + jg .loop + REP_RET + +;----------------------------------------------------------------------------- +; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ) +;----------------------------------------------------------------------------- +cglobal predict_16x16_p_core_mmxext, 1,2 + LOAD_PLANE_ARGS + movq mm5, mm2 + movq mm1, mm2 + pmullw mm5, [pw_3210 GLOBAL] + psllw mm2, 3 + psllw mm1, 2 + movq mm3, mm2 + paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b} + paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} + paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b} + paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b} + + mov r1d, 16 +ALIGN 4 +.loop: + movq mm5, mm0 + movq mm6, mm1 + psraw mm5, 5 + psraw mm6, 5 + packuswb mm5, mm6 + movq [r0], mm5 + + movq mm5, mm2 + movq mm6, mm3 + psraw mm5, 5 + psraw mm6, 5 + packuswb mm5, mm6 + movq [r0+8], mm5 + + paddsw mm0, mm4 + paddsw mm1, mm4 + paddsw mm2, mm4 + paddsw mm3, mm4 + add r0, FDEC_STRIDE + dec r1d + jg .loop + REP_RET + %endif ; !ARCH_X86_64 ;----------------------------------------------------------------------------- @@ -614,42 +781,40 @@ cglobal predict_8x8_vl_sse2, 2,2 RET ;----------------------------------------------------------------------------- -; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge ) +; void predict_8x8_vr_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- - -; fills only some pixels: -; f01234567 -; 0........ -; 1,,,,,,,, -; 2 ....... -; 3 ,,,,,,, -; 4 ...... -; 5 ,,,,,, -; 6 ..... -; 7 ,,,,, - -cglobal predict_8x8_vr_core_mmxext, 2,2 - movq mm2, [r1+16] - movq mm3, [r1+15] - movq mm1, [r1+14] - movq mm4, mm3 - pavgb mm3, mm2 - PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7 - -%assign Y 0 +cglobal predict_8x8_vr_sse2, 2,2 + movdqu xmm0, [r1+8] + movdqa xmm6, [pw_ff00 GLOBAL] + add r0, 4*FDEC_STRIDE + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pslldq xmm0, 1 + pslldq xmm1, 2 + pavgb xmm2, xmm0 + PRED8x8_LOWPASS_XMM xmm4, xmm3, xmm1, xmm0, xmm5 + pandn xmm6, xmm4 + movdqa xmm5, xmm4 + psrlw xmm4, 8 + packuswb xmm6, xmm4 + movhlps xmm4, xmm6 + movhps [r0-3*FDEC_STRIDE], xmm5 + movhps [r0-4*FDEC_STRIDE], xmm2 + psrldq xmm5, 4 + movss xmm5, xmm6 + psrldq xmm2, 4 + movss xmm2, xmm4 +%assign Y 3 %rep 3 - movq [r0+ Y *FDEC_STRIDE], mm3 - movq [r0+(Y+1)*FDEC_STRIDE], mm0 - psllq mm3, 8 - psllq mm0, 8 -%assign Y (Y+2) + psrldq xmm5, 1 + psrldq xmm2, 1 + movq [r0+Y*FDEC_STRIDE], xmm5 + movq [r0+(Y-1)*FDEC_STRIDE], xmm2 +%assign Y (Y-2) %endrep - movq [r0+ Y *FDEC_STRIDE], mm3 - movq [r0+(Y+1)*FDEC_STRIDE], mm0 - RET - ;----------------------------------------------------------------------------- ; void predict_8x8_hd_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- @@ -684,7 +849,7 @@ cglobal predict_8x8_hd_mmxext, 2,2 movq [r0+2*FDEC_STRIDE], mm7 PALIGNR mm1, mm3, 4, mm5 movq [r0+1*FDEC_STRIDE], mm1 - PALIGNR mm0, mm3, 6, mm5 + PALIGNR mm0, mm3, 6, mm3 movq [r0+0*FDEC_STRIDE], mm0 movq mm2, mm6 movq mm3, mm6 @@ -693,23 +858,24 @@ cglobal predict_8x8_hd_mmxext, 2,2 movq [r0-2*FDEC_STRIDE], mm6 PALIGNR mm2, mm4, 4, mm5 movq [r0-3*FDEC_STRIDE], mm2 - PALIGNR mm3, mm4, 6, mm5 + PALIGNR mm3, mm4, 6, mm4 movq [r0-4*FDEC_STRIDE], mm3 RET ;----------------------------------------------------------------------------- ; void predict_8x8_hd_ssse3( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_hd_ssse3, 2,2 +%macro PREDICT_8x8_HD 1 +cglobal predict_8x8_hd_%1, 2,2 add r0, 4*FDEC_STRIDE movdqa xmm0, [r1] movdqa xmm1, [r1+16] movdqa xmm2, xmm1 movdqa xmm3, xmm1 - palignr xmm1, xmm0, 7 - palignr xmm2, xmm0, 9 - palignr xmm3, xmm0, 8 - movdqa xmm4, xmm1 + PALIGNR xmm1, xmm0, 7, xmm4 + PALIGNR xmm2, xmm0, 9, xmm5 + PALIGNR xmm3, xmm0, 8, xmm0 + movdqa xmm4, xmm1 pavgb xmm4, xmm3 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm5 punpcklbw xmm4, xmm0 @@ -726,63 +892,41 @@ cglobal predict_8x8_hd_ssse3, 2,2 movq [r0+(Y)*FDEC_STRIDE], xmm4 movq [r0+(Y-4)*FDEC_STRIDE], xmm0 RET +%endmacro -;----------------------------------------------------------------------------- -; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge ) -;----------------------------------------------------------------------------- -cglobal predict_8x8_hu_mmxext, 2,2 - movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7 - add r0, 4*FDEC_STRIDE - pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 - psllq mm1, 56 ; l7 .. .. .. .. .. .. .. - movq mm2, mm0 - psllw mm0, 8 - psrlw mm2, 8 - por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 - movq mm3, mm2 - movq mm4, mm2 - movq mm5, mm2 - psrlq mm2, 8 - psrlq mm3, 16 - por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1 - punpckhbw mm1, mm1 - por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2 - pavgb mm4, mm2 - PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6 - movq mm5, mm4 - punpcklbw mm4, mm1 ; p4 p3 p2 p1 - punpckhbw mm5, mm1 ; p8 p7 p6 p5 - movq mm6, mm5 - movq mm7, mm5 - movq mm0, mm5 - PALIGNR mm5, mm4, 2, mm1 - pshufw mm1, mm6, 11111001b - PALIGNR mm6, mm4, 4, mm2 - pshufw mm2, mm7, 11111110b - PALIGNR mm7, mm4, 6, mm3 - pshufw mm3, mm0, 11111111b - movq [r0-4*FDEC_STRIDE], mm4 - movq [r0-3*FDEC_STRIDE], mm5 - movq [r0-2*FDEC_STRIDE], mm6 - movq [r0-1*FDEC_STRIDE], mm7 - movq [r0+0*FDEC_STRIDE], mm0 - movq [r0+1*FDEC_STRIDE], mm1 - movq [r0+2*FDEC_STRIDE], mm2 - movq [r0+3*FDEC_STRIDE], mm3 - RET +INIT_XMM +PREDICT_8x8_HD sse2 +%define PALIGNR PALIGNR_SSSE3 +PREDICT_8x8_HD ssse3 +INIT_MMX +%define PALIGNR PALIGNR_MMX ;----------------------------------------------------------------------------- ; void predict_8x8_hu_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_hu_sse2, 2,2 - movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7 +%macro PREDICT_8x8_HU 1 +cglobal predict_8x8_hu_%1, 2,2 add r0, 4*FDEC_STRIDE +%ifidn %1, ssse3 + movq mm5, [r1+7] + movq mm6, [pb_reverse GLOBAL] + movq mm1, mm5 + movq mm2, mm5 + movq mm3, mm5 + pshufb mm5, mm6 + psrlq mm6, 8 + pshufb mm2, mm6 + psrlq mm6, 8 + pshufb mm3, mm6 + movq mm4, mm5 +%else + movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7 pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 - psllq mm1, 56 ; l7 .. .. .. .. .. .. .. movq mm2, mm0 psllw mm0, 8 psrlw mm2, 8 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 + psllq mm1, 56 ; l7 .. .. .. .. .. .. .. movq mm3, mm2 movq mm4, mm2 movq mm5, mm2 @@ -791,30 +935,33 @@ cglobal predict_8x8_hu_sse2, 2,2 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1 punpckhbw mm1, mm1 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2 +%endif pavgb mm4, mm2 PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6 movq2dq xmm0, mm4 movq2dq xmm1, mm1 punpcklbw xmm0, xmm1 - - movhlps xmm4, xmm0 - pshuflw xmm5, xmm4, 11111001b - pshuflw xmm6, xmm4, 11111110b - pshuflw xmm7, xmm4, 11111111b + punpckhbw mm4, mm1 %assign Y -4 %rep 3 movq [r0+Y*FDEC_STRIDE], xmm0 psrldq xmm0, 2 %assign Y (Y+1) %endrep + pshufw mm5, mm4, 11111001b + pshufw mm6, mm4, 11111110b + pshufw mm7, mm4, 11111111b movq [r0+Y*FDEC_STRIDE], xmm0 - movq [r0+0*FDEC_STRIDE], xmm4 - movq [r0+1*FDEC_STRIDE], xmm5 - movq [r0+2*FDEC_STRIDE], xmm6 - movq [r0+3*FDEC_STRIDE], xmm7 + movq [r0+0*FDEC_STRIDE], mm4 + movq [r0+1*FDEC_STRIDE], mm5 + movq [r0+2*FDEC_STRIDE], mm6 + movq [r0+3*FDEC_STRIDE], mm7 RET +%endmacro +PREDICT_8x8_HU sse2 +PREDICT_8x8_HU ssse3 ;----------------------------------------------------------------------------- ; void predict_8x8c_v_mmx( uint8_t *src ) @@ -885,90 +1032,65 @@ cglobal predict_8x8c_dc_core_mmxext, 1,1 STORE8x8 mm0, mm2 RET -%macro LOAD_PLANE_ARGS 0 -%ifdef ARCH_X86_64 - movd mm0, r1d - movd mm2, r2d - movd mm4, r3d - pshufw mm0, mm0, 0 - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 -%else - pshufw mm0, r1m, 0 - pshufw mm2, r2m, 0 - pshufw mm4, r3m, 0 -%endif -%endmacro - -;----------------------------------------------------------------------------- -; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ) -;----------------------------------------------------------------------------- -cglobal predict_8x8c_p_core_mmxext, 1,2 - LOAD_PLANE_ARGS - movq mm1, mm2 - pmullw mm2, [pw_3210 GLOBAL] - psllw mm1, 2 - paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b} - paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b} - - mov r1d, 8 -ALIGN 4 -.loop: - movq mm5, mm0 - movq mm6, mm1 - psraw mm5, 5 - psraw mm6, 5 - packuswb mm5, mm6 - movq [r0], mm5 - - paddsw mm0, mm4 - paddsw mm1, mm4 - add r0, FDEC_STRIDE - dec r1d - jg .loop - REP_RET +cglobal predict_8x8c_dc_top_mmxext, 1,1 + movq mm0, [r0 - FDEC_STRIDE] + pxor mm1, mm1 + pxor mm2, mm2 + punpckhbw mm1, mm0 + punpcklbw mm0, mm2 + psadbw mm1, mm2 ; s1 + psadbw mm0, mm2 ; s0 + psrlw mm1, 1 + psrlw mm0, 1 + pavgw mm1, mm2 + pavgw mm0, mm2 + pshufw mm1, mm1, 0 + pshufw mm0, mm0, 0 ; dc0 (w) + packuswb mm0, mm1 ; dc0,dc1 (b) + STORE8x8 mm0, mm0 + RET ;----------------------------------------------------------------------------- -; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ) +; void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -cglobal predict_16x16_p_core_mmxext, 1,2 - LOAD_PLANE_ARGS - movq mm5, mm2 - movq mm1, mm2 - pmullw mm5, [pw_3210 GLOBAL] - psllw mm2, 3 - psllw mm1, 2 - movq mm3, mm2 - paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b} - paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} - paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b} - paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b} - mov r1d, 16 -ALIGN 4 +cglobal predict_8x8c_p_core_sse2, 1,1 + movd xmm0, r1m + movd xmm2, r2m + movd xmm4, r3m + pshuflw xmm0, xmm0, 0 + pshuflw xmm2, xmm2, 0 + pshuflw xmm4, xmm4, 0 + punpcklqdq xmm0, xmm0 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm4, xmm4 + pmullw xmm2, [pw_76543210 GLOBAL] + paddsw xmm0, xmm2 ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b} + movdqa xmm3, xmm0 + paddsw xmm3, xmm4 + paddsw xmm4, xmm4 +call .loop + add r0, FDEC_STRIDE*4 .loop: - movq mm5, mm0 - movq mm6, mm1 - psraw mm5, 5 - psraw mm6, 5 - packuswb mm5, mm6 - movq [r0], mm5 - - movq mm5, mm2 - movq mm6, mm3 - psraw mm5, 5 - psraw mm6, 5 - packuswb mm5, mm6 - movq [r0+8], mm5 - - paddsw mm0, mm4 - paddsw mm1, mm4 - paddsw mm2, mm4 - paddsw mm3, mm4 - add r0, FDEC_STRIDE - dec r1d - jg .loop - REP_RET + movdqa xmm5, xmm0 + movdqa xmm1, xmm3 + psraw xmm0, 5 + psraw xmm3, 5 + packuswb xmm0, xmm3 + movq [r0+FDEC_STRIDE*0], xmm0 + movhps [r0+FDEC_STRIDE*1], xmm0 + paddsw xmm5, xmm4 + paddsw xmm1, xmm4 + movdqa xmm0, xmm5 + movdqa xmm3, xmm1 + psraw xmm5, 5 + psraw xmm1, 5 + packuswb xmm5, xmm1 + movq [r0+FDEC_STRIDE*2], xmm5 + movhps [r0+FDEC_STRIDE*3], xmm5 + paddsw xmm0, xmm4 + paddsw xmm3, xmm4 + RET ;----------------------------------------------------------------------------- ; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ) @@ -988,20 +1110,28 @@ cglobal predict_16x16_p_core_sse2, 1,2 psllw xmm1, 3 paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} - - mov r1d, 16 + movdqa xmm7, xmm2 + paddsw xmm7, xmm7 + mov r1d, 8 ALIGN 4 .loop: movdqa xmm3, xmm0 movdqa xmm4, xmm1 + movdqa xmm5, xmm0 + movdqa xmm6, xmm1 psraw xmm3, 5 psraw xmm4, 5 + paddsw xmm5, xmm2 + paddsw xmm6, xmm2 + psraw xmm5, 5 + psraw xmm6, 5 packuswb xmm3, xmm4 - movdqa [r0], xmm3 - - paddsw xmm0, xmm2 - paddsw xmm1, xmm2 - add r0, FDEC_STRIDE + packuswb xmm5, xmm6 + movdqa [r0+FDEC_STRIDE*0], xmm3 + movdqa [r0+FDEC_STRIDE*1], xmm5 + paddsw xmm0, xmm7 + paddsw xmm1, xmm7 + add r0, FDEC_STRIDE*2 dec r1d jg .loop REP_RET @@ -1018,10 +1148,10 @@ cglobal predict_16x16_v_mmx, 1,2 ;----------------------------------------------------------------------------- ; void predict_16x16_v_sse2( uint8_t *src ) ;----------------------------------------------------------------------------- -cglobal predict_16x16_v_sse2, 1,2 +cglobal predict_16x16_v_sse2, 1,1 movdqa xmm0, [r0 - FDEC_STRIDE] STORE16x16_SSE2 xmm0 - REP_RET + RET ;----------------------------------------------------------------------------- ; void predict_16x16_h_mmxext( uint8_t *src ) @@ -1086,6 +1216,13 @@ cglobal predict_16x16_dc_top_mmxext, 1,2 PRED16x16_DC [pw_8 GLOBAL], 4 REP_RET +cglobal predict_16x16_dc_left_core_mmxext, 1,1 + movd mm0, r1m + pshufw mm0, mm0, 0 + packuswb mm0, mm0 + STORE16x16 mm0, mm0 + REP_RET + ;----------------------------------------------------------------------------- ; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ) ;----------------------------------------------------------------------------- @@ -1103,12 +1240,19 @@ cglobal predict_16x16_dc_top_mmxext, 1,2 STORE16x16_SSE2 xmm0 %endmacro -cglobal predict_16x16_dc_core_sse2, 1,2 +cglobal predict_16x16_dc_core_sse2, 1,1 movd xmm2, r1m PRED16x16_DC_SSE2 xmm2, 5 - REP_RET + RET -cglobal predict_16x16_dc_top_sse2, 1,2 +cglobal predict_16x16_dc_top_sse2, 1,1 PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4 - REP_RET + RET +cglobal predict_16x16_dc_left_core_sse2, 1,1 + movd xmm0, r1m + pshuflw xmm0, xmm0, 0 + punpcklqdq xmm0, xmm0 + packuswb xmm0, xmm0 + STORE16x16_SSE2 xmm0 + REP_RET diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index a5665cae..02480386 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -29,10 +29,13 @@ extern void predict_16x16_v_mmx( uint8_t *src ); extern void predict_16x16_h_mmxext( uint8_t *src ); extern void predict_16x16_h_ssse3( uint8_t *src ); extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ); +extern void predict_16x16_dc_left_core_mmxext( uint8_t *src, int i_dc_left ); extern void predict_16x16_dc_top_mmxext( uint8_t *src ); extern void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ); extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ); +extern void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c ); extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 ); +extern void predict_8x8c_dc_top_mmxext( uint8_t *src ); extern void predict_8x8c_v_mmx( uint8_t *src ); extern void predict_8x8c_h_mmxext( uint8_t *src ); extern void predict_8x8c_h_ssse3( uint8_t *src ); @@ -48,9 +51,12 @@ extern void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] ); +extern void predict_8x8_vr_sse2( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] ); +extern void predict_8x8_hd_sse2( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] ); +extern void predict_8x8_hu_ssse3( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_filter_mmxext ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters ); extern void predict_8x8_filter_ssse3 ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters ); extern void predict_4x4_ddl_mmxext( uint8_t *src ); @@ -65,9 +71,14 @@ extern void predict_4x4_ddr_ssse3( uint8_t *src ); extern void predict_4x4_hu_mmxext( uint8_t *src ); extern void predict_16x16_dc_top_sse2( uint8_t *src ); extern void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ); +extern void predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left ); extern void predict_16x16_v_sse2( uint8_t *src ); extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ); +DECLARE_ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8}; +DECLARE_ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1}; +DECLARE_ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4}; + #define PREDICT_P_SUM(j,i)\ H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\ V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\ @@ -94,28 +105,100 @@ static void predict_16x16_p_##name( uint8_t *src )\ predict_16x16_p_core_##name( src, i00, b, c );\ } +#ifndef ARCH_X86_64 PREDICT_16x16_P( mmxext ) +#endif PREDICT_16x16_P( sse2 ) -static void predict_8x8c_p_mmxext( uint8_t *src ) +#ifdef __GNUC__ +static void predict_16x16_p_ssse3( uint8_t *src ) { - int a, b, c; - int H = 0; - int V = 0; - int i00; + int a, b, c, i00; + int H, V; + asm ( + "movq %1, %%mm1 \n" + "movq 8+%1, %%mm0 \n" + "palignr $7, -8+%1, %%mm1 \n" + "pmaddubsw %2, %%mm0 \n" + "pmaddubsw %3, %%mm1 \n" + "paddw %%mm1, %%mm0 \n" + "pshufw $14, %%mm0, %%mm1 \n" + "paddw %%mm1, %%mm0 \n" + "pshufw $1, %%mm0, %%mm1 \n" + "paddw %%mm1, %%mm0 \n" + "movd %%mm0, %0 \n" + "movsx %w0, %0 \n" + :"=r"(H) + :"m"(src[-FDEC_STRIDE]), "m"(*pb_12345678), "m"(*pb_m87654321) + ); + V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] ) + + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] ) + + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] ) + + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] ) + + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] ) + + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] ) + + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] ) + + 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] ); + a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] ); + b = ( 5 * H + 32 ) >> 6; + c = ( 5 * V + 32 ) >> 6; + i00 = a - b * 7 - c * 7 + 16; + predict_16x16_p_core_sse2( src, i00, b, c ); +} +#endif - PREDICT_P_SUM(3,1) - PREDICT_P_SUM(3,2) - PREDICT_P_SUM(3,3) - PREDICT_P_SUM(3,4) +#define PREDICT_8x8_P(name)\ +static void predict_8x8c_p_##name( uint8_t *src )\ +{\ + int a, b, c;\ + int H = 0;\ + int V = 0;\ + int i00;\ + PREDICT_P_SUM(3,1)\ + PREDICT_P_SUM(3,2)\ + PREDICT_P_SUM(3,3)\ + PREDICT_P_SUM(3,4)\ + a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\ + b = ( 17 * H + 16 ) >> 5;\ + c = ( 17 * V + 16 ) >> 5;\ + i00 = a -3*b -3*c + 16;\ + predict_8x8c_p_core_##name( src, i00, b, c );\ +} +#ifndef ARCH_X86_64 +PREDICT_8x8_P( mmxext ) +#endif +PREDICT_8x8_P( sse2 ) + +#ifdef __GNUC__ +static void predict_8x8c_p_ssse3( uint8_t *src ) +{ + int a, b, c, i00; + int H, V; + asm ( + "movq %1, %%mm0 \n" + "pmaddubsw %2, %%mm0 \n" + "pshufw $14, %%mm0, %%mm1 \n" + "paddw %%mm1, %%mm0 \n" + "pshufw $1, %%mm0, %%mm1 \n" + "paddw %%mm1, %%mm0 \n" + "movd %%mm0, %0 \n" + "movsx %w0, %0 \n" + :"=r"(H) + :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234) + ); + V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] ) + + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] ) + + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] ) + + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] ); + H += -4 * src[-1*FDEC_STRIDE -1]; a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] ); b = ( 17 * H + 16 ) >> 5; c = ( 17 * V + 16 ) >> 5; i00 = a -3*b -3*c + 16; - - predict_8x8c_p_core_mmxext( src, i00, b, c ); + predict_8x8c_p_core_sse2( src, i00, b, c ); } +#endif #define PREDICT_16x16_DC(name)\ static void predict_16x16_dc_##name( uint8_t *src )\ @@ -131,7 +214,23 @@ static void predict_16x16_dc_##name( uint8_t *src )\ } PREDICT_16x16_DC( mmxext ) -PREDICT_16x16_DC( sse2 ) +PREDICT_16x16_DC( sse2 ) + +#define PREDICT_16x16_DC_LEFT(name)\ +static void predict_16x16_dc_left_##name( uint8_t *src )\ +{\ + uint32_t dc=8;\ + int i;\ + for( i = 0; i < 16; i+=2 )\ + {\ + dc += src[-1 + i * FDEC_STRIDE];\ + dc += src[-1 + (i+1) * FDEC_STRIDE];\ + }\ + predict_16x16_dc_left_core_##name( src, dc>>4 );\ +} + +PREDICT_16x16_DC_LEFT( mmxext ) +PREDICT_16x16_DC_LEFT( sse2 ) static void predict_8x8c_dc_mmxext( uint8_t *src ) { @@ -151,26 +250,6 @@ static void predict_8x8c_dc_mmxext( uint8_t *src ) } #ifdef ARCH_X86_64 -static void predict_16x16_dc_left( uint8_t *src ) -{ - uint32_t s = 0; - uint64_t dc; - int y; - - for( y = 0; y < 16; y++ ) - { - s += src[-1 + y * FDEC_STRIDE]; - } - dc = (( s + 8 ) >> 4) * 0x0101010101010101ULL; - - for( y = 0; y < 16; y++ ) - { - uint64_t *p = (uint64_t*)src; - p[0] = p[1] = dc; - src += FDEC_STRIDE; - } -} - static void predict_8x8c_dc_left( uint8_t *src ) { int y; @@ -197,213 +276,6 @@ static void predict_8x8c_dc_left( uint8_t *src ) } } - -static void predict_8x8c_dc_top( uint8_t *src ) -{ - int y, x; - uint32_t s0 = 0, s1 = 0; - uint64_t dc; - - for( x = 0; x < 4; x++ ) - { - s0 += src[x - FDEC_STRIDE]; - s1 += src[x + 4 - FDEC_STRIDE]; - } - dc = (( s0 + 2 ) >> 2) * 0x01010101 - + (( s1 + 2 ) >> 2) * 0x0101010100000000ULL; - - for( y = 0; y < 8; y++ ) - { - *(uint64_t*)src = dc; - src += FDEC_STRIDE; - } -} -#endif - -/* Diagonals */ - -#define PREDICT_4x4_LOAD_LEFT \ - const int l0 = src[-1+0*FDEC_STRIDE]; \ - const int l1 = src[-1+1*FDEC_STRIDE]; \ - const int l2 = src[-1+2*FDEC_STRIDE]; \ - UNUSED const int l3 = src[-1+3*FDEC_STRIDE]; - -#define PREDICT_4x4_LOAD_TOP \ - const int t0 = src[0-1*FDEC_STRIDE]; \ - const int t1 = src[1-1*FDEC_STRIDE]; \ - const int t2 = src[2-1*FDEC_STRIDE]; \ - UNUSED const int t3 = src[3-1*FDEC_STRIDE]; - -#define PREDICT_4x4_LOAD_TOP_RIGHT \ - const int t4 = src[4-1*FDEC_STRIDE]; \ - const int t5 = src[5-1*FDEC_STRIDE]; \ - const int t6 = src[6-1*FDEC_STRIDE]; \ - UNUSED const int t7 = src[7-1*FDEC_STRIDE]; - -#define F1(a,b) (((a)+(b)+1)>>1) -#define F2(a,b,c) (((a)+2*(b)+(c)+2)>>2) - -#ifdef ARCH_X86_64 // slower on x86 -#if 0 -static void predict_4x4_ddl( uint8_t *src ) -{ - PREDICT_4x4_LOAD_TOP - PREDICT_4x4_LOAD_TOP_RIGHT - uint32_t vec = (F2(t3,t4,t5)<< 0) - + (F2(t4,t5,t6)<< 8) - + (F2(t5,t6,t7)<<16) - + (F2(t6,t7,t7)<<24); - *(uint32_t*)&src[3*FDEC_STRIDE] = vec; - *(uint32_t*)&src[2*FDEC_STRIDE] = vec = (vec<<8) + F2(t2,t3,t4); - *(uint32_t*)&src[1*FDEC_STRIDE] = vec = (vec<<8) + F2(t1,t2,t3); - *(uint32_t*)&src[0*FDEC_STRIDE] = vec = (vec<<8) + F2(t0,t1,t2); -} -#endif - -static void predict_4x4_ddr( uint8_t *src ) -{ - const int lt = src[-1-FDEC_STRIDE]; - PREDICT_4x4_LOAD_LEFT - PREDICT_4x4_LOAD_TOP - uint32_t vec = (F2(l0,lt,t0)<< 0) - + (F2(lt,t0,t1)<< 8) - + (F2(t0,t1,t2)<<16) - + (F2(t1,t2,t3)<<24); - *(uint32_t*)&src[0*FDEC_STRIDE] = vec; - *(uint32_t*)&src[1*FDEC_STRIDE] = vec = (vec<<8) + F2(l1,l0,lt); - *(uint32_t*)&src[2*FDEC_STRIDE] = vec = (vec<<8) + F2(l2,l1,l0); - *(uint32_t*)&src[3*FDEC_STRIDE] = vec = (vec<<8) + F2(l3,l2,l1); -} - -static void predict_4x4_vr( uint8_t *src ) -{ - const int lt = src[-1-FDEC_STRIDE]; - PREDICT_4x4_LOAD_LEFT - PREDICT_4x4_LOAD_TOP - const int ltt0 = lt + t0 + 1; - const int t0t1 = t0 + t1 + 1; - const int t1t2 = t1 + t2 + 1; - const int t2t3 = t2 + t3 + 1; - const int l0lt = l0 + lt + 1; - const int l1l0 = l1 + l0 + 1; - const int l2l1 = l2 + l1 + 1; - - src[0*FDEC_STRIDE+0]= - src[2*FDEC_STRIDE+1]= ltt0 >> 1; - - src[0*FDEC_STRIDE+1]= - src[2*FDEC_STRIDE+2]= t0t1 >> 1; - - src[0*FDEC_STRIDE+2]= - src[2*FDEC_STRIDE+3]= t1t2 >> 1; - - src[0*FDEC_STRIDE+3]= t2t3 >> 1; - - src[1*FDEC_STRIDE+0]= - src[3*FDEC_STRIDE+1]= (l0lt + ltt0) >> 2; - - src[1*FDEC_STRIDE+1]= - src[3*FDEC_STRIDE+2]= (ltt0 + t0t1) >> 2; - - src[1*FDEC_STRIDE+2]= - src[3*FDEC_STRIDE+3]= (t0t1 + t1t2) >> 2; - - src[1*FDEC_STRIDE+3]= (t1t2 + t2t3) >> 2; - src[2*FDEC_STRIDE+0]= (l1l0 + l0lt) >> 2; - src[3*FDEC_STRIDE+0]= (l2l1 + l1l0) >> 2; -} - -static void predict_4x4_hd( uint8_t *src ) -{ - const int lt= src[-1-1*FDEC_STRIDE]; - PREDICT_4x4_LOAD_LEFT - PREDICT_4x4_LOAD_TOP - const int ltt0 = lt + t0 + 1; - const int t0t1 = t0 + t1 + 1; - const int t1t2 = t1 + t2 + 1; - const int l0lt = l0 + lt + 1; - const int l1l0 = l1 + l0 + 1; - const int l2l1 = l2 + l1 + 1; - const int l3l2 = l3 + l2 + 1; - - src[0*FDEC_STRIDE+0]= - src[1*FDEC_STRIDE+2]= l0lt >> 1; - src[0*FDEC_STRIDE+1]= - src[1*FDEC_STRIDE+3]= (l0lt + ltt0) >> 2; - src[0*FDEC_STRIDE+2]= (ltt0 + t0t1) >> 2; - src[0*FDEC_STRIDE+3]= (t0t1 + t1t2) >> 2; - src[1*FDEC_STRIDE+0]= - src[2*FDEC_STRIDE+2]= l1l0 >> 1; - src[1*FDEC_STRIDE+1]= - src[2*FDEC_STRIDE+3]= (l0lt + l1l0) >> 2; - src[2*FDEC_STRIDE+0]= - src[3*FDEC_STRIDE+2]= l2l1 >> 1; - src[2*FDEC_STRIDE+1]= - src[3*FDEC_STRIDE+3]= (l1l0 + l2l1) >> 2; - src[3*FDEC_STRIDE+0]= l3l2 >> 1; - src[3*FDEC_STRIDE+1]= (l2l1 + l3l2) >> 2; -} - -#if 0 -static void predict_4x4_vl( uint8_t *src ) -{ - PREDICT_4x4_LOAD_TOP - PREDICT_4x4_LOAD_TOP_RIGHT - const int t0t1 = t0 + t1 + 1; - const int t1t2 = t1 + t2 + 1; - const int t2t3 = t2 + t3 + 1; - const int t3t4 = t3 + t4 + 1; - const int t4t5 = t4 + t5 + 1; - const int t5t6 = t5 + t6 + 1; - - src[0*FDEC_STRIDE+0]= t0t1 >> 1; - src[0*FDEC_STRIDE+1]= - src[2*FDEC_STRIDE+0]= t1t2 >> 1; - src[0*FDEC_STRIDE+2]= - src[2*FDEC_STRIDE+1]= t2t3 >> 1; - src[0*FDEC_STRIDE+3]= - src[2*FDEC_STRIDE+2]= t3t4 >> 1; - src[2*FDEC_STRIDE+3]= t4t5 >> 1; - src[1*FDEC_STRIDE+0]= (t0t1 + t1t2) >> 2; - src[1*FDEC_STRIDE+1]= - src[3*FDEC_STRIDE+0]= (t1t2 + t2t3) >> 2; - src[1*FDEC_STRIDE+2]= - src[3*FDEC_STRIDE+1]= (t2t3 + t3t4) >> 2; - src[1*FDEC_STRIDE+3]= - src[3*FDEC_STRIDE+2]= (t3t4 + t4t5) >> 2; - src[3*FDEC_STRIDE+3]= (t4t5 + t5t6) >> 2; -} -#endif - -static void predict_4x4_hu( uint8_t *src ) -{ - PREDICT_4x4_LOAD_LEFT - const int l1l0 = l1 + l0 + 1; - const int l2l1 = l2 + l1 + 1; - const int l3l2 = l3 + l2 + 1; - - src[0*FDEC_STRIDE+0]= l1l0 >> 1; - src[0*FDEC_STRIDE+1]= (l1l0 + l2l1) >> 2; - - src[0*FDEC_STRIDE+2]= - src[1*FDEC_STRIDE+0]= l2l1 >> 1; - - src[0*FDEC_STRIDE+3]= - src[1*FDEC_STRIDE+1]= (l2l1 + l3l2) >> 2; - - src[1*FDEC_STRIDE+2]= - src[2*FDEC_STRIDE+0]= l3l2 >> 1; - - src[1*FDEC_STRIDE+3]= - src[2*FDEC_STRIDE+1]= (l2 + 3*l3 + 2) >> 2; - - src[2*FDEC_STRIDE+3]= - src[3*FDEC_STRIDE+1]= - src[3*FDEC_STRIDE+0]= - src[2*FDEC_STRIDE+2]= - src[3*FDEC_STRIDE+2]= - src[3*FDEC_STRIDE+3]= l3; -} #endif /**************************************************************************** @@ -431,6 +303,7 @@ static void predict_4x4_hu( uint8_t *src ) #define SRC(x,y) src[(x)+(y)*FDEC_STRIDE] +#ifndef ARCH_X86_64 static void predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] ) { predict_8x8_vr_core_mmxext( src, edge ); @@ -445,6 +318,7 @@ static void predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] ) SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2; } } +#endif #define SUMSUB(a,b,c,d,e,f,g,h)\ t=a; a+=b; b-=t;\ @@ -498,15 +372,15 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] ) { if( !(cpu&X264_CPU_MMX) ) return; -#ifdef ARCH_X86_64 - pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left; -#endif pf[I_PRED_16x16_V] = predict_16x16_v_mmx; if( !(cpu&X264_CPU_MMXEXT) ) return; pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext; pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext; + pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_mmxext; +#ifndef ARCH_X86_64 pf[I_PRED_16x16_P] = predict_16x16_p_mmxext; +#endif pf[I_PRED_16x16_H] = predict_16x16_h_mmxext; if( !(cpu&X264_CPU_SSE2) ) return; @@ -515,10 +389,14 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] ) if( cpu&X264_CPU_SSE2_IS_SLOW ) return; pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2; + pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_sse2; pf[I_PRED_16x16_P] = predict_16x16_p_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_16x16_H] = predict_16x16_h_ssse3; +#ifdef __GNUC__ + pf[I_PRED_16x16_P] = predict_16x16_p_ssse3; +#endif } void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] ) @@ -527,17 +405,25 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] ) return; #ifdef ARCH_X86_64 pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left; - pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top; #endif pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx; if( !(cpu&X264_CPU_MMXEXT) ) return; + pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top_mmxext; pf[I_PRED_CHROMA_H] = predict_8x8c_h_mmxext; +#ifndef ARCH_X86_64 pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmxext; +#endif pf[I_PRED_CHROMA_DC] = predict_8x8c_dc_mmxext; + if( !(cpu&X264_CPU_SSE2) ) + return; + pf[I_PRED_CHROMA_P] = predict_8x8c_p_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_CHROMA_H] = predict_8x8c_h_ssse3; +#ifdef __GNUC__ + pf[I_PRED_CHROMA_P] = predict_8x8c_p_ssse3; +#endif } void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter ) @@ -549,36 +435,31 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_ pf[I_PRED_8x8_DC] = predict_8x8_dc_mmxext; pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext; pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext; - pf[I_PRED_8x8_VR] = predict_8x8_vr_mmxext; pf[I_PRED_8x8_HD] = predict_8x8_hd_mmxext; - pf[I_PRED_8x8_HU] = predict_8x8_hu_mmxext; *predict_8x8_filter = predict_8x8_filter_mmxext; #ifdef ARCH_X86 pf[I_PRED_8x8_DDL] = predict_8x8_ddl_mmxext; pf[I_PRED_8x8_DDR] = predict_8x8_ddr_mmxext; + pf[I_PRED_8x8_VR] = predict_8x8_vr_mmxext; + pf[I_PRED_8x8_HU] = predict_8x8_hu_mmxext; #endif if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_8x8_DDL] = predict_8x8_ddl_sse2; pf[I_PRED_8x8_VL] = predict_8x8_vl_sse2; + pf[I_PRED_8x8_VR] = predict_8x8_vr_sse2; pf[I_PRED_8x8_DDR] = predict_8x8_ddr_sse2; + pf[I_PRED_8x8_HD] = predict_8x8_hd_sse2; pf[I_PRED_8x8_HU] = predict_8x8_hu_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_8x8_HD] = predict_8x8_hd_ssse3; + pf[I_PRED_8x8_HU] = predict_8x8_hu_ssse3; *predict_8x8_filter = predict_8x8_filter_ssse3; } void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] ) { - if( !(cpu&X264_CPU_MMX) ) - return; -#ifdef ARCH_X86_64 - pf[I_PRED_4x4_DDR] = predict_4x4_ddr; - pf[I_PRED_4x4_VR] = predict_4x4_vr; - pf[I_PRED_4x4_HD] = predict_4x4_hd; - pf[I_PRED_4x4_HU] = predict_4x4_hu; -#endif if( !(cpu&X264_CPU_MMXEXT) ) return; pf[I_PRED_4x4_VR] = predict_4x4_vr_mmxext;