;* Copyright (C) 2005-2008 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Holger Lubitz <holger@lubitz.org>
;* Fiona Glaser <fiona@x264.com>
;*
;* This program is free software; you can redistribute it and/or modify
pb_00s_ff: times 8 db 0
pb_0s_ff: times 7 db 0
db 0xff
+pw_ff00: times 8 dw 0xff00
+pb_reverse: db 7, 6, 5, 4, 3, 2, 1, 0
SECTION .text
PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa
%endmacro
+%macro LOAD_PLANE_ARGS 0
+%ifdef ARCH_X86_64
+ movd mm0, r1d
+ movd mm2, r2d
+ movd mm4, r3d
+ pshufw mm0, mm0, 0
+ pshufw mm2, mm2, 0
+ pshufw mm4, mm4, 0
+%else
+ pshufw mm0, r1m, 0
+ pshufw mm2, r2m, 0
+ pshufw mm4, r3m, 0
+%endif
+%endmacro
;-----------------------------------------------------------------------------
; void predict_4x4_ddl_mmxext( uint8_t *src )
punpckhbw mm2, [r0-1*FDEC_STRIDE-8]
movd mm3, [r0-1*FDEC_STRIDE]
punpckhwd mm1, mm2
- PALIGNR mm3, mm1, 5, mm4
+ PALIGNR mm3, mm1, 5, mm1
movq mm1, mm3
PALIGNR mm3, [r0+2*FDEC_STRIDE-8], 7, mm4
movq mm2, mm3
PALIGNR mm7, mm1, 7, mm2
psllq mm1, 8
movd [r0+2*FDEC_STRIDE], mm7
- PALIGNR mm3, mm1, 7, mm2
+ PALIGNR mm3, mm1, 7, mm1
movd [r0+3*FDEC_STRIDE], mm3
RET
movq [r0+Y*FDEC_STRIDE], mm0
RET
+;-----------------------------------------------------------------------------
+; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge )
+;-----------------------------------------------------------------------------
+cglobal predict_8x8_hu_mmxext, 2,2
+ movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
+ add r0, 4*FDEC_STRIDE
+ pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
+ psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
+ movq mm2, mm0
+ psllw mm0, 8
+ psrlw mm2, 8
+ por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
+ movq mm3, mm2
+ movq mm4, mm2
+ movq mm5, mm2
+ psrlq mm2, 8
+ psrlq mm3, 16
+ por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
+ punpckhbw mm1, mm1
+ por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
+ pavgb mm4, mm2
+ PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
+ movq mm5, mm4
+ punpcklbw mm4, mm1 ; p4 p3 p2 p1
+ punpckhbw mm5, mm1 ; p8 p7 p6 p5
+ movq mm6, mm5
+ movq mm7, mm5
+ movq mm0, mm5
+ PALIGNR mm5, mm4, 2, mm1
+ pshufw mm1, mm6, 11111001b
+ PALIGNR mm6, mm4, 4, mm2
+ pshufw mm2, mm7, 11111110b
+ PALIGNR mm7, mm4, 6, mm4
+ pshufw mm3, mm0, 11111111b
+ movq [r0-4*FDEC_STRIDE], mm4
+ movq [r0-3*FDEC_STRIDE], mm5
+ movq [r0-2*FDEC_STRIDE], mm6
+ movq [r0-1*FDEC_STRIDE], mm7
+ movq [r0+0*FDEC_STRIDE], mm0
+ movq [r0+1*FDEC_STRIDE], mm1
+ movq [r0+2*FDEC_STRIDE], mm2
+ movq [r0+3*FDEC_STRIDE], mm3
+ RET
+
+;-----------------------------------------------------------------------------
+; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
+;-----------------------------------------------------------------------------
+
+; fills only some pixels:
+; f01234567
+; 0........
+; 1,,,,,,,,
+; 2 .......
+; 3 ,,,,,,,
+; 4 ......
+; 5 ,,,,,,
+; 6 .....
+; 7 ,,,,,
+
+cglobal predict_8x8_vr_core_mmxext, 2,2
+ movq mm2, [r1+16]
+ movq mm3, [r1+15]
+ movq mm1, [r1+14]
+ movq mm4, mm3
+ pavgb mm3, mm2
+ PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
+
+%assign Y 0
+%rep 3
+ movq [r0+ Y *FDEC_STRIDE], mm3
+ movq [r0+(Y+1)*FDEC_STRIDE], mm0
+ psllq mm3, 8
+ psllq mm0, 8
+%assign Y (Y+2)
+%endrep
+ movq [r0+ Y *FDEC_STRIDE], mm3
+ movq [r0+(Y+1)*FDEC_STRIDE], mm0
+
+ RET
+
+;-----------------------------------------------------------------------------
+; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
+;-----------------------------------------------------------------------------
+cglobal predict_8x8c_p_core_mmxext, 1,2
+ LOAD_PLANE_ARGS
+ movq mm1, mm2
+ pmullw mm2, [pw_3210 GLOBAL]
+ psllw mm1, 2
+ paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
+ paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
+
+ mov r1d, 8
+ALIGN 4
+.loop:
+ movq mm5, mm0
+ movq mm6, mm1
+ psraw mm5, 5
+ psraw mm6, 5
+ packuswb mm5, mm6
+ movq [r0], mm5
+
+ paddsw mm0, mm4
+ paddsw mm1, mm4
+ add r0, FDEC_STRIDE
+ dec r1d
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
+;-----------------------------------------------------------------------------
+cglobal predict_16x16_p_core_mmxext, 1,2
+ LOAD_PLANE_ARGS
+ movq mm5, mm2
+ movq mm1, mm2
+ pmullw mm5, [pw_3210 GLOBAL]
+ psllw mm2, 3
+ psllw mm1, 2
+ movq mm3, mm2
+ paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
+ paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
+ paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
+ paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
+
+ mov r1d, 16
+ALIGN 4
+.loop:
+ movq mm5, mm0
+ movq mm6, mm1
+ psraw mm5, 5
+ psraw mm6, 5
+ packuswb mm5, mm6
+ movq [r0], mm5
+
+ movq mm5, mm2
+ movq mm6, mm3
+ psraw mm5, 5
+ psraw mm6, 5
+ packuswb mm5, mm6
+ movq [r0+8], mm5
+
+ paddsw mm0, mm4
+ paddsw mm1, mm4
+ paddsw mm2, mm4
+ paddsw mm3, mm4
+ add r0, FDEC_STRIDE
+ dec r1d
+ jg .loop
+ REP_RET
+
%endif ; !ARCH_X86_64
;-----------------------------------------------------------------------------
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_vr_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-
-; fills only some pixels:
-; f01234567
-; 0........
-; 1,,,,,,,,
-; 2 .......
-; 3 ,,,,,,,
-; 4 ......
-; 5 ,,,,,,
-; 6 .....
-; 7 ,,,,,
-
-cglobal predict_8x8_vr_core_mmxext, 2,2
- movq mm2, [r1+16]
- movq mm3, [r1+15]
- movq mm1, [r1+14]
- movq mm4, mm3
- pavgb mm3, mm2
- PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
-
-%assign Y 0
+cglobal predict_8x8_vr_sse2, 2,2
+ movdqu xmm0, [r1+8]
+ movdqa xmm6, [pw_ff00 GLOBAL]
+ add r0, 4*FDEC_STRIDE
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pslldq xmm0, 1
+ pslldq xmm1, 2
+ pavgb xmm2, xmm0
+ PRED8x8_LOWPASS_XMM xmm4, xmm3, xmm1, xmm0, xmm5
+ pandn xmm6, xmm4
+ movdqa xmm5, xmm4
+ psrlw xmm4, 8
+ packuswb xmm6, xmm4
+ movhlps xmm4, xmm6
+ movhps [r0-3*FDEC_STRIDE], xmm5
+ movhps [r0-4*FDEC_STRIDE], xmm2
+ psrldq xmm5, 4
+ movss xmm5, xmm6
+ psrldq xmm2, 4
+ movss xmm2, xmm4
+%assign Y 3
%rep 3
- movq [r0+ Y *FDEC_STRIDE], mm3
- movq [r0+(Y+1)*FDEC_STRIDE], mm0
- psllq mm3, 8
- psllq mm0, 8
-%assign Y (Y+2)
+ psrldq xmm5, 1
+ psrldq xmm2, 1
+ movq [r0+Y*FDEC_STRIDE], xmm5
+ movq [r0+(Y-1)*FDEC_STRIDE], xmm2
+%assign Y (Y-2)
%endrep
- movq [r0+ Y *FDEC_STRIDE], mm3
- movq [r0+(Y+1)*FDEC_STRIDE], mm0
-
RET
-
;-----------------------------------------------------------------------------
; void predict_8x8_hd_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
movq [r0+2*FDEC_STRIDE], mm7
PALIGNR mm1, mm3, 4, mm5
movq [r0+1*FDEC_STRIDE], mm1
- PALIGNR mm0, mm3, 6, mm5
+ PALIGNR mm0, mm3, 6, mm3
movq [r0+0*FDEC_STRIDE], mm0
movq mm2, mm6
movq mm3, mm6
movq [r0-2*FDEC_STRIDE], mm6
PALIGNR mm2, mm4, 4, mm5
movq [r0-3*FDEC_STRIDE], mm2
- PALIGNR mm3, mm4, 6, mm5
+ PALIGNR mm3, mm4, 6, mm4
movq [r0-4*FDEC_STRIDE], mm3
RET
;-----------------------------------------------------------------------------
; void predict_8x8_hd_ssse3( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_hd_ssse3, 2,2
+%macro PREDICT_8x8_HD 1
+cglobal predict_8x8_hd_%1, 2,2
add r0, 4*FDEC_STRIDE
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdqa xmm2, xmm1
movdqa xmm3, xmm1
- palignr xmm1, xmm0, 7
- palignr xmm2, xmm0, 9
- palignr xmm3, xmm0, 8
- movdqa xmm4, xmm1
+ PALIGNR xmm1, xmm0, 7, xmm4
+ PALIGNR xmm2, xmm0, 9, xmm5
+ PALIGNR xmm3, xmm0, 8, xmm0
+ movdqa xmm4, xmm1
pavgb xmm4, xmm3
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm5
punpcklbw xmm4, xmm0
movq [r0+(Y)*FDEC_STRIDE], xmm4
movq [r0+(Y-4)*FDEC_STRIDE], xmm0
RET
+%endmacro
-;-----------------------------------------------------------------------------
-; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_hu_mmxext, 2,2
- movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
- add r0, 4*FDEC_STRIDE
- pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
- psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
- movq mm2, mm0
- psllw mm0, 8
- psrlw mm2, 8
- por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
- movq mm3, mm2
- movq mm4, mm2
- movq mm5, mm2
- psrlq mm2, 8
- psrlq mm3, 16
- por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
- punpckhbw mm1, mm1
- por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
- pavgb mm4, mm2
- PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
- movq mm5, mm4
- punpcklbw mm4, mm1 ; p4 p3 p2 p1
- punpckhbw mm5, mm1 ; p8 p7 p6 p5
- movq mm6, mm5
- movq mm7, mm5
- movq mm0, mm5
- PALIGNR mm5, mm4, 2, mm1
- pshufw mm1, mm6, 11111001b
- PALIGNR mm6, mm4, 4, mm2
- pshufw mm2, mm7, 11111110b
- PALIGNR mm7, mm4, 6, mm3
- pshufw mm3, mm0, 11111111b
- movq [r0-4*FDEC_STRIDE], mm4
- movq [r0-3*FDEC_STRIDE], mm5
- movq [r0-2*FDEC_STRIDE], mm6
- movq [r0-1*FDEC_STRIDE], mm7
- movq [r0+0*FDEC_STRIDE], mm0
- movq [r0+1*FDEC_STRIDE], mm1
- movq [r0+2*FDEC_STRIDE], mm2
- movq [r0+3*FDEC_STRIDE], mm3
- RET
+INIT_XMM
+PREDICT_8x8_HD sse2
+%define PALIGNR PALIGNR_SSSE3
+PREDICT_8x8_HD ssse3
+INIT_MMX
+%define PALIGNR PALIGNR_MMX
;-----------------------------------------------------------------------------
; void predict_8x8_hu_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_hu_sse2, 2,2
- movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
+%macro PREDICT_8x8_HU 1
+cglobal predict_8x8_hu_%1, 2,2
add r0, 4*FDEC_STRIDE
+%ifidn %1, ssse3
+ movq mm5, [r1+7]
+ movq mm6, [pb_reverse GLOBAL]
+ movq mm1, mm5
+ movq mm2, mm5
+ movq mm3, mm5
+ pshufb mm5, mm6
+ psrlq mm6, 8
+ pshufb mm2, mm6
+ psrlq mm6, 8
+ pshufb mm3, mm6
+ movq mm4, mm5
+%else
+ movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
- psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
movq mm2, mm0
psllw mm0, 8
psrlw mm2, 8
por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
+ psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
movq mm3, mm2
movq mm4, mm2
movq mm5, mm2
por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
punpckhbw mm1, mm1
por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
+%endif
pavgb mm4, mm2
PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
movq2dq xmm0, mm4
movq2dq xmm1, mm1
punpcklbw xmm0, xmm1
-
- movhlps xmm4, xmm0
- pshuflw xmm5, xmm4, 11111001b
- pshuflw xmm6, xmm4, 11111110b
- pshuflw xmm7, xmm4, 11111111b
+ punpckhbw mm4, mm1
%assign Y -4
%rep 3
movq [r0+Y*FDEC_STRIDE], xmm0
psrldq xmm0, 2
%assign Y (Y+1)
%endrep
+ pshufw mm5, mm4, 11111001b
+ pshufw mm6, mm4, 11111110b
+ pshufw mm7, mm4, 11111111b
movq [r0+Y*FDEC_STRIDE], xmm0
- movq [r0+0*FDEC_STRIDE], xmm4
- movq [r0+1*FDEC_STRIDE], xmm5
- movq [r0+2*FDEC_STRIDE], xmm6
- movq [r0+3*FDEC_STRIDE], xmm7
+ movq [r0+0*FDEC_STRIDE], mm4
+ movq [r0+1*FDEC_STRIDE], mm5
+ movq [r0+2*FDEC_STRIDE], mm6
+ movq [r0+3*FDEC_STRIDE], mm7
RET
+%endmacro
+PREDICT_8x8_HU sse2
+PREDICT_8x8_HU ssse3
;-----------------------------------------------------------------------------
; void predict_8x8c_v_mmx( uint8_t *src )
STORE8x8 mm0, mm2
RET
-%macro LOAD_PLANE_ARGS 0
-%ifdef ARCH_X86_64
- movd mm0, r1d
- movd mm2, r2d
- movd mm4, r3d
- pshufw mm0, mm0, 0
- pshufw mm2, mm2, 0
- pshufw mm4, mm4, 0
-%else
- pshufw mm0, r1m, 0
- pshufw mm2, r2m, 0
- pshufw mm4, r3m, 0
-%endif
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8c_p_core_mmxext, 1,2
- LOAD_PLANE_ARGS
- movq mm1, mm2
- pmullw mm2, [pw_3210 GLOBAL]
- psllw mm1, 2
- paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
- paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
-
- mov r1d, 8
-ALIGN 4
-.loop:
- movq mm5, mm0
- movq mm6, mm1
- psraw mm5, 5
- psraw mm6, 5
- packuswb mm5, mm6
- movq [r0], mm5
-
- paddsw mm0, mm4
- paddsw mm1, mm4
- add r0, FDEC_STRIDE
- dec r1d
- jg .loop
- REP_RET
+cglobal predict_8x8c_dc_top_mmxext, 1,1
+ movq mm0, [r0 - FDEC_STRIDE]
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpckhbw mm1, mm0
+ punpcklbw mm0, mm2
+ psadbw mm1, mm2 ; s1
+ psadbw mm0, mm2 ; s0
+ psrlw mm1, 1
+ psrlw mm0, 1
+ pavgw mm1, mm2
+ pavgw mm0, mm2
+ pshufw mm1, mm1, 0
+ pshufw mm0, mm0, 0 ; dc0 (w)
+ packuswb mm0, mm1 ; dc0,dc1 (b)
+ STORE8x8 mm0, mm0
+ RET
;-----------------------------------------------------------------------------
-; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
+; void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-cglobal predict_16x16_p_core_mmxext, 1,2
- LOAD_PLANE_ARGS
- movq mm5, mm2
- movq mm1, mm2
- pmullw mm5, [pw_3210 GLOBAL]
- psllw mm2, 3
- psllw mm1, 2
- movq mm3, mm2
- paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
- paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
- paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
- paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
- mov r1d, 16
-ALIGN 4
+cglobal predict_8x8c_p_core_sse2, 1,1
+ movd xmm0, r1m
+ movd xmm2, r2m
+ movd xmm4, r3m
+ pshuflw xmm0, xmm0, 0
+ pshuflw xmm2, xmm2, 0
+ pshuflw xmm4, xmm4, 0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm4, xmm4
+ pmullw xmm2, [pw_76543210 GLOBAL]
+ paddsw xmm0, xmm2 ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
+ movdqa xmm3, xmm0
+ paddsw xmm3, xmm4
+ paddsw xmm4, xmm4
+call .loop
+ add r0, FDEC_STRIDE*4
.loop:
- movq mm5, mm0
- movq mm6, mm1
- psraw mm5, 5
- psraw mm6, 5
- packuswb mm5, mm6
- movq [r0], mm5
-
- movq mm5, mm2
- movq mm6, mm3
- psraw mm5, 5
- psraw mm6, 5
- packuswb mm5, mm6
- movq [r0+8], mm5
-
- paddsw mm0, mm4
- paddsw mm1, mm4
- paddsw mm2, mm4
- paddsw mm3, mm4
- add r0, FDEC_STRIDE
- dec r1d
- jg .loop
- REP_RET
+ movdqa xmm5, xmm0
+ movdqa xmm1, xmm3
+ psraw xmm0, 5
+ psraw xmm3, 5
+ packuswb xmm0, xmm3
+ movq [r0+FDEC_STRIDE*0], xmm0
+ movhps [r0+FDEC_STRIDE*1], xmm0
+ paddsw xmm5, xmm4
+ paddsw xmm1, xmm4
+ movdqa xmm0, xmm5
+ movdqa xmm3, xmm1
+ psraw xmm5, 5
+ psraw xmm1, 5
+ packuswb xmm5, xmm1
+ movq [r0+FDEC_STRIDE*2], xmm5
+ movhps [r0+FDEC_STRIDE*3], xmm5
+ paddsw xmm0, xmm4
+ paddsw xmm3, xmm4
+ RET
;-----------------------------------------------------------------------------
; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c )
psllw xmm1, 3
paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
-
- mov r1d, 16
+ movdqa xmm7, xmm2
+ paddsw xmm7, xmm7
+ mov r1d, 8
ALIGN 4
.loop:
movdqa xmm3, xmm0
movdqa xmm4, xmm1
+ movdqa xmm5, xmm0
+ movdqa xmm6, xmm1
psraw xmm3, 5
psraw xmm4, 5
+ paddsw xmm5, xmm2
+ paddsw xmm6, xmm2
+ psraw xmm5, 5
+ psraw xmm6, 5
packuswb xmm3, xmm4
- movdqa [r0], xmm3
-
- paddsw xmm0, xmm2
- paddsw xmm1, xmm2
- add r0, FDEC_STRIDE
+ packuswb xmm5, xmm6
+ movdqa [r0+FDEC_STRIDE*0], xmm3
+ movdqa [r0+FDEC_STRIDE*1], xmm5
+ paddsw xmm0, xmm7
+ paddsw xmm1, xmm7
+ add r0, FDEC_STRIDE*2
dec r1d
jg .loop
REP_RET
;-----------------------------------------------------------------------------
; void predict_16x16_v_sse2( uint8_t *src )
;-----------------------------------------------------------------------------
-cglobal predict_16x16_v_sse2, 1,2
+cglobal predict_16x16_v_sse2, 1,1
movdqa xmm0, [r0 - FDEC_STRIDE]
STORE16x16_SSE2 xmm0
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void predict_16x16_h_mmxext( uint8_t *src )
PRED16x16_DC [pw_8 GLOBAL], 4
REP_RET
+cglobal predict_16x16_dc_left_core_mmxext, 1,1
+ movd mm0, r1m
+ pshufw mm0, mm0, 0
+ packuswb mm0, mm0
+ STORE16x16 mm0, mm0
+ REP_RET
+
;-----------------------------------------------------------------------------
; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left )
;-----------------------------------------------------------------------------
STORE16x16_SSE2 xmm0
%endmacro
-cglobal predict_16x16_dc_core_sse2, 1,2
+cglobal predict_16x16_dc_core_sse2, 1,1
movd xmm2, r1m
PRED16x16_DC_SSE2 xmm2, 5
- REP_RET
+ RET
-cglobal predict_16x16_dc_top_sse2, 1,2
+cglobal predict_16x16_dc_top_sse2, 1,1
PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4
- REP_RET
+ RET
+cglobal predict_16x16_dc_left_core_sse2, 1,1
+ movd xmm0, r1m
+ pshuflw xmm0, xmm0, 0
+ punpcklqdq xmm0, xmm0
+ packuswb xmm0, xmm0
+ STORE16x16_SSE2 xmm0
+ REP_RET
extern void predict_16x16_h_mmxext( uint8_t *src );
extern void predict_16x16_h_ssse3( uint8_t *src );
extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
+extern void predict_16x16_dc_left_core_mmxext( uint8_t *src, int i_dc_left );
extern void predict_16x16_dc_top_mmxext( uint8_t *src );
extern void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
+extern void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
+extern void predict_8x8c_dc_top_mmxext( uint8_t *src );
extern void predict_8x8c_v_mmx( uint8_t *src );
extern void predict_8x8c_h_mmxext( uint8_t *src );
extern void predict_8x8c_h_ssse3( uint8_t *src );
extern void predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_vr_sse2( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_hd_sse2( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_hu_ssse3( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_filter_mmxext ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
extern void predict_8x8_filter_ssse3 ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
extern void predict_4x4_ddl_mmxext( uint8_t *src );
extern void predict_4x4_hu_mmxext( uint8_t *src );
extern void predict_16x16_dc_top_sse2( uint8_t *src );
extern void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left );
+extern void predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left );
extern void predict_16x16_v_sse2( uint8_t *src );
extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
+DECLARE_ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
+DECLARE_ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
+DECLARE_ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
+
#define PREDICT_P_SUM(j,i)\
H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\
V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\
predict_16x16_p_core_##name( src, i00, b, c );\
}
+#ifndef ARCH_X86_64
PREDICT_16x16_P( mmxext )
+#endif
PREDICT_16x16_P( sse2 )
-static void predict_8x8c_p_mmxext( uint8_t *src )
+#ifdef __GNUC__
+static void predict_16x16_p_ssse3( uint8_t *src )
{
- int a, b, c;
- int H = 0;
- int V = 0;
- int i00;
+ int a, b, c, i00;
+ int H, V;
+ asm (
+ "movq %1, %%mm1 \n"
+ "movq 8+%1, %%mm0 \n"
+ "palignr $7, -8+%1, %%mm1 \n"
+ "pmaddubsw %2, %%mm0 \n"
+ "pmaddubsw %3, %%mm1 \n"
+ "paddw %%mm1, %%mm0 \n"
+ "pshufw $14, %%mm0, %%mm1 \n"
+ "paddw %%mm1, %%mm0 \n"
+ "pshufw $1, %%mm0, %%mm1 \n"
+ "paddw %%mm1, %%mm0 \n"
+ "movd %%mm0, %0 \n"
+ "movsx %w0, %0 \n"
+ :"=r"(H)
+ :"m"(src[-FDEC_STRIDE]), "m"(*pb_12345678), "m"(*pb_m87654321)
+ );
+ V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )
+ + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )
+ + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )
+ + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )
+ + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )
+ + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )
+ + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )
+ + 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] );
+ a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );
+ b = ( 5 * H + 32 ) >> 6;
+ c = ( 5 * V + 32 ) >> 6;
+ i00 = a - b * 7 - c * 7 + 16;
+ predict_16x16_p_core_sse2( src, i00, b, c );
+}
+#endif
- PREDICT_P_SUM(3,1)
- PREDICT_P_SUM(3,2)
- PREDICT_P_SUM(3,3)
- PREDICT_P_SUM(3,4)
+#define PREDICT_8x8_P(name)\
+static void predict_8x8c_p_##name( uint8_t *src )\
+{\
+ int a, b, c;\
+ int H = 0;\
+ int V = 0;\
+ int i00;\
+ PREDICT_P_SUM(3,1)\
+ PREDICT_P_SUM(3,2)\
+ PREDICT_P_SUM(3,3)\
+ PREDICT_P_SUM(3,4)\
+ a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
+ b = ( 17 * H + 16 ) >> 5;\
+ c = ( 17 * V + 16 ) >> 5;\
+ i00 = a -3*b -3*c + 16;\
+ predict_8x8c_p_core_##name( src, i00, b, c );\
+}
+#ifndef ARCH_X86_64
+PREDICT_8x8_P( mmxext )
+#endif
+PREDICT_8x8_P( sse2 )
+
+#ifdef __GNUC__
+static void predict_8x8c_p_ssse3( uint8_t *src )
+{
+ int a, b, c, i00;
+ int H, V;
+ asm (
+ "movq %1, %%mm0 \n"
+ "pmaddubsw %2, %%mm0 \n"
+ "pshufw $14, %%mm0, %%mm1 \n"
+ "paddw %%mm1, %%mm0 \n"
+ "pshufw $1, %%mm0, %%mm1 \n"
+ "paddw %%mm1, %%mm0 \n"
+ "movd %%mm0, %0 \n"
+ "movsx %w0, %0 \n"
+ :"=r"(H)
+ :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)
+ );
+ V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )
+ + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )
+ + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )
+ + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );
+ H += -4 * src[-1*FDEC_STRIDE -1];
a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );
b = ( 17 * H + 16 ) >> 5;
c = ( 17 * V + 16 ) >> 5;
i00 = a -3*b -3*c + 16;
-
- predict_8x8c_p_core_mmxext( src, i00, b, c );
+ predict_8x8c_p_core_sse2( src, i00, b, c );
}
+#endif
#define PREDICT_16x16_DC(name)\
static void predict_16x16_dc_##name( uint8_t *src )\
}
PREDICT_16x16_DC( mmxext )
-PREDICT_16x16_DC( sse2 )
+PREDICT_16x16_DC( sse2 )
+
+#define PREDICT_16x16_DC_LEFT(name)\
+static void predict_16x16_dc_left_##name( uint8_t *src )\
+{\
+ uint32_t dc=8;\
+ int i;\
+ for( i = 0; i < 16; i+=2 )\
+ {\
+ dc += src[-1 + i * FDEC_STRIDE];\
+ dc += src[-1 + (i+1) * FDEC_STRIDE];\
+ }\
+ predict_16x16_dc_left_core_##name( src, dc>>4 );\
+}
+
+PREDICT_16x16_DC_LEFT( mmxext )
+PREDICT_16x16_DC_LEFT( sse2 )
static void predict_8x8c_dc_mmxext( uint8_t *src )
{
}
#ifdef ARCH_X86_64
-static void predict_16x16_dc_left( uint8_t *src )
-{
- uint32_t s = 0;
- uint64_t dc;
- int y;
-
- for( y = 0; y < 16; y++ )
- {
- s += src[-1 + y * FDEC_STRIDE];
- }
- dc = (( s + 8 ) >> 4) * 0x0101010101010101ULL;
-
- for( y = 0; y < 16; y++ )
- {
- uint64_t *p = (uint64_t*)src;
- p[0] = p[1] = dc;
- src += FDEC_STRIDE;
- }
-}
-
static void predict_8x8c_dc_left( uint8_t *src )
{
int y;
}
}
-
-static void predict_8x8c_dc_top( uint8_t *src )
-{
- int y, x;
- uint32_t s0 = 0, s1 = 0;
- uint64_t dc;
-
- for( x = 0; x < 4; x++ )
- {
- s0 += src[x - FDEC_STRIDE];
- s1 += src[x + 4 - FDEC_STRIDE];
- }
- dc = (( s0 + 2 ) >> 2) * 0x01010101
- + (( s1 + 2 ) >> 2) * 0x0101010100000000ULL;
-
- for( y = 0; y < 8; y++ )
- {
- *(uint64_t*)src = dc;
- src += FDEC_STRIDE;
- }
-}
-#endif
-
-/* Diagonals */
-
-#define PREDICT_4x4_LOAD_LEFT \
- const int l0 = src[-1+0*FDEC_STRIDE]; \
- const int l1 = src[-1+1*FDEC_STRIDE]; \
- const int l2 = src[-1+2*FDEC_STRIDE]; \
- UNUSED const int l3 = src[-1+3*FDEC_STRIDE];
-
-#define PREDICT_4x4_LOAD_TOP \
- const int t0 = src[0-1*FDEC_STRIDE]; \
- const int t1 = src[1-1*FDEC_STRIDE]; \
- const int t2 = src[2-1*FDEC_STRIDE]; \
- UNUSED const int t3 = src[3-1*FDEC_STRIDE];
-
-#define PREDICT_4x4_LOAD_TOP_RIGHT \
- const int t4 = src[4-1*FDEC_STRIDE]; \
- const int t5 = src[5-1*FDEC_STRIDE]; \
- const int t6 = src[6-1*FDEC_STRIDE]; \
- UNUSED const int t7 = src[7-1*FDEC_STRIDE];
-
-#define F1(a,b) (((a)+(b)+1)>>1)
-#define F2(a,b,c) (((a)+2*(b)+(c)+2)>>2)
-
-#ifdef ARCH_X86_64 // slower on x86
-#if 0
-static void predict_4x4_ddl( uint8_t *src )
-{
- PREDICT_4x4_LOAD_TOP
- PREDICT_4x4_LOAD_TOP_RIGHT
- uint32_t vec = (F2(t3,t4,t5)<< 0)
- + (F2(t4,t5,t6)<< 8)
- + (F2(t5,t6,t7)<<16)
- + (F2(t6,t7,t7)<<24);
- *(uint32_t*)&src[3*FDEC_STRIDE] = vec;
- *(uint32_t*)&src[2*FDEC_STRIDE] = vec = (vec<<8) + F2(t2,t3,t4);
- *(uint32_t*)&src[1*FDEC_STRIDE] = vec = (vec<<8) + F2(t1,t2,t3);
- *(uint32_t*)&src[0*FDEC_STRIDE] = vec = (vec<<8) + F2(t0,t1,t2);
-}
-#endif
-
-static void predict_4x4_ddr( uint8_t *src )
-{
- const int lt = src[-1-FDEC_STRIDE];
- PREDICT_4x4_LOAD_LEFT
- PREDICT_4x4_LOAD_TOP
- uint32_t vec = (F2(l0,lt,t0)<< 0)
- + (F2(lt,t0,t1)<< 8)
- + (F2(t0,t1,t2)<<16)
- + (F2(t1,t2,t3)<<24);
- *(uint32_t*)&src[0*FDEC_STRIDE] = vec;
- *(uint32_t*)&src[1*FDEC_STRIDE] = vec = (vec<<8) + F2(l1,l0,lt);
- *(uint32_t*)&src[2*FDEC_STRIDE] = vec = (vec<<8) + F2(l2,l1,l0);
- *(uint32_t*)&src[3*FDEC_STRIDE] = vec = (vec<<8) + F2(l3,l2,l1);
-}
-
-static void predict_4x4_vr( uint8_t *src )
-{
- const int lt = src[-1-FDEC_STRIDE];
- PREDICT_4x4_LOAD_LEFT
- PREDICT_4x4_LOAD_TOP
- const int ltt0 = lt + t0 + 1;
- const int t0t1 = t0 + t1 + 1;
- const int t1t2 = t1 + t2 + 1;
- const int t2t3 = t2 + t3 + 1;
- const int l0lt = l0 + lt + 1;
- const int l1l0 = l1 + l0 + 1;
- const int l2l1 = l2 + l1 + 1;
-
- src[0*FDEC_STRIDE+0]=
- src[2*FDEC_STRIDE+1]= ltt0 >> 1;
-
- src[0*FDEC_STRIDE+1]=
- src[2*FDEC_STRIDE+2]= t0t1 >> 1;
-
- src[0*FDEC_STRIDE+2]=
- src[2*FDEC_STRIDE+3]= t1t2 >> 1;
-
- src[0*FDEC_STRIDE+3]= t2t3 >> 1;
-
- src[1*FDEC_STRIDE+0]=
- src[3*FDEC_STRIDE+1]= (l0lt + ltt0) >> 2;
-
- src[1*FDEC_STRIDE+1]=
- src[3*FDEC_STRIDE+2]= (ltt0 + t0t1) >> 2;
-
- src[1*FDEC_STRIDE+2]=
- src[3*FDEC_STRIDE+3]= (t0t1 + t1t2) >> 2;
-
- src[1*FDEC_STRIDE+3]= (t1t2 + t2t3) >> 2;
- src[2*FDEC_STRIDE+0]= (l1l0 + l0lt) >> 2;
- src[3*FDEC_STRIDE+0]= (l2l1 + l1l0) >> 2;
-}
-
-static void predict_4x4_hd( uint8_t *src )
-{
- const int lt= src[-1-1*FDEC_STRIDE];
- PREDICT_4x4_LOAD_LEFT
- PREDICT_4x4_LOAD_TOP
- const int ltt0 = lt + t0 + 1;
- const int t0t1 = t0 + t1 + 1;
- const int t1t2 = t1 + t2 + 1;
- const int l0lt = l0 + lt + 1;
- const int l1l0 = l1 + l0 + 1;
- const int l2l1 = l2 + l1 + 1;
- const int l3l2 = l3 + l2 + 1;
-
- src[0*FDEC_STRIDE+0]=
- src[1*FDEC_STRIDE+2]= l0lt >> 1;
- src[0*FDEC_STRIDE+1]=
- src[1*FDEC_STRIDE+3]= (l0lt + ltt0) >> 2;
- src[0*FDEC_STRIDE+2]= (ltt0 + t0t1) >> 2;
- src[0*FDEC_STRIDE+3]= (t0t1 + t1t2) >> 2;
- src[1*FDEC_STRIDE+0]=
- src[2*FDEC_STRIDE+2]= l1l0 >> 1;
- src[1*FDEC_STRIDE+1]=
- src[2*FDEC_STRIDE+3]= (l0lt + l1l0) >> 2;
- src[2*FDEC_STRIDE+0]=
- src[3*FDEC_STRIDE+2]= l2l1 >> 1;
- src[2*FDEC_STRIDE+1]=
- src[3*FDEC_STRIDE+3]= (l1l0 + l2l1) >> 2;
- src[3*FDEC_STRIDE+0]= l3l2 >> 1;
- src[3*FDEC_STRIDE+1]= (l2l1 + l3l2) >> 2;
-}
-
-#if 0
-static void predict_4x4_vl( uint8_t *src )
-{
- PREDICT_4x4_LOAD_TOP
- PREDICT_4x4_LOAD_TOP_RIGHT
- const int t0t1 = t0 + t1 + 1;
- const int t1t2 = t1 + t2 + 1;
- const int t2t3 = t2 + t3 + 1;
- const int t3t4 = t3 + t4 + 1;
- const int t4t5 = t4 + t5 + 1;
- const int t5t6 = t5 + t6 + 1;
-
- src[0*FDEC_STRIDE+0]= t0t1 >> 1;
- src[0*FDEC_STRIDE+1]=
- src[2*FDEC_STRIDE+0]= t1t2 >> 1;
- src[0*FDEC_STRIDE+2]=
- src[2*FDEC_STRIDE+1]= t2t3 >> 1;
- src[0*FDEC_STRIDE+3]=
- src[2*FDEC_STRIDE+2]= t3t4 >> 1;
- src[2*FDEC_STRIDE+3]= t4t5 >> 1;
- src[1*FDEC_STRIDE+0]= (t0t1 + t1t2) >> 2;
- src[1*FDEC_STRIDE+1]=
- src[3*FDEC_STRIDE+0]= (t1t2 + t2t3) >> 2;
- src[1*FDEC_STRIDE+2]=
- src[3*FDEC_STRIDE+1]= (t2t3 + t3t4) >> 2;
- src[1*FDEC_STRIDE+3]=
- src[3*FDEC_STRIDE+2]= (t3t4 + t4t5) >> 2;
- src[3*FDEC_STRIDE+3]= (t4t5 + t5t6) >> 2;
-}
-#endif
-
-static void predict_4x4_hu( uint8_t *src )
-{
- PREDICT_4x4_LOAD_LEFT
- const int l1l0 = l1 + l0 + 1;
- const int l2l1 = l2 + l1 + 1;
- const int l3l2 = l3 + l2 + 1;
-
- src[0*FDEC_STRIDE+0]= l1l0 >> 1;
- src[0*FDEC_STRIDE+1]= (l1l0 + l2l1) >> 2;
-
- src[0*FDEC_STRIDE+2]=
- src[1*FDEC_STRIDE+0]= l2l1 >> 1;
-
- src[0*FDEC_STRIDE+3]=
- src[1*FDEC_STRIDE+1]= (l2l1 + l3l2) >> 2;
-
- src[1*FDEC_STRIDE+2]=
- src[2*FDEC_STRIDE+0]= l3l2 >> 1;
-
- src[1*FDEC_STRIDE+3]=
- src[2*FDEC_STRIDE+1]= (l2 + 3*l3 + 2) >> 2;
-
- src[2*FDEC_STRIDE+3]=
- src[3*FDEC_STRIDE+1]=
- src[3*FDEC_STRIDE+0]=
- src[2*FDEC_STRIDE+2]=
- src[3*FDEC_STRIDE+2]=
- src[3*FDEC_STRIDE+3]= l3;
-}
#endif
/****************************************************************************
#define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
+#ifndef ARCH_X86_64
static void predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] )
{
predict_8x8_vr_core_mmxext( src, edge );
SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
}
}
+#endif
#define SUMSUB(a,b,c,d,e,f,g,h)\
t=a; a+=b; b-=t;\
{
if( !(cpu&X264_CPU_MMX) )
return;
-#ifdef ARCH_X86_64
- pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left;
-#endif
pf[I_PRED_16x16_V] = predict_16x16_v_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext;
pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext;
+ pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_mmxext;
+#ifndef ARCH_X86_64
pf[I_PRED_16x16_P] = predict_16x16_p_mmxext;
+#endif
pf[I_PRED_16x16_H] = predict_16x16_h_mmxext;
if( !(cpu&X264_CPU_SSE2) )
return;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
+ pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_sse2;
pf[I_PRED_16x16_P] = predict_16x16_p_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_16x16_H] = predict_16x16_h_ssse3;
+#ifdef __GNUC__
+ pf[I_PRED_16x16_P] = predict_16x16_p_ssse3;
+#endif
}
void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
return;
#ifdef ARCH_X86_64
pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left;
- pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top;
#endif
pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
+ pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top_mmxext;
pf[I_PRED_CHROMA_H] = predict_8x8c_h_mmxext;
+#ifndef ARCH_X86_64
pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmxext;
+#endif
pf[I_PRED_CHROMA_DC] = predict_8x8c_dc_mmxext;
+ if( !(cpu&X264_CPU_SSE2) )
+ return;
+ pf[I_PRED_CHROMA_P] = predict_8x8c_p_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_CHROMA_H] = predict_8x8c_h_ssse3;
+#ifdef __GNUC__
+ pf[I_PRED_CHROMA_P] = predict_8x8c_p_ssse3;
+#endif
}
void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
pf[I_PRED_8x8_DC] = predict_8x8_dc_mmxext;
pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext;
pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext;
- pf[I_PRED_8x8_VR] = predict_8x8_vr_mmxext;
pf[I_PRED_8x8_HD] = predict_8x8_hd_mmxext;
- pf[I_PRED_8x8_HU] = predict_8x8_hu_mmxext;
*predict_8x8_filter = predict_8x8_filter_mmxext;
#ifdef ARCH_X86
pf[I_PRED_8x8_DDL] = predict_8x8_ddl_mmxext;
pf[I_PRED_8x8_DDR] = predict_8x8_ddr_mmxext;
+ pf[I_PRED_8x8_VR] = predict_8x8_vr_mmxext;
+ pf[I_PRED_8x8_HU] = predict_8x8_hu_mmxext;
#endif
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_8x8_DDL] = predict_8x8_ddl_sse2;
pf[I_PRED_8x8_VL] = predict_8x8_vl_sse2;
+ pf[I_PRED_8x8_VR] = predict_8x8_vr_sse2;
pf[I_PRED_8x8_DDR] = predict_8x8_ddr_sse2;
+ pf[I_PRED_8x8_HD] = predict_8x8_hd_sse2;
pf[I_PRED_8x8_HU] = predict_8x8_hu_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_8x8_HD] = predict_8x8_hd_ssse3;
+ pf[I_PRED_8x8_HU] = predict_8x8_hu_ssse3;
*predict_8x8_filter = predict_8x8_filter_ssse3;
}
void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
{
- if( !(cpu&X264_CPU_MMX) )
- return;
-#ifdef ARCH_X86_64
- pf[I_PRED_4x4_DDR] = predict_4x4_ddr;
- pf[I_PRED_4x4_VR] = predict_4x4_vr;
- pf[I_PRED_4x4_HD] = predict_4x4_hd;
- pf[I_PRED_4x4_HU] = predict_4x4_hu;
-#endif
if( !(cpu&X264_CPU_MMXEXT) )
return;
pf[I_PRED_4x4_VR] = predict_4x4_vr_mmxext;