db 0xff
shuf_fixtr: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
shuf_nop: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+shuf_hu: db 7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0
+shuf_vr: db 2,4,6,8,9,10,11,12,13,14,15,0,1,3,5,7
+pw_reverse: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
SECTION .text
cextern pw_4
cextern pw_8
cextern pw_16
-cextern pw_ff00
-cextern pb_reverse
+cextern pw_00ff
cextern pw_pixel_max
%macro STORE8x8 2
; dest, left, right, src, tmp
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
-%macro PRED8x8_LOWPASS 5-6
-%ifidn %1, w
- paddw %3, %4
- psrlw %3, 1
- pavgw %2, %5, %3
-%else
- mova %6, %3
- pavgb %3, %4
- pxor %4, %6
- pand %4, [pb_1]
- psubusb %3, %4
- pavgb %2, %5, %3
-%endif
-%endmacro
-
-%macro LOAD_PLANE_ARGS 0
-%ifdef ARCH_X86_64
- movd mm0, r1d
- movd mm2, r2d
- movd mm4, r3d
- pshufw mm0, mm0, 0
- pshufw mm2, mm2, 0
- pshufw mm4, mm4, 0
+%macro PRED8x8_LOWPASS 4-5
+%ifdef HIGH_BIT_DEPTH
+ paddw %2, %3
+ psrlw %2, 1
+ pavgw %1, %4, %2
%else
- pshufw mm0, r1m, 0
- pshufw mm2, r2m, 0
- pshufw mm4, r3m, 0
+ mova %5, %2
+ pavgb %2, %3
+ pxor %3, %5
+ pand %3, [pb_1]
+ psubusb %2, %3
+ pavgb %1, %4, %2
%endif
%endmacro
;-----------------------------------------------------------------------------
; void predict_4x4_ddl( pixel *src )
;-----------------------------------------------------------------------------
-%macro PREDICT_4x4_DDL 3
+%macro PREDICT_4x4_DDL 0
cglobal predict_4x4_ddl, 1,1
movu m1, [r0-FDEC_STRIDEB]
- psll%1 m2, m1, %2
- mova m3, m1
- mova m4, m1
+ PSLLPIX m2, m1, 1
+ mova m0, m1
+%ifdef HIGH_BIT_DEPTH
+ PSRLPIX m1, m1, 1
+ pshufhw m1, m1, q2210
+%else
pxor m1, m2
- psrl%1 m1, %2
- pxor m3, m1
- PRED8x8_LOWPASS %3, m0, m2, m3, m4, m5
+ PSRLPIX m1, m1, 1
+ pxor m1, m0
+%endif
+ PRED8x8_LOWPASS m0, m2, m1, m0, m3
%assign Y 0
%rep 4
- psrl%1 m0, %2
- movh [r0+Y*FDEC_STRIDEB], m0
+ PSRLPIX m0, m0, 1
+ movh [r0+Y*FDEC_STRIDEB], m0
%assign Y (Y+1)
%endrep
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
-PREDICT_4x4_DDL dq, 2, w
+PREDICT_4x4_DDL
INIT_XMM avx
-PREDICT_4x4_DDL dq, 2, w
+PREDICT_4x4_DDL
INIT_MMX mmx2
cglobal predict_4x4_ddl, 1,2
- mova m1, [r0-2*FDEC_STRIDE+4]
- mova m2, [r0-2*FDEC_STRIDE+0]
- mova m3, [r0-2*FDEC_STRIDE+2]
- PRED8x8_LOWPASS w, m0, m1, m2, m3
- mova [r0+0*FDEC_STRIDE], m0
-
- mova m5, [r0-2*FDEC_STRIDE+6]
- mova m6, [r0-2*FDEC_STRIDE+8]
- pshufw m7, m6, q3321
- PRED8x8_LOWPASS w, m4, m7, m5, m6
- mova [r0+6*FDEC_STRIDE], m4
-
- psllq m0, 16
- PALIGNR m4, m0, 6, m1
- mova [r0+4*FDEC_STRIDE], m4
-
+ movu m1, [r0-FDEC_STRIDEB+4]
+ PRED8x8_LOWPASS m0, m1, [r0-FDEC_STRIDEB+0], [r0-FDEC_STRIDEB+2]
+ mova m3, [r0-FDEC_STRIDEB+8]
+ mova [r0+0*FDEC_STRIDEB], m0
+ pshufw m4, m3, q3321
+ PRED8x8_LOWPASS m2, m4, [r0-FDEC_STRIDEB+6], m3
+ mova [r0+3*FDEC_STRIDEB], m2
+ pshufw m1, m0, q0021
+ punpckldq m1, m2
+ mova [r0+1*FDEC_STRIDEB], m1
psllq m0, 16
- PALIGNR m4, m0, 6, m0
- mova [r0+2*FDEC_STRIDE], m4
+ PALIGNR m2, m0, 6, m0
+ mova [r0+2*FDEC_STRIDEB], m2
RET
-%else
+%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
-PREDICT_4x4_DDL q, 8, b
+PREDICT_4x4_DDL
%endif
+;-----------------------------------------------------------------------------
+; void predict_4x4_vr( pixel *src )
+;-----------------------------------------------------------------------------
+%ifndef HIGH_BIT_DEPTH
+INIT_MMX ssse3
+cglobal predict_4x4_vr, 1,1
+ movd m1, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
+ mova m4, m1
+ palignr m1, [r0-1*FDEC_STRIDEB-8], 7 ; ......t3t2t1t0lt
+ pavgb m4, m1
+ palignr m1, [r0+0*FDEC_STRIDEB-8], 7 ; ....t3t2t1t0ltl0
+ mova m0, m1
+ palignr m1, [r0+1*FDEC_STRIDEB-8], 7 ; ..t3t2t1t0ltl0l1
+ mova m2, m1
+ palignr m1, [r0+2*FDEC_STRIDEB-8], 7 ; t3t2t1t0ltl0l1l2
+ PRED8x8_LOWPASS m2, m0, m1, m2, m3
+ pshufw m0, m2, 0
+ psrlq m2, 16
+ movd [r0+0*FDEC_STRIDEB], m4
+ palignr m4, m0, 7
+ movd [r0+1*FDEC_STRIDEB], m2
+ psllq m0, 8
+ movd [r0+2*FDEC_STRIDEB], m4
+ palignr m2, m0, 7
+ movd [r0+3*FDEC_STRIDEB], m2
+ RET
+%endif ; !HIGH_BIT_DEPTH
+
;-----------------------------------------------------------------------------
; void predict_4x4_ddr( pixel *src )
;-----------------------------------------------------------------------------
-%macro PREDICT_4x4 6
+%macro PREDICT_4x4 4
cglobal predict_4x4_ddr, 1,1
- movu m1, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- movq m2, [r0+0*FDEC_STRIDEB-8]
%ifdef HIGH_BIT_DEPTH
- movh m4, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- punpckl%1 m2, m4
- movh m3, [r0-1*FDEC_STRIDEB]
- punpckh%2 m1, m2
- PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1
- mova m1, m3
- movhps m4, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- PALIGNR m3, m4, 7*SIZEOF_PIXEL, m4
- mova m2, m3
- movhps m4, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- PALIGNR m3, m4, 7*SIZEOF_PIXEL, m4
+ movu m2, [r0-1*FDEC_STRIDEB-8]
+ pinsrw m2, [r0+0*FDEC_STRIDEB-2], 2
+ pinsrw m2, [r0+1*FDEC_STRIDEB-2], 1
+ pinsrw m2, [r0+2*FDEC_STRIDEB-2], 0
+ movhps m3, [r0+3*FDEC_STRIDEB-8]
+%else ; !HIGH_BIT_DEPTH
+ movd m0, [r0+2*FDEC_STRIDEB-4]
+ movd m1, [r0+0*FDEC_STRIDEB-4]
+ punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
+ punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
+ punpckhwd m0, m1
+ movd m2, [r0-1*FDEC_STRIDEB]
+%if cpuflag(ssse3)
+ palignr m2, m0, 4
%else
- punpckh%1 m2, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- movh m3, [r0-1*FDEC_STRIDEB]
- punpckh%2 m1, m2
- PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1
- mova m1, m3
- PALIGNR m3, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
- mova m2, m3
- PALIGNR m3, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
+ psllq m2, 32
+ punpckhdq m0, m2
+ SWAP 2, 0
%endif
- PRED8x8_LOWPASS %4, m0, m3, m1, m2, m4
+ movd m3, [r0+3*FDEC_STRIDEB-4]
+ psllq m3, 32
+%endif ; !HIGH_BIT_DEPTH
+
+ PSRLPIX m1, m2, 1
+ mova m0, m2
+ PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3
+ PRED8x8_LOWPASS m0, m2, m1, m0, m3
%assign Y 3
movh [r0+Y*FDEC_STRIDEB], m0
%rep 3
%assign Y (Y-1)
- psrl%3 m0, %6
+ PSRLPIX m0, m0, 1
movh [r0+Y*FDEC_STRIDEB], m0
%endrep
RET
-cglobal predict_4x4_vr, 1,1,6
- movh m0, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
- mova m5, m0
+;-----------------------------------------------------------------------------
+; void predict_4x4_vr( pixel *src )
+;-----------------------------------------------------------------------------
+cglobal predict_4x4_vr, 1,1
%ifdef HIGH_BIT_DEPTH
- movhps m1, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt
- pavg%4 m5, m0
- movhps m1, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0
- mova m1, m0
- movhps m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- PALIGNR m0, m2, 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1
- mova m2, m0
- movhps m3, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- PALIGNR m0, m3, 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
+ movu m1, [r0-1*FDEC_STRIDEB-8]
+ pinsrw m1, [r0+0*FDEC_STRIDEB-2], 2
+ pinsrw m1, [r0+1*FDEC_STRIDEB-2], 1
+ pinsrw m1, [r0+2*FDEC_STRIDEB-2], 0
+%else ; !HIGH_BIT_DEPTH
+ movd m0, [r0+2*FDEC_STRIDEB-4]
+ movd m1, [r0+0*FDEC_STRIDEB-4]
+ punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
+ punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
+ punpckhwd m0, m1
+ movd m1, [r0-1*FDEC_STRIDEB]
+%if cpuflag(ssse3)
+ palignr m1, m0, 4
%else
- PALIGNR m0, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt
- pavg%4 m5, m0
- PALIGNR m0, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0
- mova m1, m0
- PALIGNR m0, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1
- mova m2, m0
- PALIGNR m0, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
+ psllq m1, 32
+ punpckhdq m0, m1
+ SWAP 1, 0
%endif
- PRED8x8_LOWPASS %4, m3, m1, m0, m2, m4
- psll%3 m1, m3, %6*6
- psrl%3 m3, %6*2
- movh [r0+0*FDEC_STRIDEB], m5
- movh [r0+1*FDEC_STRIDEB], m3
- PALIGNR m5, m1, 7*SIZEOF_PIXEL, m2
- psll%3 m1, %6
- movh [r0+2*FDEC_STRIDEB], m5
- PALIGNR m3, m1, 7*SIZEOF_PIXEL, m1
- movh [r0+3*FDEC_STRIDEB], m3
+%endif ; !HIGH_BIT_DEPTH
+ PSRLPIX m2, m1, 1
+ PSRLPIX m0, m1, 2
+ pavg%1 m4, m1, m2
+ PSRLPIX m4, m4, 3
+ PRED8x8_LOWPASS m2, m0, m1, m2, m3
+ PSLLPIX m0, m2, 6
+ PSRLPIX m2, m2, 2
+ movh [r0+0*FDEC_STRIDEB], m4
+ PALIGNR m4, m0, 7*SIZEOF_PIXEL, m3
+ movh [r0+1*FDEC_STRIDEB], m2
+ PSLLPIX m0, m0, 1
+ movh [r0+2*FDEC_STRIDEB], m4
+ PALIGNR m2, m0, 7*SIZEOF_PIXEL, m0
+ movh [r0+3*FDEC_STRIDEB], m2
RET
-cglobal predict_4x4_hd, 1,1,6
- movh m0, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; lt ..
+;-----------------------------------------------------------------------------
+; void predict_4x4_hd( pixel *src )
+;-----------------------------------------------------------------------------
+cglobal predict_4x4_hd, 1,1
%ifdef HIGH_BIT_DEPTH
- movh m1, [r0-1*FDEC_STRIDEB]
- punpckl%5 m0, m1 ; t3 t2 t1 t0 lt .. .. ..
- psll%3 m0, %6 ; t2 t1 t0 lt .. .. .. ..
- movh m1, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; l3
- movh m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- punpckl%1 m1, m2 ; l2 l3
- movh m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; l1
- movh m3, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- punpckl%1 m2, m3 ; l0 l1
+ movu m1, [r0-1*FDEC_STRIDEB-8]
+ PSLLPIX m1, m1, 1
+ pinsrw m1, [r0+0*FDEC_STRIDEB-2], 3
+ pinsrw m1, [r0+1*FDEC_STRIDEB-2], 2
+ pinsrw m1, [r0+2*FDEC_STRIDEB-2], 1
+ pinsrw m1, [r0+3*FDEC_STRIDEB-2], 0
%else
- punpckl%5 m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
- psll%3 m0, %6 ; t2 t1 t0 lt .. .. .. ..
- movu m1, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l3
- punpckh%1 m1, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l2 l3
- movu m2, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l1
- punpckh%1 m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l0 l1
+ movd m0, [r0-1*FDEC_STRIDEB-4] ; lt ..
+ punpckldq m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
+ PSLLPIX m0, m0, 1 ; t2 t1 t0 lt .. .. .. ..
+ movd m1, [r0+3*FDEC_STRIDEB-4] ; l3
+ punpcklbw m1, [r0+2*FDEC_STRIDEB-4] ; l2 l3
+ movd m2, [r0+1*FDEC_STRIDEB-4] ; l1
+ punpcklbw m2, [r0+0*FDEC_STRIDEB-4] ; l0 l1
+ punpckh%3 m1, m2 ; l0 l1 l2 l3
+ punpckh%4 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
%endif
- punpckh%2 m1, m2 ; l0 l1 l2 l3
- punpckh%5 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
- psrl%3 m2, m1, %6 ; .. t2 t1 t0 lt l0 l1 l2
- psrl%3 m0, m1, %6*2 ; .. .. t2 t1 t0 lt l0 l1
- pavg%4 m5, m1, m2
- PRED8x8_LOWPASS %4, m3, m1, m0, m2, m4
- punpckl%1 m5, m3
- psrl%3 m3, %6*4
+ PSRLPIX m2, m1, 1 ; .. t2 t1 t0 lt l0 l1 l2
+ PSRLPIX m0, m1, 2 ; .. .. t2 t1 t0 lt l0 l1
+ pavg%1 m5, m1, m2
+ PRED8x8_LOWPASS m3, m1, m0, m2, m4
+ punpckl%2 m5, m3
+ PSRLPIX m3, m3, 4
PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4
%assign Y 3
movh [r0+Y*FDEC_STRIDEB], m5
%rep 2
%assign Y (Y-1)
- psrl%3 m5, %6*2
+ PSRLPIX m5, m5, 2
movh [r0+Y*FDEC_STRIDEB], m5
%endrep
movh [r0+0*FDEC_STRIDEB], m3
RET
-%endmacro
+%endmacro ; PREDICT_4x4
+;-----------------------------------------------------------------------------
+; void predict_4x4_ddr( pixel *src )
+;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_MMX mmx2
cglobal predict_4x4_ddr, 1,1
- movq m3, [r0+3*FDEC_STRIDEB-8]
- psrlq m3, 48
- PALIGNR m3, [r0+2*FDEC_STRIDEB-8], 6, m6
- PALIGNR m3, [r0+1*FDEC_STRIDEB-8], 6, m7
- movq m6, [r0+0*FDEC_STRIDEB-8]
- PALIGNR m3, m6, 6, m5
-
- movq m4, [r0-1*FDEC_STRIDEB-8]
- movq m2, m3
- movq m1, m3
- PALIGNR m2, m4, 6, m5
- movq m1, m2
- psllq m1, 16
- PRED8x8_LOWPASS w, m0, m3, m1, m2
- pshufw m0, m0, q0123
+ mova m0, [r0+1*FDEC_STRIDEB-8]
+ punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
+ mova m3, [r0+3*FDEC_STRIDEB-8]
+ punpckhwd m3, [r0+2*FDEC_STRIDEB-8]
+ punpckhdq m3, m0
+
+ pshufw m0, m3, q3321
+ pinsrw m0, [r0-1*FDEC_STRIDEB-2], 3
+ pshufw m1, m0, q3321
+ PRED8x8_LOWPASS m0, m1, m3, m0
movq [r0+3*FDEC_STRIDEB], m0
movq m2, [r0-1*FDEC_STRIDEB-0]
- movq m5, m2
- PALIGNR m5, m4, 6, m4
- movq m3, m5
- PALIGNR m5, m6, 6, m6
- PRED8x8_LOWPASS w, m1, m5, m2, m3
+ pshufw m4, m2, q2100
+ pinsrw m4, [r0-1*FDEC_STRIDEB-2], 0
+ movq m1, m4
+ PALIGNR m4, m3, 6, m3
+ PRED8x8_LOWPASS m1, m4, m2, m1
movq [r0+0*FDEC_STRIDEB], m1
- psllq m0, 16
- PALIGNR m1, m0, 6, m2
- movq [r0+1*FDEC_STRIDEB], m1
+ pshufw m2, m0, q3321
+ punpckldq m2, m1
psllq m0, 16
PALIGNR m1, m0, 6, m0
- movq [r0+2*FDEC_STRIDEB], m1
- psrlq m1, 16
+ movq [r0+1*FDEC_STRIDEB], m1
+ movq [r0+2*FDEC_STRIDEB], m2
movd [r0+3*FDEC_STRIDEB+4], m1
RET
+;-----------------------------------------------------------------------------
+; void predict_4x4_hd( pixel *src )
+;-----------------------------------------------------------------------------
cglobal predict_4x4_hd, 1,1
mova m0, [r0+1*FDEC_STRIDEB-8]
punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
punpckhwd m1, [r0+2*FDEC_STRIDEB-8]
punpckhdq m1, m0
mova m0, m1
- mova m4, m1
movu m3, [r0-1*FDEC_STRIDEB-2]
+ pshufw m4, m1, q0032
mova m7, m3
- punpckhdq m4, [r0-1*FDEC_STRIDEB-6]
+ punpckldq m4, m3
PALIGNR m3, m1, 2, m2
- PRED8x8_LOWPASS w, m2, m4, m1, m3, m6
+ PRED8x8_LOWPASS m2, m4, m1, m3
pavgw m0, m3
- mova m5, m0
- punpcklwd m5, m2
- mova m4, m0
- punpckhwd m4, m2
+ punpcklwd m5, m0, m2
+ punpckhwd m4, m0, m2
mova [r0+3*FDEC_STRIDEB], m5
mova [r0+1*FDEC_STRIDEB], m4
+ psrlq m5, 32
+ punpckldq m5, m4
+ mova [r0+2*FDEC_STRIDEB], m5
- mova m4, m7
+ pshufw m4, m7, q2100
mova m6, [r0-1*FDEC_STRIDEB+0]
- PALIGNR m7, [r0+0*FDEC_STRIDEB-8], 6, m5
- PRED8x8_LOWPASS w, m3, m7, m6, m4, m1
-
- PALIGNR m3, m0, 6, m5
+ pinsrw m4, [r0+0*FDEC_STRIDEB-2], 0
+ PRED8x8_LOWPASS m3, m4, m6, m7
+ PALIGNR m3, m0, 6, m0
mova [r0+0*FDEC_STRIDEB], m3
- psrlq m0, 16
- psrlq m2, 16
- punpcklwd m0, m2
- mova [r0+2*FDEC_STRIDEB], m0
RET
INIT_XMM sse2
-PREDICT_4x4 wd, dq, dq, w, qdq, 2
+PREDICT_4x4 w, wd, dq, qdq
INIT_XMM ssse3
-PREDICT_4x4 wd, dq, dq, w, qdq, 2
+PREDICT_4x4 w, wd, dq, qdq
INIT_XMM avx
-PREDICT_4x4 wd, dq, dq, w, qdq, 2
-%else
+PREDICT_4x4 w, wd, dq, qdq
+%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
-PREDICT_4x4 bw, wd, q , b, dq , 8
+PREDICT_4x4 b, bw, wd, dq
INIT_MMX ssse3
-PREDICT_4x4 bw, wd, q , b, dq , 8
+%define predict_4x4_vr_ssse3 predict_4x4_vr_ssse3_cache64
+PREDICT_4x4 b, bw, wd, dq
%endif
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_4x4_hu_mmx2, 1,1
- movq m0, [r0+0*FDEC_STRIDEB-4*2]
- punpckhwd m0, [r0+1*FDEC_STRIDEB-4*2]
- movq m1, [r0+2*FDEC_STRIDEB-4*2]
- punpckhwd m1, [r0+3*FDEC_STRIDEB-4*2]
+ movq m0, [r0+0*FDEC_STRIDEB-8]
+ punpckhwd m0, [r0+1*FDEC_STRIDEB-8]
+ movq m1, [r0+2*FDEC_STRIDEB-8]
+ punpckhwd m1, [r0+3*FDEC_STRIDEB-8]
punpckhdq m0, m1
pshufw m1, m1, q3333
movq [r0+3*FDEC_STRIDEB], m1
- movd [r0+2*FDEC_STRIDEB+4], m1
- mova m2, m0
- psrlq m2, 16
- pavgw m2, m0
-
- pshufw m1, m0, q3321
- pshufw m5, m0, q3332
- PRED8x8_LOWPASS w, m3, m0, m5, m1, m7
- movq m6, m2
- punpcklwd m6, m3
- mova [r0+0*FDEC_STRIDEB], m6
+ pshufw m3, m0, q3321
+ pshufw m4, m0, q3332
+ pavgw m2, m0, m3
+ PRED8x8_LOWPASS m3, m0, m4, m3
+ punpcklwd m4, m2, m3
+ mova [r0+0*FDEC_STRIDEB], m4
psrlq m2, 16
psrlq m3, 16
punpcklwd m2, m3
mova [r0+1*FDEC_STRIDEB], m2
- psrlq m2, 32
- movd [r0+2*FDEC_STRIDEB+0], m2
+ punpckhdq m2, m1
+ mova [r0+2*FDEC_STRIDEB], m2
RET
%else ; !HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_4x4_hu_mmx2, 1,1
- movq mm0, [r0+0*FDEC_STRIDE-8]
- punpckhbw mm0, [r0+1*FDEC_STRIDE-8]
- movq mm1, [r0+2*FDEC_STRIDE-8]
- punpckhbw mm1, [r0+3*FDEC_STRIDE-8]
- punpckhwd mm0, mm1
- movq mm1, mm0
- punpckhbw mm1, mm1
- pshufw mm1, mm1, q3333
- punpckhdq mm0, mm1
- movq mm2, mm0
- movq mm3, mm0
- movq mm7, mm0
- psrlq mm2, 16
- psrlq mm3, 8
- pavgb mm7, mm3
- PRED8x8_LOWPASS b, mm4, mm0, mm2, mm3, mm5
- punpcklbw mm7, mm4
-%assign Y 0
- movd [r0+Y*FDEC_STRIDE], mm7
-%rep 2
-%assign Y (Y+1)
- psrlq mm7, 16
- movd [r0+Y*FDEC_STRIDE], mm7
-%endrep
- movd [r0+3*FDEC_STRIDE], mm1
+ movd m1, [r0+0*FDEC_STRIDEB-4]
+ punpcklbw m1, [r0+1*FDEC_STRIDEB-4]
+ movd m0, [r0+2*FDEC_STRIDEB-4]
+ punpcklbw m0, [r0+3*FDEC_STRIDEB-4]
+ punpckhwd m1, m0
+ movq m0, m1
+ punpckhbw m1, m1
+ pshufw m1, m1, q3333
+ punpckhdq m0, m1
+ movq m2, m0
+ movq m3, m0
+ movq m5, m0
+ psrlq m3, 8
+ psrlq m2, 16
+ pavgb m5, m3
+ PRED8x8_LOWPASS m3, m0, m2, m3, m4
+ movd [r0+3*FDEC_STRIDEB], m1
+ punpcklbw m5, m3
+ movd [r0+0*FDEC_STRIDEB], m5
+ psrlq m5, 16
+ movd [r0+1*FDEC_STRIDEB], m5
+ psrlq m5, 16
+ movd [r0+2*FDEC_STRIDEB], m5
RET
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void predict_4x4_vl( pixel *src )
;-----------------------------------------------------------------------------
-%macro PREDICT_4x4_V1 3
-cglobal predict_4x4_vl, 1,1,6
+%macro PREDICT_4x4_V1 1
+cglobal predict_4x4_vl, 1,1
movu m1, [r0-FDEC_STRIDEB]
- psrl%1 m3, m1, %2
- psrl%1 m2, m1, %2*2
- pavg%3 m4, m3, m1
- PRED8x8_LOWPASS %3, m0, m1, m2, m3, m5
+ PSRLPIX m3, m1, 1
+ PSRLPIX m2, m1, 2
+ pavg%1 m4, m3, m1
+ PRED8x8_LOWPASS m0, m1, m2, m3, m5
movh [r0+0*FDEC_STRIDEB], m4
movh [r0+1*FDEC_STRIDEB], m0
- psrl%1 m4, %2
- psrl%1 m0, %2
+ PSRLPIX m4, m4, 1
+ PSRLPIX m0, m0, 1
movh [r0+2*FDEC_STRIDEB], m4
movh [r0+3*FDEC_STRIDEB], m0
RET
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
-PREDICT_4x4_V1 dq, 2, w
-%ifdef ARCH_X86_64
+PREDICT_4x4_V1 w
INIT_XMM avx
-PREDICT_4x4_V1 dq, 2, w
-%endif
+PREDICT_4x4_V1 w
INIT_MMX mmx2
cglobal predict_4x4_vl, 1,4
mova m1, [r0-FDEC_STRIDEB+0]
mova m2, [r0-FDEC_STRIDEB+8]
- mova m3, m2
- PALIGNR m2, m1, 4, m6
- PALIGNR m3, m1, 2, m5
- mova m4, m3
- pavgw m4, m1
- mova [r0+0*FDEC_STRIDEB], m4
- psrlq m4, 16
- mova [r0+2*FDEC_STRIDEB], m4
- PRED8x8_LOWPASS w, m0, m1, m2, m3, m6
+ mova m0, m2
+ PALIGNR m2, m1, 4, m4
+ PALIGNR m0, m1, 2, m4
+ mova m3, m0
+ pavgw m3, m1
+ mova [r0+0*FDEC_STRIDEB], m3
+ psrlq m3, 16
+ mova [r0+2*FDEC_STRIDEB], m3
+ PRED8x8_LOWPASS m0, m1, m2, m0
mova [r0+1*FDEC_STRIDEB], m0
psrlq m0, 16
mova [r0+3*FDEC_STRIDEB], m0
mov [r0+2*FDEC_STRIDEB+6], r1w
mov [r0+3*FDEC_STRIDEB+6], r3w
RET
-%else
+%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
-PREDICT_4x4_V1 q, 8, b
+PREDICT_4x4_V1 b
%endif
;-----------------------------------------------------------------------------
mova [r0+3*FDEC_STRIDEB], m0
RET
-%else
+%else ; !HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_4x4_dc_mmx2, 1,4
pxor mm7, mm7
- movd mm0, [r0-FDEC_STRIDE]
+ movd mm0, [r0-FDEC_STRIDEB]
psadbw mm0, mm7
movd r3d, mm0
movzx r1d, byte [r0-1]
-%assign n 1
+%assign Y 1
%rep 3
- movzx r2d, byte [r0+FDEC_STRIDE*n-1]
+ movzx r2d, byte [r0+FDEC_STRIDEB*Y-1]
add r1d, r2d
-%assign n n+1
+%assign Y Y+1
%endrep
lea r1d, [r1+r3+4]
shr r1d, 3
imul r1d, 0x01010101
- mov [r0+FDEC_STRIDE*0], r1d
- mov [r0+FDEC_STRIDE*1], r1d
- mov [r0+FDEC_STRIDE*2], r1d
- mov [r0+FDEC_STRIDE*3], r1d
+ mov [r0+FDEC_STRIDEB*0], r1d
+ mov [r0+FDEC_STRIDEB*1], r1d
+ mov [r0+FDEC_STRIDEB*2], r1d
+ mov [r0+FDEC_STRIDEB*3], r1d
RET
%endif ; HIGH_BIT_DEPTH
-%macro PREDICT_FILTER 5
+%macro PREDICT_FILTER 4
;-----------------------------------------------------------------------------
;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters )
;-----------------------------------------------------------------------------
punpckh%3%4 m3, m1
mova m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL]
mova m1, [src-1*FDEC_STRIDEB]
- mova m4, m3
- mova m2, m3
- PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0
- PALIGNR m1, m2, 1*SIZEOF_PIXEL, m2
- PRED8x8_LOWPASS %1, m2, m1, m4, m3, m5
- mova [t1+8*SIZEOF_PIXEL], m2
+ PALIGNR m4, m3, m0, 7*SIZEOF_PIXEL, m0
+ PALIGNR m1, m1, m3, 1*SIZEOF_PIXEL, m2
+ PRED8x8_LOWPASS m3, m1, m4, m3, m5
+ mova [t1+8*SIZEOF_PIXEL], m3
movzx t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL]
movzx r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL]
lea t4d, [t4*3+2]
psrldq m1, m3, 15
PALIGNR m2, m3, m0, 15, m0
PALIGNR m1, m3, 1, m5
- PRED8x8_LOWPASS %1, m0, m2, m1, m3, m5
+ PRED8x8_LOWPASS m0, m2, m1, m3, m5
mova [t1+16*SIZEOF_PIXEL], m0
psrldq m0, 15
movd [t1+32*SIZEOF_PIXEL], m0
.do_top:
PALIGNR m2, m3, m0, 7*SIZEOF_PIXEL, m0
PALIGNR m0, m1, m3, 1*SIZEOF_PIXEL, m5
- PRED8x8_LOWPASS %1, m4, m2, m0, m3, m5
+ PRED8x8_LOWPASS m4, m2, m0, m3, m5
mova [t1+16*SIZEOF_PIXEL], m4
test r3b, 4
je .done
- mova m2, m1
- mova m4, m1
- psrl%4 m5, m1, 7*%5
- PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3
- PALIGNR m5, m4, 1*SIZEOF_PIXEL, m4
- PRED8x8_LOWPASS %1, m0, m2, m5, m1, m4
+ PSRLPIX m5, m1, 7
+ PALIGNR m2, m1, m3, 7*SIZEOF_PIXEL, m3
+ PALIGNR m5, m1, 1*SIZEOF_PIXEL, m4
+ PRED8x8_LOWPASS m0, m2, m5, m1, m4
mova [t1+24*SIZEOF_PIXEL], m0
- psrl%4 m0, 7*%5
+ PSRLPIX m0, m0, 7
movd [t1+32*SIZEOF_PIXEL], m0
.done:
REP_RET
.fix_lt_2:
- psll%4 m0, m3, 7*%5
+ PSLLPIX m0, m3, 7
test r2b, 4
jne .do_top
.fix_tr_1:
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
-PREDICT_FILTER w, d, q, dq, 2
+PREDICT_FILTER w, d, q, dq
INIT_XMM ssse3
-PREDICT_FILTER w, d, q, dq, 2
+PREDICT_FILTER w, d, q, dq
INIT_XMM avx
-PREDICT_FILTER w, d, q, dq, 2
+PREDICT_FILTER w, d, q, dq
%else
INIT_MMX mmx2
-PREDICT_FILTER b, w, d, q , 8
+PREDICT_FILTER b, w, d, q
INIT_MMX ssse3
-PREDICT_FILTER b, w, d, q , 8
+PREDICT_FILTER b, w, d, q
%endif
;-----------------------------------------------------------------------------
add r0, 4*FDEC_STRIDEB
punpckl%1 m2, m1, m1
punpckh%1 m1, m1
-%assign n 0
+%assign Y 0
%rep 8
-%assign i 1+n/4
- SPLAT%2 m0, m %+ i, (3-n)&3
- mova [r0+(n-4)*FDEC_STRIDEB], m0
-%assign n n+1
+%assign i 1+Y/4
+ SPLAT%2 m0, m %+ i, (3-Y)&3
+ mova [r0+(Y-4)*FDEC_STRIDEB], m0
+%assign Y Y+1
%endrep
RET
%endmacro
STORE8x8 m0, m0
REP_RET
-%else
+%else ; !HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_8x8_dc_mmx2, 2,2
pxor mm0, mm0
PREDICT_8x8_DC predict_8x8_dc_top_sse2 , 32, mova
PREDICT_8x8_DC predict_8x8_dc_left_sse2, 14, movu
-%else
+%else ; !HIGH_BIT_DEPTH
%macro PREDICT_8x8_DC 2
cglobal %1, 2,2
pxor mm0, mm0
; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe
; size on the 8-bit mmx functions below if we know sse2 is available.
-%macro PREDICT_8x8 3
+%macro PREDICT_8x8_DDLR 0
;-----------------------------------------------------------------------------
; void predict_8x8_ddl( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl, 2,2,8
- mova m5, [r1+16*SIZEOF_PIXEL]
- movu m2, [r1+17*SIZEOF_PIXEL]
- movu m3, [r1+23*SIZEOF_PIXEL]
- movu m4, [r1+25*SIZEOF_PIXEL]
- psll%2 m1, m5, %3
+cglobal predict_8x8_ddl, 2,2,7
+ mova m0, [r1+16*SIZEOF_PIXEL]
+ mova m1, [r1+24*SIZEOF_PIXEL]
+%if cpuflag(cache64)
+ movd m5, [r1+32*SIZEOF_PIXEL]
+ palignr m3, m1, m0, 1*SIZEOF_PIXEL
+ palignr m5, m5, m1, 1*SIZEOF_PIXEL
+ palignr m4, m1, m0, 7*SIZEOF_PIXEL
+%else
+ movu m3, [r1+17*SIZEOF_PIXEL]
+ movu m4, [r1+23*SIZEOF_PIXEL]
+ movu m5, [r1+25*SIZEOF_PIXEL]
+%endif
+ PSLLPIX m2, m0, 1
add r0, FDEC_STRIDEB*4
- PRED8x8_LOWPASS %1, m0, m1, m2, m5, m7
-%assign %%bak avx_enabled
-%assign avx_enabled 0
- PRED8x8_LOWPASS %1, m1, m3, m4, [r1+24*SIZEOF_PIXEL], m6
-%assign avx_enabled %%bak
-%assign Y 3
+ PRED8x8_LOWPASS m0, m2, m3, m0, m6
+ PRED8x8_LOWPASS m1, m4, m5, m1, m6
+ mova [r0+3*FDEC_STRIDEB], m1
+%assign Y 2
%rep 6
+ PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2
+ PSLLPIX m0, m0, 1
mova [r0+Y*FDEC_STRIDEB], m1
- psll%2 m1, %3
- psrl%2 m2, m0, 7*%3
- psll%2 m0, %3
- por m1, m2
%assign Y (Y-1)
%endrep
- mova [r0+Y*FDEC_STRIDEB], m1
- psll%2 m1, %3
- psrl%2 m0, 7*%3
- por m1, m0
-%assign Y (Y-1)
+ PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0
mova [r0+Y*FDEC_STRIDEB], m1
RET
;-----------------------------------------------------------------------------
; void predict_8x8_ddr( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
-%if avx_enabled == 0
cglobal predict_8x8_ddr, 2,2,7
- movu m1, [r1+ 7*SIZEOF_PIXEL]
- movu m2, [r1+ 9*SIZEOF_PIXEL]
- movu m3, [r1+15*SIZEOF_PIXEL]
- movu m4, [r1+17*SIZEOF_PIXEL]
add r0, FDEC_STRIDEB*4
- PRED8x8_LOWPASS %1, m0, m1, m2, [r1+ 8*SIZEOF_PIXEL], m5
- PRED8x8_LOWPASS %1, m1, m3, m4, [r1+16*SIZEOF_PIXEL], m6
-%assign Y 3
+ mova m0, [r1+ 8*SIZEOF_PIXEL]
+ mova m1, [r1+16*SIZEOF_PIXEL]
+ ; edge[] is 32byte aligned, so some of the unaligned loads are known to be not cachesplit
+ movu m2, [r1+ 7*SIZEOF_PIXEL]
+ movu m5, [r1+17*SIZEOF_PIXEL]
+%if cpuflag(cache64)
+ palignr m3, m1, m0, 1*SIZEOF_PIXEL
+ palignr m4, m1, m0, 7*SIZEOF_PIXEL
+%else
+ movu m3, [r1+ 9*SIZEOF_PIXEL]
+ movu m4, [r1+15*SIZEOF_PIXEL]
+%endif
+ PRED8x8_LOWPASS m0, m2, m3, m0, m6
+ PRED8x8_LOWPASS m1, m4, m5, m1, m6
+ mova [r0+3*FDEC_STRIDEB], m0
+%assign Y -4
%rep 6
- mova [r0+Y*FDEC_STRIDEB], m0
- psrl%2 m0, %3
- psll%2 m2, m1, 7*%3
- psrl%2 m1, %3
- por m0, m2
-%assign Y (Y-1)
+ PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2
+ PSLLPIX m0, m0, 1
+ mova [r0+Y*FDEC_STRIDEB], m1
+%assign Y (Y+1)
%endrep
- mova [r0+Y*FDEC_STRIDEB], m0
- psrl%2 m0, %3
- psll%2 m1, 7*%3
- por m0, m1
-%assign Y (Y-1)
- mova [r0+Y*FDEC_STRIDEB], m0
+ PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0
+ mova [r0+Y*FDEC_STRIDEB], m1
RET
-%endif
-%endmacro ; PREDICT_8x8
+%endmacro ; PREDICT_8x8_DDLR
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
-PREDICT_8x8 w, dq, 2
-INIT_XMM avx
-PREDICT_8x8 w, dq, 2
+PREDICT_8x8_DDLR
+INIT_XMM ssse3
+PREDICT_8x8_DDLR
+INIT_XMM ssse3, cache64
+PREDICT_8x8_DDLR
%elifndef ARCH_X86_64
INIT_MMX mmx2
-PREDICT_8x8 b, q , 8
+PREDICT_8x8_DDLR
%endif
;-----------------------------------------------------------------------------
; void predict_8x8_hu( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_HU 5
+%macro PREDICT_8x8_HU 2
cglobal predict_8x8_hu, 2,2,8
- movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
add r0, 4*FDEC_STRIDEB
- pshuf%3 m0, m1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1
- psll%2 m1, 7*%5 ; l7 .. .. .. .. .. .. ..
+%ifdef HIGH_BIT_DEPTH
+%if cpuflag(ssse3)
+ movu m5, [r1+7*SIZEOF_PIXEL]
+ pshufb m5, [pw_reverse]
+%else
+ movq m6, [r1+7*SIZEOF_PIXEL]
+ movq m5, [r1+11*SIZEOF_PIXEL]
+ pshuflw m6, m6, q0123
+ pshuflw m5, m5, q0123
+ movlhps m5, m6
+%endif ; cpuflag
+ psrldq m2, m5, 2
+ pshufd m3, m5, q0321
+ pshufhw m2, m2, q2210
+ pshufhw m3, m3, q1110
+ pavgw m4, m5, m2
+%else ; !HIGH_BIT_DEPTH
+ movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
+ pshufw m0, m1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1
+ psllq m1, 56 ; l7 .. .. .. .. .. .. ..
mova m2, m0
- psll%3 m0, 8*SIZEOF_PIXEL
- psrl%3 m2, 8*SIZEOF_PIXEL
- por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0
+ psllw m0, 8
+ psrlw m2, 8
+ por m2, m0
+ mova m3, m2
mova m4, m2
- mova m5, m2
- psrl%2 m3, m2, 2*%5
- psrl%2 m2, %5
+ mova m5, m2 ; l7 l6 l5 l4 l3 l2 l1 l0
+ psrlq m3, 16
+ psrlq m2, 8
por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1
- punpckh%4 m1, m1
+ punpckhbw m1, m1
por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2
- pavg%1 m4, m2
- PRED8x8_LOWPASS %1, m1, m3, m5, m2, m6
- punpckh%4 m5, m4, m1 ; p8 p7 p6 p5
- punpckl%4 m4, m1 ; p4 p3 p2 p1
- mova m6, m5
- mova m7, m5
- mova m0, m5
- PALIGNR m5, m4, 2*SIZEOF_PIXEL, m1
- pshuf%3 m1, m6, q3321
- PALIGNR m6, m4, 4*SIZEOF_PIXEL, m2
- pshuf%3 m2, m7, q3332
- PALIGNR m7, m4, 6*SIZEOF_PIXEL, m3
- pshuf%3 m3, m0, q3333
+ pavgb m4, m2
+%endif ; !HIGH_BIT_DEPTH
+ PRED8x8_LOWPASS m2, m3, m5, m2, m6
+ punpckh%2 m0, m4, m2 ; p8 p7 p6 p5
+ punpckl%2 m4, m2 ; p4 p3 p2 p1
+ PALIGNR m5, m0, m4, 2*SIZEOF_PIXEL, m3
+ pshuf%1 m1, m0, q3321
+ PALIGNR m6, m0, m4, 4*SIZEOF_PIXEL, m3
+ pshuf%1 m2, m0, q3332
+ PALIGNR m7, m0, m4, 6*SIZEOF_PIXEL, m3
+ pshuf%1 m3, m0, q3333
mova [r0-4*FDEC_STRIDEB], m4
mova [r0-3*FDEC_STRIDEB], m5
mova [r0-2*FDEC_STRIDEB], m6
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
-PREDICT_8x8_HU w, dq, d, wd, 2
+PREDICT_8x8_HU d, wd
INIT_XMM ssse3
-PREDICT_8x8_HU w, dq, d, wd, 2
+PREDICT_8x8_HU d, wd
INIT_XMM avx
-PREDICT_8x8_HU w, dq, d, wd, 2
+PREDICT_8x8_HU d, wd
%elifndef ARCH_X86_64
INIT_MMX mmx2
-PREDICT_8x8_HU b, q , w, bw, 8
+PREDICT_8x8_HU w, bw
%endif
;-----------------------------------------------------------------------------
; void predict_8x8_vr( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_VR 3
-cglobal predict_8x8_vr, 2,3,7
+%macro PREDICT_8x8_VR 1
+cglobal predict_8x8_vr, 2,3
mova m2, [r1+16*SIZEOF_PIXEL]
+%ifidn cpuname, ssse3
+ mova m0, [r1+8*SIZEOF_PIXEL]
+ palignr m3, m2, m0, 7*SIZEOF_PIXEL
+ palignr m1, m2, m0, 6*SIZEOF_PIXEL
+%else
movu m3, [r1+15*SIZEOF_PIXEL]
movu m1, [r1+14*SIZEOF_PIXEL]
+%endif
pavg%1 m4, m3, m2
add r0, FDEC_STRIDEB*4
- PRED8x8_LOWPASS %1, m0, m1, m2, m3, m5
+ PRED8x8_LOWPASS m3, m1, m2, m3, m5
mova [r0-4*FDEC_STRIDEB], m4
- mova [r0-3*FDEC_STRIDEB], m0
- mova m5, m0
- mova m6, m4
+ mova [r0-3*FDEC_STRIDEB], m3
mova m1, [r1+8*SIZEOF_PIXEL]
- mova m2, m1
- psll%2 m2, %3
- mova m3, m1
- psll%2 m3, 2*%3
- PRED8x8_LOWPASS %1, m0, m1, m3, m2, m4
+ PSLLPIX m0, m1, 1
+ PSLLPIX m2, m1, 2
+ PRED8x8_LOWPASS m0, m1, m2, m0, m6
%assign Y -2
%rep 5
- %assign i (5 + ((Y+3)&1))
- PALIGNR m %+ i, m0, 7*SIZEOF_PIXEL, m2
- mova [r0+Y*FDEC_STRIDEB], m %+ i
- psll%2 m0, %3
+ PALIGNR m4, m0, 7*SIZEOF_PIXEL, m5
+ mova [r0+Y*FDEC_STRIDEB], m4
+ PSLLPIX m0, m0, 1
+ SWAP 3, 4
%assign Y (Y+1)
%endrep
- PALIGNR m5, m0, 7*SIZEOF_PIXEL, m0
- mova [r0+Y*FDEC_STRIDEB], m5
+ PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0
+ mova [r0+Y*FDEC_STRIDEB], m4
RET
%endmacro
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
-PREDICT_8x8_VR w, dq, 2
+PREDICT_8x8_VR w
INIT_XMM ssse3
-PREDICT_8x8_VR w, dq, 2
+PREDICT_8x8_VR w
INIT_XMM avx
-PREDICT_8x8_VR w, dq, 2
+PREDICT_8x8_VR w
%elifndef ARCH_X86_64
INIT_MMX mmx2
-PREDICT_8x8_VR b, q , 8
+PREDICT_8x8_VR b
+%endif
+
+%macro LOAD_PLANE_ARGS 0
+%ifdef ARCH_X86_64
+ movd mm0, r1d
+ movd mm2, r2d
+ movd mm4, r3d
+ pshufw mm0, mm0, 0
+ pshufw mm2, mm2, 0
+ pshufw mm4, mm4, 0
+%else
+ pshufw mm0, r1m, 0
+ pshufw mm2, r2m, 0
+ pshufw mm4, r3m, 0
%endif
+%endmacro
;-----------------------------------------------------------------------------
; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddl, 2,2
- movdqa xmm3, [r1+16]
- movdqu xmm2, [r1+17]
- pslldq xmm1, xmm3, 1
- add r0, FDEC_STRIDE*4
- PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
+ mova m0, [r1+16]
+%ifidn cpuname, ssse3
+ movd m2, [r1+32]
+ palignr m2, m0, 1
+%else
+ movu m2, [r1+17]
+%endif
+ pslldq m1, m0, 1
+ add r0, FDEC_STRIDE*4
+ PRED8x8_LOWPASS m0, m1, m2, m0, m3
%assign Y -4
%rep 8
- psrldq xmm0, 1
- movq [r0+Y*FDEC_STRIDE], xmm0
+ psrldq m0, 1
+ movq [r0+Y*FDEC_STRIDE], m0
%assign Y (Y+1)
%endrep
RET
+%ifnidn cpuname, ssse3
;-----------------------------------------------------------------------------
; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddr, 2,2
- movdqu xmm3, [r1+8]
- movdqu xmm1, [r1+7]
- psrldq xmm2, xmm3, 1
- add r0, FDEC_STRIDE*4
- PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
+ movu m0, [r1+8]
+ movu m1, [r1+7]
+ psrldq m2, m0, 1
+ add r0, FDEC_STRIDE*4
+ PRED8x8_LOWPASS m0, m1, m2, m0, m3
- psrldq xmm1, xmm0, 1
+ psrldq m1, m0, 1
%assign Y 3
%rep 3
- movq [r0+Y*FDEC_STRIDE], xmm0
- movq [r0+(Y-1)*FDEC_STRIDE], xmm1
- psrldq xmm0, 2
- psrldq xmm1, 2
+ movq [r0+Y*FDEC_STRIDE], m0
+ movq [r0+(Y-1)*FDEC_STRIDE], m1
+ psrldq m0, 2
+ psrldq m1, 2
%assign Y (Y-2)
%endrep
- movq [r0-3*FDEC_STRIDE], xmm0
- movq [r0-4*FDEC_STRIDE], xmm1
+ movq [r0-3*FDEC_STRIDE], m0
+ movq [r0-4*FDEC_STRIDE], m1
RET
;-----------------------------------------------------------------------------
; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_vl, 2,2
- movdqa xmm4, [r1+16]
- pslldq xmm1, xmm4, 1
- psrldq xmm2, xmm4, 1
- pavgb xmm3, xmm4, xmm2
- add r0, FDEC_STRIDE*4
- PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm4, xmm5
-; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
-; xmm3: (t0 + t1 + 1) >> 1
+ mova m0, [r1+16]
+ pslldq m1, m0, 1
+ psrldq m2, m0, 1
+ pavgb m3, m0, m2
+ add r0, FDEC_STRIDE*4
+ PRED8x8_LOWPASS m0, m1, m2, m0, m5
+; m0: (t0 + 2*t1 + t2 + 2) >> 2
+; m3: (t0 + t1 + 1) >> 1
%assign Y -4
%rep 3
- psrldq xmm0, 1
- movq [r0+ Y *FDEC_STRIDE], xmm3
- movq [r0+(Y+1)*FDEC_STRIDE], xmm0
- psrldq xmm3, 1
+ psrldq m0, 1
+ movq [r0+ Y *FDEC_STRIDE], m3
+ movq [r0+(Y+1)*FDEC_STRIDE], m0
+ psrldq m3, 1
%assign Y (Y+2)
%endrep
- psrldq xmm0, 1
- movq [r0+ Y *FDEC_STRIDE], xmm3
- movq [r0+(Y+1)*FDEC_STRIDE], xmm0
-
+ psrldq m0, 1
+ movq [r0+ Y *FDEC_STRIDE], m3
+ movq [r0+(Y+1)*FDEC_STRIDE], m0
RET
+%endif ; !ssse3
;-----------------------------------------------------------------------------
; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_vr, 2,2,7
- movdqu xmm0, [r1+8]
- movdqa xmm6, [pw_ff00]
+cglobal predict_8x8_vr, 2,2
+ movu m2, [r1+8]
add r0, 4*FDEC_STRIDE
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- pslldq xmm1, xmm0, 2
- pslldq xmm0, 1
- pavgb xmm2, xmm0
- PRED8x8_LOWPASS b, xmm4, xmm3, xmm1, xmm0, xmm5
- pandn xmm6, xmm4
- movdqa xmm5, xmm4
- psrlw xmm4, 8
- packuswb xmm6, xmm4
- movhlps xmm4, xmm6
- movhps [r0-3*FDEC_STRIDE], xmm5
- movhps [r0-4*FDEC_STRIDE], xmm2
- psrldq xmm5, 4
- movss xmm5, xmm6
- psrldq xmm2, 4
- movss xmm2, xmm4
-%assign Y 3
-%rep 3
- psrldq xmm5, 1
- psrldq xmm2, 1
- movq [r0+Y*FDEC_STRIDE], xmm5
- movq [r0+(Y-1)*FDEC_STRIDE], xmm2
-%assign Y (Y-2)
-%endrep
+ pslldq m1, m2, 2
+ pslldq m0, m2, 1
+ pavgb m3, m2, m0
+ PRED8x8_LOWPASS m0, m2, m1, m0, m4
+ movhps [r0-4*FDEC_STRIDE], m3
+ movhps [r0-3*FDEC_STRIDE], m0
+%if cpuflag(ssse3)
+ movhlps m3, m3
+ pshufb m0, [shuf_vr]
+ palignr m3, m0, 13
+%else
+ mova m2, m0
+ mova m1, [pw_00ff]
+ pand m1, m0
+ psrlw m0, 8
+ packuswb m1, m0
+ pslldq m1, 4
+ movhlps m3, m1
+ shufps m1, m2, q3210
+ psrldq m3, 5
+ psrldq m1, 5
+ SWAP 0, 1
+%endif
+ movq [r0+3*FDEC_STRIDE], m0
+ movq [r0+2*FDEC_STRIDE], m3
+ psrldq m0, 1
+ psrldq m3, 1
+ movq [r0+1*FDEC_STRIDE], m0
+ movq [r0+0*FDEC_STRIDE], m3
+ psrldq m0, 1
+ psrldq m3, 1
+ movq [r0-1*FDEC_STRIDE], m0
+ movq [r0-2*FDEC_STRIDE], m3
RET
%endmacro ; PREDICT_8x8
INIT_XMM sse2
PREDICT_8x8
+INIT_XMM ssse3
+PREDICT_8x8
INIT_XMM avx
PREDICT_8x8
;-----------------------------------------------------------------------------
; void predict_8x8_hd( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_HD 4
-cglobal predict_8x8_hd, 2,2,8
+%macro PREDICT_8x8_HD 2
+cglobal predict_8x8_hd, 2,2
add r0, 4*FDEC_STRIDEB
- mova m0, [r1] ; l7 .. .. .. .. .. .. ..
- mova m1, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6
+ mova m0, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6
+ movu m1, [r1+ 7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
+%ifidn cpuname, ssse3
mova m2, [r1+16*SIZEOF_PIXEL] ; t7 t6 t5 t4 t3 t2 t1 t0
- mova m3, m1 ; lt l0 l1 l2 l3 l4 l5 l6
mova m4, m2 ; t7 t6 t5 t4 t3 t2 t1 t0
- PALIGNR m2, m1, 7*SIZEOF_PIXEL, m5 ; t6 t5 t4 t3 t2 t1 t0 lt
- PALIGNR m1, m0, 7*SIZEOF_PIXEL, m6 ; l0 l1 l2 l3 l4 l5 l6 l7
- PALIGNR m4, m3, 1*SIZEOF_PIXEL, m7 ; t0 lt l0 l1 l2 l3 l4 l5
- mova m5, m3
- pavg%1 m3, m1
- PRED8x8_LOWPASS %1, m0, m4, m1, m5, m7
- psrl%2 m4, m2, 2*%4 ; .. .. t6 t5 t4 t3 t2 t1
- psrl%2 m1, m2, %4 ; .. t6 t5 t4 t3 t2 t1 t0
- PRED8x8_LOWPASS %1, m6, m4, m2, m1, m5
+ palignr m2, m0, 7*SIZEOF_PIXEL ; t6 t5 t4 t3 t2 t1 t0 lt
+ palignr m4, m0, 1*SIZEOF_PIXEL ; t0 lt l0 l1 l2 l3 l4 l5
+%else
+ movu m2, [r1+15*SIZEOF_PIXEL]
+ movu m4, [r1+ 9*SIZEOF_PIXEL]
+%endif ; cpuflag
+ pavg%1 m3, m0, m1
+ PRED8x8_LOWPASS m0, m4, m1, m0, m5
+ PSRLPIX m4, m2, 2 ; .. .. t6 t5 t4 t3 t2 t1
+ PSRLPIX m1, m2, 1 ; .. t6 t5 t4 t3 t2 t1 t0
+ PRED8x8_LOWPASS m1, m4, m2, m1, m5
; .. p11 p10 p9
- punpckh%3 m7, m3, m0 ; p8 p7 p6 p5
- punpckl%3 m3, m0 ; p4 p3 p2 p1
- mova m1, m7
- mova m0, m7
- mova m4, m7
+ punpckh%2 m2, m3, m0 ; p8 p7 p6 p5
+ punpckl%2 m3, m0 ; p4 p3 p2 p1
mova [r0+3*FDEC_STRIDEB], m3
- PALIGNR m7, m3, 2*SIZEOF_PIXEL, m5
- mova [r0+2*FDEC_STRIDEB], m7
- PALIGNR m1, m3, 4*SIZEOF_PIXEL, m5
- mova [r0+1*FDEC_STRIDEB], m1
- PALIGNR m0, m3, 6*SIZEOF_PIXEL, m3
+ PALIGNR m0, m2, m3, 2*SIZEOF_PIXEL, m5
+ mova [r0+2*FDEC_STRIDEB], m0
+ PALIGNR m0, m2, m3, 4*SIZEOF_PIXEL, m5
+ mova [r0+1*FDEC_STRIDEB], m0
+ PALIGNR m0, m2, m3, 6*SIZEOF_PIXEL, m3
mova [r0+0*FDEC_STRIDEB], m0
- mova m2, m6
- mova m3, m6
- mova [r0-1*FDEC_STRIDEB], m4
- PALIGNR m6, m4, 2*SIZEOF_PIXEL, m5
- mova [r0-2*FDEC_STRIDEB], m6
- PALIGNR m2, m4, 4*SIZEOF_PIXEL, m5
- mova [r0-3*FDEC_STRIDEB], m2
- PALIGNR m3, m4, 6*SIZEOF_PIXEL, m4
- mova [r0-4*FDEC_STRIDEB], m3
+ mova [r0-1*FDEC_STRIDEB], m2
+ PALIGNR m0, m1, m2, 2*SIZEOF_PIXEL, m5
+ mova [r0-2*FDEC_STRIDEB], m0
+ PALIGNR m0, m1, m2, 4*SIZEOF_PIXEL, m5
+ mova [r0-3*FDEC_STRIDEB], m0
+ PALIGNR m1, m1, m2, 6*SIZEOF_PIXEL, m2
+ mova [r0-4*FDEC_STRIDEB], m1
RET
%endmacro
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
-PREDICT_8x8_HD w, dq, wd, 2
+PREDICT_8x8_HD w, wd
INIT_XMM ssse3
-PREDICT_8x8_HD w, dq, wd, 2
+PREDICT_8x8_HD w, wd
INIT_XMM avx
-PREDICT_8x8_HD w, dq, wd, 2
+PREDICT_8x8_HD w, wd
%else
INIT_MMX mmx2
-PREDICT_8x8_HD b, q , bw, 8
+PREDICT_8x8_HD b, bw
;-----------------------------------------------------------------------------
; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_HD 0
cglobal predict_8x8_hd, 2,2
- add r0, 4*FDEC_STRIDE
- movdqa xmm0, [r1]
- movdqa xmm1, [r1+16]
- movdqa xmm2, xmm1
- movdqa xmm3, xmm1
- PALIGNR xmm1, xmm0, 7, xmm4
- PALIGNR xmm2, xmm0, 9, xmm5
- PALIGNR xmm3, xmm0, 8, xmm0
- pavgb xmm4, xmm1, xmm3
- PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm5
- punpcklbw xmm4, xmm0
- movhlps xmm0, xmm4
+ add r0, 4*FDEC_STRIDE
+ movu m1, [r1+7]
+ movu m3, [r1+8]
+ movu m2, [r1+9]
+ pavgb m4, m1, m3
+ PRED8x8_LOWPASS m0, m1, m2, m3, m5
+ punpcklbw m4, m0
+ movhlps m0, m4
%assign Y 3
%rep 3
- movq [r0+(Y)*FDEC_STRIDE], xmm4
- movq [r0+(Y-4)*FDEC_STRIDE], xmm0
- psrldq xmm4, 2
- psrldq xmm0, 2
+ movq [r0+(Y)*FDEC_STRIDE], m4
+ movq [r0+(Y-4)*FDEC_STRIDE], m0
+ psrldq m4, 2
+ psrldq m0, 2
%assign Y (Y-1)
%endrep
- movq [r0+(Y)*FDEC_STRIDE], xmm4
- movq [r0+(Y-4)*FDEC_STRIDE], xmm0
+ movq [r0+(Y)*FDEC_STRIDE], m4
+ movq [r0+(Y-4)*FDEC_STRIDE], m0
RET
%endmacro
INIT_XMM sse2
PREDICT_8x8_HD
-INIT_XMM ssse3
-PREDICT_8x8_HD
INIT_XMM avx
PREDICT_8x8_HD
%endif ; HIGH_BIT_DEPTH
+%ifndef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_HU 0
-cglobal predict_8x8_hu, 2,2
+INIT_MMX
+cglobal predict_8x8_hu_sse2, 2,2
add r0, 4*FDEC_STRIDE
-%if cpuflag(ssse3)
- movq mm5, [r1+7]
- movq mm6, [pb_reverse]
- movq mm1, mm5
- movq mm2, mm5
- movq mm3, mm5
- pshufb mm5, mm6
- psrlq mm6, 8
- pshufb mm2, mm6
- psrlq mm6, 8
- pshufb mm3, mm6
- movq mm4, mm5
-%else
movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
pshufw mm0, mm1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1
movq mm2, mm0
por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
punpckhbw mm1, mm1
por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
-%endif
pavgb mm4, mm2
- PRED8x8_LOWPASS b, mm1, mm3, mm5, mm2, mm6
+ PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
movq2dq xmm0, mm4
movq2dq xmm1, mm1
movq [r0+2*FDEC_STRIDE], mm6
movq [r0+3*FDEC_STRIDE], mm7
RET
-%endmacro
-%ifndef HIGH_BIT_DEPTH
-INIT_MMX sse2
-PREDICT_8x8_HU
-INIT_MMX ssse3
-PREDICT_8x8_HU
-%endif
+INIT_XMM
+cglobal predict_8x8_hu_ssse3, 2,2
+ add r0, 4*FDEC_STRIDE
+ movq m3, [r1+7]
+ pshufb m3, [shuf_hu]
+ psrldq m1, m3, 1
+ psrldq m2, m3, 2
+ pavgb m0, m1, m3
+ PRED8x8_LOWPASS m1, m3, m2, m1, m4
+ punpcklbw m0, m1
+%assign Y -4
+%rep 3
+ movq [r0+ Y *FDEC_STRIDE], m0
+ movhps [r0+(Y+4)*FDEC_STRIDE], m0
+ psrldq m0, 2
+ pshufhw m0, m0, q2210
+%assign Y (Y+1)
+%endrep
+ movq [r0+ Y *FDEC_STRIDE], m0
+ movhps [r0+(Y+4)*FDEC_STRIDE], m0
+ RET
+%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void predict_8x8c_v( uint8_t *src )
cglobal predict_8x8c_v_mmx, 1,1
mova m0, [r0 - FDEC_STRIDEB]
mova m1, [r0 - FDEC_STRIDEB + 8]
-%assign n 0
+%assign Y 0
%rep 8
- mova [r0 + (n&1)*FDEC_STRIDEB], m0
- mova [r0 + (n&1)*FDEC_STRIDEB + 8], m1
-%if (n&1) && (n!=7)
+ mova [r0 + (Y&1)*FDEC_STRIDEB], m0
+ mova [r0 + (Y&1)*FDEC_STRIDEB + 8], m1
+%if (Y&1) && (Y!=7)
add r0, FDEC_STRIDEB*2
%endif
-%assign n n+1
+%assign Y Y+1
%endrep
RET
INIT_XMM sse2
cglobal predict_8x8c_h, 1,1
add r0, FDEC_STRIDEB*4
-%assign n -4
+%assign Y -4
%rep 8
- movd m0, [r0+FDEC_STRIDEB*n-SIZEOF_PIXEL*2]
+ movd m0, [r0+FDEC_STRIDEB*Y-SIZEOF_PIXEL*2]
SPLATW m0, m0, 1
- mova [r0+FDEC_STRIDEB*n], m0
-%assign n n+1
+ mova [r0+FDEC_STRIDEB*Y], m0
+%assign Y Y+1
%endrep
RET
-%else
+%else ; !HIGH_BIT_DEPTH
%macro PREDICT_8x8C_H 0
cglobal predict_8x8c_h, 1,1
mova m1, [pb_3]
%endif
add r0, FDEC_STRIDE*4
-%assign n -4
+%assign Y -4
%rep 8
- SPLATB_LOAD m0, r0+FDEC_STRIDE*n-1, m1
- mova [r0+FDEC_STRIDE*n], m0
-%assign n n+1
+ SPLATB_LOAD m0, r0+FDEC_STRIDE*Y-1, m1
+ mova [r0+FDEC_STRIDE*Y], m0
+%assign Y Y+1
%endrep
RET
%endmacro
movq m1, [r0-FDEC_STRIDEB+8]
HADDW m0, m2
HADDW m1, m2
-%else
+%else ; !HIGH_BIT_DEPTH
movd m0, [r0-FDEC_STRIDEB+0]
movd m1, [r0-FDEC_STRIDEB+4]
psadbw m0, m7 ; s0
punpcklwd xmm0, xmm0
pshufd xmm1, xmm0, q3322
punpckldq xmm0, xmm0
-%assign n 0
+%assign Y 0
%rep 8
-%assign i (0 + (n/4))
- movdqa [r0+FDEC_STRIDEB*(n-4)+0], xmm %+ i
-%assign n n+1
+%assign i (0 + (Y/4))
+ movdqa [r0+FDEC_STRIDEB*(Y-4)+0], xmm %+ i
+%assign Y Y+1
%endrep
-%else
+%else ; !sse2
pshufw m1, m0, q0000
pshufw m2, m0, q1111
pshufw m3, m0, q2222
pshufw m4, m0, q3333
-%assign n 0
+%assign Y 0
%rep 8
-%assign i (1 + (n/4)*2)
-%assign j (2 + (n/4)*2)
- movq [r0+FDEC_STRIDEB*(n-4)+0], m %+ i
- movq [r0+FDEC_STRIDEB*(n-4)+8], m %+ j
-%assign n n+1
+%assign i (1 + (Y/4)*2)
+%assign j (2 + (Y/4)*2)
+ movq [r0+FDEC_STRIDEB*(Y-4)+0], m %+ i
+ movq [r0+FDEC_STRIDEB*(Y-4)+8], m %+ j
+%assign Y Y+1
%endrep
%endif
-%else
+%else ; !HIGH_BIT_DEPTH
packuswb m0, m0
punpcklbw m0, m0
movq m1, m0
punpcklbw m0, m0
punpckhbw m1, m1
-%assign n 0
+%assign Y 0
%rep 8
-%assign i (0 + (n/4))
- movq [r0+FDEC_STRIDEB*(n-4)], m %+ i
-%assign n n+1
+%assign i (0 + (Y/4))
+ movq [r0+FDEC_STRIDEB*(Y-4)], m %+ i
+%assign Y Y+1
%endrep
%endif
RET
%endif
%ifdef HIGH_BIT_DEPTH
-
INIT_XMM
cglobal predict_8x8c_dc_top_sse2, 1,1
pxor m2, m2
pavgw m0, m2
STORE8x8 m0, m0
RET
-
-%else
-
+%else ; !HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_8x8c_dc_top_mmx2, 1,1
movq mm0, [r0 - FDEC_STRIDE]
packuswb mm0, mm1 ; dc0,dc1 (b)
STORE8x8 mm0, mm0
RET
-
%endif
;-----------------------------------------------------------------------------
mova m1, [r0 - FDEC_STRIDEB+16]
STORE16x16_SSE2 m0, m1
REP_RET
-%else
+%else ; !HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_16x16_v_mmx, 1,2
movq m0, [r0 - FDEC_STRIDE + 0]
mov r1, 12*FDEC_STRIDEB
%ifdef HIGH_BIT_DEPTH
.vloop:
-%assign n 0
+%assign Y 0
%rep 4
- movd m0, [r0+r1+n*FDEC_STRIDEB-2*SIZEOF_PIXEL]
+ movd m0, [r0+r1+Y*FDEC_STRIDEB-2*SIZEOF_PIXEL]
SPLATW m0, m0, 1
- mova [r0+r1+n*FDEC_STRIDEB+ 0], m0
- mova [r0+r1+n*FDEC_STRIDEB+16], m0
+ mova [r0+r1+Y*FDEC_STRIDEB+ 0], m0
+ mova [r0+r1+Y*FDEC_STRIDEB+16], m0
%if mmsize==8
- mova [r0+r1+n*FDEC_STRIDEB+ 8], m0
- mova [r0+r1+n*FDEC_STRIDEB+24], m0
+ mova [r0+r1+Y*FDEC_STRIDEB+ 8], m0
+ mova [r0+r1+Y*FDEC_STRIDEB+24], m0
%endif
-%assign n n+1
+%assign Y Y+1
%endrep
-%else
+%else ; !HIGH_BIT_DEPTH
%if cpuflag(ssse3)
mova m1, [pb_3]
%endif
.vloop:
-%assign n 0
+%assign Y 0
%rep 4
- SPLATB_LOAD m0, r0+r1+FDEC_STRIDE*n-1, m1
- mova [r0+r1+FDEC_STRIDE*n], m0
+ SPLATB_LOAD m0, r0+r1+FDEC_STRIDE*Y-1, m1
+ mova [r0+r1+FDEC_STRIDE*Y], m0
%if mmsize==8
- mova [r0+r1+FDEC_STRIDE*n+8], m0
+ mova [r0+r1+FDEC_STRIDE*Y+8], m0
%endif
-%assign n n+1
+%assign Y Y+1
%endrep
%endif ; HIGH_BIT_DEPTH
sub r1, 4*FDEC_STRIDEB
psrlw m0, %2
SPLATW m0, m0
STORE16x16 m0, m0, m0, m0
-%else
+%else ; !HIGH_BIT_DEPTH
pxor m0, m0
pxor m1, m1
psadbw m0, [r0 - FDEC_STRIDE]
SPLATW m0, m0
STORE16x16 m0, m0, m0, m0
REP_RET
-%else
+%else ; !HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core_mmx2, 1,1
movd m0, r1m
pshufw m0, m0, 0
psrlw m0, %2
SPLATW m0, m0
STORE16x16_SSE2 m0, m0
-%else
+%else ; !HIGH_BIT_DEPTH
pxor m0, m0
psadbw m0, [r0 - FDEC_STRIDE]
movhlps m1, m0
SPLATW m0, m0
STORE16x16_SSE2 m0, m0
REP_RET
-%else
+%else ; !HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core_sse2, 1,1
movd m0, r1m
SPLATW m0, m0