Reduces the number of macro args that need to be passed around.
Allows multiple implementations of a given macro (e.g. PALIGNR) to check
cpuflags at the location where the macro is defined, instead of having
to select implementations by %define at toplevel.
Remove INIT_AVX, as it's replaced by "INIT_XMM avx".
This commit does not change the stripped executable.
jl %1
%endmacro
-%macro NAL_ESCAPE 1
+%macro NAL_ESCAPE 0
-cglobal nal_escape_%1, 3,5
+cglobal nal_escape, 3,5
mov r3w, [r1]
sub r1, r2 ; r1 = offset of current src pointer from end of src
pxor m4, m4
jmp .no_escape
%endmacro
-INIT_MMX
-NAL_ESCAPE mmx2
-INIT_XMM
-NAL_ESCAPE sse2
-INIT_AVX
-NAL_ESCAPE avx
+INIT_MMX mmx2
+NAL_ESCAPE
+INIT_XMM sse2
+NAL_ESCAPE
+INIT_XMM avx
+NAL_ESCAPE
movq m0, [r0]
ret
-INIT_MMX
-ALIGN 16
-dct8_mmx:
+cglobal dct8_mmx
DCT8_1D 0,1,2,3,4,5,6,7,r0
- SAVE_MM_PERMUTATION dct8_mmx
+ SAVE_MM_PERMUTATION
ret
%macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
cglobal sub8x8_dct8_mmx, 3,3
global sub8x8_dct8_mmx.skip_prologue
.skip_prologue:
- INIT_MMX
+ RESET_MM_PERMUTATION
call load_diff_4x8_mmx
call dct8_mmx
UNSPILL r0, 0
UNSPILL r0, 4,6
TRANSPOSE4x4W 4,5,6,7,0
SPILL r0, 4,5,6,7
- INIT_MMX
+ RESET_MM_PERMUTATION
add r1, 4
add r2, 4
add r0, 8
movq mm1, m5
movq mm2, mm4
movq mm3, m7
- INIT_MMX
+ RESET_MM_PERMUTATION
UNSPILL r0+8, 4,5,6,7
add r0, 8
call dct8_mmx
sub r0, 8
SPILL r0+8, 1,2,3,5,7
- INIT_MMX
+ RESET_MM_PERMUTATION
UNSPILL r0, 0,1,2,3,4,5,6,7
call dct8_mmx
SPILL r0, 1,2,3,5,7
ret
-INIT_MMX
-ALIGN 16
-idct8_mmx:
+cglobal idct8_mmx
IDCT8_1D 0,1,2,3,4,5,6,7,r1
- SAVE_MM_PERMUTATION idct8_mmx
+ SAVE_MM_PERMUTATION
ret
%macro ADD_STORE_ROW 3
ADD_STORE_ROW 7, m7, [r1+0x78]
ret
-%macro DCT_SUB8 1
-cglobal sub8x8_dct_%1, 3,3
+%macro DCT_SUB8 0
+cglobal sub8x8_dct, 3,3
add r2, 4*FDEC_STRIDE
-global sub8x8_dct_%1.skip_prologue
+global current_function %+ .skip_prologue
.skip_prologue:
-%ifnidn %1, sse2
+%if cpuflag(ssse3)
mova m7, [hsub_mul]
%endif
LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
;-----------------------------------------------------------------------------
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal sub8x8_dct8_%1, 3,3
+cglobal sub8x8_dct8, 3,3
add r2, 4*FDEC_STRIDE
-global sub8x8_dct8_%1.skip_prologue
+global current_function %+ .skip_prologue
.skip_prologue:
-%ifidn %1, sse2
+%if cpuflag(ssse3)
+ mova m7, [hsub_mul]
+ LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
+ SPILL r0, 0,1
+ SWAP 1, 7
+ LOAD_DIFF8x4 4, 5, 6, 7, 0, 1, r1, r2-4*FDEC_STRIDE
+ UNSPILL r0, 0,1
+%else
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE]
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2-3*FDEC_STRIDE]
LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2-2*FDEC_STRIDE]
LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
UNSPILL r0, 0
-%else
- mova m7, [hsub_mul]
- LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
- SPILL r0, 0,1
- SWAP 1, 7
- LOAD_DIFF8x4 4, 5, 6, 7, 0, 1, r1, r2-4*FDEC_STRIDE
- UNSPILL r0, 0,1
%endif
DCT8_1D 0,1,2,3,4,5,6,7,r0
UNSPILL r0, 0,4
ret
%endmacro
-INIT_XMM
-%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2
+INIT_XMM sse2
%define movdqa movaps
%define punpcklqdq movlhps
-DCT_SUB8 sse2
+DCT_SUB8
%undef movdqa
%undef punpcklqdq
-%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
-DCT_SUB8 ssse3
-INIT_AVX
-DCT_SUB8 avx
+INIT_XMM ssse3
+DCT_SUB8
+INIT_XMM avx
+DCT_SUB8
;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
-%macro ADD8x8 1
-cglobal add8x8_idct_%1, 2,2
+%macro ADD8x8 0
+cglobal add8x8_idct, 2,2
add r0, 4*FDEC_STRIDE
-global add8x8_idct_%1.skip_prologue
+global current_function %+ .skip_prologue
.skip_prologue:
UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
SBUTTERFLY qdq, 0, 1, 4
ret
%endmacro ; ADD8x8
-INIT_XMM
-ADD8x8 sse2
-INIT_AVX
-ADD8x8 avx
+INIT_XMM sse2
+ADD8x8
+INIT_XMM avx
+ADD8x8
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-%macro ADD8x8_IDCT8 1
-cglobal add8x8_idct8_%1, 2,2
+%macro ADD8x8_IDCT8 0
+cglobal add8x8_idct8, 2,2
add r0, 4*FDEC_STRIDE
-global add8x8_idct8_%1.skip_prologue
+global current_function %+ .skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
ret
%endmacro ; ADD8x8_IDCT8
-INIT_XMM
-ADD8x8_IDCT8 sse2
-INIT_AVX
-ADD8x8_IDCT8 avx
+INIT_XMM sse2
+ADD8x8_IDCT8
+INIT_XMM avx
+ADD8x8_IDCT8
%endif ; !HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
cextern pw_32
cextern hsub_mul
-INIT_XMM
%macro DCT8_1D 10
SUMSUB_BA w, %5, %4 ; %5=s34, %4=d34
SWAP %3, %8, %7
%endmacro
-%macro DCT_SUB8 1
-cglobal sub8x8_dct_%1, 3,3,11
+%macro DCT_SUB8 0
+cglobal sub8x8_dct, 3,3,11
add r2, 4*FDEC_STRIDE
-%ifnidn %1, sse2
+%if cpuflag(ssse3)
mova m7, [hsub_mul]
%endif
%ifdef WIN64
call .skip_prologue
RET
%endif
-global sub8x8_dct_%1.skip_prologue
+global current_function %+ .skip_prologue
.skip_prologue:
SWAP 7, 9
LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
;-----------------------------------------------------------------------------
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal sub8x8_dct8_%1, 3,3,11
+cglobal sub8x8_dct8, 3,3,11
add r2, 4*FDEC_STRIDE
-%ifnidn %1, sse2
+%if cpuflag(ssse3)
mova m7, [hsub_mul]
%endif
%ifdef WIN64
call .skip_prologue
RET
%endif
-global sub8x8_dct8_%1.skip_prologue
+global current_function %+ .skip_prologue
.skip_prologue:
SWAP 7, 10
LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
ret
%endmacro
-%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2
+INIT_XMM sse2
%define movdqa movaps
%define punpcklqdq movlhps
-DCT_SUB8 sse2
+DCT_SUB8
%undef movdqa
%undef punpcklqdq
-%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
-DCT_SUB8 ssse3
-INIT_AVX
-DCT_SUB8 avx
+INIT_XMM ssse3
+DCT_SUB8
+INIT_XMM avx
+DCT_SUB8
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-%macro ADD8x8_IDCT8 1
-cglobal add8x8_idct8_%1, 2,2,11
+%macro ADD8x8_IDCT8 0
+cglobal add8x8_idct8, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
-global add8x8_idct8_%1.skip_prologue
+global current_function %+ .skip_prologue
.skip_prologue:
SWAP 7, 9
movdqa m0, [r1+0x00]
ret
%endmacro ; ADD8x8_IDCT8
-INIT_XMM
-ADD8x8_IDCT8 sse2
-INIT_AVX
-ADD8x8_IDCT8 avx
+INIT_XMM sse2
+ADD8x8_IDCT8
+INIT_XMM avx
+ADD8x8_IDCT8
;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
-%macro ADD8x8 1
-cglobal add8x8_idct_%1, 2,2,11
+%macro ADD8x8 0
+cglobal add8x8_idct, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
-global add8x8_idct_%1.skip_prologue
+global current_function %+ .skip_prologue
.skip_prologue:
SWAP 7, 9
mova m0, [r1+ 0]
ret
%endmacro ; ADD8x8
-INIT_XMM
-ADD8x8 sse2
-INIT_AVX
-ADD8x8 avx
+INIT_XMM sse2
+ADD8x8
+INIT_XMM avx
+ADD8x8
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void dct4x4dc( dctcoef d[4][4] )
;-----------------------------------------------------------------------------
-%macro DCT4x4_DC 1
-cglobal dct4x4dc_%1, 1,1,5
+%macro DCT4x4_DC 0
+cglobal dct4x4dc, 1,1,5
mova m0, [r0+ 0]
mova m1, [r0+16]
mova m2, [r0+32]
RET
%endmacro ; DCT4x4_DC
-INIT_XMM
-DCT4x4_DC sse2
-INIT_AVX
-DCT4x4_DC avx
+INIT_XMM sse2
+DCT4x4_DC
+INIT_XMM avx
+DCT4x4_DC
%else
-INIT_MMX
-cglobal dct4x4dc_mmx, 1,1
+INIT_MMX mmx
+cglobal dct4x4dc, 1,1
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
;-----------------------------------------------------------------------------
; void idct4x4dc( int32_t d[4][4] )
;-----------------------------------------------------------------------------
-%macro IDCT4x4DC 1
-cglobal idct4x4dc_%1, 1,1
+%macro IDCT4x4DC 0
+cglobal idct4x4dc, 1,1
mova m3, [r0+48]
mova m2, [r0+32]
mova m1, [r0+16]
RET
%endmacro ; IDCT4x4DC
-INIT_XMM
-IDCT4x4DC sse2
-INIT_AVX
-IDCT4x4DC avx
+INIT_XMM sse2
+IDCT4x4DC
+INIT_XMM avx
+IDCT4x4DC
%else
-INIT_MMX
;-----------------------------------------------------------------------------
; void idct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
-cglobal idct4x4dc_mmx, 1,1
+INIT_MMX mmx
+cglobal idct4x4dc, 1,1
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
RET
%endif ; HIGH_BIT_DEPTH
-INIT_MMX
%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
;-----------------------------------------------------------------------------
-cglobal sub4x4_dct_mmx, 3,3
+INIT_MMX mmx
+cglobal sub4x4_dct, 3,3
.skip_prologue:
LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
RET
%else
-%macro SUB_DCT4 1
-cglobal sub4x4_dct_%1, 3,3
-%ifidn %1, mmx
+%macro SUB_DCT4 0
+cglobal sub4x4_dct, 3,3
.skip_prologue:
- LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
- LOAD_DIFF m3, m4, m5, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
- LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
- LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
-%else
+%if cpuflag(ssse3)
mova m5, [hsub_mul]
- LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
%endif
+ LOAD_DIFF8x4 0, 3, 1, 2, 4, 5, r1, r2
DCT4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
DCT4_1D 0,1,2,3,4
RET
%endmacro
-SUB_DCT4 mmx
-SUB_DCT4 ssse3
+INIT_MMX mmx
+SUB_DCT4
+INIT_MMX ssse3
+SUB_DCT4
%endif ; HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
movhps %6, %1
%endmacro
-%macro ADD4x4_IDCT 1
-cglobal add4x4_idct_%1, 2,2,6
+%macro ADD4x4_IDCT 0
+cglobal add4x4_idct, 2,2,6
add r0, 2*FDEC_STRIDEB
.skip_prologue:
mova m1, [r1+16]
RET
%endmacro
-INIT_XMM
-ADD4x4_IDCT sse2
-INIT_AVX
-ADD4x4_IDCT avx
+INIT_XMM sse2
+ADD4x4_IDCT
+INIT_XMM avx
+ADD4x4_IDCT
%else ; !HIGH_BIT_DEPTH
-cglobal add4x4_idct_mmx, 2,2
+INIT_MMX mmx
+cglobal add4x4_idct, 2,2
pxor m7, m7
.skip_prologue:
movq m1, [r1+ 8]
STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
RET
-%macro ADD4x4 1
-cglobal add4x4_idct_%1, 2,2,6
+%macro ADD4x4 0
+cglobal add4x4_idct, 2,2,6
mova m1, [r1+0x00] ; row1/row0
mova m3, [r1+0x10] ; row3/row2
psraw m0, m1, 1 ; row1>>1/...
RET
%endmacro ; ADD4x4
-INIT_XMM
-ADD4x4 sse4
-INIT_AVX
-ADD4x4 avx
+INIT_XMM sse4
+ADD4x4
+INIT_XMM avx
+ADD4x4
%endif ; HIGH_BIT_DEPTH
INIT_MMX
%ifdef WIN64
sub rsp, 8
%endif
- call %2
+ call %2.skip_prologue
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE
- call %2
+ call %2.skip_prologue
add r0, %3
add r1, (%4-%6)*FENC_STRIDE-%5-%4
add r2, (%4-%6)*FDEC_STRIDE-%5-%4
- call %2
+ call %2.skip_prologue
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE
%ifdef WIN64
- call %2
+ call %2.skip_prologue
add rsp, 8
RET
%else
- jmp %2
+ jmp %2.skip_prologue
%endif
%endmacro
%ifdef WIN64
sub rsp, 8
%endif
- call %2
+ call %2.skip_prologue
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
- call %2
+ call %2.skip_prologue
add r0, (%4-%6)*FDEC_STRIDE-%5-%4
add r1, %3
- call %2
+ call %2.skip_prologue
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
%ifdef WIN64
- call %2
+ call %2.skip_prologue
add rsp, 8
RET
%else
- jmp %2
+ jmp %2.skip_prologue
%endif
%endmacro
%ifdef HIGH_BIT_DEPTH
INIT_MMX
-SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 64, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 64, 16, 8, 8
+SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8
INIT_XMM
-ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2.skip_prologue,64, 8, 0, 0
-ADD_NxN_IDCT add16x16_idct_sse2,add8x8_idct_sse2.skip_prologue,64, 16, 8, 8
-INIT_AVX
-ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx.skip_prologue, 64, 8, 0, 0
-ADD_NxN_IDCT add16x16_idct_avx ,add8x8_idct_avx.skip_prologue, 64, 16, 8, 8
+ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8
+ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8
%else ; !HIGH_BIT_DEPTH
%ifndef ARCH_X86_64
-SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
-ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
-SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
-ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
+INIT_MMX
+SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0
+ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0
+SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4
+ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx, 32, 8, 4, 4
cextern sub8x8_dct8_mmx.skip_prologue
cextern add8x8_idct8_mmx.skip_prologue
-SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
-ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
%endif
INIT_XMM
cextern sub8x8_dct_sse2.skip_prologue
cextern sub8x8_dct_ssse3.skip_prologue
cextern sub8x8_dct_avx.skip_prologue
-SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0
cextern add8x8_idct_sse2.skip_prologue
cextern add8x8_idct_avx.skip_prologue
-ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 128, 8, 0, 0
-ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 128, 8, 0, 0
cextern add8x8_idct8_sse2.skip_prologue
cextern add8x8_idct8_avx.skip_prologue
-ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
-ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 128, 8, 0, 0
cextern sub8x8_dct8_sse2.skip_prologue
cextern sub8x8_dct8_ssse3.skip_prologue
cextern sub8x8_dct8_avx.skip_prologue
-SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0
%endif ; HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
;-----------------------------------------------------------------------------
; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
;-----------------------------------------------------------------------------
mova [%1+FDEC_STRIDEB*3], %2
%endmacro
-%macro ADD_IDCT_DC 1
-cglobal add8x8_idct_dc_%1, 2,2,7
+%macro ADD_IDCT_DC 0
+cglobal add8x8_idct_dc, 2,2,7
mova m6, [pw_pixel_max]
pxor m5, m5
mova m3, [r1]
ADD_DC r0+FDEC_STRIDEB*4, m3
RET
-cglobal add16x16_idct_dc_%1, 2,3,8
+cglobal add16x16_idct_dc, 2,3,8
mov r2, 4
mova m6, [pw_pixel_max]
mova m7, [pd_32]
REP_RET
%endmacro ; ADD_IDCT_DC
-INIT_XMM
-ADD_IDCT_DC sse2
-INIT_AVX
-ADD_IDCT_DC avx
+INIT_XMM sse2
+ADD_IDCT_DC
+INIT_XMM avx
+ADD_IDCT_DC
%else ;!HIGH_BIT_DEPTH
%macro ADD_DC 3
movq [%3+FDEC_STRIDE*3], %1
%endmacro
+INIT_MMX
cglobal add8x8_idct_dc_mmx, 2,2
movq mm0, [r1]
pxor mm1, mm1
IDCT_DC_STORE 0, xmm2, xmm3
ret
-%macro ADD16x16 1
-cglobal add16x16_idct_dc_%1, 2,2,8
+%macro ADD16x16 0
+cglobal add16x16_idct_dc, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
ret
%endmacro ; ADD16x16
-INIT_XMM
-ADD16x16 ssse3
-INIT_AVX
-ADD16x16 avx
+INIT_XMM ssse3
+ADD16x16
+INIT_XMM avx
+ADD16x16
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-%macro SCAN_8x8 1
-cglobal zigzag_scan_8x8_frame_%1, 2,2,8
+%macro SCAN_8x8 0
+cglobal zigzag_scan_8x8_frame, 2,2,8
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdq2q mm0, xmm0
movdq2q mm5, xmm5
PALIGNR xmm6, xmm6, 10, xmm3
movdq2q mm6, xmm6
-%ifnidn %1, sse2
+%if cpuflag(ssse3)
PALIGNR xmm7, xmm7, 8, xmm3
movdq2q mm7, xmm7
%else
%endmacro
%ifndef HIGH_BIT_DEPTH
-INIT_XMM
-%define PALIGNR PALIGNR_MMX
-SCAN_8x8 sse2
-%define PALIGNR PALIGNR_SSSE3
-SCAN_8x8 ssse3
+INIT_XMM sse2
+SCAN_8x8
+INIT_XMM ssse3
+SCAN_8x8
%endif
;-----------------------------------------------------------------------------
; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
;-----------------------------------------------------------------------------
-%macro SCAN_8x8_FRAME 6
-cglobal zigzag_scan_8x8_frame_%1, 2,2,8*(mmsize/16)
+%macro SCAN_8x8_FRAME 5
+cglobal zigzag_scan_8x8_frame, 2,2,8*(mmsize/16)
mova m0, [r1]
mova m1, [r1+ 8*SIZEOF_DCTCOEF]
movu m2, [r1+14*SIZEOF_DCTCOEF]
movu m3, [r1+21*SIZEOF_DCTCOEF]
mova m4, [r1+28*SIZEOF_DCTCOEF]
- punpckl%5 m5, m0, m1
- psrl%3 m0, %2
- punpckh%5 m6, m1, m0
- punpckl%4 m5, m0
- punpckl%4 m1, m1
- punpckh%5 m1, m3
+ punpckl%4 m5, m0, m1
+ psrl%2 m0, %1
+ punpckh%4 m6, m1, m0
+ punpckl%3 m5, m0
+ punpckl%3 m1, m1
+ punpckh%4 m1, m3
mova m7, [r1+52*SIZEOF_DCTCOEF]
mova m0, [r1+60*SIZEOF_DCTCOEF]
- punpckh%5 m1, m2
- punpckl%5 m2, m4
- punpckh%5 m4, m3
- punpckl%4 m3, m3
- punpckh%5 m3, m2
+ punpckh%4 m1, m2
+ punpckl%4 m2, m4
+ punpckh%4 m4, m3
+ punpckl%3 m3, m3
+ punpckh%4 m3, m2
mova [r0], m5
mova [r0+ 4*SIZEOF_DCTCOEF], m1
mova [r0+ 8*SIZEOF_DCTCOEF], m6
- punpckl%5 m6, m0
- punpckl%5 m6, m7
+ punpckl%4 m6, m0
+ punpckl%4 m6, m7
mova m1, [r1+32*SIZEOF_DCTCOEF]
movu m5, [r1+39*SIZEOF_DCTCOEF]
movu m2, [r1+46*SIZEOF_DCTCOEF]
movu [r0+35*SIZEOF_DCTCOEF], m3
movu [r0+47*SIZEOF_DCTCOEF], m4
- punpckh%5 m7, m0
- psll%3 m0, %2
- punpckh%4 m3, m5, m5
- punpckl%5 m5, m1
- punpckh%5 m1, m2
+ punpckh%4 m7, m0
+ psll%2 m0, %1
+ punpckh%3 m3, m5, m5
+ punpckl%4 m5, m1
+ punpckh%4 m1, m2
mova [r0+52*SIZEOF_DCTCOEF], m6
movu [r0+13*SIZEOF_DCTCOEF], m5
movu m4, [r1+11*SIZEOF_DCTCOEF]
movu m6, [r1+25*SIZEOF_DCTCOEF]
- punpckl%5 m5, m7
- punpckl%5 m1, m3
- punpckh%4 m0, m7
+ punpckl%4 m5, m7
+ punpckl%4 m1, m3
+ punpckh%3 m0, m7
mova m3, [r1+ 4*SIZEOF_DCTCOEF]
movu m7, [r1+18*SIZEOF_DCTCOEF]
- punpckl%5 m2, m5
+ punpckl%4 m2, m5
movu [r0+25*SIZEOF_DCTCOEF], m1
mova m1, m4
mova m5, m6
- punpckl%5 m4, m3
- punpckl%5 m6, m7
- punpckh%5 m1, m3
- punpckh%5 m5, m7
- punpckh%4 m3, m6, m4
- punpckh%4 m7, m5, m1
- punpckl%4 m6, m4
- punpckl%4 m5, m1
+ punpckl%4 m4, m3
+ punpckl%4 m6, m7
+ punpckh%4 m1, m3
+ punpckh%4 m5, m7
+ punpckh%3 m3, m6, m4
+ punpckh%3 m7, m5, m1
+ punpckl%3 m6, m4
+ punpckl%3 m5, m1
movu m4, [r1+35*SIZEOF_DCTCOEF]
movu m1, [r1+49*SIZEOF_DCTCOEF]
- pshuf%6 m6, m6, 0x1b
- pshuf%6 m5, m5, 0x1b
+ pshuf%5 m6, m6, 0x1b
+ pshuf%5 m5, m5, 0x1b
mova [r0+60*SIZEOF_DCTCOEF], m0
mova [r0+56*SIZEOF_DCTCOEF], m2
movu m0, [r1+42*SIZEOF_DCTCOEF]
mova [r0+32*SIZEOF_DCTCOEF], m7
movu [r0+10*SIZEOF_DCTCOEF], m6
movu [r0+21*SIZEOF_DCTCOEF], m5
- punpckh%5 m3, m0, m4
- punpckh%5 m7, m2, m1
- punpckl%5 m0, m4
- punpckl%5 m2, m1
- punpckl%4 m4, m2, m0
- punpckl%4 m1, m7, m3
- punpckh%4 m2, m0
- punpckh%4 m7, m3
- pshuf%6 m2, m2, 0x1b
- pshuf%6 m7, m7, 0x1b
+ punpckh%4 m3, m0, m4
+ punpckh%4 m7, m2, m1
+ punpckl%4 m0, m4
+ punpckl%4 m2, m1
+ punpckl%3 m4, m2, m0
+ punpckl%3 m1, m7, m3
+ punpckh%3 m2, m0
+ punpckh%3 m7, m3
+ pshuf%5 m2, m2, 0x1b
+ pshuf%5 m7, m7, 0x1b
mova [r0+28*SIZEOF_DCTCOEF], m4
movu [r0+43*SIZEOF_DCTCOEF], m1
movu [r0+39*SIZEOF_DCTCOEF], m2
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-SCAN_8x8_FRAME sse2, 4 , dq, qdq, dq, d
-INIT_AVX
-SCAN_8x8_FRAME avx , 4 , dq, qdq, dq, d
+INIT_XMM sse2
+SCAN_8x8_FRAME 4 , dq, qdq, dq, d
+INIT_XMM avx
+SCAN_8x8_FRAME 4 , dq, qdq, dq, d
%else
-INIT_MMX
-SCAN_8x8_FRAME mmx2, 16, q , dq , wd, w
+INIT_MMX mmx2
+SCAN_8x8_FRAME 16, q , dq , wd, w
%endif
;-----------------------------------------------------------------------------
; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] )
;-----------------------------------------------------------------------------
-%macro SCAN_4x4 5
-cglobal zigzag_scan_4x4_frame_%1, 2,2,8*(mmsize)/16
+%macro SCAN_4x4 4
+cglobal zigzag_scan_4x4_frame, 2,2,8*(mmsize)/16
mova m0, [r1]
mova m1, [r1+ 4*SIZEOF_DCTCOEF]
mova m2, [r1+ 8*SIZEOF_DCTCOEF]
mova m3, [r1+12*SIZEOF_DCTCOEF]
- punpckl%5 m4, m0, m1
+ punpckl%4 m4, m0, m1
mova m5, m1
mova m6, m2
mova m7, m3
- psll%3 m3, %2
- psrl%3 m0, %2
- punpckl%4 m2, m2
- punpckh%4 m1, m1
- punpckl%5 m5, m3
- punpckl%4 m4, m0
- punpckh%5 m5, m2
- punpckh%5 m0, m6
- punpckh%5 m6, m7
- punpckl%5 m1, m0
- punpckh%4 m3, m6
+ psll%2 m3, %1
+ psrl%2 m0, %1
+ punpckl%3 m2, m2
+ punpckh%3 m1, m1
+ punpckl%4 m5, m3
+ punpckl%3 m4, m0
+ punpckh%4 m5, m2
+ punpckh%4 m0, m6
+ punpckh%4 m6, m7
+ punpckl%4 m1, m0
+ punpckh%3 m3, m6
mova [r0], m4
mova [r0+ 4*SIZEOF_DCTCOEF], m5
mova [r0+ 8*SIZEOF_DCTCOEF], m1
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-SCAN_4x4 sse2, 4 , dq, qdq, dq
-INIT_AVX
-SCAN_4x4 avx , 4 , dq, qdq, dq
+INIT_XMM sse2
+SCAN_4x4 4 , dq, qdq, dq
+INIT_XMM avx
+SCAN_4x4 4 , dq, qdq, dq
%else
-INIT_MMX
-SCAN_4x4 mmx , 16, q , dq , wd
+INIT_MMX mmx
+SCAN_4x4 16, q , dq , wd
;-----------------------------------------------------------------------------
; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-%macro SCAN_4x4_FRAME 1
-cglobal zigzag_scan_4x4_frame_%1, 2,2
+%macro SCAN_4x4_FRAME 0
+cglobal zigzag_scan_4x4_frame, 2,2
movdqa xmm1, [r1+16]
movdqa xmm0, [r1]
pshufb xmm1, [pb_scan4frameb]
RET
%endmacro
-INIT_XMM
-SCAN_4x4_FRAME ssse3
-INIT_AVX
-SCAN_4x4_FRAME avx
+INIT_XMM ssse3
+SCAN_4x4_FRAME
+INIT_XMM avx
+SCAN_4x4_FRAME
%endif ; !HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
;-----------------------------------------------------------------------------
; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
;-----------------------------------------------------------------------------
+INIT_XMM
cglobal zigzag_scan_4x4_field_sse2, 2,3
movu m4, [r1+8]
pshufd m0, m4, 0xd2
; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
+INIT_MMX
cglobal zigzag_scan_4x4_field_mmx2, 2,3
pshufw mm0, [r1+4], 0xd2
movq mm1, [r1+16]
; 45 46 47 51 56 57 52 53
; 54 55 58 59 60 61 62 63
%undef SCAN_8x8
-%macro SCAN_8x8 6
-cglobal zigzag_scan_8x8_field_%1, 2,3,8*(mmsize/16)
+%macro SCAN_8x8 5
+cglobal zigzag_scan_8x8_field, 2,3,8*(mmsize/16)
mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00
mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04
mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08
- pshuf%2 m3, m0, 011111111b ; 03 03 03 03
+ pshuf%1 m3, m0, 011111111b ; 03 03 03 03
movd r2, m2 ; 09 08
- pshuf%2 m2, m2, 000111001b ; 08 11 10 09
- punpckl%3 m3, m1 ; 05 03 04 03
- pinsr%2 m0, r2d, 3 ; 08 02 01 00
- punpckl%3 m4, m2, m3 ; 04 10 03 09
- pshuf%2 m4, m4, 010110100b ; 10 04 03 09
+ pshuf%1 m2, m2, 000111001b ; 08 11 10 09
+ punpckl%2 m3, m1 ; 05 03 04 03
+ pinsr%1 m0, r2d, 3 ; 08 02 01 00
+ punpckl%2 m4, m2, m3 ; 04 10 03 09
+ pshuf%1 m4, m4, 010110100b ; 10 04 03 09
mova [r0+ 0*SIZEOF_DCTCOEF], m0 ; 08 02 01 00
mova [r0+ 4*SIZEOF_DCTCOEF], m4 ; 10 04 03 09
mova m3, [r1+12*SIZEOF_DCTCOEF] ; 15 14 13 12
mova m5, [r1+16*SIZEOF_DCTCOEF] ; 19 18 17 16
- punpckl%4 m6, m5 ; 17 16 XX XX
- psrl%5 m1, %6 ; XX 07 06 05
- punpckh%3 m6, m2 ; 08 17 11 16
- punpckl%4 m6, m1 ; 06 05 11 16
+ punpckl%3 m6, m5 ; 17 16 XX XX
+ psrl%4 m1, %5 ; XX 07 06 05
+ punpckh%2 m6, m2 ; 08 17 11 16
+ punpckl%3 m6, m1 ; 06 05 11 16
mova [r0+ 8*SIZEOF_DCTCOEF], m6 ; 06 05 11 16
- psrl%5 m1, %6 ; XX XX 07 06
- punpckl%3 m1, m5 ; 17 07 16 06
+ psrl%4 m1, %5 ; XX XX 07 06
+ punpckl%2 m1, m5 ; 17 07 16 06
mova m0, [r1+20*SIZEOF_DCTCOEF] ; 23 22 21 20
mova m2, [r1+24*SIZEOF_DCTCOEF] ; 27 26 25 24
- punpckh%4 m1, m1 ; 17 07 17 07
- punpckl%3 m6, m3, m2 ; 25 13 24 12
- pextr%2 r2d, m5, 2
+ punpckh%3 m1, m1 ; 17 07 17 07
+ punpckl%2 m6, m3, m2 ; 25 13 24 12
+ pextr%1 r2d, m5, 2
mova [r0+24*SIZEOF_DCTCOEF], m0 ; 23 22 21 20
- punpckl%3 m1, m6 ; 24 17 12 07
+ punpckl%2 m1, m6 ; 24 17 12 07
mova [r0+12*SIZEOF_DCTCOEF], m1
- pinsr%2 m3, r2d, 0 ; 15 14 13 18
+ pinsr%1 m3, r2d, 0 ; 15 14 13 18
mova [r0+16*SIZEOF_DCTCOEF], m3 ; 15 14 13 18
mova m7, [r1+28*SIZEOF_DCTCOEF]
mova m0, [r1+32*SIZEOF_DCTCOEF] ; 35 34 33 32
- psrl%5 m5, %6*3 ; XX XX XX 19
- pshuf%2 m1, m2, 011111001b ; 27 27 26 25
- punpckl%3 m5, m0 ; 33 XX 32 19
- psrl%5 m2, %6*3 ; XX XX XX 27
- punpckl%3 m5, m1 ; 26 32 25 19
+ psrl%4 m5, %5*3 ; XX XX XX 19
+ pshuf%1 m1, m2, 011111001b ; 27 27 26 25
+ punpckl%2 m5, m0 ; 33 XX 32 19
+ psrl%4 m2, %5*3 ; XX XX XX 27
+ punpckl%2 m5, m1 ; 26 32 25 19
mova [r0+32*SIZEOF_DCTCOEF], m7
mova [r0+20*SIZEOF_DCTCOEF], m5 ; 26 32 25 19
mova m7, [r1+36*SIZEOF_DCTCOEF]
mova m1, [r1+40*SIZEOF_DCTCOEF] ; 43 42 41 40
- pshuf%2 m3, m0, 011111001b ; 35 35 34 33
- punpckl%3 m2, m1 ; 41 XX 40 27
+ pshuf%1 m3, m0, 011111001b ; 35 35 34 33
+ punpckl%2 m2, m1 ; 41 XX 40 27
mova [r0+40*SIZEOF_DCTCOEF], m7
- punpckl%3 m2, m3 ; 34 40 33 27
+ punpckl%2 m2, m3 ; 34 40 33 27
mova [r0+28*SIZEOF_DCTCOEF], m2
mova m7, [r1+44*SIZEOF_DCTCOEF] ; 47 46 45 44
mova m2, [r1+48*SIZEOF_DCTCOEF] ; 51 50 49 48
- psrl%5 m0, %6*3 ; XX XX XX 35
- punpckl%3 m0, m2 ; 49 XX 48 35
- pshuf%2 m3, m1, 011111001b ; 43 43 42 41
- punpckl%3 m0, m3 ; 42 48 41 35
+ psrl%4 m0, %5*3 ; XX XX XX 35
+ punpckl%2 m0, m2 ; 49 XX 48 35
+ pshuf%1 m3, m1, 011111001b ; 43 43 42 41
+ punpckl%2 m0, m3 ; 42 48 41 35
mova [r0+36*SIZEOF_DCTCOEF], m0
- pextr%2 r2d, m2, 3 ; 51
- psrl%5 m1, %6*3 ; XX XX XX 43
- punpckl%3 m1, m7 ; 45 XX 44 43
- psrl%5 m2, %6 ; XX 51 50 49
- punpckl%3 m1, m2 ; 50 44 49 43
- pshuf%2 m1, m1, 010110100b ; 44 50 49 43
+ pextr%1 r2d, m2, 3 ; 51
+ psrl%4 m1, %5*3 ; XX XX XX 43
+ punpckl%2 m1, m7 ; 45 XX 44 43
+ psrl%4 m2, %5 ; XX 51 50 49
+ punpckl%2 m1, m2 ; 50 44 49 43
+ pshuf%1 m1, m1, 010110100b ; 44 50 49 43
mova [r0+44*SIZEOF_DCTCOEF], m1
- psrl%5 m7, %6 ; XX 47 46 45
- pinsr%2 m7, r2d, 3 ; 51 47 46 45
+ psrl%4 m7, %5 ; XX 47 46 45
+ pinsr%1 m7, r2d, 3 ; 51 47 46 45
mova [r0+48*SIZEOF_DCTCOEF], m7
mova m0, [r1+56*SIZEOF_DCTCOEF] ; 59 58 57 56
mova m1, [r1+52*SIZEOF_DCTCOEF] ; 55 54 53 52
mova m7, [r1+60*SIZEOF_DCTCOEF]
- punpckl%4 m2, m0, m1 ; 53 52 57 56
- punpckh%4 m1, m0 ; 59 58 55 54
+ punpckl%3 m2, m0, m1 ; 53 52 57 56
+ punpckh%3 m1, m0 ; 59 58 55 54
mova [r0+52*SIZEOF_DCTCOEF], m2
mova [r0+56*SIZEOF_DCTCOEF], m1
mova [r0+60*SIZEOF_DCTCOEF], m7
RET
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-SCAN_8x8 sse4, d, dq, qdq, dq, 4
-INIT_AVX
-SCAN_8x8 avx , d, dq, qdq, dq, 4
+INIT_XMM sse4
+SCAN_8x8 d, dq, qdq, dq, 4
+INIT_XMM avx
+SCAN_8x8 d, dq, qdq, dq, 4
%else
-INIT_MMX
-SCAN_8x8 mmx2, w, wd, dq , q , 16
+INIT_MMX mmx2
+SCAN_8x8 w, wd, dq , q , 16
%endif
;-----------------------------------------------------------------------------
; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
-%macro ZIGZAG_SUB_4x4 3
+%macro ZIGZAG_SUB_4x4 2
%ifidn %1, ac
-cglobal zigzag_sub_4x4%1_%2_%3, 4,4,8
+cglobal zigzag_sub_4x4%1_%2, 4,4,8
%else
-cglobal zigzag_sub_4x4%1_%2_%3, 3,3,8
+cglobal zigzag_sub_4x4%1_%2, 3,3,8
%endif
movd xmm0, [r1+0*FENC_STRIDE]
movd xmm1, [r1+1*FENC_STRIDE]
RET
%endmacro
-INIT_XMM
-ZIGZAG_SUB_4x4 , frame, ssse3
-ZIGZAG_SUB_4x4 ac, frame, ssse3
-ZIGZAG_SUB_4x4 , field, ssse3
-ZIGZAG_SUB_4x4 ac, field, ssse3
-INIT_AVX
-ZIGZAG_SUB_4x4 , frame, avx
-ZIGZAG_SUB_4x4 ac, frame, avx
-ZIGZAG_SUB_4x4 , field, avx
-ZIGZAG_SUB_4x4 ac, field, avx
+INIT_XMM ssse3
+ZIGZAG_SUB_4x4 , frame
+ZIGZAG_SUB_4x4 ac, frame
+ZIGZAG_SUB_4x4 , field
+ZIGZAG_SUB_4x4 ac, field
+INIT_XMM avx
+ZIGZAG_SUB_4x4 , frame
+ZIGZAG_SUB_4x4 ac, frame
+ZIGZAG_SUB_4x4 , field
+ZIGZAG_SUB_4x4 ac, field
;-----------------------------------------------------------------------------
; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
%endif
%endmacro
-%macro ZIGZAG_8x8_CAVLC 2
-cglobal zigzag_interleave_8x8_cavlc_%1, 3,3,8*(mmsize/16)
- INTERLEAVE 0, %2
- INTERLEAVE 8, %2
- INTERLEAVE 16, %2
- INTERLEAVE 24, %2
+%macro ZIGZAG_8x8_CAVLC 1
+cglobal zigzag_interleave_8x8_cavlc, 3,3,8*(mmsize/16)
+ INTERLEAVE 0, %1
+ INTERLEAVE 8, %1
+ INTERLEAVE 16, %1
+ INTERLEAVE 24, %1
packsswb m6, m7
packsswb m5, m6
packsswb m5, m5
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-ZIGZAG_8x8_CAVLC sse2, D
-INIT_AVX
-ZIGZAG_8x8_CAVLC avx , D
+INIT_XMM sse2
+ZIGZAG_8x8_CAVLC D
+INIT_XMM avx
+ZIGZAG_8x8_CAVLC D
%else
-INIT_MMX
-ZIGZAG_8x8_CAVLC mmx , W
+INIT_MMX mmx
+ZIGZAG_8x8_CAVLC W
%endif
%macro INTERLEAVE_XMM 1
%endmacro
%ifndef HIGH_BIT_DEPTH
-%macro ZIGZAG_8x8_CAVLC 1
-cglobal zigzag_interleave_8x8_cavlc_%1, 3,3,8
+%macro ZIGZAG_8x8_CAVLC 0
+cglobal zigzag_interleave_8x8_cavlc, 3,3,8
INTERLEAVE_XMM 0
INTERLEAVE_XMM 16
packsswb m2, m3
RET
%endmacro
-INIT_XMM
-ZIGZAG_8x8_CAVLC sse2
-INIT_AVX
-ZIGZAG_8x8_CAVLC avx
+INIT_XMM sse2
+ZIGZAG_8x8_CAVLC
+INIT_XMM avx
+ZIGZAG_8x8_CAVLC
%endif ; !HIGH_BIT_DEPTH
%endif
%endmacro
-%macro DEBLOCK_LUMA 1
+%macro DEBLOCK_LUMA 0
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_%1, 5,5,8*(mmsize/16)
+cglobal deblock_v_luma, 5,5,8*(mmsize/16)
%assign pad 5*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
ADD rsp, pad
RET
-cglobal deblock_h_luma_%1, 5,6,8*(mmsize/16)
+cglobal deblock_h_luma, 5,6,8*(mmsize/16)
%assign pad 7*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
RET
%endmacro
-INIT_XMM
%ifdef ARCH_X86_64
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
; m12=alpha, m13=beta
SWAP 3, 9
%endmacro
-%macro DEBLOCK_LUMA_64 1
-cglobal deblock_v_luma_%1, 5,5,15
+%macro DEBLOCK_LUMA_64 0
+cglobal deblock_v_luma, 5,5,15
%define p2 m8
%define p1 m0
%define p0 m1
jg .loop
REP_RET
-cglobal deblock_h_luma_%1, 5,7,15
+cglobal deblock_h_luma, 5,7,15
add r1, r1
LOAD_AB m12, m13, r2, r3
mov r2, r1
REP_RET
%endmacro
-INIT_XMM
-DEBLOCK_LUMA_64 sse2
-INIT_AVX
-DEBLOCK_LUMA_64 avx
+INIT_XMM sse2
+DEBLOCK_LUMA_64
+INIT_XMM avx
+DEBLOCK_LUMA_64
%endif
%macro SWAPMOVA 2
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-%macro DEBLOCK_LUMA_INTRA_64 1
-cglobal deblock_v_luma_intra_%1, 4,7,16
+%macro DEBLOCK_LUMA_INTRA_64 0
+cglobal deblock_v_luma_intra, 4,7,16
%define t0 m1
%define t1 m2
%define t2 m4
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_%1, 4,7,16
+cglobal deblock_h_luma_intra, 4,7,16
%define t0 m15
%define t1 m14
%define t2 m2
RET
%endmacro
-INIT_XMM
-DEBLOCK_LUMA_INTRA_64 sse2
-INIT_AVX
-DEBLOCK_LUMA_INTRA_64 avx
+INIT_XMM sse2
+DEBLOCK_LUMA_INTRA_64
+INIT_XMM avx
+DEBLOCK_LUMA_INTRA_64
%endif
-%macro DEBLOCK_LUMA_INTRA 1
+%macro DEBLOCK_LUMA_INTRA 0
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_intra_%1, 4,7,8*(mmsize/16)
+cglobal deblock_v_luma_intra, 4,7,8*(mmsize/16)
LUMA_INTRA_INIT 3
lea r4, [r1*4]
lea r5, [r1*3]
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_%1, 4,7,8*(mmsize/16)
+cglobal deblock_h_luma_intra, 4,7,8*(mmsize/16)
LUMA_INTRA_INIT 8
%if mmsize == 8
lea r4, [r1*3]
%endmacro
%ifndef ARCH_X86_64
-INIT_MMX
-DEBLOCK_LUMA mmx2
-DEBLOCK_LUMA_INTRA mmx2
-INIT_XMM
-DEBLOCK_LUMA sse2
-DEBLOCK_LUMA_INTRA sse2
-INIT_AVX
-DEBLOCK_LUMA avx
-DEBLOCK_LUMA_INTRA avx
+INIT_MMX mmx2
+DEBLOCK_LUMA
+DEBLOCK_LUMA_INTRA
+INIT_XMM sse2
+DEBLOCK_LUMA
+DEBLOCK_LUMA_INTRA
+INIT_XMM avx
+DEBLOCK_LUMA
+DEBLOCK_LUMA_INTRA
%endif
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-%macro DEBLOCK_LUMA 1
-cglobal deblock_v_luma_%1, 5,5,10
+%macro DEBLOCK_LUMA 0
+cglobal deblock_v_luma, 5,5,10
movd m8, [r4] ; tc0
lea r4, [r1*3]
dec r2d ; alpha-1
;-----------------------------------------------------------------------------
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal deblock_h_luma_%1, 5,7
+INIT_MMX cpuname
+cglobal deblock_h_luma, 5,7
movsxd r10, r1d
lea r11, [r10+r10*2]
lea r6, [r0-4]
%ifdef WIN64
mov [rsp+0x20], r4
%endif
- call deblock_v_luma_%1
+ call deblock_v_luma
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add r6, 2
RET
%endmacro
-INIT_XMM
-DEBLOCK_LUMA sse2
-INIT_AVX
-DEBLOCK_LUMA avx
+INIT_XMM sse2
+DEBLOCK_LUMA
+INIT_XMM avx
+DEBLOCK_LUMA
%else
-%macro DEBLOCK_LUMA 3
+%macro DEBLOCK_LUMA 2
;-----------------------------------------------------------------------------
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_%2_luma_%1, 5,5
+cglobal deblock_%1_luma, 5,5
lea r4, [r1*3]
dec r2 ; alpha-1
neg r4
dec r3 ; beta-1
add r4, r0 ; pix-3*stride
- %assign pad 2*%3+12-(stack_offset&15)
+ %assign pad 2*%2+12-(stack_offset&15)
SUB esp, pad
mova m0, [r4+r1] ; p1
movd m4, [r3] ; tc0
punpcklbw m4, m4
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
- mova [esp+%3], m4 ; tc
+ mova [esp+%2], m4 ; tc
pcmpeqb m3, m3
pcmpgtb m4, m3
pand m4, m7
mova m3, [r4] ; p2
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m4
- pand m4, [esp+%3] ; tc
+ pand m4, [esp+%2] ; tc
psubb m7, m4, m6
pand m6, m4
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
mova m5, [esp] ; mask
pand m6, m5
- mova m5, [esp+%3] ; tc
+ mova m5, [esp+%2] ; tc
pand m5, m6
psubb m7, m6
mova m3, [r0+r1]
;-----------------------------------------------------------------------------
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal deblock_h_luma_%1, 0,5
+INIT_MMX cpuname
+cglobal deblock_h_luma, 0,5
mov r0, r0mp
mov r3, r1m
lea r4, [r3*3]
PUSH dword r2m
PUSH dword 16
PUSH dword r0
- call deblock_%2_luma_%1
-%ifidn %2, v8
+ call deblock_%1_luma
+%ifidn %1, v8
add dword [esp ], 8 ; pix_tmp+0x38
add dword [esp+16], 2 ; tc0+2
- call deblock_%2_luma_%1
+ call deblock_%1_luma
%endif
ADD esp, 20
RET
%endmacro ; DEBLOCK_LUMA
-INIT_MMX
-DEBLOCK_LUMA mmx2, v8, 8
-INIT_XMM
-DEBLOCK_LUMA sse2, v, 16
-INIT_AVX
-DEBLOCK_LUMA avx, v, 16
+INIT_MMX mmx2
+DEBLOCK_LUMA v8, 8
+INIT_XMM sse2
+DEBLOCK_LUMA v, 16
+INIT_XMM avx
+DEBLOCK_LUMA v, 16
%endif ; ARCH
%define mask1p mask1q
%endmacro
-%macro DEBLOCK_LUMA_INTRA 2
+%macro DEBLOCK_LUMA_INTRA 1
%define p1 m0
%define p0 m1
%define q0 m2
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_%2_luma_intra_%1, 4,6,16
+cglobal deblock_%1_luma_intra, 4,6,16
%ifndef ARCH_X86_64
sub esp, 0x60
%endif
%endif
RET
-INIT_MMX
+INIT_MMX cpuname
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_%1, 4,7
+cglobal deblock_h_luma_intra, 4,7
movsxd r10, r1d
lea r11, [r10*3]
lea r6, [r0-4]
lea r0, [pix_tmp+0x40]
mov r1, 0x10
- call deblock_v_luma_intra_%1
+ call deblock_v_luma_intra
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea r5, [r6+r11]
add rsp, 0x88
RET
%else
-cglobal deblock_h_luma_intra_%1, 2,4
+cglobal deblock_h_luma_intra, 2,4
lea r3, [r1*3]
sub r0, 4
lea r2, [r0+r3]
PUSH dword r2m
PUSH dword 16
PUSH r0
- call deblock_%2_luma_intra_%1
-%ifidn %2, v8
+ call deblock_%1_luma_intra
+%ifidn %1, v8
add dword [rsp], 8 ; pix_tmp+8
- call deblock_%2_luma_intra_%1
+ call deblock_%1_luma_intra
%endif
ADD esp, 16
%endif ; ARCH_X86_64
%endmacro ; DEBLOCK_LUMA_INTRA
-INIT_XMM
-DEBLOCK_LUMA_INTRA sse2, v
-INIT_AVX
-DEBLOCK_LUMA_INTRA avx , v
+INIT_XMM sse2
+DEBLOCK_LUMA_INTRA v
+INIT_XMM avx
+DEBLOCK_LUMA_INTRA v
%ifndef ARCH_X86_64
-INIT_MMX
-DEBLOCK_LUMA_INTRA mmx2, v8
+INIT_MMX mmx2
+DEBLOCK_LUMA_INTRA v8
%endif
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_%1, 7,7,8*(mmsize/16)
+cglobal deblock_v_chroma, 7,7,8*(mmsize/16)
FIX_STRIDES r1
mov r5, r0
sub r0, r1
;-----------------------------------------------------------------------------
; void deblock_h_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_%1, 5,7,8*(mmsize/16)
+cglobal deblock_h_chroma, 5,7,8*(mmsize/16)
add r1, r1
mov r5, 32/mmsize
%if mmsize == 16
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_intra_%1, 4,6,8*(mmsize/16)
+cglobal deblock_v_chroma_intra, 4,6,8*(mmsize/16)
add r1, r1
mov r5, 32/mmsize
movd m5, r3
;-----------------------------------------------------------------------------
; void deblock_h_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_intra_%1, 4,6,8*(mmsize/16)
+cglobal deblock_h_chroma_intra, 4,6,8*(mmsize/16)
add r1, r1
mov r4, 32/mmsize
%if mmsize == 16
%endmacro
%ifndef ARCH_X86_64
-INIT_MMX
+INIT_MMX mmx2
DEBLOCK_CHROMA mmx2
%endif
-INIT_XMM
+INIT_XMM sse2
DEBLOCK_CHROMA sse2
-INIT_AVX
+INIT_XMM avx
DEBLOCK_CHROMA avx
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_%1, 5,6,8
+cglobal deblock_v_chroma, 5,6,8
CHROMA_V_START
mova m0, [t5]
mova m1, [t5+r1]
;-----------------------------------------------------------------------------
; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_%1, 5,7,8
+cglobal deblock_h_chroma, 5,7,8
CHROMA_H_START
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
call chroma_inter_body_%1
ret
%endmacro ; DEBLOCK_CHROMA
-INIT_XMM
+INIT_XMM sse2
DEBLOCK_CHROMA sse2
-INIT_AVX
+INIT_XMM avx
DEBLOCK_CHROMA avx
%ifndef ARCH_X86_64
-INIT_MMX
+INIT_MMX mmx2
DEBLOCK_CHROMA mmx2
%endif
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_intra_%1, 4,5,8
+cglobal deblock_v_chroma_intra, 4,5,8
CHROMA_V_START
mova m0, [t5]
mova m1, [t5+r1]
;-----------------------------------------------------------------------------
; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_intra_%1, 4,6,8
+cglobal deblock_h_chroma_intra, 4,6,8
CHROMA_H_START
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
call chroma_intra_body_%1
ret
%endmacro ; DEBLOCK_CHROMA_INTRA
-INIT_XMM
+INIT_XMM sse2
DEBLOCK_CHROMA_INTRA sse2
-INIT_AVX
+INIT_XMM avx
DEBLOCK_CHROMA_INTRA avx
%ifndef ARCH_X86_64
-INIT_MMX
+INIT_MMX mmx2
DEBLOCK_CHROMA_INTRA mmx2
%endif
%endif ; !HIGH_BIT_DEPTH
por m1, m3 ; top neighbors
%endmacro
-INIT_MMX
-cglobal deblock_strength_mmx2, 6,6
+INIT_MMX mmx2
+cglobal deblock_strength, 6,6
; Prepare mv comparison register
shl r4d, 8
add r4d, 3 - (1<<8)
mova [bs1+8], m3
RET
-%macro DEBLOCK_STRENGTH_XMM 1
-cglobal deblock_strength_%1, 6,6,8
+%macro DEBLOCK_STRENGTH_XMM 0
+cglobal deblock_strength, 6,6,8
; Prepare mv comparison register
shl r4d, 8
add r4d, 3 - (1<<8)
por m5, m1
; Check mvs
-%ifnidn %1, sse2
+%if cpuflag(ssse3)
mova m0, [mv+4*8*0]
mova m1, [mv+4*8*1]
palignr m3, m0, [mv+4*8*0-16], 12
paddb m1, m1
pmaxub m4, m0
pmaxub m5, m1
-%ifnidn %1, sse2
+%if cpuflag(ssse3)
pshufb m4, [transpose_shuf]
%else
movhlps m3, m4
RET
%endmacro
-INIT_XMM
-DEBLOCK_STRENGTH_XMM sse2
-%define ABSB2 ABSB2_SSSE3
-DEBLOCK_STRENGTH_XMM ssse3
-INIT_AVX
-DEBLOCK_STRENGTH_XMM avx
+INIT_XMM sse2
+DEBLOCK_STRENGTH_XMM
+INIT_XMM ssse3
+DEBLOCK_STRENGTH_XMM
+INIT_XMM avx
+DEBLOCK_STRENGTH_XMM
;-----------------------------------------------------------------------------
; int pixel_avg_weight_w16( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight )
;-----------------------------------------------------------------------------
-%macro AVG_WEIGHT 2-3 0
-cglobal pixel_avg_weight_w%2_%1
+%macro AVG_WEIGHT 1-2 0
+cglobal pixel_avg_weight_w%1
BIWEIGHT_START
- AVG_START %3
+ AVG_START %2
%ifdef HIGH_BIT_DEPTH
mova m7, [pw_pixel_max]
%endif
.height_loop:
-%if mmsize==16 && %2==mmsize/(2*SIZEOF_PIXEL)
+%if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
BIWEIGHT [t2], [t4]
SWAP 0, 6
BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
movhps [t0+SIZEOF_PIXEL*t1], m6
%else
%assign x 0
-%rep (%2*SIZEOF_PIXEL+mmsize-1)/mmsize
- BIWEIGHT_ROW t0+x, t2+x, t4+x, %2
- BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %2
+%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
+ BIWEIGHT_ROW t0+x, t2+x, t4+x, %1
+ BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1
%assign x x+mmsize
%endrep
%endif
%define BIWEIGHT BIWEIGHT_MMX
%define BIWEIGHT_START BIWEIGHT_START_MMX
-INIT_MMX
-AVG_WEIGHT mmx2, 4
-AVG_WEIGHT mmx2, 8
-AVG_WEIGHT mmx2, 16
+INIT_MMX mmx2
+AVG_WEIGHT 4
+AVG_WEIGHT 8
+AVG_WEIGHT 16
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-AVG_WEIGHT sse2, 4, 8
-AVG_WEIGHT sse2, 8, 8
-AVG_WEIGHT sse2, 16, 8
+INIT_XMM sse2
+AVG_WEIGHT 4, 8
+AVG_WEIGHT 8, 8
+AVG_WEIGHT 16, 8
%else ;!HIGH_BIT_DEPTH
-INIT_XMM
-AVG_WEIGHT sse2, 8, 7
-AVG_WEIGHT sse2, 16, 7
+INIT_XMM sse2
+AVG_WEIGHT 8, 7
+AVG_WEIGHT 16, 7
%define BIWEIGHT BIWEIGHT_SSSE3
%define BIWEIGHT_START BIWEIGHT_START_SSSE3
-INIT_MMX
-AVG_WEIGHT ssse3, 4
-INIT_XMM
-AVG_WEIGHT ssse3, 8, 7
-AVG_WEIGHT ssse3, 16, 7
+INIT_MMX ssse3
+AVG_WEIGHT 4
+INIT_XMM ssse3
+AVG_WEIGHT 8, 7
+AVG_WEIGHT 16, 7
%endif ;HIGH_BIT_DEPTH
;=============================================================================
%assign XMMREGS 8
%endif
-%macro WEIGHTER 2
- cglobal mc_weight_w%1_%2, NUMREGS, NUMREGS, XMMREGS*(mmsize/16)
+%macro WEIGHTER 1
+ cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS*(mmsize/16)
FIX_STRIDES r1, r3
WEIGHT_START %1
LOAD_HEIGHT
REP_RET
%endmacro
-INIT_MMX
-WEIGHTER 4, mmx2
-WEIGHTER 8, mmx2
-WEIGHTER 12, mmx2
-WEIGHTER 16, mmx2
-WEIGHTER 20, mmx2
-INIT_XMM
-WEIGHTER 8, sse2
-WEIGHTER 16, sse2
-WEIGHTER 20, sse2
+INIT_MMX mmx2
+WEIGHTER 4
+WEIGHTER 8
+WEIGHTER 12
+WEIGHTER 16
+WEIGHTER 20
+INIT_XMM sse2
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
%ifdef HIGH_BIT_DEPTH
-WEIGHTER 12, sse2
-INIT_AVX
-WEIGHTER 8, avx
-WEIGHTER 12, avx
-WEIGHTER 16, avx
-WEIGHTER 20, avx
+WEIGHTER 12
+INIT_XMM avx
+WEIGHTER 8
+WEIGHTER 12
+WEIGHTER 16
+WEIGHTER 20
%else
%define WEIGHT WEIGHT_SSSE3
%define WEIGHT_START WEIGHT_START_SSSE3
-INIT_MMX
-WEIGHTER 4, ssse3
-INIT_XMM
-WEIGHTER 8, ssse3
-WEIGHTER 16, ssse3
-WEIGHTER 20, ssse3
-INIT_AVX
-WEIGHTER 8, avx
-WEIGHTER 16, avx
-WEIGHTER 20, avx
+INIT_MMX ssse3
+WEIGHTER 4
+INIT_XMM ssse3
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
+INIT_XMM avx
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
%endif
%macro OFFSET_OP 7
;-----------------------------------------------------------------------------
;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
;-----------------------------------------------------------------------------
-%macro OFFSET 3
- cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS
+%macro OFFSET 2
+ cglobal mc_offset%2_w%1, NUMREGS, NUMREGS
FIX_STRIDES r1, r3
mova m2, [r4]
%ifdef HIGH_BIT_DEPTH
-%ifidn %3,add
+%ifidn %2,add
mova m3, [pw_pixel_max]
%endif
%endif
LOAD_HEIGHT
.loop:
- OFFSET_TWO_ROW r2, r0, %1, %3
+ OFFSET_TWO_ROW r2, r0, %1, %2
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
sub HEIGHT_REG, 2
REP_RET
%endmacro
-%macro OFFSETPN 2
- OFFSET %1, %2, add
- OFFSET %1, %2, sub
+%macro OFFSETPN 1
+ OFFSET %1, add
+ OFFSET %1, sub
%endmacro
-INIT_MMX
-OFFSETPN 4, mmx2
-OFFSETPN 8, mmx2
-OFFSETPN 12, mmx2
-OFFSETPN 16, mmx2
-OFFSETPN 20, mmx2
-INIT_XMM
-OFFSETPN 12, sse2
-OFFSETPN 16, sse2
-OFFSETPN 20, sse2
-INIT_AVX
-OFFSETPN 12, avx
-OFFSETPN 16, avx
-OFFSETPN 20, avx
+INIT_MMX mmx2
+OFFSETPN 4
+OFFSETPN 8
+OFFSETPN 12
+OFFSETPN 16
+OFFSETPN 20
+INIT_XMM sse2
+OFFSETPN 12
+OFFSETPN 16
+OFFSETPN 20
+INIT_XMM avx
+OFFSETPN 12
+OFFSETPN 16
+OFFSETPN 20
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-OFFSETPN 8, sse2
-INIT_AVX
-OFFSETPN 8, avx
+INIT_XMM sse2
+OFFSETPN 8
+INIT_XMM avx
+OFFSETPN 8
%endif
%undef LOAD_HEIGHT
%undef HEIGHT_REG
; void pixel_avg_4x4( pixel *dst, int dst_stride,
; pixel *src1, int src1_stride, pixel *src2, int src2_stride, int weight );
;-----------------------------------------------------------------------------
-%macro AVGH 3
-cglobal pixel_avg_%1x%2_%3
+%macro AVGH 2
+cglobal pixel_avg_%1x%2
mov eax, %2
cmp dword r6m, 32
- jne pixel_avg_weight_w%1_%3
+ jne pixel_avg_weight_w%1 %+ SUFFIX
%if mmsize == 16 && %1 == 16
test dword r4m, 15
jz pixel_avg_w%1_sse2
; int height, int weight );
;-----------------------------------------------------------------------------
-%macro AVG_FUNC 4
-cglobal pixel_avg_w%1_%4
+%macro AVG_FUNC 3
+cglobal pixel_avg_w%1
AVG_START
.height_loop:
%assign x 0
%ifdef HIGH_BIT_DEPTH
-INIT_MMX
-AVG_FUNC 4, movq, movq, mmx2
-AVGH 4, 8, mmx2
-AVGH 4, 4, mmx2
-AVGH 4, 2, mmx2
+INIT_MMX mmx2
+AVG_FUNC 4, movq, movq
+AVGH 4, 8
+AVGH 4, 4
+AVGH 4, 2
-AVG_FUNC 8, movq, movq, mmx2
-AVGH 8, 16, mmx2
-AVGH 8, 8, mmx2
-AVGH 8, 4, mmx2
+AVG_FUNC 8, movq, movq
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
-AVG_FUNC 16, movq, movq, mmx2
-AVGH 16, 16, mmx2
-AVGH 16, 8, mmx2
+AVG_FUNC 16, movq, movq
+AVGH 16, 16
+AVGH 16, 8
-INIT_XMM
-AVG_FUNC 4, movq, movq, sse2
-AVGH 4, 8, sse2
-AVGH 4, 4, sse2
-AVGH 4, 2, sse2
+INIT_XMM sse2
+AVG_FUNC 4, movq, movq
+AVGH 4, 8
+AVGH 4, 4
+AVGH 4, 2
-AVG_FUNC 8, movdqu, movdqa, sse2
-AVGH 8, 16, sse2
-AVGH 8, 8, sse2
-AVGH 8, 4, sse2
+AVG_FUNC 8, movdqu, movdqa
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
-AVG_FUNC 16, movdqu, movdqa, sse2
-AVGH 16, 16, sse2
-AVGH 16, 8, sse2
+AVG_FUNC 16, movdqu, movdqa
+AVGH 16, 16
+AVGH 16, 8
%else ;!HIGH_BIT_DEPTH
-INIT_MMX
-AVG_FUNC 4, movd, movd, mmx2
-AVGH 4, 8, mmx2
-AVGH 4, 4, mmx2
-AVGH 4, 2, mmx2
-
-AVG_FUNC 8, movq, movq, mmx2
-AVGH 8, 16, mmx2
-AVGH 8, 8, mmx2
-AVGH 8, 4, mmx2
-
-AVG_FUNC 16, movq, movq, mmx2
-AVGH 16, 16, mmx2
-AVGH 16, 8, mmx2
-
-INIT_XMM
-AVG_FUNC 16, movdqu, movdqa, sse2
-AVGH 16, 16, sse2
-AVGH 16, 8, sse2
-AVGH 8, 16, sse2
-AVGH 8, 8, sse2
-AVGH 8, 4, sse2
-AVGH 16, 16, ssse3
-AVGH 16, 8, ssse3
-AVGH 8, 16, ssse3
-AVGH 8, 8, ssse3
-AVGH 8, 4, ssse3
-INIT_MMX
-AVGH 4, 8, ssse3
-AVGH 4, 4, ssse3
-AVGH 4, 2, ssse3
+INIT_MMX mmx2
+AVG_FUNC 4, movd, movd
+AVGH 4, 8
+AVGH 4, 4
+AVGH 4, 2
+
+AVG_FUNC 8, movq, movq
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+
+AVG_FUNC 16, movq, movq
+AVGH 16, 16
+AVGH 16, 8
+
+INIT_XMM sse2
+AVG_FUNC 16, movdqu, movdqa
+AVGH 16, 16
+AVGH 16, 8
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+INIT_XMM ssse3
+AVGH 16, 16
+AVGH 16, 8
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+INIT_MMX ssse3
+AVGH 4, 8
+AVGH 4, 4
+AVGH 4, 2
%endif ;HIGH_BIT_DEPTH
+
;=============================================================================
; pixel avg2
;=============================================================================
; uint16_t *src1, int src_stride,
; uint16_t *src2, int height );
;-----------------------------------------------------------------------------
-%macro AVG2_W_ONE 2
-cglobal pixel_avg2_w%1_%2, 6,7,4*(mmsize/16)
+%macro AVG2_W_ONE 1
+cglobal pixel_avg2_w%1, 6,7,4*(mmsize/16)
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
REP_RET
%endmacro
-%macro AVG2_W_TWO 4
-cglobal pixel_avg2_w%1_%4, 6,7,8*(mmsize/16)
+%macro AVG2_W_TWO 3
+cglobal pixel_avg2_w%1, 6,7,8*(mmsize/16)
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
REP_RET
%endmacro
-INIT_MMX
-AVG2_W_ONE 4, mmx2
-AVG2_W_TWO 8, movu, mova, mmx2
-INIT_XMM
-AVG2_W_ONE 8, sse2
-AVG2_W_TWO 10, movd, movd, sse2
-AVG2_W_TWO 16, movu, mova, sse2
+INIT_MMX mmx2
+AVG2_W_ONE 4
+AVG2_W_TWO 8, movu, mova
+INIT_XMM sse2
+AVG2_W_ONE 8
+AVG2_W_TWO 10, movd, movd
+AVG2_W_TWO 16, movu, mova
INIT_MMX
cglobal pixel_avg2_w10_mmx2, 6,7
REP_RET
%endmacro
+INIT_MMX
AVG2_W8 4, movd
AVG2_W8 8, movq
jz pixel_avg2_w%1_%3
mov eax, r2m
%endif
-%ifidn %3, sse2
- AVG_CACHELINE_FUNC %1, %2
-%elif %1==8 && %2==64
+%if mmsize==16 || (%1==8 && %2==64)
AVG_CACHELINE_FUNC %1, %2
%else
jmp cachesplit
%endif
%endmacro
+INIT_MMX
AVG_CACHELINE_CHECK 8, 64, mmx2
AVG_CACHELINE_CHECK 12, 64, mmx2
%ifndef ARCH_X86_64
AVG_CACHELINE_CHECK 16, 32, mmx2
AVG_CACHELINE_CHECK 20, 32, mmx2
%endif
+INIT_XMM
AVG_CACHELINE_CHECK 16, 64, sse2
AVG_CACHELINE_CHECK 20, 64, sse2
; pixel copy
;=============================================================================
-%macro COPY4 4
- %2 m0, [r2]
- %2 m1, [r2+r3]
- %2 m2, [r2+r3*2]
- %2 m3, [r2+%4]
- %1 [r0], m0
- %1 [r0+r1], m1
- %1 [r0+r1*2], m2
- %1 [r0+%3], m3
+%macro COPY4 2-*
+ movu m0, [r2]
+ movu m1, [r2+r3]
+ movu m2, [r2+r3*2]
+ movu m3, [r2+%2]
+ mova [r0], m0
+ mova [r0+r1], m1
+ mova [r0+r1*2], m2
+ mova [r0+%1], m3
%endmacro
-%ifdef HIGH_BIT_DEPTH
-%macro COPY_ONE 6
- COPY4 %1, %2, %3, %4
+%macro COPY_ONE 4
+ COPY4 %1, %2
%endmacro
-%macro COPY_TWO 6
- %2 m0, [r2+%5]
- %2 m1, [r2+%6]
- %2 m2, [r2+r3+%5]
- %2 m3, [r2+r3+%6]
- %2 m4, [r2+r3*2+%5]
- %2 m5, [r2+r3*2+%6]
- %2 m6, [r2+%4+%5]
- %2 m7, [r2+%4+%6]
- %1 [r0+%5], m0
- %1 [r0+%6], m1
- %1 [r0+r1+%5], m2
- %1 [r0+r1+%6], m3
- %1 [r0+r1*2+%5], m4
- %1 [r0+r1*2+%6], m5
- %1 [r0+%3+%5], m6
- %1 [r0+%3+%6], m7
+%macro COPY_TWO 4
+ movu m0, [r2+%3]
+ movu m1, [r2+%4]
+ movu m2, [r2+r3+%3]
+ movu m3, [r2+r3+%4]
+ movu m4, [r2+r3*2+%3]
+ movu m5, [r2+r3*2+%4]
+ movu m6, [r2+%2+%3]
+ movu m7, [r2+%2+%4]
+ mova [r0+%3], m0
+ mova [r0+%4], m1
+ mova [r0+r1+%3], m2
+ mova [r0+r1+%4], m3
+ mova [r0+r1*2+%3], m4
+ mova [r0+r1*2+%4], m5
+ mova [r0+%1+%3], m6
+ mova [r0+%1+%4], m7
%endmacro
+;-----------------------------------------------------------------------------
+; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
+; uint8_t *src, int i_src_stride, int i_height )
+;-----------------------------------------------------------------------------
INIT_MMX
cglobal mc_copy_w4_mmx, 4,6
FIX_STRIDES r1, r3
lea r5, [r3*3]
lea r4, [r1*3]
je .end
- COPY4 mova, mova, r4, r5
+%ifndef HIGH_BIT_DEPTH
+ %define mova movd
+ %define movu movd
+%endif
+ COPY4 r4, r5
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
-.end
- COPY4 movu, mova, r4, r5
+.end:
+ COPY4 r4, r5
RET
+%ifdef HIGH_BIT_DEPTH
cglobal mc_copy_w16_mmx, 5,7
FIX_STRIDES r1, r3
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
- COPY_TWO mova, movu, r5, r6, mmsize*0, mmsize*1
- COPY_TWO mova, movu, r5, r6, mmsize*2, mmsize*3
+ COPY_TWO r5, r6, mmsize*0, mmsize*1
+ COPY_TWO r5, r6, mmsize*2, mmsize*3
sub r4d, 4
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
jg .height_loop
REP_RET
-%macro MC_COPY 5
-cglobal mc_copy_w%2_%4, 5,7,%5
+%macro MC_COPY 2
+cglobal mc_copy_w%2, 5,7,%2-8
FIX_STRIDES r1, r3
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
- COPY_%1 mova, %3, r5, r6, 0, mmsize
+ COPY_%1 r5, r6, 0, mmsize
sub r4d, 4
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
REP_RET
%endmacro
-MC_COPY TWO, 8, movu, mmx, 0
-INIT_XMM
-MC_COPY ONE, 8, movu, sse2, 0
-MC_COPY TWO, 16, movu, sse2, 8
-MC_COPY TWO, 16, mova, aligned_sse2, 8
+INIT_MMX mmx
+MC_COPY TWO, 8
+INIT_XMM sse2
+MC_COPY ONE, 8
+MC_COPY TWO, 16
+INIT_XMM aligned, sse2
+MC_COPY TWO, 16
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
-INIT_MMX
-;-----------------------------------------------------------------------------
-; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
-; uint8_t *src, int i_src_stride, int i_height )
-;-----------------------------------------------------------------------------
-cglobal mc_copy_w4_mmx, 4,6
- cmp dword r4m, 4
- lea r5, [r3*3]
- lea r4, [r1*3]
- je .end
- COPY4 movd, movd, r4, r5
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
-.end:
- COPY4 movd, movd, r4, r5
- RET
-
-cglobal mc_copy_w8_mmx, 5,7
- lea r6, [r3*3]
- lea r5, [r1*3]
-.height_loop:
- COPY4 movq, movq, r5, r6
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
- sub r4d, 4
- jg .height_loop
- REP_RET
-
-cglobal mc_copy_w16_mmx, 5,7
- lea r6, [r3*3]
- lea r5, [r1*3]
-.height_loop:
- movq mm0, [r2]
- movq mm1, [r2+8]
- movq mm2, [r2+r3]
- movq mm3, [r2+r3+8]
- movq mm4, [r2+r3*2]
- movq mm5, [r2+r3*2+8]
- movq mm6, [r2+r6]
- movq mm7, [r2+r6+8]
- movq [r0], mm0
- movq [r0+8], mm1
- movq [r0+r1], mm2
- movq [r0+r1+8], mm3
- movq [r0+r1*2], mm4
- movq [r0+r1*2+8], mm5
- movq [r0+r5], mm6
- movq [r0+r5+8], mm7
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
- sub r4d, 4
- jg .height_loop
- REP_RET
-
-INIT_XMM
-%macro COPY_W16_SSE2 2
-cglobal %1, 5,7
+%macro MC_COPY 2
+cglobal mc_copy_w%2, 5,7
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
- COPY4 movdqa, %2, r5, r6
+ %1 r5, r6, 0, mmsize
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
sub r4d, 4
REP_RET
%endmacro
-COPY_W16_SSE2 mc_copy_w16_sse2, movdqu
+INIT_MMX mmx
+MC_COPY COPY4, 8
+MC_COPY COPY_TWO, 16
+INIT_XMM sse2
+MC_COPY COPY4, 16
; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
; but with SSE3 the overhead is zero, so there's no reason not to include it.
-COPY_W16_SSE2 mc_copy_w16_sse3, lddqu
-COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa
+INIT_XMM sse3
+MC_COPY COPY4, 16
+INIT_XMM aligned, sse2
+MC_COPY COPY4, 16
%endif ; !HIGH_BIT_DEPTH
; void prefetch_fenc( uint8_t *pix_y, int stride_y,
; uint8_t *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
+INIT_MMX
%ifdef ARCH_X86_64
cglobal prefetch_fenc_mmx2, 5,5
and r4d, 3
%endif
%endmacro
%else ; !HIGH_BIT_DEPTH
-%macro UNPACK_UNALIGNED_MEM 3
+%macro UNPACK_UNALIGNED 3
+%if mmsize == 8 || cpuflag(misalign)
punpcklwd %1, %3
-%endmacro
-
-%macro UNPACK_UNALIGNED_LOAD 3
+%else
movh %2, %3
punpcklwd %1, %2
+%endif
%endmacro
%endif ; HIGH_BIT_DEPTH
; int dx, int dy,
; int width, int height )
;-----------------------------------------------------------------------------
-%macro MC_CHROMA 1
-cglobal mc_chroma_%1, 0,6
+%macro MC_CHROMA 0
+cglobal mc_chroma, 0,6
MC_CHROMA_START
FIX_STRIDES r4
and r5d, 7
%endmacro ; MC_CHROMA
-%macro MC_CHROMA_SSSE3 1-2
-cglobal mc_chroma_%1, 0,6,9
+%macro MC_CHROMA_SSSE3 0
+cglobal mc_chroma, 0,6,9
MC_CHROMA_START
and r5d, 7
and t2d, 7
imul r5d, t0d ; (x*255+8)*(8-y)
movd m6, t2d
movd m7, r5d
-%ifidn %2, _cache64
+%if cpuflag(cache64)
mov t0d, r3d
and t0d, 7
%ifdef PIC
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_MMX
-MC_CHROMA mmx2
-INIT_XMM
-MC_CHROMA sse2
-INIT_AVX
-MC_CHROMA avx
+INIT_MMX mmx2
+MC_CHROMA
+INIT_XMM sse2
+MC_CHROMA
+INIT_XMM avx
+MC_CHROMA
%else ; !HIGH_BIT_DEPTH
-INIT_MMX
-%define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM
-MC_CHROMA mmx2
-INIT_XMM
-MC_CHROMA sse2_misalign
-%define UNPACK_UNALIGNED UNPACK_UNALIGNED_LOAD
-MC_CHROMA sse2
-MC_CHROMA_SSSE3 ssse3
-MC_CHROMA_SSSE3 ssse3_cache64, _cache64
-INIT_AVX
-MC_CHROMA_SSSE3 avx ; No known AVX CPU will trigger CPU_CACHELINE_64
+INIT_MMX mmx2
+MC_CHROMA
+INIT_XMM sse2, misalign
+MC_CHROMA
+INIT_XMM sse2
+MC_CHROMA
+INIT_XMM ssse3
+MC_CHROMA_SSSE3
+INIT_XMM ssse3, cache64
+MC_CHROMA_SSSE3
+INIT_XMM avx
+MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
-%macro HPEL_FILTER 1
-cglobal hpel_filter_v_%1, 5,6,11*(mmsize/16)
+%macro HPEL_FILTER 0
+cglobal hpel_filter_v, 5,6,11*(mmsize/16)
FIX_STRIDES r3d, r4d
%ifdef WIN64
movsxd r4, r4d
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_c_%1, 3,3,10*(mmsize/16)
+cglobal hpel_filter_c, 3,3,10*(mmsize/16)
add r2, r2
add r0, r2
lea r1, [r1+r2]
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_h_%1, 3,4,8*(mmsize/16)
+cglobal hpel_filter_h, 3,4,8*(mmsize/16)
%define src r1+r2
add r2, r2
add r0, r2
REP_RET
%endmacro ; HPEL_FILTER
-INIT_MMX
-HPEL_FILTER mmx2
-INIT_XMM
-HPEL_FILTER sse2
+INIT_MMX mmx2
+HPEL_FILTER
+INIT_XMM sse2
+HPEL_FILTER
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
-%macro HPEL_V 3
+%macro HPEL_V 1
;-----------------------------------------------------------------------------
; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_v_%1, 5,6,%2
+cglobal hpel_filter_v, 5,6,%1
%ifdef WIN64
movsxd r4, r4d
%endif
add r0, r4
lea r2, [r2+r4*2]
neg r4
-%if %3
- pxor m0, m0
-%else
+%if cpuflag(ssse3)
mova m0, [filt_mul15]
+%else
+ pxor m0, m0
%endif
.loop:
-%if %3
- LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
- LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
- LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
- LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
- FILT_V2 m1, m2, m3, m4, m5, m6
-%else
+%if cpuflag(ssse3)
mova m1, [r1]
mova m4, [r1+r3]
mova m2, [r5+r3*2]
paddw m4, m5
paddw m1, m3
paddw m4, m6
+%else
+ LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
+ LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
+ LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
+ LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
+ FILT_V2 m1, m2, m3, m4, m5, m6
%endif
mova m7, [pw_16]
mova [r2+r4*2], m1
INIT_XMM
-%macro HPEL_C 1
+%macro HPEL_C 0
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_c_%1, 3,3,9
+cglobal hpel_filter_c, 3,3,9
add r0, r2
lea r1, [r1+r2*2]
neg r2
%define src r1+r2*2
-%ifnidn %1, sse2
+%ifnidn cpuname, sse2
mova m7, [pw_32]
%define tpw_32 m7
%elifdef ARCH_X86_64
%else
%define tpw_32 [pw_32]
%endif
-%ifidn %1,sse2_misalign
+%if cpuflag(misalign)
.loop:
movu m4, [src-4]
movu m5, [src-2]
REP_RET
%endif ; !ARCH_X86_64
-%define PALIGNR PALIGNR_MMX
-INIT_MMX
-HPEL_V mmx2, 0, 1
-INIT_XMM
-HPEL_V sse2, 8, 1
-HPEL_C sse2_misalign
+INIT_MMX mmx2
+HPEL_V 0
+INIT_XMM sse2
+HPEL_V 8
+INIT_XMM sse2, misalign
+HPEL_C
%ifndef ARCH_X86_64
-HPEL_C sse2
-%define PALIGNR PALIGNR_SSSE3
-HPEL_C ssse3
-HPEL_V ssse3, 0, 0
-INIT_AVX
-HPEL_C avx
-HPEL_V avx, 0, 0
+INIT_XMM sse2
+HPEL_C
+INIT_XMM ssse3
+HPEL_C
+HPEL_V 0
+INIT_XMM avx
+HPEL_C
+HPEL_V 0
%endif
%ifdef ARCH_X86_64
-%macro DO_FILT_V 6
+%macro DO_FILT_V 5
;The optimum prefetch distance is difficult to determine in checkasm:
;any prefetch seems slower than not prefetching.
;In real use, the prefetch seems to be a slight win.
;+16 is picked somewhat arbitrarily here based on the fact that even one
;loop iteration is going to take longer than the prefetch.
prefetcht0 [r1+r2*2+16]
-%ifnidn %6, sse2
+%if cpuflag(ssse3)
mova m1, [r3]
mova m2, [r3+r2]
mova %3, [r3+r2*2]
paddw %1, %4
%endmacro
-%macro DO_FILT_H 4
+%macro DO_FILT_H 3
PALIGNR m1, %2, %1, 14, m3
PALIGNR m2, %2, %1, 15, m3
PALIGNR m4, %3, %2, 1 , m3
PALIGNR m5, %3, %2, 2 , m3
PALIGNR m6, %3, %2, 3 , m3
mova %1, %2
-%ifidn %4, sse2
- ADD8TO16 m1, m6, m12, m3, m0 ; a
- ADD8TO16 m2, m5, m12, m3, m0 ; b
- ADD8TO16 %2, m4, m12, m3, m0 ; c
- FILT_V2 m1, m2, %2, m6, m5, m4
- FILT_PACK m1, m6, 5, m15
-%else ; ssse3, avx
+%if cpuflag(ssse3)
pmaddubsw m1, m12
pmaddubsw m2, m12
pmaddubsw %2, m14
paddw m2, m6
FILT_PACK m1, m2, 5, m15
pshufb m1, [hpel_shuf]
+%else ; ssse3, avx
+ ADD8TO16 m1, m6, m12, m3, m0 ; a
+ ADD8TO16 m2, m5, m12, m3, m0 ; b
+ ADD8TO16 %2, m4, m12, m3, m0 ; c
+ FILT_V2 m1, m2, %2, m6, m5, m4
+ FILT_PACK m1, m6, 5, m15
%endif
movntps [r0+r4], m1
mova %2, %3
%endmacro
-%macro HPEL 1
+%macro HPEL 0
;-----------------------------------------------------------------------------
; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; uint8_t *src, int stride, int width, int height)
;-----------------------------------------------------------------------------
-cglobal hpel_filter_%1, 7,7,16
+cglobal hpel_filter, 7,7,16
%ifdef WIN64
movsxd r4, r4d
movsxd r5, r5d
sub r3, r2
mov r4, r10
mova m15, [pw_16]
-%ifidn %1, sse2
- pxor m0, m0
-%else ; ssse3
+%if cpuflag(ssse3)
mova m0, [filt_mul51]
mova m12, [filt_mul15]
mova m14, [filt_mul20]
+%else
+ pxor m0, m0
%endif
;ALIGN 16
.loopy:
; first filter_v
- DO_FILT_V m8, m7, m13, m12, 0, %1
+ DO_FILT_V m8, m7, m13, m12, 0
;ALIGN 16
.loopx:
- DO_FILT_V m6, m5, m11, m12, 16, %1
+ DO_FILT_V m6, m5, m11, m12, 16
.lastx:
paddw m15, m15 ; pw_32
DO_FILT_C m9, m8, m7, m6
psrlw m15, 1 ; pw_16
movdqa m7, m5
- DO_FILT_H m10, m13, m11, %1
+ DO_FILT_H m10, m13, m11
add r4, 16
jl .loopx
cmp r4, 16
RET
%endmacro
-INIT_XMM
-%define PALIGNR PALIGNR_MMX
-HPEL sse2
-%define PALIGNR PALIGNR_SSSE3
-HPEL ssse3
-INIT_AVX
-HPEL avx
+INIT_XMM sse2
+HPEL
+INIT_XMM ssse3
+HPEL
+INIT_XMM avx
+HPEL
%endif ; ARCH_X86_64
%undef movntq
%endif ; HIGH_BIT_DEPTH
%endmacro
-%macro DEINTERLEAVE 7 ; dstu, dstv, src, dstv==dstu+8, cpu, shuffle constant, is aligned
+%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
%ifdef HIGH_BIT_DEPTH
%assign n 0
%rep 16/mmsize
mova m1, [%3+(n+1)*mmsize]
psrld m2, m0, 16
psrld m3, m1, 16
- pand m0, %6
- pand m1, %6
+ pand m0, %5
+ pand m1, %5
packssdw m0, m1
packssdw m2, m3
- mov%7 [%1+(n/2)*mmsize], m0
- mov%7 [%2+(n/2)*mmsize], m2
+ mov%6 [%1+(n/2)*mmsize], m0
+ mov%6 [%2+(n/2)*mmsize], m2
%assign n (n+2)
%endrep
%else ; !HIGH_BIT_DEPTH
%if mmsize==16
mova m0, [%3]
-%ifidn %5, ssse3
- pshufb m0, %6
+%if cpuflag(ssse3)
+ pshufb m0, %5
%else
mova m1, m0
- pand m0, %6
+ pand m0, %5
psrlw m1, 8
packuswb m0, m1
%endif
mova m1, [%3+8]
mova m2, m0
mova m3, m1
- pand m0, %6
- pand m1, %6
+ pand m0, %5
+ pand m1, %5
psrlw m2, 8
psrlw m3, 8
packuswb m0, m1
%endif ; HIGH_BIT_DEPTH
%endmacro
-%macro PLANE_INTERLEAVE 1
+%macro PLANE_INTERLEAVE 0
;-----------------------------------------------------------------------------
; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
; uint8_t *srcu, int i_srcu,
; uint8_t *srcv, int i_srcv, int w, int h )
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>2*w
-cglobal plane_copy_interleave_core_%1, 7,7
+cglobal plane_copy_interleave_core, 7,7
FIX_STRIDES r1d, r3d, r5d, r6d
%ifdef HIGH_BIT_DEPTH
mov r1m, r1d
;-----------------------------------------------------------------------------
; void store_interleave_8x8x2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv )
;-----------------------------------------------------------------------------
-cglobal store_interleave_8x8x2_%1, 4,5
+cglobal store_interleave_8x8x2, 4,5
mov r4d, 4
FIX_STRIDES r1d
.loop:
REP_RET
%endmacro ; PLANE_INTERLEAVE
-%macro DEINTERLEAVE_START 1
+%macro DEINTERLEAVE_START 0
%ifdef HIGH_BIT_DEPTH
mova m4, [pd_ffff]
-%elifidn %1, ssse3
+%elif cpuflag(ssse3)
mova m4, [deinterleave_shuf]
%else
mova m4, [pw_00ff]
%endif ; HIGH_BIT_DEPTH
%endmacro
-%macro PLANE_DEINTERLEAVE 1
+%macro PLANE_DEINTERLEAVE 0
;-----------------------------------------------------------------------------
; void plane_copy_deinterleave( pixel *dstu, int i_dstu,
; pixel *dstv, int i_dstv,
; pixel *src, int i_src, int w, int h )
;-----------------------------------------------------------------------------
-cglobal plane_copy_deinterleave_%1, 6,7
- DEINTERLEAVE_START %1
+cglobal plane_copy_deinterleave, 6,7
+ DEINTERLEAVE_START
mov r6d, r6m
FIX_STRIDES r1d, r3d, r5d, r6d
%ifdef HIGH_BIT_DEPTH
mov r6d, r6m
neg r6
.loopx:
- DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, %1, m4, u
- DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, %1, m4, u
+ DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
+ DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
add r6, 16*SIZEOF_PIXEL
jl .loopx
add r0, r1
;-----------------------------------------------------------------------------
; void load_deinterleave_8x8x2_fenc( pixel *dst, pixel *src, int i_src )
;-----------------------------------------------------------------------------
-cglobal load_deinterleave_8x8x2_fenc_%1, 3,4
- DEINTERLEAVE_START %1
+cglobal load_deinterleave_8x8x2_fenc, 3,4
+ DEINTERLEAVE_START
mov r3d, 4
FIX_STRIDES r2d
.loop:
- DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, %1, m4, a
- DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, %1, m4, a
+ DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
+ DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
add r0, FENC_STRIDEB*2
lea r1, [r1+r2*2]
dec r3d
;-----------------------------------------------------------------------------
; void load_deinterleave_8x8x2_fdec( pixel *dst, pixel *src, int i_src )
;-----------------------------------------------------------------------------
-cglobal load_deinterleave_8x8x2_fdec_%1, 3,4
- DEINTERLEAVE_START %1
+cglobal load_deinterleave_8x8x2_fdec, 3,4
+ DEINTERLEAVE_START
mov r3d, 4
FIX_STRIDES r2d
.loop:
- DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, %1, m4, a
- DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, %1, m4, a
+ DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
+ DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
add r0, FDEC_STRIDEB*2
lea r1, [r1+r2*2]
dec r3d
%endmacro ; PLANE_DEINTERLEAVE
%ifdef HIGH_BIT_DEPTH
-INIT_MMX
-PLANE_INTERLEAVE mmx2
-PLANE_DEINTERLEAVE mmx
-INIT_XMM
-PLANE_INTERLEAVE sse2
-PLANE_DEINTERLEAVE sse2
-INIT_AVX
-PLANE_INTERLEAVE avx
-PLANE_DEINTERLEAVE avx
+INIT_MMX mmx2
+PLANE_INTERLEAVE
+INIT_MMX mmx
+PLANE_DEINTERLEAVE
+INIT_XMM sse2
+PLANE_INTERLEAVE
+PLANE_DEINTERLEAVE
+INIT_XMM avx
+PLANE_INTERLEAVE
+PLANE_DEINTERLEAVE
%else
-INIT_MMX
-PLANE_INTERLEAVE mmx2
-PLANE_DEINTERLEAVE mmx
-INIT_XMM
-PLANE_INTERLEAVE sse2
-PLANE_DEINTERLEAVE sse2
-PLANE_DEINTERLEAVE ssse3
+INIT_MMX mmx2
+PLANE_INTERLEAVE
+INIT_MMX mmx
+PLANE_DEINTERLEAVE
+INIT_XMM sse2
+PLANE_INTERLEAVE
+PLANE_DEINTERLEAVE
+INIT_XMM ssse3
+PLANE_DEINTERLEAVE
%endif
; These functions are not general-use; not only do the SSE ones require aligned input,
;-----------------------------------------------------------------------------
; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
+INIT_MMX
cglobal memcpy_aligned_mmx, 3,3
test r2d, 16
jz .copy32start
;-----------------------------------------------------------------------------
; void *memzero_aligned( void *dst, size_t n );
;-----------------------------------------------------------------------------
-%macro MEMZERO 1
-cglobal memzero_aligned_%1, 2,2
+%macro MEMZERO 0
+cglobal memzero_aligned, 2,2
add r0, r1
neg r1
pxor m0, m0
REP_RET
%endmacro
-INIT_MMX
-MEMZERO mmx
-INIT_XMM
-MEMZERO sse2
+INIT_MMX mmx
+MEMZERO
+INIT_XMM sse2
+MEMZERO
;-----------------------------------------------------------------------------
; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
;-----------------------------------------------------------------------------
+INIT_XMM
cglobal integral_init4h_sse4, 3,4
lea r3, [r0+r2*2]
add r1, r2
jl .loop
REP_RET
-%macro INTEGRAL_INIT8H 1
-cglobal integral_init8h_%1, 3,4
+%macro INTEGRAL_INIT8H 0
+cglobal integral_init8h, 3,4
lea r3, [r0+r2*2]
add r1, r2
neg r2
REP_RET
%endmacro
-INIT_XMM
-INTEGRAL_INIT8H sse4
-INIT_AVX
-INTEGRAL_INIT8H avx
+INIT_XMM sse4
+INTEGRAL_INIT8H
+INIT_XMM avx
+INTEGRAL_INIT8H
-%macro INTEGRAL_INIT_8V 1
+%macro INTEGRAL_INIT_8V 0
;-----------------------------------------------------------------------------
; void integral_init8v( uint16_t *sum8, int stride )
;-----------------------------------------------------------------------------
-cglobal integral_init8v_%1, 3,3
+cglobal integral_init8v, 3,3
shl r1, 1
add r0, r1
lea r2, [r0+r1*8]
REP_RET
%endmacro
-INIT_MMX
-INTEGRAL_INIT_8V mmx
-INIT_XMM
-INTEGRAL_INIT_8V sse2
+INIT_MMX mmx
+INTEGRAL_INIT_8V
+INIT_XMM sse2
+INTEGRAL_INIT_8V
;-----------------------------------------------------------------------------
; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; int src_stride, int dst_stride, int width, int height )
;-----------------------------------------------------------------------------
-%macro FRAME_INIT_LOWRES 1
-cglobal frame_init_lowres_core_%1, 6,7,(12-4*(BIT_DEPTH/9))*(mmsize/16) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
+%macro FRAME_INIT_LOWRES 0
+cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9))*(mmsize/16) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
%ifdef HIGH_BIT_DEPTH
shl dword r6m, 1
FIX_STRIDES r5d
psrld m7, 16
.vloop:
mov r6d, r7m
-%ifnidn %1,mmx2
+%ifnidn cpuname, mmx2
mova m0, [r0]
mova m1, [r0+r5]
pavgw m0, m1
sub r2, mmsize
sub r3, mmsize
sub r4, mmsize
-%ifidn %1,mmx2
+%ifidn cpuname, mmx2
FILT8xU r1, r2, 0
FILT8xU r3, r4, r5
%else
psrlw m7, 8
.vloop:
mov r6d, r7m
-%ifnidn %1, mmx2
+%ifnidn cpuname, mmx2
mova m0, [r0]
mova m1, [r0+r5]
pavgb m0, m1
mova [r2], m4
mova [r3], m3
mova [r4], m5
-%elifidn %1, mmx2
+%elifidn cpuname, mmx2
FILT8x2U r1, r2, 0
FILT8x2U r3, r4, r5
%else
RET
%endmacro ; FRAME_INIT_LOWRES
-INIT_MMX
-%define PALIGNR PALIGNR_MMX
-FRAME_INIT_LOWRES mmx2
+INIT_MMX mmx2
+FRAME_INIT_LOWRES
%ifndef ARCH_X86_64
-FRAME_INIT_LOWRES cache32_mmx2
+INIT_MMX cache32, mmx2
+FRAME_INIT_LOWRES
%endif
-INIT_XMM
-FRAME_INIT_LOWRES sse2
-%define PALIGNR PALIGNR_SSSE3
-FRAME_INIT_LOWRES ssse3
+INIT_XMM sse2
+FRAME_INIT_LOWRES
+INIT_XMM ssse3
+FRAME_INIT_LOWRES
;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
;-----------------------------------------------------------------------------
+INIT_XMM
cglobal mbtree_propagate_cost_sse2, 7,7,7
add r6d, r6d
lea r0, [r0+r6*2]
%include "x86util.asm"
SECTION .text
-INIT_MMX
+INIT_MMX mmx2
%macro LOAD_DIFF_4x8P 1 ; dx
LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1]
;-----------------------------------------------------------------------------
; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal pixel_sa8d_8x8_internal_mmx2
+cglobal pixel_sa8d_8x8_internal
push r0
push r2
sub esp, 0x74
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
-cglobal intra_sa8d_x3_8x8_core_mmx2
+cglobal intra_sa8d_x3_8x8_core
mov eax, [esp+4]
mov ecx, [esp+8]
sub esp, 0x70
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
-cglobal pixel_ssim_4x4x2_core_mmx2
+cglobal pixel_ssim_4x4x2_core
push ebx
push edi
mov ebx, [esp+16]
;-----------------------------------------------------------------------------
; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int )
;-----------------------------------------------------------------------------
-%macro SSD_ONE 3
-cglobal pixel_ssd_%1x%2_%3, 4,5,6*(mmsize/16)
+%macro SSD_ONE 2
+cglobal pixel_ssd_%1x%2, 4,5,6*(mmsize/16)
mov r4, %1*%2/mmsize
pxor m0, m0
.loop
%endmacro
%macro SSD_16_MMX 2
-cglobal pixel_ssd_%1x%2_mmx2, 4,5
+cglobal pixel_ssd_%1x%2, 4,5
mov r4, %1*%2/mmsize/2
pxor m0, m0
.loop
RET
%endmacro
-INIT_MMX
-SSD_ONE 4, 4, mmx2
-SSD_ONE 4, 8, mmx2
-SSD_ONE 8, 4, mmx2
-SSD_ONE 8, 8, mmx2
-SSD_ONE 8, 16, mmx2
+INIT_MMX mmx2
+SSD_ONE 4, 4
+SSD_ONE 4, 8
+SSD_ONE 8, 4
+SSD_ONE 8, 8
+SSD_ONE 8, 16
SSD_16_MMX 16, 8
SSD_16_MMX 16, 16
-INIT_XMM
-SSD_ONE 8, 4, sse2
-SSD_ONE 8, 8, sse2
-SSD_ONE 8, 16, sse2
-SSD_ONE 16, 8, sse2
-SSD_ONE 16, 16, sse2
+INIT_XMM sse2
+SSD_ONE 8, 4
+SSD_ONE 8, 8
+SSD_ONE 8, 16
+SSD_ONE 16, 8
+SSD_ONE 16, 16
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; int pixel_ssd_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-%macro SSD 3-4 0
+%macro SSD 2
%if %1 != %2
%assign function_align 8
%else
%assign function_align 16
%endif
-cglobal pixel_ssd_%1x%2_%3, 0,0,0
+cglobal pixel_ssd_%1x%2, 0,0,0
mov al, %1*%2/mmsize/2
%if %1 != %2
- jmp mangle(x264_pixel_ssd_%1x%1_%3.startloop)
+ jmp mangle(x264_pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop)
%else
.startloop:
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3
-%ifnidn %3, mmx
+%if cpuflag(ssse3) ; FIXME wrong, but correcting this modifies the binary
PROLOGUE 0,0,8
+%else
+ PROLOGUE 0,0,8*(mmsize/16)
%endif
%else
PROLOGUE 0,5
mov t3, r3m
%endif
-%ifidn %3, ssse3
- mova m7, [hsub_mul]
-%elifidn %3, avx
+%if cpuflag(ssse3)
mova m7, [hsub_mul]
-%elifidn %3, sse2
+%elifidn cpuname, sse2
mova m7, [pw_00ff]
%elif %1 >= mmsize
pxor m7, m7
%endif
%endmacro
-INIT_MMX
-SSD 16, 16, mmx
-SSD 16, 8, mmx
-SSD 8, 8, mmx
-SSD 8, 16, mmx
-SSD 4, 4, mmx
-SSD 8, 4, mmx
-SSD 4, 8, mmx
-INIT_XMM
-SSD 16, 16, sse2slow, 8
-SSD 8, 8, sse2slow, 8
-SSD 16, 8, sse2slow, 8
-SSD 8, 16, sse2slow, 8
-SSD 8, 4, sse2slow, 8
+INIT_MMX mmx
+SSD 16, 16
+SSD 16, 8
+SSD 8, 8
+SSD 8, 16
+SSD 4, 4
+SSD 8, 4
+SSD 4, 8
+INIT_XMM sse2slow
+SSD 16, 16
+SSD 8, 8
+SSD 16, 8
+SSD 8, 16
+SSD 8, 4
+INIT_XMM sse2
%define SSD_CORE SSD_CORE_SSE2
%define JOIN JOIN_SSE2
-SSD 16, 16, sse2, 8
-SSD 8, 8, sse2, 8
-SSD 16, 8, sse2, 8
-SSD 8, 16, sse2, 8
-SSD 8, 4, sse2, 8
+SSD 16, 16
+SSD 8, 8
+SSD 16, 8
+SSD 8, 16
+SSD 8, 4
+INIT_XMM ssse3
%define SSD_CORE SSD_CORE_SSSE3
%define JOIN JOIN_SSSE3
-SSD 16, 16, ssse3, 8
-SSD 8, 8, ssse3, 8
-SSD 16, 8, ssse3, 8
-SSD 8, 16, ssse3, 8
-SSD 8, 4, ssse3, 8
-INIT_AVX
-SSD 16, 16, avx, 8
-SSD 8, 8, avx, 8
-SSD 16, 8, avx, 8
-SSD 8, 16, avx, 8
-SSD 8, 4, avx, 8
-INIT_MMX
-SSD 4, 4, ssse3
-SSD 4, 8, ssse3
+SSD 16, 16
+SSD 8, 8
+SSD 16, 8
+SSD 8, 16
+SSD 8, 4
+INIT_XMM avx
+SSD 16, 16
+SSD 8, 8
+SSD 16, 8
+SSD 8, 16
+SSD 8, 4
+INIT_MMX ssse3
+SSD 4, 4
+SSD 4, 8
%assign function_align 16
%endif ; !HIGH_BIT_DEPTH
; distortion levels it will take much more than that though.
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
-%macro SSD_NV12 1-2 0
-cglobal pixel_ssd_nv12_core_%1, 6,7,7*(mmsize/16)
+%macro SSD_NV12 0
+cglobal pixel_ssd_nv12_core, 6,7,7*(mmsize/16)
shl r4d, 2
FIX_STRIDES r1, r3
add r0, r4
; 6604 if interlaced), since it is called on blocks of height up to 12 (resp
; 20). At sane distortion levels it will take much more than that though.
;-----------------------------------------------------------------------------
-%macro SSD_NV12 1-2 0
-cglobal pixel_ssd_nv12_core_%1, 6,7
+%macro SSD_NV12 0
+cglobal pixel_ssd_nv12_core, 6,7
shl r4d, 1
add r0, r4
add r2, r4
%endmacro ; SSD_NV12
%endif ; !HIGH_BIT_DEPTH
-INIT_MMX
-SSD_NV12 mmx2
-INIT_XMM
-SSD_NV12 sse2
-INIT_AVX
-SSD_NV12 avx
+INIT_MMX mmx2
+SSD_NV12
+INIT_XMM sse2
+SSD_NV12
+INIT_XMM avx
+SSD_NV12
;=============================================================================
; variance
VAR_END 8, 8
%ifdef HIGH_BIT_DEPTH
-%macro VAR 1
-cglobal pixel_var_16x16_%1, 2,3,8
+%macro VAR 0
+cglobal pixel_var_16x16, 2,3,8
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 8
VAR_END 16, 16
-cglobal pixel_var_8x8_%1, 2,3,8
+cglobal pixel_var_8x8, 2,3,8
lea r2, [r1*3]
VAR_START 0
mova m0, [r0]
VAR_END 8, 8
%endmacro ; VAR
-INIT_XMM
-VAR sse2
-INIT_AVX
-VAR avx
+INIT_XMM sse2
+VAR
+INIT_XMM avx
+VAR
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
-%macro VAR 1
-cglobal pixel_var_16x16_%1, 2,3,8
+%macro VAR 0
+cglobal pixel_var_16x16, 2,3,8
VAR_START 1
mov r2d, 8
.loop:
jg .loop
VAR_END 16, 16
-cglobal pixel_var_8x8_%1, 2,4,8
+cglobal pixel_var_8x8, 2,4,8
VAR_START 1
mov r2d, 2
lea r3, [r1*3]
VAR_END 8, 8
%endmacro ; VAR
-INIT_XMM
-VAR sse2
-INIT_AVX
-VAR avx
+INIT_XMM sse2
+VAR
+INIT_XMM avx
+VAR
%endif ; !HIGH_BIT_DEPTH
%macro VAR2_END 0
%define TRANS TRANS_SSE2
-%macro JDUP_SSE2 2
- punpckldq %1, %2
- ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
-%endmacro
-
-%macro JDUP_CONROE 2
+%macro JDUP 2
+%if cpuflag(sse4)
+ ; just use shufps on anything post conroe
+ shufps %1, %2, 0
+%elif cpuflag(ssse3)
; join 2x 32 bit and duplicate them
; emulating shufps is faster on conroe
punpcklqdq %1, %2
movsldup %1, %1
-%endmacro
-
-%macro JDUP_PENRYN 2
- ; just use shufps on anything post conroe
- shufps %1, %2, 0
+%else
+ ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
+ punpckldq %1, %2
+%endif
%endmacro
%macro HSUMSUB 5
%else
HADAMARD4_V %2, %3, %4, %5, %6
; doing the abs first is a slight advantage
- ABS4 m%2, m%4, m%3, m%5, m%6, m%7
+ ABS2 m%2, m%4, m%6, m%7
+ ABS2 m%3, m%5, m%6, m%7
HADAMARD 1, max, %2, %4, %6, %7
%endif
%ifnidn %9, swap
;-----------------------------------------------------------------------------
; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal pixel_satd_16x4_internal_mmx2
+INIT_MMX mmx2
+cglobal pixel_satd_16x4_internal
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 0
paddw m0, m2
paddw m0, m1
ret
-cglobal pixel_satd_8x8_internal_mmx2
+cglobal pixel_satd_8x8_internal
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 1
paddw m0, m2
%ifdef HIGH_BIT_DEPTH
%macro SATD_MxN_MMX 3
-cglobal pixel_satd_%1x%2_mmx2, 4,7
+cglobal pixel_satd_%1x%2, 4,7
SATD_START_MMX
pxor m0, m0
call pixel_satd_%1x%3_internal_mmx2
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
-cglobal pixel_satd_16x16_mmx2, 4,6
+cglobal pixel_satd_16x16, 4,6
SATD_START_MMX
pxor m0, m0
%rep 3
movd eax, m0
RET
-cglobal pixel_satd_16x8_mmx2, 4,6
+cglobal pixel_satd_16x8, 4,6
SATD_START_MMX
pxor m0, m0
call pixel_satd_16x4_internal_mmx2
call pixel_satd_16x4_internal_mmx2
SATD_END_MMX
-cglobal pixel_satd_8x16_mmx2, 4,6
+cglobal pixel_satd_8x16, 4,6
SATD_START_MMX
pxor m0, m0
call pixel_satd_8x8_internal_mmx2
SATD_END_MMX
%endif ; !HIGH_BIT_DEPTH
-cglobal pixel_satd_8x8_mmx2, 4,6
+cglobal pixel_satd_8x8, 4,6
SATD_START_MMX
pxor m0, m0
call pixel_satd_8x8_internal_mmx2
SATD_END_MMX
-cglobal pixel_satd_8x4_mmx2, 4,6
+cglobal pixel_satd_8x4, 4,6
SATD_START_MMX
pxor m0, m0
call pixel_satd_8x4_internal_mmx2
SATD_END_MMX
-cglobal pixel_satd_4x8_mmx2, 4,6
+cglobal pixel_satd_4x8, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 1
SATD_4x4_MMX m1, 0, 0
paddw m0, m1
SATD_END_MMX
-cglobal pixel_satd_4x4_mmx2, 4,6
+cglobal pixel_satd_4x4, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 0
SATD_END_MMX
-%macro SATD_START_SSE2 3
-%ifnidn %1, sse2
- mova %3, [hmul_8p]
+%macro SATD_START_SSE2 2
+%if cpuflag(ssse3)
+ mova %2, [hmul_8p]
%endif
lea r4, [3*r1]
lea r5, [3*r3]
- pxor %2, %2
+ pxor %1, %1
%endmacro
-%macro SATD_END_SSE2 2
- HADDW %2, m7
- movd eax, %2
+%macro SATD_END_SSE2 1
+ HADDW %1, m7
+ movd eax, %1
RET
%endmacro
;-----------------------------------------------------------------------------
; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-%macro SATDS_SSE2 1
-%ifnidn %1, sse2
-cglobal pixel_satd_4x4_%1, 4, 6, 6
+%macro SATDS_SSE2 0
+%if cpuflag(ssse3)
+cglobal pixel_satd_4x4, 4, 6, 6
SATD_START_MMX
mova m4, [hmul_4p]
LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
RET
%endif
-cglobal pixel_satd_4x8_%1, 4, 6, 8
+cglobal pixel_satd_4x8, 4, 6, 8
SATD_START_MMX
-%ifnidn %1, sse2
+%if cpuflag(ssse3)
mova m7, [hmul_4p]
%endif
movd m4, [r2]
movd m4, [r0+r1]
JDUP m3, m4
DIFFOP 2, 6, 3, 5, 7
- SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6, swap
+ SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6, swap
HADDW m6, m1
movd eax, m6
RET
-cglobal pixel_satd_8x8_internal_%1
+cglobal pixel_satd_8x8_internal
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
- SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
-pixel_satd_8x4_internal_%1:
+ SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6
+%%pixel_satd_8x4_internal:
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
- SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
+ SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6
ret
%ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
-cglobal pixel_satd_16x4_internal_%1
+cglobal pixel_satd_16x4_internal
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
lea r0, [r0+4*r1]
+ ; FIXME: this doesn't really mean ssse3, but rather selects between two different behaviors implemented with sse2?
SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10
SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
ret
-cglobal pixel_satd_16x8_%1, 4,6,12
- SATD_START_SSE2 %1, m10, m7
-%ifidn %1, sse2
+cglobal pixel_satd_16x8, 4,6,12
+ SATD_START_SSE2 m10, m7
+%if notcpuflag(ssse3)
mova m7, [pw_00ff]
%endif
- jmp pixel_satd_16x8_internal_%1
+ jmp %%pixel_satd_16x8_internal
-cglobal pixel_satd_16x16_%1, 4,6,12
- SATD_START_SSE2 %1, m10, m7
-%ifidn %1, sse2
+cglobal pixel_satd_16x16, 4,6,12
+ SATD_START_SSE2 m10, m7
+%if notcpuflag(ssse3)
mova m7, [pw_00ff]
%endif
- call pixel_satd_16x4_internal_%1
- call pixel_satd_16x4_internal_%1
-pixel_satd_16x8_internal_%1:
- call pixel_satd_16x4_internal_%1
- call pixel_satd_16x4_internal_%1
- SATD_END_SSE2 %1, m10
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+%%pixel_satd_16x8_internal:
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ SATD_END_SSE2 m10
%else
-cglobal pixel_satd_16x8_%1, 4,6,8
- SATD_START_SSE2 %1, m6, m7
+cglobal pixel_satd_16x8, 4,6,8
+ SATD_START_SSE2 m6, m7
BACKUP_POINTERS
- call pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal
RESTORE_AND_INC_POINTERS
- call pixel_satd_8x8_internal_%1
- SATD_END_SSE2 %1, m6
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6
-cglobal pixel_satd_16x16_%1, 4,6,8
- SATD_START_SSE2 %1, m6, m7
+cglobal pixel_satd_16x16, 4,6,8
+ SATD_START_SSE2 m6, m7
BACKUP_POINTERS
- call pixel_satd_8x8_internal_%1
- call pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
RESTORE_AND_INC_POINTERS
- call pixel_satd_8x8_internal_%1
- call pixel_satd_8x8_internal_%1
- SATD_END_SSE2 %1, m6
-%endif
-
-cglobal pixel_satd_8x16_%1, 4,6,8
- SATD_START_SSE2 %1, m6, m7
- call pixel_satd_8x8_internal_%1
- call pixel_satd_8x8_internal_%1
- SATD_END_SSE2 %1, m6
-
-cglobal pixel_satd_8x8_%1, 4,6,8
- SATD_START_SSE2 %1, m6, m7
- call pixel_satd_8x8_internal_%1
- SATD_END_SSE2 %1, m6
-
-cglobal pixel_satd_8x4_%1, 4,6,8
- SATD_START_SSE2 %1, m6, m7
- call pixel_satd_8x4_internal_%1
- SATD_END_SSE2 %1, m6
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6
+%endif
+
+cglobal pixel_satd_8x16, 4,6,8
+ SATD_START_SSE2 m6, m7
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6
+
+cglobal pixel_satd_8x8, 4,6,8
+ SATD_START_SSE2 m6, m7
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6
+
+cglobal pixel_satd_8x4, 4,6,8
+ SATD_START_SSE2 m6, m7
+ call %%pixel_satd_8x4_internal
+ SATD_END_SSE2 m6
%endmacro ; SATDS_SSE2
%macro SA8D_INTER 0
%endif ; HIGH_BIT_DEPTH
%endmacro
-%macro SA8D 1
+%macro SA8D 0
%ifdef HIGH_BIT_DEPTH
%define vertical 1
-%elifidn %1, sse2 ; sse2 doesn't seem to like the horizontal way of doing things
- %define vertical 1
-%else
- %define vertical 0
+%else ; sse2 doesn't seem to like the horizontal way of doing things
+ %define vertical (cpuflags == cpuflags_sse2)
%endif
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal pixel_sa8d_8x8_internal_%1
+cglobal pixel_sa8d_8x8_internal
lea r10, [r0+4*r1]
lea r11, [r2+4*r3]
LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
paddw m0, m1
paddw m0, m2
paddw m0, m8
- SAVE_MM_PERMUTATION pixel_sa8d_8x8_internal_%1
+ SAVE_MM_PERMUTATION
ret
-cglobal pixel_sa8d_8x8_%1, 4,6,12
+cglobal pixel_sa8d_8x8, 4,6,12
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
%if vertical == 0
mova m7, [hmul_8p]
%endif
- call pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal
%ifdef HIGH_BIT_DEPTH
HADDUW m0, m1
%else
shr eax, 1
RET
-cglobal pixel_sa8d_16x16_%1, 4,6,12
+cglobal pixel_sa8d_16x16, 4,6,12
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
%if vertical == 0
mova m7, [hmul_8p]
%endif
- call pixel_sa8d_8x8_internal_%1 ; pix[0]
+ call pixel_sa8d_8x8_internal ; pix[0]
add r2, 8*SIZEOF_PIXEL
add r0, 8*SIZEOF_PIXEL
%ifdef HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova m10, m0
- call pixel_sa8d_8x8_internal_%1 ; pix[8]
+ call pixel_sa8d_8x8_internal ; pix[8]
lea r2, [r2+8*r3]
lea r0, [r0+8*r1]
SA8D_INTER
- call pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
+ call pixel_sa8d_8x8_internal ; pix[8*stride+8]
sub r2, 8*SIZEOF_PIXEL
sub r0, 8*SIZEOF_PIXEL
SA8D_INTER
- call pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
+ call pixel_sa8d_8x8_internal ; pix[8*stride]
SA8D_INTER
SWAP 0, 10
%ifndef HIGH_BIT_DEPTH
RET
%else ; ARCH_X86_32
-%ifnidn %1, mmx2
-cglobal pixel_sa8d_8x8_internal_%1
+%if mmsize == 16
+cglobal pixel_sa8d_8x8_internal
%define spill0 [esp+4]
%define spill1 [esp+20]
%define spill2 [esp+36]
movdqa m3, spill0
paddw m0, m1
HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
-%else ; non-sse2
+%else ; mmsize == 8
mova m7, [hmul_8p]
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
; could do first HADAMARD4_V here to save spilling later
%endif ; sse2/non-sse2
paddw m0, m2
paddw m0, m3
- SAVE_MM_PERMUTATION pixel_sa8d_8x8_internal_%1
+ SAVE_MM_PERMUTATION
ret
%endif ; ifndef mmx2
-cglobal pixel_sa8d_8x8_%1, 4,7
+cglobal pixel_sa8d_8x8, 4,7
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 48
lea r4, [3*r1]
lea r5, [3*r3]
- call pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal
%ifdef HIGH_BIT_DEPTH
HADDUW m0, m1
%else
mov esp, r6
RET
-cglobal pixel_sa8d_16x16_%1, 4,7
+cglobal pixel_sa8d_16x16, 4,7
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [3*r1]
lea r5, [3*r3]
- call pixel_sa8d_8x8_internal_%1
-%ifidn %1, mmx2
+ call pixel_sa8d_8x8_internal
+%if mmsize == 8
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
HADDUW m0, m1
%endif
mova [esp+48], m0
- call pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_INTER
mova [esp+48], m0
- call pixel_sa8d_8x8_internal_%1
-%ifidn %1, mmx2
+ call pixel_sa8d_8x8_internal
+%if mmsize == 8
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
-%endif
-%if mmsize == 16
+%else
SA8D_INTER
%endif
mova [esp+64-mmsize], m0
- call pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal
%ifdef HIGH_BIT_DEPTH
SA8D_INTER
%else ; !HIGH_BIT_DEPTH
; INTRA SATD
;=============================================================================
-%macro INTRA_SA8D_SSE2 1
+%macro INTRA_SA8D_SSE2 0
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
-cglobal intra_sa8d_x3_8x8_core_%1, 3,3,16
+cglobal intra_sa8d_x3_8x8_core, 3,3,16
; 8x8 hadamard
pxor m8, m8
movq m0, [r0+0*FENC_STRIDE]
movdqa m9, m3
movdqa m10, m4
movdqa m11, m5
- ABS4 m8, m9, m10, m11, m12, m13
+ ABS2 m8, m9, m12, m13
+ ABS2 m10, m11, m12, m13
paddusw m8, m10
paddusw m9, m11
-%ifidn %1, ssse3
+%ifidn cpuname, ssse3
pabsw m10, m6
pabsw m11, m7
pabsw m15, m1
punpcklbw m2, m7
punpcklbw m3, m7
HADAMARD4_2D 0, 1, 2, 3, 4
- SAVE_MM_PERMUTATION hadamard_load
+ SAVE_MM_PERMUTATION
ret
%macro SCALAR_SUMSUB 4
; in: m1..m3
; out: m7
; clobber: m4..m6
-%macro SUM3x4 1
-%ifidn %1, ssse3
+%macro SUM3x4 0
+%if cpuflag(ssse3)
pabsw m4, m1
pabsw m5, m2
pabsw m7, m3
ABS1 m0, m1 ; 4x1 sum
%endmacro
-%macro INTRA_SATDS_MMX 1
+%macro INTRA_SATDS_MMX 0
;-----------------------------------------------------------------------------
; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-cglobal intra_satd_x3_4x4_%1, 2,6
+cglobal intra_satd_x3_4x4, 2,6
%ifdef ARCH_X86_64
; stack is 16 byte aligned because abi says so
%define top_1d rsp-8 ; size 8
and t0d, -8
shl t0d, 1 ; dc
- SUM3x4 %1
+ SUM3x4
SUM4x3 t0d, [left_1d], [top_1d]
paddw m4, m7
paddw m5, m7
;-----------------------------------------------------------------------------
; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-cglobal intra_satd_x3_16x16_%1, 0,7
+cglobal intra_satd_x3_16x16, 0,7
%ifdef ARCH_X86_64
%assign stack_pad 88
%else
.loop_x:
call hadamard_load
- SUM3x4 %1
+ SUM3x4
SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
pavgw m4, m7
pavgw m5, m7
;-----------------------------------------------------------------------------
; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-cglobal intra_satd_x3_8x8c_%1, 0,6
+cglobal intra_satd_x3_8x8c, 0,6
; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
SUB rsp, 72
%define sums rsp+48 ; size 24
.loop_x:
call hadamard_load
- SUM3x4 %1
+ SUM3x4
SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
pavgw m4, m7
pavgw m5, m7
%endmacro ; INTRA_SATDS_MMX
-%macro ABS_MOV_SSSE3 2
+%macro ABS_MOV 2
+%if cpuflag(ssse3)
pabsw %1, %2
-%endmacro
-
-%macro ABS_MOV_MMX 2
+%else
pxor %1, %1
psubw %1, %2
pmaxsw %1, %2
+%endif
%endmacro
-%define ABS_MOV ABS_MOV_MMX
-
; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
; out: [tmp]=hadamard4, m0=satd
-cglobal hadamard_ac_4x4_mmx2
+INIT_MMX mmx2
+cglobal hadamard_ac_4x4
%ifdef HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+r1]
paddw m0, m1
paddw m2, m3
paddw m0, m2
- SAVE_MM_PERMUTATION hadamard_ac_4x4_mmx2
+ SAVE_MM_PERMUTATION
ret
-cglobal hadamard_ac_2x2max_mmx2
+cglobal hadamard_ac_2x2max
mova m0, [r3+0x00]
mova m1, [r3+0x20]
mova m2, [r3+0x40]
mova m3, [r3+0x60]
sub r3, 8
SUMSUB_BADC w, 0, 1, 2, 3, 4
- ABS4 m0, m2, m1, m3, m4, m5
+ ABS2 m0, m2, m4, m5
+ ABS2 m1, m3, m4, m5
HADAMARD 0, max, 0, 2, 4, 5
HADAMARD 0, max, 1, 3, 4, 5
%ifdef HIGH_BIT_DEPTH
paddw m7, m0
paddw m7, m1
%endif ; HIGH_BIT_DEPTH
- SAVE_MM_PERMUTATION hadamard_ac_2x2max_mmx2
+ SAVE_MM_PERMUTATION
ret
%macro AC_PREP 2
%endif ; HIGH_BIT_DEPTH
%endmacro
-cglobal hadamard_ac_8x8_mmx2
+cglobal hadamard_ac_8x8
mova m6, [mask_ac4]
%ifdef HIGH_BIT_DEPTH
mova m7, [pw_1]
mova m3, [r3+0x60]
SUMSUB_BADC w, 0, 1, 2, 3, 4
HADAMARD 0, sumsub, 0, 2, 4, 5
- ABS4 m1, m3, m0, m2, m4, m5
+ ABS2 m1, m3, m4, m5
+ ABS2 m0, m2, m4, m5
HADAMARD 0, max, 1, 3, 4, 5
%ifdef HIGH_BIT_DEPTH
pand m0, [mask_ac4]
%endif ; HIGH_BIT_DEPTH
mova [rsp+gprsize], m6 ; save sa8d
SWAP 0, 6
- SAVE_MM_PERMUTATION hadamard_ac_8x8_mmx2
+ SAVE_MM_PERMUTATION
ret
%macro HADAMARD_AC_WXH_SUM_MMX 2
%endmacro
%macro HADAMARD_AC_WXH_MMX 2
-cglobal pixel_hadamard_ac_%1x%2_mmx2, 2,4
+cglobal pixel_hadamard_ac_%1x%2, 2,4
%assign pad 16-gprsize-(stack_offset&15)
%define ysub r1
FIX_STRIDES r1
HSUMSUB %1, %2, %3, %4, %5
%endmacro
-%macro HADAMARD_AC_SSE2 1
+%macro HADAMARD_AC_SSE2 0
; in: r0=pix, r1=stride, r2=stride*3
; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
-cglobal hadamard_ac_8x8_%1
+cglobal hadamard_ac_8x8
%ifdef ARCH_X86_64
%define spill0 m8
%define spill1 m9
%endif
%ifdef HIGH_BIT_DEPTH
%define vertical 1
-%elifidn %1, sse2
- %define vertical 1
- ;LOAD_INC only unpacks to words
- pxor m7, m7
-%else
+%elif cpuflag(ssse3)
%define vertical 0
;LOAD_INC loads sumsubs
mova m7, [hmul_8p]
+%else
+ %define vertical 1
+ ;LOAD_INC only unpacks to words
+ pxor m7, m7
%endif
LOAD_INC_8x4W 0, 1, 2, 3, 7
%if vertical
HADAMARD4_2D_SSE 4, 5, 6, 7, 1
%else
HADAMARD4_V 4, 5, 6, 7, 1
+ ; FIXME SWAP
mova m1, spill0
mova spill0, m6
mova spill1, m7
AC_PADD m2, m0, [pw_1]
mova [rsp+gprsize+16], m2 ; save sa8d
SWAP 0, 2
- SAVE_MM_PERMUTATION hadamard_ac_8x8_%1
+ SAVE_MM_PERMUTATION
ret
-HADAMARD_AC_WXH_SSE2 16, 16, %1
-HADAMARD_AC_WXH_SSE2 8, 16, %1
-HADAMARD_AC_WXH_SSE2 16, 8, %1
-HADAMARD_AC_WXH_SSE2 8, 8, %1
+HADAMARD_AC_WXH_SSE2 16, 16
+HADAMARD_AC_WXH_SSE2 8, 16
+HADAMARD_AC_WXH_SSE2 16, 8
+HADAMARD_AC_WXH_SSE2 8, 8
%endmacro ; HADAMARD_AC_SSE2
%macro HADAMARD_AC_WXH_SUM_SSE2 2
%endmacro
; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
-%macro HADAMARD_AC_WXH_SSE2 3
-cglobal pixel_hadamard_ac_%1x%2_%3, 2,3,11
+%macro HADAMARD_AC_WXH_SSE2 2
+cglobal pixel_hadamard_ac_%1x%2, 2,3,11
%assign pad 16-gprsize-(stack_offset&15)
%define ysub r1
FIX_STRIDES r1
sub rsp, 48+pad
lea r2, [r1*3]
- call hadamard_ac_8x8_%3
+ call hadamard_ac_8x8
%if %2==16
%define ysub r2
lea r0, [r0+r1*4]
sub rsp, 32
- call hadamard_ac_8x8_%3
+ call hadamard_ac_8x8
%endif
%if %1==16
neg ysub
sub rsp, 32
lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
neg ysub
- call hadamard_ac_8x8_%3
+ call hadamard_ac_8x8
%if %2==16
lea r0, [r0+r1*4]
sub rsp, 32
- call hadamard_ac_8x8_%3
+ call hadamard_ac_8x8
%endif
%endif
HADAMARD_AC_WXH_SUM_SSE2 %1, %2
%ifndef ARCH_X86_64
cextern pixel_sa8d_8x8_internal_mmx2
-SA8D mmx2
+INIT_MMX mmx2
+SA8D
%endif
%define TRANS TRANS_SSE2
-%define ABS1 ABS1_MMX
-%define ABS2 ABS2_MMX
%define DIFFOP DIFF_UNPACK_SSE2
-%define JDUP JDUP_SSE2
%define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
%define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
%define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
%define movdqu movups
%define punpcklqdq movlhps
-INIT_XMM
-SA8D sse2
-SATDS_SSE2 sse2
-INTRA_SA8D_SSE2 sse2
+INIT_XMM sse2
+SA8D
+SATDS_SSE2
+INTRA_SA8D_SSE2
%ifndef HIGH_BIT_DEPTH
-INIT_MMX
-INTRA_SATDS_MMX mmx2
+INIT_MMX mmx2
+INTRA_SATDS_MMX
%endif
-INIT_XMM
-HADAMARD_AC_SSE2 sse2
+INIT_XMM sse2
+HADAMARD_AC_SSE2
-%define ABS1 ABS1_SSSE3
-%define ABS2 ABS2_SSSE3
-%define ABS_MOV ABS_MOV_SSSE3
%define DIFFOP DIFF_SUMSUB_SSSE3
-%define JDUP JDUP_CONROE
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
%ifndef HIGH_BIT_DEPTH
%define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
%endif
-INIT_XMM
-SATDS_SSE2 ssse3
-SA8D ssse3
-HADAMARD_AC_SSE2 ssse3
+INIT_XMM ssse3
+SATDS_SSE2
+SA8D
+HADAMARD_AC_SSE2
%undef movdqa ; nehalem doesn't like movaps
%undef movdqu ; movups
%undef punpcklqdq ; or movlhps
-INTRA_SA8D_SSE2 ssse3
-INIT_MMX
-INTRA_SATDS_MMX ssse3
+INTRA_SA8D_SSE2
+INIT_MMX ssse3
+INTRA_SATDS_MMX
%define TRANS TRANS_SSE4
-%define JDUP JDUP_PENRYN
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
-INIT_XMM
-SATDS_SSE2 sse4
-SA8D sse4
-HADAMARD_AC_SSE2 sse4
+INIT_XMM sse4
+SATDS_SSE2
+SA8D
+HADAMARD_AC_SSE2
-INIT_AVX
-SATDS_SSE2 avx
-SA8D avx
-INTRA_SA8D_SSE2 avx
-HADAMARD_AC_SSE2 avx
+INIT_XMM avx
+SATDS_SSE2
+SA8D
+INTRA_SA8D_SSE2
+HADAMARD_AC_SSE2
;=============================================================================
; SSIM
paddd m3, m6
%endmacro
-%macro SSIM 1
-cglobal pixel_ssim_4x4x2_core_%1, 4,4,8
+%macro SSIM 0
+cglobal pixel_ssim_4x4x2_core, 4,4,8
FIX_STRIDES r1, r3
pxor m0, m0
SSIM_ITER 0
;-----------------------------------------------------------------------------
; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
-cglobal pixel_ssim_end4_%1, 3,3,7
+cglobal pixel_ssim_end4, 3,3,7
movdqa m0, [r0+ 0]
movdqa m1, [r0+16]
movdqa m2, [r0+32]
RET
%endmacro ; SSIM
-INIT_XMM
-SSIM sse2
-INIT_AVX
-SSIM avx
+INIT_XMM sse2
+SSIM
+INIT_XMM avx
+SSIM
;=============================================================================
; Successive Elimination ADS
jmp ads_mvs
%endmacro
-%define ABS1 ABS1_MMX
-
;-----------------------------------------------------------------------------
; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
-cglobal pixel_ads4_mmx2, 6,7
+INIT_MMX mmx2
+cglobal pixel_ads4, 6,7
movq mm6, [r0]
movq mm4, [r0+8]
pshufw mm7, mm6, 0
movd [r6], mm1
ADS_END 1
-cglobal pixel_ads2_mmx2, 6,7
+cglobal pixel_ads2, 6,7
movq mm6, [r0]
pshufw mm5, r6m, 0
pshufw mm7, mm6, 0
movd [r6], mm4
ADS_END 1
-cglobal pixel_ads1_mmx2, 6,7
+cglobal pixel_ads1, 6,7
pshufw mm7, [r0], 0
pshufw mm6, r6m, 0
ADS_START
movq [r6], mm4
ADS_END 2
-%macro ADS_SSE2 1
-cglobal pixel_ads4_%1, 6,7,12
+%macro ADS_XMM 0
+cglobal pixel_ads4, 6,7,12
movdqa xmm4, [r0]
pshuflw xmm7, xmm4, 0
pshuflw xmm6, xmm4, 0xAA
%endif ; ARCH
ADS_END 2
-cglobal pixel_ads2_%1, 6,7,8
+cglobal pixel_ads2, 6,7,8
movq xmm6, [r0]
movd xmm5, r6m
pshuflw xmm7, xmm6, 0
movq [r6], xmm1
ADS_END 2
-cglobal pixel_ads1_%1, 6,7,8
+cglobal pixel_ads1, 6,7,8
movd xmm7, [r0]
movd xmm6, r6m
pshuflw xmm7, xmm7, 0
ADS_END 4
%endmacro
-INIT_XMM
-ADS_SSE2 sse2
-%define ABS1 ABS1_SSSE3
-ADS_SSE2 ssse3
-INIT_AVX
-ADS_SSE2 avx
+INIT_XMM sse2
+ADS_XMM
+INIT_XMM ssse3
+ADS_XMM
+INIT_XMM avx
+ADS_XMM
; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
inc r1d
%endmacro
+INIT_MMX
cglobal pixel_ads_mvs, 0,7,0
ads_mvs:
lea r6, [r4+r5+15]
;-----------------------------------------------------------------------------
; void predict_4x4_ddl( pixel *src )
;-----------------------------------------------------------------------------
-%macro PREDICT_4x4_DDL 4
-cglobal predict_4x4_ddl_%1, 1,1
+%macro PREDICT_4x4_DDL 3
+cglobal predict_4x4_ddl, 1,1
movu m1, [r0-FDEC_STRIDEB]
- psll%2 m2, m1, %3
+ psll%1 m2, m1, %2
mova m3, m1
mova m4, m1
pxor m1, m2
- psrl%2 m1, %3
+ psrl%1 m1, %2
pxor m3, m1
- PRED8x8_LOWPASS %4, m0, m2, m3, m4, m5
+ PRED8x8_LOWPASS %3, m0, m2, m3, m4, m5
%assign Y 0
%rep 4
- psrl%2 m0, %3
+ psrl%1 m0, %2
movh [r0+Y*FDEC_STRIDEB], m0
%assign Y (Y+1)
%endrep
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-PREDICT_4x4_DDL sse2, dq, 2, w
-INIT_AVX
-PREDICT_4x4_DDL avx , dq, 2, w
-INIT_MMX
-%define PALIGNR PALIGNR_MMX
-cglobal predict_4x4_ddl_mmx2, 1,2
+INIT_XMM sse2
+PREDICT_4x4_DDL dq, 2, w
+INIT_XMM avx
+PREDICT_4x4_DDL dq, 2, w
+INIT_MMX mmx2
+cglobal predict_4x4_ddl, 1,2
mova m1, [r0-2*FDEC_STRIDE+4]
mova m2, [r0-2*FDEC_STRIDE+0]
mova m3, [r0-2*FDEC_STRIDE+2]
mova [r0+2*FDEC_STRIDE], m4
RET
%else
-INIT_MMX
-PREDICT_4x4_DDL mmx2, q, 8, b
+INIT_MMX mmx2
+PREDICT_4x4_DDL q, 8, b
%endif
;-----------------------------------------------------------------------------
; void predict_4x4_ddr( pixel *src )
;-----------------------------------------------------------------------------
-%macro PREDICT_4x4 7
-cglobal predict_4x4_ddr_%1, 1,1
+%macro PREDICT_4x4 6
+cglobal predict_4x4_ddr, 1,1
movu m1, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
movq m2, [r0+0*FDEC_STRIDEB-8]
%ifdef HIGH_BIT_DEPTH
movh m4, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- punpckl%2 m2, m4
+ punpckl%1 m2, m4
movh m3, [r0-1*FDEC_STRIDEB]
- punpckh%3 m1, m2
+ punpckh%2 m1, m2
PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1
mova m1, m3
movhps m4, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
movhps m4, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
PALIGNR m3, m4, 7*SIZEOF_PIXEL, m4
%else
- punpckh%2 m2, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
+ punpckh%1 m2, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
movh m3, [r0-1*FDEC_STRIDEB]
- punpckh%3 m1, m2
+ punpckh%2 m1, m2
PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1
mova m1, m3
PALIGNR m3, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
mova m2, m3
PALIGNR m3, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
%endif
- PRED8x8_LOWPASS %5, m0, m3, m1, m2, m4
+ PRED8x8_LOWPASS %4, m0, m3, m1, m2, m4
%assign Y 3
movh [r0+Y*FDEC_STRIDEB], m0
%rep 3
%assign Y (Y-1)
- psrl%4 m0, %7
+ psrl%3 m0, %6
movh [r0+Y*FDEC_STRIDEB], m0
%endrep
RET
-cglobal predict_4x4_vr_%1, 1,1,6*(mmsize/16)
+cglobal predict_4x4_vr, 1,1,6*(mmsize/16)
movh m0, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
mova m5, m0
%ifdef HIGH_BIT_DEPTH
movhps m1, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt
- pavg%5 m5, m0
+ pavg%4 m5, m0
movhps m1, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0
mova m1, m0
PALIGNR m0, m3, 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
%else
PALIGNR m0, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt
- pavg%5 m5, m0
+ pavg%4 m5, m0
PALIGNR m0, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0
mova m1, m0
PALIGNR m0, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1
mova m2, m0
PALIGNR m0, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
%endif
- PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
- psll%4 m1, m3, %7*6
- psrl%4 m3, %7*2
+ PRED8x8_LOWPASS %4, m3, m1, m0, m2, m4
+ psll%3 m1, m3, %6*6
+ psrl%3 m3, %6*2
movh [r0+0*FDEC_STRIDEB], m5
movh [r0+1*FDEC_STRIDEB], m3
PALIGNR m5, m1, 7*SIZEOF_PIXEL, m2
- psll%4 m1, %7
+ psll%3 m1, %6
movh [r0+2*FDEC_STRIDEB], m5
PALIGNR m3, m1, 7*SIZEOF_PIXEL, m1
movh [r0+3*FDEC_STRIDEB], m3
RET
-cglobal predict_4x4_hd_%1, 1,1,6*(mmsize/16)
+cglobal predict_4x4_hd, 1,1,6*(mmsize/16)
movh m0, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; lt ..
%ifdef HIGH_BIT_DEPTH
movh m1, [r0-1*FDEC_STRIDEB]
- punpckl%6 m0, m1 ; t3 t2 t1 t0 lt .. .. ..
- psll%4 m0, %7 ; t2 t1 t0 lt .. .. .. ..
+ punpckl%5 m0, m1 ; t3 t2 t1 t0 lt .. .. ..
+ psll%3 m0, %6 ; t2 t1 t0 lt .. .. .. ..
movh m1, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; l3
movh m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- punpckl%2 m1, m2 ; l2 l3
+ punpckl%1 m1, m2 ; l2 l3
movh m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; l1
movh m3, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- punpckl%2 m2, m3 ; l0 l1
+ punpckl%1 m2, m3 ; l0 l1
%else
- punpckl%6 m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
- psll%4 m0, %7 ; t2 t1 t0 lt .. .. .. ..
+ punpckl%5 m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
+ psll%3 m0, %6 ; t2 t1 t0 lt .. .. .. ..
movu m1, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l3
- punpckh%2 m1, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l2 l3
+ punpckh%1 m1, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l2 l3
movu m2, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l1
- punpckh%2 m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l0 l1
+ punpckh%1 m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l0 l1
%endif
- punpckh%3 m1, m2 ; l0 l1 l2 l3
- punpckh%6 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
- psrl%4 m2, m1, %7 ; .. t2 t1 t0 lt l0 l1 l2
- psrl%4 m0, m1, %7*2 ; .. .. t2 t1 t0 lt l0 l1
- pavg%5 m5, m1, m2
- PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
- punpckl%2 m5, m3
- psrl%4 m3, %7*4
+ punpckh%2 m1, m2 ; l0 l1 l2 l3
+ punpckh%5 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
+ psrl%3 m2, m1, %6 ; .. t2 t1 t0 lt l0 l1 l2
+ psrl%3 m0, m1, %6*2 ; .. .. t2 t1 t0 lt l0 l1
+ pavg%4 m5, m1, m2
+ PRED8x8_LOWPASS %4, m3, m1, m0, m2, m4
+ punpckl%1 m5, m3
+ psrl%3 m3, %6*4
PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4
%assign Y 3
movh [r0+Y*FDEC_STRIDEB], m5
%rep 2
%assign Y (Y-1)
- psrl%4 m5, %7*2
+ psrl%3 m5, %6*2
movh [r0+Y*FDEC_STRIDEB], m5
%endrep
movh [r0+0*FDEC_STRIDEB], m3
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_MMX
-%define PALIGNR PALIGNR_MMX
-cglobal predict_4x4_ddr_mmx2, 1,1
+INIT_MMX mmx2
+cglobal predict_4x4_ddr, 1,1
movq m3, [r0+3*FDEC_STRIDEB-8]
psrlq m3, 48
PALIGNR m3, [r0+2*FDEC_STRIDEB-8], 6, m6
movd [r0+3*FDEC_STRIDEB+4], m1
RET
-cglobal predict_4x4_hd_mmx2, 1,1
+cglobal predict_4x4_hd, 1,1
mova m0, [r0+1*FDEC_STRIDEB-8]
punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
mova m1, [r0+3*FDEC_STRIDEB-8]
mova [r0+2*FDEC_STRIDEB], m0
RET
-INIT_XMM
-%define PALIGNR PALIGNR_MMX
-PREDICT_4x4 sse2 , wd, dq, dq, w, qdq, 2
-%define PALIGNR PALIGNR_SSSE3
-PREDICT_4x4 ssse3, wd, dq, dq, w, qdq, 2
-INIT_AVX
-PREDICT_4x4 avx , wd, dq, dq, w, qdq, 2
+INIT_XMM sse2
+PREDICT_4x4 wd, dq, dq, w, qdq, 2
+INIT_XMM ssse3
+PREDICT_4x4 wd, dq, dq, w, qdq, 2
+INIT_XMM avx
+PREDICT_4x4 wd, dq, dq, w, qdq, 2
%else
-INIT_MMX
-%define PALIGNR PALIGNR_MMX
-PREDICT_4x4 mmx2 , bw, wd, q , b, dq , 8
-%define PALIGNR PALIGNR_SSSE3
-PREDICT_4x4 ssse3, bw, wd, q , b, dq , 8
+INIT_MMX mmx2
+PREDICT_4x4 bw, wd, q , b, dq , 8
+INIT_MMX ssse3
+PREDICT_4x4 bw, wd, q , b, dq , 8
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void predict_4x4_vl( pixel *src )
;-----------------------------------------------------------------------------
-%macro PREDICT_4x4_V1 4
-cglobal predict_4x4_vl_%1, 1,1,6*(mmsize/16)
+%macro PREDICT_4x4_V1 3
+cglobal predict_4x4_vl, 1,1,6*(mmsize/16)
movu m1, [r0-FDEC_STRIDEB]
- psrl%2 m3, m1, %3
- psrl%2 m2, m1, %3*2
- pavg%4 m4, m3, m1
- PRED8x8_LOWPASS %4, m0, m1, m2, m3, m5
+ psrl%1 m3, m1, %2
+ psrl%1 m2, m1, %2*2
+ pavg%3 m4, m3, m1
+ PRED8x8_LOWPASS %3, m0, m1, m2, m3, m5
movh [r0+0*FDEC_STRIDEB], m4
movh [r0+1*FDEC_STRIDEB], m0
- psrl%2 m4, %3
- psrl%2 m0, %3
+ psrl%1 m4, %2
+ psrl%1 m0, %2
movh [r0+2*FDEC_STRIDEB], m4
movh [r0+3*FDEC_STRIDEB], m0
RET
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-PREDICT_4x4_V1 sse2, dq, 2, w
+INIT_XMM sse2
+PREDICT_4x4_V1 dq, 2, w
%ifdef ARCH_X86_64
-INIT_AVX
-PREDICT_4x4_V1 avx , dq, 2, w
+INIT_XMM avx
+PREDICT_4x4_V1 dq, 2, w
%endif
-INIT_MMX
-%define PALIGNR PALIGNR_MMX
-cglobal predict_4x4_vl_mmx2, 1,4
+INIT_MMX mmx2
+cglobal predict_4x4_vl, 1,4
mova m1, [r0-FDEC_STRIDEB+0]
mova m2, [r0-FDEC_STRIDEB+8]
mova m3, m2
mov [r0+3*FDEC_STRIDEB+6], r3w
RET
%else
-INIT_MMX
-PREDICT_4x4_V1 mmx2, q, 8, b
+INIT_MMX mmx2
+PREDICT_4x4_V1 q, 8, b
%endif
;-----------------------------------------------------------------------------
RET
%endif ; HIGH_BIT_DEPTH
-%macro PREDICT_FILTER 6
+%macro PREDICT_FILTER 5
;-----------------------------------------------------------------------------
;void predict_8x8_filter( pixel *src, pixel edge[33], int i_neighbor, int i_filters )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_filter_%1, 4,5,7*(mmsize/16)
+cglobal predict_8x8_filter, 4,5,7*(mmsize/16)
add r0, 0x58*SIZEOF_PIXEL
%define src r0-0x58*SIZEOF_PIXEL
%ifndef ARCH_X86_64
test r3b, 0x01
je .check_top
mova m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- punpckh%2%3 m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
+ punpckh%1%2 m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
mova m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- punpckh%2%3 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- punpckh%3%4 m1, m0
+ punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
+ punpckh%2%3 m1, m0
mova m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- punpckh%2%3 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL]
+ punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL]
mova m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- punpckh%2%3 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- punpckh%3%4 m3, m2
- punpckh%4%5 m3, m1
+ punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL]
+ punpckh%2%3 m3, m2
+ punpckh%3%4 m3, m1
mova m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL]
mova m1, [src-1*FDEC_STRIDEB]
mova m4, m3
je .fix_lt_1
.do_left:
mova m0, m4
- PRED8x8_LOWPASS %2, m2, m1, m4, m3, m5
+ PRED8x8_LOWPASS %1, m2, m1, m4, m3, m5
mova [t1+8*SIZEOF_PIXEL], m2
mova m4, m0
- PRED8x8_LOWPASS %2, m1, m3, m0, m4, m5
+ PRED8x8_LOWPASS %1, m1, m3, m0, m4, m5
movd t4, m1
- mov [t1+7*SIZEOF_PIXEL], t4%2
+ mov [t1+7*SIZEOF_PIXEL], t4%1
.check_top:
test r3b, 0x02
je .done
test r2b, 0x04
je .fix_tr_1
.do_top:
- PRED8x8_LOWPASS %2, m4, m2, m1, m3, m5
+ PRED8x8_LOWPASS %1, m4, m2, m1, m3, m5
mova [t1+16*SIZEOF_PIXEL], m4
test r3b, 0x04
je .done
mova m0, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
mova m2, m0
mova m4, m0
- psrl%5 m5, m0, 7*%6
+ psrl%4 m5, m0, 7*%5
PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3
PALIGNR m5, m4, 1*SIZEOF_PIXEL, m4
- PRED8x8_LOWPASS %2, m1, m2, m5, m0, m4
+ PRED8x8_LOWPASS %1, m1, m2, m5, m0, m4
jmp .do_topright
.fix_tr_2:
- punpckh%2%3 m3, m3
- pshuf%3 m1, m3, 0xFF
+ punpckh%1%2 m3, m3
+ pshuf%2 m1, m3, 0xFF
.do_topright:
mova [t1+24*SIZEOF_PIXEL], m1
- psrl%5 m1, 7*%6
+ psrl%4 m1, 7*%5
movd t4, m1
- mov [t1+32*SIZEOF_PIXEL], t4%2
+ mov [t1+32*SIZEOF_PIXEL], t4%1
.done:
REP_RET
.fix_lt_1:
pxor m5, m3, m4
- psrl%5 m5, 7*%6
- psll%5 m5, 6*%6
+ psrl%4 m5, 7*%5
+ psll%4 m5, 6*%5
pxor m1, m5
jmp .do_left
.fix_lt_2:
pxor m5, m3, m2
- psll%5 m5, 7*%6
- psrl%5 m5, 7*%6
+ psll%4 m5, 7*%5
+ psrl%4 m5, 7*%5
pxor m2, m5
test r2b, 0x04
jne .do_top
.fix_tr_1:
pxor m5, m3, m1
- psrl%5 m5, 7*%6
- psll%5 m5, 7*%6
+ psrl%4 m5, 7*%5
+ psll%4 m5, 7*%5
pxor m1, m5
jmp .do_top
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-%define PALIGNR PALIGNR_MMX
-PREDICT_FILTER sse2 , w, d, q, dq, 2
-%define PALIGNR PALIGNR_SSSE3
-PREDICT_FILTER ssse3, w, d, q, dq, 2
-INIT_AVX
-PREDICT_FILTER avx , w, d, q, dq, 2
+INIT_XMM sse2
+PREDICT_FILTER w, d, q, dq, 2
+INIT_XMM ssse3
+PREDICT_FILTER w, d, q, dq, 2
+INIT_XMM avx
+PREDICT_FILTER w, d, q, dq, 2
%else
-INIT_MMX
-%define PALIGNR PALIGNR_MMX
-PREDICT_FILTER mmx2 , b, w, d, q , 8
-%define PALIGNR PALIGNR_SSSE3
-PREDICT_FILTER ssse3, b, w, d, q , 8
+INIT_MMX mmx2
+PREDICT_FILTER b, w, d, q , 8
+INIT_MMX ssse3
+PREDICT_FILTER b, w, d, q , 8
%endif
;-----------------------------------------------------------------------------
; void predict_8x8_v( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_V 1
-cglobal predict_8x8_v_%1, 2,2
+%macro PREDICT_8x8_V 0
+cglobal predict_8x8_v, 2,2
mova m0, [r1+16*SIZEOF_PIXEL]
STORE8x8 m0, m0
RET
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-PREDICT_8x8_V sse2
+INIT_XMM sse2
+PREDICT_8x8_V
%else
-INIT_MMX
-PREDICT_8x8_V mmx2
+INIT_MMX mmx2
+PREDICT_8x8_V
%endif
;-----------------------------------------------------------------------------
; void predict_8x8_h( pixel *src, pixel edge[33] )
;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_H 3
-cglobal predict_8x8_h_%1, 2,2
+%macro PREDICT_8x8_H 2
+cglobal predict_8x8_h, 2,2
movu m1, [r1+7*SIZEOF_PIXEL]
add r0, 4*FDEC_STRIDEB
- punpckl%2 m2, m1, m1
- punpckh%2 m1, m1
+ punpckl%1 m2, m1, m1
+ punpckh%1 m1, m1
%assign n 0
%rep 8
%assign i 1+n/4
- SPLAT%3 m0, m %+ i, (3-n)&3
+ SPLAT%2 m0, m %+ i, (3-n)&3
mova [r0+(n-4)*FDEC_STRIDEB], m0
%assign n n+1
%endrep
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-PREDICT_8x8_H sse2, wd, D
+INIT_XMM sse2
+PREDICT_8x8_H wd, D
%else
-INIT_MMX
-PREDICT_8x8_H mmx2, bw, W
+INIT_MMX mmx2
+PREDICT_8x8_H bw, W
%endif
;-----------------------------------------------------------------------------
; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe
; size on the 8-bit mmx functions below if we know sse2 is available.
-%macro PREDICT_8x8 4
+%macro PREDICT_8x8 3
;-----------------------------------------------------------------------------
; void predict_8x8_ddl( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl_%1, 2,2,8*(mmsize/16)
+cglobal predict_8x8_ddl, 2,2,8*(mmsize/16)
mova m5, [r1+16*SIZEOF_PIXEL]
movu m2, [r1+17*SIZEOF_PIXEL]
movu m3, [r1+23*SIZEOF_PIXEL]
movu m4, [r1+25*SIZEOF_PIXEL]
- psll%3 m1, m5, %4
+ psll%2 m1, m5, %3
add r0, FDEC_STRIDEB*4
- PRED8x8_LOWPASS %2, m0, m1, m2, m5, m7
-%if avx_enabled == 1
- INIT_XMM
- PRED8x8_LOWPASS %2, m1, m3, m4, [r1+24*SIZEOF_PIXEL], m6
- INIT_AVX
-%else
- PRED8x8_LOWPASS %2, m1, m3, m4, [r1+24*SIZEOF_PIXEL], m6
-%endif
+ PRED8x8_LOWPASS %1, m0, m1, m2, m5, m7
+%assign %%bak avx_enabled
+%assign avx_enabled 0
+ PRED8x8_LOWPASS %1, m1, m3, m4, [r1+24*SIZEOF_PIXEL], m6
+%assign avx_enabled %%bak
%assign Y 3
%rep 6
mova [r0+Y*FDEC_STRIDEB], m1
- psll%3 m1, %4
- psrl%3 m2, m0, 7*%4
- psll%3 m0, %4
+ psll%2 m1, %3
+ psrl%2 m2, m0, 7*%3
+ psll%2 m0, %3
por m1, m2
%assign Y (Y-1)
%endrep
mova [r0+Y*FDEC_STRIDEB], m1
- psll%3 m1, %4
- psrl%3 m0, 7*%4
+ psll%2 m1, %3
+ psrl%2 m0, 7*%3
por m1, m0
%assign Y (Y-1)
mova [r0+Y*FDEC_STRIDEB], m1
; void predict_8x8_ddr( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
%if avx_enabled == 0
-cglobal predict_8x8_ddr_%1, 2,2,7*(mmsize/16)
+cglobal predict_8x8_ddr, 2,2,7*(mmsize/16)
movu m1, [r1+ 7*SIZEOF_PIXEL]
movu m2, [r1+ 9*SIZEOF_PIXEL]
movu m3, [r1+15*SIZEOF_PIXEL]
movu m4, [r1+17*SIZEOF_PIXEL]
add r0, FDEC_STRIDEB*4
- PRED8x8_LOWPASS %2, m0, m1, m2, [r1+ 8*SIZEOF_PIXEL], m5
- PRED8x8_LOWPASS %2, m1, m3, m4, [r1+16*SIZEOF_PIXEL], m6
+ PRED8x8_LOWPASS %1, m0, m1, m2, [r1+ 8*SIZEOF_PIXEL], m5
+ PRED8x8_LOWPASS %1, m1, m3, m4, [r1+16*SIZEOF_PIXEL], m6
%assign Y 3
%rep 6
mova [r0+Y*FDEC_STRIDEB], m0
- psrl%3 m0, %4
- psll%3 m2, m1, 7*%4
- psrl%3 m1, %4
+ psrl%2 m0, %3
+ psll%2 m2, m1, 7*%3
+ psrl%2 m1, %3
por m0, m2
%assign Y (Y-1)
%endrep
mova [r0+Y*FDEC_STRIDEB], m0
- psrl%3 m0, %4
- psll%3 m1, 7*%4
+ psrl%2 m0, %3
+ psll%2 m1, 7*%3
por m0, m1
%assign Y (Y-1)
mova [r0+Y*FDEC_STRIDEB], m0
%endmacro ; PREDICT_8x8
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-PREDICT_8x8 sse2, w, dq, 2
-INIT_AVX
-PREDICT_8x8 avx , w, dq, 2
+INIT_XMM sse2
+PREDICT_8x8 w, dq, 2
+INIT_XMM avx
+PREDICT_8x8 w, dq, 2
%elifndef ARCH_X86_64
-INIT_MMX
-PREDICT_8x8 mmx2, b, q , 8
+INIT_MMX mmx2
+PREDICT_8x8 b, q , 8
%endif
;-----------------------------------------------------------------------------
; void predict_8x8_hu( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_HU 6
-cglobal predict_8x8_hu_%1, 2,2,8*(mmsize/16)
+%macro PREDICT_8x8_HU 5
+cglobal predict_8x8_hu, 2,2,8*(mmsize/16)
movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
add r0, 4*FDEC_STRIDEB
- pshuf%4 m0, m1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
- psll%3 m1, 7*%6 ; l7 .. .. .. .. .. .. ..
+ pshuf%3 m0, m1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
+ psll%2 m1, 7*%5 ; l7 .. .. .. .. .. .. ..
mova m2, m0
- psll%4 m0, 8*SIZEOF_PIXEL
- psrl%4 m2, 8*SIZEOF_PIXEL
+ psll%3 m0, 8*SIZEOF_PIXEL
+ psrl%3 m2, 8*SIZEOF_PIXEL
por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0
mova m4, m2
mova m5, m2
- psrl%3 m3, m2, 2*%6
- psrl%3 m2, %6
+ psrl%2 m3, m2, 2*%5
+ psrl%2 m2, %5
por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1
- punpckh%5 m1, m1
+ punpckh%4 m1, m1
por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2
- pavg%2 m4, m2
- PRED8x8_LOWPASS %2, m1, m3, m5, m2, m6
- punpckh%5 m5, m4, m1 ; p8 p7 p6 p5
- punpckl%5 m4, m1 ; p4 p3 p2 p1
+ pavg%1 m4, m2
+ PRED8x8_LOWPASS %1, m1, m3, m5, m2, m6
+ punpckh%4 m5, m4, m1 ; p8 p7 p6 p5
+ punpckl%4 m4, m1 ; p4 p3 p2 p1
mova m6, m5
mova m7, m5
mova m0, m5
PALIGNR m5, m4, 2*SIZEOF_PIXEL, m1
- pshuf%4 m1, m6, 11111001b
+ pshuf%3 m1, m6, 11111001b
PALIGNR m6, m4, 4*SIZEOF_PIXEL, m2
- pshuf%4 m2, m7, 11111110b
+ pshuf%3 m2, m7, 11111110b
PALIGNR m7, m4, 6*SIZEOF_PIXEL, m3
- pshuf%4 m3, m0, 11111111b
+ pshuf%3 m3, m0, 11111111b
mova [r0-4*FDEC_STRIDEB], m4
mova [r0-3*FDEC_STRIDEB], m5
mova [r0-2*FDEC_STRIDEB], m6
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-%define PALIGNR PALIGNR_MMX
-PREDICT_8x8_HU sse2 , w, dq, d, wd, 2
-%define PALIGNR PALIGNR_SSSE3
-PREDICT_8x8_HU ssse3, w, dq, d, wd, 2
-INIT_AVX
-PREDICT_8x8_HU avx , w, dq, d, wd, 2
+INIT_XMM sse2
+PREDICT_8x8_HU w, dq, d, wd, 2
+INIT_XMM ssse3
+PREDICT_8x8_HU w, dq, d, wd, 2
+INIT_XMM avx
+PREDICT_8x8_HU w, dq, d, wd, 2
%elifndef ARCH_X86_64
-INIT_MMX
-%define PALIGNR PALIGNR_MMX
-PREDICT_8x8_HU mmx2 , b, q , w, bw, 8
+INIT_MMX mmx2
+PREDICT_8x8_HU b, q , w, bw, 8
%endif
;-----------------------------------------------------------------------------
; void predict_8x8_vr( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_VR 4
-cglobal predict_8x8_vr_%1, 2,3,7*(mmsize/16)
+%macro PREDICT_8x8_VR 3
+cglobal predict_8x8_vr, 2,3,7*(mmsize/16)
mova m2, [r1+16*SIZEOF_PIXEL]
movu m3, [r1+15*SIZEOF_PIXEL]
movu m1, [r1+14*SIZEOF_PIXEL]
- pavg%2 m4, m3, m2
+ pavg%1 m4, m3, m2
add r0, FDEC_STRIDEB*4
- PRED8x8_LOWPASS %2, m0, m1, m2, m3, m5
+ PRED8x8_LOWPASS %1, m0, m1, m2, m3, m5
mova [r0-4*FDEC_STRIDEB], m4
mova [r0-3*FDEC_STRIDEB], m0
mova m5, m0
mova m6, m4
mova m1, [r1+8*SIZEOF_PIXEL]
mova m2, m1
- psll%3 m2, %4
+ psll%2 m2, %3
mova m3, m1
- psll%3 m3, 2*%4
- PRED8x8_LOWPASS %2, m0, m1, m3, m2, m4
+ psll%2 m3, 2*%3
+ PRED8x8_LOWPASS %1, m0, m1, m3, m2, m4
%assign Y -2
%rep 5
%assign i (5 + ((Y+3)&1))
PALIGNR m %+ i, m0, 7*SIZEOF_PIXEL, m2
mova [r0+Y*FDEC_STRIDEB], m %+ i
- psll%3 m0, %4
+ psll%2 m0, %3
%assign Y (Y+1)
%endrep
PALIGNR m5, m0, 7*SIZEOF_PIXEL, m0
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-%define PALIGNR PALIGNR_MMX
-PREDICT_8x8_VR sse2 , w, dq, 2
-%define PALIGNR PALIGNR_SSSE3
-PREDICT_8x8_VR ssse3, w, dq, 2
-INIT_AVX
-PREDICT_8x8_VR avx , w, dq, 2
+INIT_XMM sse2
+PREDICT_8x8_VR w, dq, 2
+INIT_XMM ssse3
+PREDICT_8x8_VR w, dq, 2
+INIT_XMM avx
+PREDICT_8x8_VR w, dq, 2
%else
-INIT_MMX
-%define PALIGNR PALIGNR_MMX
-PREDICT_8x8_VR mmx2 , b, q , 8
+INIT_MMX mmx2
+PREDICT_8x8_VR b, q , 8
%endif
;-----------------------------------------------------------------------------
REP_RET
%endif ; !ARCH_X86_64
-%macro PREDICT_16x16_P 1
-cglobal predict_16x16_p_core_%1, 1,2,8
+%macro PREDICT_16x16_P 0
+cglobal predict_16x16_p_core, 1,2,8
movd m0, r1m
movd m1, r2m
movd m2, r3m
REP_RET
%endmacro ; PREDICT_16x16_P
-INIT_XMM
-PREDICT_16x16_P sse2
+INIT_XMM sse2
+PREDICT_16x16_P
%ifndef HIGH_BIT_DEPTH
-INIT_AVX
-PREDICT_16x16_P avx
+INIT_XMM avx
+PREDICT_16x16_P
%endif
%ifndef HIGH_BIT_DEPTH
-%macro PREDICT_8x8 1
+%macro PREDICT_8x8 0
;-----------------------------------------------------------------------------
; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl_%1, 2,2
+cglobal predict_8x8_ddl, 2,2
movdqa xmm3, [r1+16]
movdqu xmm2, [r1+17]
pslldq xmm1, xmm3, 1
;-----------------------------------------------------------------------------
; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddr_%1, 2,2
+cglobal predict_8x8_ddr, 2,2
movdqu xmm3, [r1+8]
movdqu xmm1, [r1+7]
psrldq xmm2, xmm3, 1
;-----------------------------------------------------------------------------
; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_vl_%1, 2,2
+cglobal predict_8x8_vl, 2,2
movdqa xmm4, [r1+16]
pslldq xmm1, xmm4, 1
psrldq xmm2, xmm4, 1
;-----------------------------------------------------------------------------
; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_vr_%1, 2,2,7
+cglobal predict_8x8_vr, 2,2,7
movdqu xmm0, [r1+8]
movdqa xmm6, [pw_ff00]
add r0, 4*FDEC_STRIDE
RET
%endmacro ; PREDICT_8x8
-INIT_XMM
-PREDICT_8x8 sse2
-INIT_AVX
-PREDICT_8x8 avx
+INIT_XMM sse2
+PREDICT_8x8
+INIT_XMM avx
+PREDICT_8x8
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void predict_8x8_hd( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_HD 5
-cglobal predict_8x8_hd_%1, 2,2,8*(mmsize/16)
+%macro PREDICT_8x8_HD 4
+cglobal predict_8x8_hd, 2,2,8*(mmsize/16)
add r0, 4*FDEC_STRIDEB
mova m0, [r1] ; l7 .. .. .. .. .. .. ..
mova m1, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6
PALIGNR m1, m0, 7*SIZEOF_PIXEL, m6 ; l0 l1 l2 l3 l4 l5 l6 l7
PALIGNR m4, m3, 1*SIZEOF_PIXEL, m7 ; t0 lt l0 l1 l2 l3 l4 l5
mova m5, m3
- pavg%2 m3, m1
- PRED8x8_LOWPASS %2, m0, m4, m1, m5, m7
- psrl%3 m4, m2, 2*%5 ; .. .. t6 t5 t4 t3 t2 t1
- psrl%3 m1, m2, %5 ; .. t6 t5 t4 t3 t2 t1 t0
- PRED8x8_LOWPASS %2, m6, m4, m2, m1, m5
+ pavg%1 m3, m1
+ PRED8x8_LOWPASS %1, m0, m4, m1, m5, m7
+ psrl%2 m4, m2, 2*%4 ; .. .. t6 t5 t4 t3 t2 t1
+ psrl%2 m1, m2, %4 ; .. t6 t5 t4 t3 t2 t1 t0
+ PRED8x8_LOWPASS %1, m6, m4, m2, m1, m5
; .. p11 p10 p9
- punpckh%4 m7, m3, m0 ; p8 p7 p6 p5
- punpckl%4 m3, m0 ; p4 p3 p2 p1
+ punpckh%3 m7, m3, m0 ; p8 p7 p6 p5
+ punpckl%3 m3, m0 ; p4 p3 p2 p1
mova m1, m7
mova m0, m7
mova m4, m7
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-%define PALIGNR PALIGNR_MMX
-PREDICT_8x8_HD sse2 , w, dq, wd, 2
-%define PALIGNR PALIGNR_SSSE3
-PREDICT_8x8_HD ssse3, w, dq, wd, 2
-INIT_AVX
-PREDICT_8x8_HD avx , w, dq, wd, 2
+INIT_XMM sse2
+PREDICT_8x8_HD w, dq, wd, 2
+INIT_XMM ssse3
+PREDICT_8x8_HD w, dq, wd, 2
+INIT_XMM avx
+PREDICT_8x8_HD w, dq, wd, 2
%else
-INIT_MMX
-%define PALIGNR PALIGNR_MMX
-PREDICT_8x8_HD mmx2 , b, q , bw, 8
+INIT_MMX mmx2
+PREDICT_8x8_HD b, q , bw, 8
;-----------------------------------------------------------------------------
; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_HD 1
-cglobal predict_8x8_hd_%1, 2,2
+%macro PREDICT_8x8_HD 0
+cglobal predict_8x8_hd, 2,2
add r0, 4*FDEC_STRIDE
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
RET
%endmacro
-INIT_XMM
-PREDICT_8x8_HD sse2
-%define PALIGNR PALIGNR_SSSE3
-PREDICT_8x8_HD ssse3
-INIT_AVX
-PREDICT_8x8_HD avx
-INIT_MMX
-%define PALIGNR PALIGNR_MMX
+INIT_XMM sse2
+PREDICT_8x8_HD
+INIT_XMM ssse3
+PREDICT_8x8_HD
+INIT_XMM avx
+PREDICT_8x8_HD
%endif ; HIGH_BIT_DEPTH
-INIT_MMX
;-----------------------------------------------------------------------------
; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_HU 1
-cglobal predict_8x8_hu_%1, 2,2
+%macro PREDICT_8x8_HU 0
+cglobal predict_8x8_hu, 2,2
add r0, 4*FDEC_STRIDE
-%ifidn %1, ssse3
+%if cpuflag(ssse3)
movq mm5, [r1+7]
movq mm6, [pb_reverse]
movq mm1, mm5
%endmacro
%ifndef HIGH_BIT_DEPTH
-PREDICT_8x8_HU sse2
-PREDICT_8x8_HU ssse3
+INIT_MMX sse2
+PREDICT_8x8_HU
+INIT_MMX ssse3
+PREDICT_8x8_HU
%endif
;-----------------------------------------------------------------------------
; void predict_8x8c_v( uint8_t *src )
;-----------------------------------------------------------------------------
-%macro PREDICT_8x8C_V 1
-cglobal predict_8x8c_v_%1, 1,1
+%macro PREDICT_8x8C_V 0
+cglobal predict_8x8c_v, 1,1
mova m0, [r0 - FDEC_STRIDEB]
STORE8x8 m0, m0
RET
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-PREDICT_8x8C_V sse2
+INIT_XMM sse2
+PREDICT_8x8C_V
%else
-INIT_MMX
-PREDICT_8x8C_V mmx
+INIT_MMX mmx
+PREDICT_8x8C_V
%endif
%ifdef HIGH_BIT_DEPTH
; void predict_8x8c_h( uint8_t *src )
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
-%macro PREDICT_8x8C_H 1
-cglobal predict_8x8c_h_%1, 1,1
+INIT_XMM sse2
+cglobal predict_8x8c_h, 1,1
add r0, FDEC_STRIDEB*4
%assign n -4
%rep 8
%endrep
RET
-%endmacro
-
-INIT_XMM
-PREDICT_8x8C_H sse2
-
%else
-%macro PREDICT_8x8C_H 1
-cglobal predict_8x8c_h_%1, 1,1
-%ifidn %1, ssse3
+%macro PREDICT_8x8C_H 0
+cglobal predict_8x8c_h, 1,1
+%if cpuflag(ssse3)
mova m1, [pb_3]
%endif
add r0, FDEC_STRIDE*4
RET
%endmacro
-INIT_MMX
-%define SPLATB SPLATB_MMX
-PREDICT_8x8C_H mmx2
-%define SPLATB SPLATB_SSSE3
-PREDICT_8x8C_H ssse3
+INIT_MMX mmx2
+PREDICT_8x8C_H
+INIT_MMX ssse3
+PREDICT_8x8C_H
%endif
;-----------------------------------------------------------------------------
; void predict_8x8c_dc( pixel *src )
;-----------------------------------------------------------------------------
-%macro PREDICT_8x8C_DC 1
-cglobal predict_8x8c_dc_%1, 1,3
+%macro PREDICT_8x8C_DC 0
+cglobal predict_8x8c_dc, 1,3
pxor m7, m7
%ifdef HIGH_BIT_DEPTH
movq m0, [r0-FDEC_STRIDEB+0]
psrlw m0, 2
pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
%ifdef HIGH_BIT_DEPTH
-%ifidn %1, sse2
+%if cpuflag(sse2)
movq2dq xmm0, m0
punpcklwd xmm0, xmm0
pshufd xmm1, xmm0, 11111010b
RET
%endmacro
-INIT_MMX
-PREDICT_8x8C_DC mmx2
+INIT_MMX mmx2
+PREDICT_8x8C_DC
%ifdef HIGH_BIT_DEPTH
-PREDICT_8x8C_DC sse2
+INIT_MMX sse2
+PREDICT_8x8C_DC
%endif
%ifdef HIGH_BIT_DEPTH
%else
+INIT_MMX
cglobal predict_8x8c_dc_top_mmx2, 1,1
movq mm0, [r0 - FDEC_STRIDE]
pxor mm1, mm1
;-----------------------------------------------------------------------------
; void predict_16x16_h( pixel *src )
;-----------------------------------------------------------------------------
-%macro PREDICT_16x16_H 1
-cglobal predict_16x16_h_%1, 1,2
+%macro PREDICT_16x16_H 0
+cglobal predict_16x16_h, 1,2
mov r1, 12*FDEC_STRIDEB
%ifdef HIGH_BIT_DEPTH
.vloop:
%endrep
%else
-%ifidn %1, ssse3
+%if cpuflag(ssse3)
mova m1, [pb_3]
%endif
.vloop:
REP_RET
%endmacro
-INIT_MMX
-%define SPLATB SPLATB_MMX
-PREDICT_16x16_H mmx2
-INIT_XMM
+INIT_MMX mmx2
+PREDICT_16x16_H
+INIT_XMM sse2
%ifdef HIGH_BIT_DEPTH
-PREDICT_16x16_H sse2
+PREDICT_16x16_H
%else
;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
-%define SPLATB SPLATB_SSSE3
-PREDICT_16x16_H ssse3
+INIT_XMM ssse3
+PREDICT_16x16_H
%endif
;-----------------------------------------------------------------------------
pshufb m7, m5
%endmacro
-%macro PABSW_MMX 2
+; PABSW mmx and PSIGNW mmx do not individually perform the same operations as
+; pabsw and psignw instructions, but the conjuction works
+%macro PABSW 2
+%if cpuflag(ssse3)
+ pabsw %1, %2
+%else
pxor %1, %1
pcmpgtw %1, %2
pxor %2, %1
psubw %2, %1
SWAP %1, %2
+%endif
%endmacro
-%macro PSIGNW_MMX 2
+%macro PSIGNW 2
+%if cpuflag(ssse3)
+ psignw %1, %2
+%else
pxor %1, %2
psubw %1, %2
+%endif
%endmacro
-%macro PABSW_SSSE3 2
- pabsw %1, %2
-%endmacro
-
-%macro PSIGNW_SSSE3 2
- psignw %1, %2
+%macro PABSD 2
+%if cpuflag(ssse3)
+ pabsd %1, %2
+%else
+ pxor %1, %1
+ pcmpgtd %1, %2
+ pxor %2, %1
+ psubd %2, %1
+ SWAP %1, %2
+%endif
%endmacro
%macro PSIGND_MMX 2-3
%if %0==3
- mova %1, %2
- pxor %1, %3
+ pxor %1, %2, %3
psubd %1, %3
%else
pxor %1, %2
%endif
%endmacro
-%macro PSIGND_SSSE3 2+
+%macro PSIGND 2+
+%if cpuflag(ssse3)
psignd %1, %2
+%else
+ PSIGND_MMX %1, %2
+%endif
%endmacro
-%macro PABSD_MMX 2
- pxor %1, %1
- pcmpgtd %1, %2
- pxor %2, %1
- psubd %2, %1
- SWAP %1, %2
-%endmacro
-
-%macro PABSD_SSSE3 2
- pabsd %1, %2
-%endmacro
-
-%macro QUANT_END_MMX 0
+%macro QUANT_END 0
+%if cpuflag(sse4)
+ xor eax, eax
+ ptest m5, m5
+ setne al
+%else ; !sse4
xor eax, eax
%ifdef ARCH_X86_64
%if mmsize == 16
%endif
%endif
setne al
-%endmacro
-
-%macro QUANT_END_SSE4 0
- xor eax, eax
- ptest m5, m5
- setne al
+%endif ; cpuflag
%endmacro
%ifdef HIGH_BIT_DEPTH
-%macro QUANT_ONE_DC_MMX 4
+%macro QUANT_ONE_DC 4
+%if cpuflag(sse4)
mova m0, [%1]
PABSD m1, m0
paddd m1, %3
- mova m2, m1
- psrlq m2, 32
- pmuludq m1, %2
- pmuludq m2, %2
- psllq m2, 32
- paddd m1, m2
- psrld m1, 16
+ pmulld m1, %2
+ psrad m1, 16
PSIGND m1, m0
mova [%1], m1
%if %4
%else
SWAP 5, 1
%endif
-%endmacro
-
-%macro QUANT_TWO_DC_MMX 4
- QUANT_ONE_DC_MMX %1, %2, %3, %4
- QUANT_ONE_DC_MMX %1+mmsize, %2, %3, %4+mmsize
-%endmacro
-
-%macro QUANT_ONE_DC_SSE4 4
+%else ; !sse4
mova m0, [%1]
PABSD m1, m0
paddd m1, %3
- pmulld m1, %2
- psrad m1, 16
+ mova m2, m1
+ psrlq m2, 32
+ pmuludq m1, %2
+ pmuludq m2, %2
+ psllq m2, 32
+ paddd m1, m2
+ psrld m1, 16
PSIGND m1, m0
mova [%1], m1
%if %4
%else
SWAP 5, 1
%endif
+%endif ; cpuflag
%endmacro
-%macro QUANT_TWO_DC_SSE4 4
+%macro QUANT_TWO_DC 4
+%if cpuflag(sse4)
mova m0, [%1]
mova m1, [%1+mmsize]
PABSD m2, m0
SWAP 5, 2
%endif
por m5, m3
+%else ; !sse4
+ QUANT_ONE_DC %1, %2, %3, %4
+ QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
+%endif ; cpuflag
%endmacro
%macro QUANT_ONE_AC_MMX 4
%endif
%endmacro
-%macro QUANT_TWO_AC_MMX 4
- QUANT_ONE_AC_MMX %1, %2, %3, %4
- QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
-%endmacro
-
-%macro QUANT_TWO_AC_SSE4 4
+%macro QUANT_TWO_AC 4
+%if cpuflag(sse4)
mova m0, [%1]
mova m1, [%1+mmsize]
PABSD m2, m0
SWAP 5, 2
%endif
por m5, m3
+%else ; !sse4
+ QUANT_ONE_AC_MMX %1, %2, %3, %4
+ QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
+%endif ; cpuflag
%endmacro
;-----------------------------------------------------------------------------
; int quant_2x2( int32_t dct[M*N], int mf, int bias )
;-----------------------------------------------------------------------------
-%macro QUANT_DC 3
-cglobal quant_%1x%2_dc_%3, 3,3,8*(mmsize/16)
+%macro QUANT_DC 2
+cglobal quant_%1x%2_dc, 3,3,8*(mmsize/16)
QUANT_DC_START_MMX
%if %1*%2 <= mmsize/4
QUANT_ONE_DC r0, m6, m7, 0
;-----------------------------------------------------------------------------
; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
;-----------------------------------------------------------------------------
-%macro QUANT_AC 3
-cglobal quant_%1x%2_%3, 3,3,8*(mmsize/16)
+%macro QUANT_AC 2
+cglobal quant_%1x%2, 3,3,8*(mmsize/16)
%assign x 0
%rep %1*%2/(mmsize/2)
QUANT_TWO_AC r0+x, r1+x, r2+x, x
RET
%endmacro
-%define QUANT_TWO_AC QUANT_TWO_AC_MMX
-%define QUANT_ONE_DC QUANT_ONE_DC_MMX
-%define QUANT_TWO_DC QUANT_TWO_DC_MMX
-%define QUANT_END QUANT_END_MMX
-%define PABSD PABSD_MMX
-%define PSIGND PSIGND_MMX
-INIT_XMM
-QUANT_DC 2, 2, sse2
-QUANT_DC 4, 4, sse2
-QUANT_AC 4, 4, sse2
-QUANT_AC 8, 8, sse2
-
-%define PABSD PABSD_SSSE3
-%define PSIGND PSIGND_SSSE3
-QUANT_DC 2, 2, ssse3
-QUANT_DC 4, 4, ssse3
-QUANT_AC 4, 4, ssse3
-QUANT_AC 8, 8, ssse3
-
-%define QUANT_TWO_AC QUANT_TWO_AC_SSE4
-%define QUANT_ONE_DC QUANT_ONE_DC_SSE4
-%define QUANT_TWO_DC QUANT_TWO_DC_SSE4
-%define QUANT_END QUANT_END_SSE4
-QUANT_DC 2, 2, sse4
-QUANT_DC 4, 4, sse4
-QUANT_AC 4, 4, sse4
-QUANT_AC 8, 8, sse4
-
-%undef SIGND
-%undef PABSD
-%undef QUANT_END
-%undef QUANT_TWO_AC
-%undef QUANT_ONE_DC
-%undef QUANT_TWO_DC
+INIT_XMM sse2
+QUANT_DC 2, 2
+QUANT_DC 4, 4
+QUANT_AC 4, 4
+QUANT_AC 8, 8
+
+INIT_XMM ssse3
+QUANT_DC 2, 2
+QUANT_DC 4, 4
+QUANT_AC 4, 4
+QUANT_AC 8, 8
+
+INIT_XMM sse4
+QUANT_DC 2, 2
+QUANT_DC 4, 4
+QUANT_AC 4, 4
+QUANT_AC 8, 8
+
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
RET
%endmacro
-INIT_MMX
-%define QUANT_END QUANT_END_MMX
-%define PABSW PABSW_MMX
-%define PSIGNW PSIGNW_MMX
+INIT_MMX mmx2
%define QUANT_DC_START QUANT_DC_START_MMX
-QUANT_DC quant_2x2_dc_mmx2, 1
+QUANT_DC quant_2x2_dc, 1
%ifndef ARCH_X86_64 ; not needed because sse2 is faster
-QUANT_DC quant_4x4_dc_mmx2, 4
-QUANT_AC quant_4x4_mmx, 4
-QUANT_AC quant_8x8_mmx, 16
+QUANT_DC quant_4x4_dc, 4
+INIT_MMX mmx
+QUANT_AC quant_4x4, 4
+QUANT_AC quant_8x8, 16
%endif
-INIT_XMM
-QUANT_DC quant_4x4_dc_sse2, 2, 8
-QUANT_AC quant_4x4_sse2, 2
-QUANT_AC quant_8x8_sse2, 8
+INIT_XMM sse2
+QUANT_DC quant_4x4_dc, 2, 8
+QUANT_AC quant_4x4, 2
+QUANT_AC quant_8x8, 8
-%define PABSW PABSW_SSSE3
-%define PSIGNW PSIGNW_SSSE3
-QUANT_DC quant_4x4_dc_ssse3, 2, 8
-QUANT_AC quant_4x4_ssse3, 2
-QUANT_AC quant_8x8_ssse3, 8
+INIT_XMM ssse3
+QUANT_DC quant_4x4_dc, 2, 8
+QUANT_AC quant_4x4, 2
+QUANT_AC quant_8x8, 8
-INIT_MMX
-QUANT_DC quant_2x2_dc_ssse3, 1
-%define QUANT_END QUANT_END_SSE4
+INIT_MMX ssse3
+QUANT_DC quant_2x2_dc, 1
+
+INIT_XMM sse4
;Not faster on Conroe, so only used in SSE4 versions
%define QUANT_DC_START QUANT_DC_START_SSSE3
-INIT_XMM
-QUANT_DC quant_4x4_dc_sse4, 2, 8
-QUANT_AC quant_4x4_sse4, 2
-QUANT_AC quant_8x8_sse4, 8
+QUANT_DC quant_4x4_dc, 2, 8
+QUANT_AC quant_4x4, 2
+QUANT_AC quant_8x8, 8
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
-%macro DEQUANT 4
-cglobal dequant_%2x%2_%1, 0,3,6*(mmsize/16)
+%macro DEQUANT 3
+cglobal dequant_%1x%1, 0,3,6*(mmsize/16)
.skip_prologue:
- DEQUANT_START %3+2, %3
+ DEQUANT_START %2+2, %2
.lshift:
movd m2, t0d
- DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
+ DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
.rshift32:
neg t0d
pxor m4, m4
pslld m3, m2
psrld m3, 1
- DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
+ DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
-%ifnidn %1, avx
-cglobal dequant_%2x%2_flat16_%1, 0,3
+%if notcpuflag(avx)
+cglobal dequant_%1x%1_flat16, 0,3
movifnidn t2d, r2m
-%if %2 == 8
+%if %1 == 8
cmp t2d, 12
- jl dequant_%2x%2_%1.skip_prologue
+ jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
sub t2d, 12
%endif
imul t0d, t2d, 0x2b
lea t1, [t0*3]
sub t2d, t1d
sub t2d, t1d ; i_mf = i_qp % 6
- shl t2d, %3
+ shl t2d, %2
%ifdef PIC
- lea r1, [dequant%2_scale]
+ lea r1, [dequant%1_scale]
add r1, t2
%else
- lea r1, [dequant%2_scale + t2]
+ lea r1, [dequant%1_scale + t2]
%endif
movifnidn r0, r0mp
movd m4, t0d
-%if %2 == 4
-%ifidn %1, mmx
+%if %1 == 4
+%if mmsize == 8
DEQUANT16_FLAT [r1], 0, 16
DEQUANT16_FLAT [r1+8], 8, 24
%else
DEQUANT16_FLAT [r1], 0, 16
%endif
-%elifidn %1, mmx
+%elif mmsize == 8
DEQUANT16_FLAT [r1], 0, 8, 64, 72
DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
%endmacro ; DEQUANT
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-DEQUANT sse2, 4, 4, 1
-DEQUANT sse4, 4, 4, 1
-DEQUANT sse2, 8, 6, 1
-DEQUANT sse4, 8, 6, 1
+INIT_XMM sse2
+DEQUANT 4, 4, 1
+INIT_XMM sse4
+DEQUANT 4, 4, 1
+INIT_XMM sse2
+DEQUANT 8, 6, 1
+INIT_XMM sse4
+DEQUANT 8, 6, 1
%else
%ifndef ARCH_X86_64
-INIT_MMX
-DEQUANT mmx, 4, 4, 1
-DEQUANT mmx, 8, 6, 1
+INIT_MMX mmx
+DEQUANT 4, 4, 1
+DEQUANT 8, 6, 1
%endif
-INIT_XMM
-DEQUANT sse2, 4, 4, 2
-DEQUANT sse2, 8, 6, 2
-INIT_AVX
-DEQUANT avx, 4, 4, 2
-DEQUANT avx, 8, 6, 2
+INIT_XMM sse2
+DEQUANT 4, 4, 2
+DEQUANT 8, 6, 2
+INIT_XMM avx
+DEQUANT 4, 4, 2
+DEQUANT 8, 6, 2
%endif
%macro DEQUANT_DC 2
-cglobal dequant_4x4dc_%1, 0,3,6*(mmsize/16)
+cglobal dequant_4x4dc, 0,3,6*(mmsize/16)
DEQUANT_START 6, 6
.lshift:
movd m3, [r1]
movd m2, t0d
pslld m3, m2
-%ifdef HIGH_BIT_DEPTH
- pshufd m3, m3, 0
+ SPLAT%1 m3, m3, 0
%assign x 0
%rep SIZEOF_PIXEL*16/mmsize
mova m0, [r0+mmsize*0+x]
mova m1, [r0+mmsize*1+x]
- pmaddwd m0, m3
- pmaddwd m1, m3
+ %2 m0, m3
+ %2 m1, m3
mova [r0+mmsize*0+x], m0
mova [r0+mmsize*1+x], m1
%assign x x+mmsize*2
%endrep
-
-%else ; !HIGH_BIT_DEPTH
-%if mmsize==16
- pshuflw m3, m3, 0
- punpcklqdq m3, m3
-%else
- pshufw m3, m3, 0
-%endif
-%assign x 0
-%rep SIZEOF_PIXEL*16/mmsize
- mova m0, [r0+mmsize*0+x]
- mova m1, [r0+mmsize*1+x]
- pmullw m0, m3
- pmullw m1, m3
- mova [r0+mmsize*0+x], m0
- mova [r0+mmsize*1+x], m1
-%assign x x+mmsize*2
-%endrep
-%endif ; HIGH_BIT_DEPTH
RET
.rshift32:
neg t0d
movd m3, t0d
- mova m4, [p%2_1]
+ mova m4, [p%1_1]
mova m5, m4
pslld m4, m3
psrld m4, 1
%assign x x+mmsize
%endrep
-%else
+%else ; !HIGH_BIT_DEPTH
%if mmsize==8
punpcklwd m2, m2
%else
mova [r0+x], m0
%assign x x+mmsize
%endrep
-%endif
+%endif ; !HIGH_BIT_DEPTH
RET
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-DEQUANT_DC sse2, d
-DEQUANT_DC sse4, d
-INIT_AVX
-DEQUANT_DC avx, d
+INIT_XMM sse2
+DEQUANT_DC d, pmaddwd
+INIT_XMM sse4
+DEQUANT_DC d, pmaddwd
+INIT_XMM avx
+DEQUANT_DC d, pmaddwd
%else
-INIT_MMX
-DEQUANT_DC mmx2, w
-INIT_XMM
-DEQUANT_DC sse2, w
-INIT_AVX
-DEQUANT_DC avx, w
+INIT_MMX mmx2
+DEQUANT_DC w, pmullw
+INIT_XMM sse2
+DEQUANT_DC w, pmullw
+INIT_XMM avx
+DEQUANT_DC w, pmullw
%endif
; t4 is eax for return value.
; x264_optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
;-----------------------------------------------------------------------------
-; %2 == 1 for sse2 or ssse3, 0 for sse4/avx
-%macro OPTIMIZE_CHROMA_DC 2
-%assign %%regs 4+%2
+%macro OPTIMIZE_CHROMA_DC 0
+%assign %%regs 5
+%if cpuflag(sse4)
+ %assign %%regs %%regs-1
+%endif
%ifndef ARCH_X86_64
%assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
%endif
-cglobal optimize_chroma_dc_%1, 0,%%regs,7
+cglobal optimize_chroma_dc, 0,%%regs,7
movifnidn t0, r0mp
movd m2, r1m
movq m1, [t0]
-%if %2
- pxor m4, m4
-%else ; sse4, avx
+%if cpuflag(sse4)
pcmpeqb m4, m4
pslld m4, 11
-%endif
-%ifidn %1, sse2
- mova m3, [chroma_dc_dct_mask_mmx]
- mova m5, [chroma_dc_dmf_mask_mmx]
%else
+ pxor m4, m4
+%endif
+%if cpuflag(ssse3)
mova m3, [chroma_dc_dct_mask]
mova m5, [chroma_dc_dmf_mask]
+%else
+ mova m3, [chroma_dc_dct_mask_mmx]
+ mova m5, [chroma_dc_dmf_mask_mmx]
%endif
pshuflw m2, m2, 0
pshufd m0, m1, 00010001b ; 1 0 3 2 1 0 3 2
mov t1d, 3
paddd m0, m6
xor t4d, t4d
-%ifidn %1, sse2
+%if notcpuflag(ssse3)
psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
%endif
-%if %2
+%if cpuflag(sse4)
+ ptest m0, m4
+%else
mova m6, m0
SWAP 0, 6
psrad m6, 11
pcmpeqd m6, m4
pmovmskb t5d, m6
cmp t5d, 0xffff
-%else ; sse4, avx
- ptest m0, m4
%endif
jz .ret ; if the DC coefficients already round to zero, terminate early
mova m3, m0
.inner_loop:
psubd m3, m5 ; coeff -= sign
pxor m6, m0, m3
-%if %2
+%if cpuflag(sse4)
+ ptest m6, m4
+%else
psrad m6, 11
pcmpeqd m6, m4
pmovmskb t5d, m6
cmp t5d, 0xffff
-%else ; sse4, avx
- ptest m6, m4
%endif
jz .round_coeff
paddd m3, m5 ; coeff += sign
REP_RET
%endmacro
-INIT_XMM
-%define PSIGNW PSIGNW_MMX
-%define PSIGND PSIGND_MMX
-OPTIMIZE_CHROMA_DC sse2, 1
-%define PSIGNW PSIGNW_SSSE3
-%define PSIGND PSIGND_SSSE3
-OPTIMIZE_CHROMA_DC ssse3, 1
-OPTIMIZE_CHROMA_DC sse4, 0
-INIT_AVX
-OPTIMIZE_CHROMA_DC avx, 0
+INIT_XMM sse2
+OPTIMIZE_CHROMA_DC
+INIT_XMM ssse3
+OPTIMIZE_CHROMA_DC
+INIT_XMM sse4
+OPTIMIZE_CHROMA_DC
+INIT_XMM avx
+OPTIMIZE_CHROMA_DC
%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
;-----------------------------------------------------------------------------
-%macro DENOISE_DCT 1-2 0
-cglobal denoise_dct_%1, 4,4,%2
+%macro DENOISE_DCT 0
+cglobal denoise_dct, 4,4,8*(mmsize/16)
pxor m6, m6
.loop:
sub r3, mmsize/2
REP_RET
%endmacro
-%define PABSD PABSD_MMX
-%define PSIGND PSIGND_MMX
%ifndef ARCH_X86_64
-INIT_MMX
-DENOISE_DCT mmx
+INIT_MMX mmx
+DENOISE_DCT
%endif
-INIT_XMM
-DENOISE_DCT sse2, 8
-%define PABSD PABSD_SSSE3
-%define PSIGND PSIGND_SSSE3
-DENOISE_DCT ssse3, 8
-INIT_AVX
-DENOISE_DCT avx , 8
+INIT_XMM sse2
+DENOISE_DCT
+INIT_XMM ssse3
+DENOISE_DCT
+INIT_XMM avx
+DENOISE_DCT
%else ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
;-----------------------------------------------------------------------------
-%macro DENOISE_DCT 1-2 0
-cglobal denoise_dct_%1, 4,4,%2
+%macro DENOISE_DCT 0
+cglobal denoise_dct, 4,4,7*(mmsize/16)
pxor m6, m6
.loop:
sub r3, mmsize
REP_RET
%endmacro
-%define PABSW PABSW_MMX
-%define PSIGNW PSIGNW_MMX
%ifndef ARCH_X86_64
-INIT_MMX
-DENOISE_DCT mmx
+INIT_MMX mmx
+DENOISE_DCT
%endif
-INIT_XMM
-DENOISE_DCT sse2, 7
-%define PABSW PABSW_SSSE3
-%define PSIGNW PSIGNW_SSSE3
-DENOISE_DCT ssse3, 7
-INIT_AVX
-DENOISE_DCT avx, 7
+INIT_XMM sse2
+DENOISE_DCT
+INIT_XMM ssse3
+DENOISE_DCT
+INIT_XMM avx
+DENOISE_DCT
%endif ; !HIGH_BIT_DEPTH
; int decimate_score( dctcoef *dct )
;-----------------------------------------------------------------------------
-%macro DECIMATE_MASK_SSE2 7
+%macro DECIMATE_MASK 5
+%if mmsize==16
%ifdef HIGH_BIT_DEPTH
movdqa xmm0, [%3+ 0]
movdqa xmm1, [%3+32]
packssdw xmm0, [%3+16]
packssdw xmm1, [%3+48]
-%if %7
- pabsw xmm0, xmm0
- pabsw xmm1, xmm1
+ ABS2 xmm0, xmm1, xmm3, xmm4
%else
- ABS2_MMX xmm0, xmm1, xmm3, xmm4
-%endif
-%else
-%if %7
+%if cpuflag(ssse3)
pabsw xmm0, [%3+ 0]
pabsw xmm1, [%3+16]
%else
movdqa xmm0, [%3+ 0]
movdqa xmm1, [%3+16]
- ABS2_MMX xmm0, xmm1, xmm3, xmm4
+ ABS2 xmm0, xmm1, xmm3, xmm4
%endif
%endif
packsswb xmm0, xmm1
pcmpgtb xmm0, %4
pmovmskb %1, xmm2
pmovmskb %2, xmm0
-%endmacro
-%macro DECIMATE_MASK_MMX 7
+%else ; mmsize==8
%ifdef HIGH_BIT_DEPTH
movq mm0, [%3+ 0]
movq mm1, [%3+16]
movq mm2, [%3+16]
movq mm3, [%3+24]
%endif
- ABS2_MMX mm0, mm1, mm6, mm7
- ABS2_MMX mm2, mm3, mm6, mm7
+ ABS2 mm0, mm1, mm6, mm7
+ ABS2 mm2, mm3, mm6, mm7
packsswb mm0, mm1
packsswb mm2, mm3
pxor mm4, mm4
pcmpeqb mm6, mm2
pcmpgtb mm0, %4
pcmpgtb mm2, %4
- pmovmskb %6, mm4
+ pmovmskb %5, mm4
pmovmskb %1, mm6
shl %1, 8
- or %1, %6
- pmovmskb %6, mm0
+ or %1, %5
+ pmovmskb %5, mm0
pmovmskb %2, mm2
shl %2, 8
- or %2, %6
+ or %2, %5
+%endif
%endmacro
cextern decimate_table4
cextern decimate_table8
-%macro DECIMATE4x4 4
+%macro DECIMATE4x4 1
;A LUT is faster than bsf on AMD processors.
;This is not true for score64.
-cglobal decimate_score%1_%2, 1,3
+cglobal decimate_score%1, 1,3
%ifdef PIC
lea r10, [decimate_table4]
lea r11, [decimate_mask_table4]
%define table decimate_table4
%define mask_table decimate_mask_table4
%endif
- DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx, %4
+ DECIMATE_MASK edx, eax, r0, [pb_1], ecx
xor edx, 0xffff
je .ret
test eax, eax
%if %1==15
shr edx, 1
%endif
-%if %3==1
+%if cpuflag(slowctz)
movzx ecx, dl
movzx eax, byte [mask_table + rcx]
cmp edx, ecx
%endmacro
%ifndef ARCH_X86_64
-INIT_MMX
-%define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE4x4 15, mmx2, 0, 0
-DECIMATE4x4 16, mmx2, 0, 0
-DECIMATE4x4 15, mmx2_slowctz, 1, 0
-DECIMATE4x4 16, mmx2_slowctz, 1, 0
+INIT_MMX mmx2
+DECIMATE4x4 15
+DECIMATE4x4 16
+INIT_MMX mmx2, slowctz
+DECIMATE4x4 15
+DECIMATE4x4 16
%endif
-INIT_XMM
-%define DECIMATE_MASK DECIMATE_MASK_SSE2
-DECIMATE4x4 15, sse2, 0, 0
-DECIMATE4x4 16, sse2, 0, 0
-DECIMATE4x4 15, sse2_slowctz, 1, 0
-DECIMATE4x4 16, sse2_slowctz, 1, 0
-DECIMATE4x4 15, ssse3, 0, 1
-DECIMATE4x4 16, ssse3, 0, 1
-DECIMATE4x4 15, ssse3_slowctz, 1, 1
-DECIMATE4x4 16, ssse3_slowctz, 1, 1
-
-%macro DECIMATE8x8 2
+INIT_XMM sse2
+DECIMATE4x4 15
+DECIMATE4x4 16
+INIT_XMM sse2, slowctz
+DECIMATE4x4 15
+DECIMATE4x4 16
+INIT_XMM ssse3
+DECIMATE4x4 15
+DECIMATE4x4 16
+INIT_XMM ssse3, slowctz
+DECIMATE4x4 15
+DECIMATE4x4 16
+
+%macro DECIMATE8x8 0
%ifdef ARCH_X86_64
-cglobal decimate_score64_%1, 1,4
+cglobal decimate_score64, 1,4
%ifdef PIC
lea r10, [decimate_table8]
%define table r10
%define table decimate_table8
%endif
mova m5, [pb_1]
- DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, %1, null, %2
+ DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
test eax, eax
jne .ret9
- DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, %1, null, %2
+ DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
shl r2d, 16
or r1d, r2d
- DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, %1, null, %2
+ DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
shl r2, 32
or eax, r3d
or r1, r2
- DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, %1, null, %2
+ DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
shl r2, 48
or r1, r2
xor r1, -1
RET
%else ; ARCH
-%ifidn %1, mmx2
-cglobal decimate_score64_%1, 1,6
+%if mmsize == 8
+cglobal decimate_score64, 1,6
%else
-cglobal decimate_score64_%1, 1,5
+cglobal decimate_score64, 1,5
%endif
mova m5, [pb_1]
- DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, %1, r5, %2
+ DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
test r2, r2
jne .ret9
- DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, %1, r5, %2
+ DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
shl r4, 16
or r3, r4
- DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, %1, r5, %2
+ DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
or r2, r1
- DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, %1, r5, %2
+ DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
shl r1, 16
or r4, r1
xor r3, -1
%endmacro
%ifndef ARCH_X86_64
-INIT_MMX
-%define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE8x8 mmx2, 0
+INIT_MMX mmx2
+DECIMATE8x8
%endif
-INIT_XMM
-%define DECIMATE_MASK DECIMATE_MASK_SSE2
-DECIMATE8x8 sse2, 0
-DECIMATE8x8 ssse3, 1
+INIT_XMM sse2
+DECIMATE8x8
+INIT_XMM ssse3
+DECIMATE8x8
;-----------------------------------------------------------------------------
; int coeff_last( dctcoef *dct )
;-----------------------------------------------------------------------------
-%macro LAST_X86 3
+%macro BSR 3
+%if cpuflag(lzcnt)
+ lzcnt %1, %2
+ xor %1, %3
+%else
bsr %1, %2
+%endif
%endmacro
-%macro LAST_SSE4A 3
+%macro LZCOUNT 3
+%if cpuflag(lzcnt)
lzcnt %1, %2
+%else
+ bsr %1, %2
xor %1, %3
+%endif
%endmacro
%ifdef HIGH_BIT_DEPTH
-%macro LAST_MASK4_MMX 2-3
- movq mm0, [%2]
- packssdw mm0, [%2+8]
+%macro LAST_MASK 3-4
+%if %1 == 4
+ movq mm0, [%3]
+ packssdw mm0, [%3+8]
packsswb mm0, mm0
pcmpeqb mm0, mm2
- pmovmskb %1, mm0
-%endmacro
-
-%macro LAST_MASK_SSE2 2-3
- movdqa xmm0, [%2+ 0]
- movdqa xmm1, [%2+32]
- packssdw xmm0, [%2+16]
- packssdw xmm1, [%2+48]
+ pmovmskb %2, mm0
+%elif mmsize == 16
+ movdqa xmm0, [%3+ 0]
+ movdqa xmm1, [%3+32]
+ packssdw xmm0, [%3+16]
+ packssdw xmm1, [%3+48]
packsswb xmm0, xmm1
pcmpeqb xmm0, xmm2
- pmovmskb %1, xmm0
-%endmacro
-
-%macro LAST_MASK_MMX 3
- movq mm0, [%2+ 0]
- movq mm1, [%2+16]
- packssdw mm0, [%2+ 8]
- packssdw mm1, [%2+24]
- movq mm3, [%2+32]
- movq mm4, [%2+48]
- packssdw mm3, [%2+40]
- packssdw mm4, [%2+56]
+ pmovmskb %2, xmm0
+%else
+ movq mm0, [%3+ 0]
+ movq mm1, [%3+16]
+ packssdw mm0, [%3+ 8]
+ packssdw mm1, [%3+24]
+ movq mm3, [%3+32]
+ movq mm4, [%3+48]
+ packssdw mm3, [%3+40]
+ packssdw mm4, [%3+56]
packsswb mm0, mm1
packsswb mm3, mm4
pcmpeqb mm0, mm2
pcmpeqb mm3, mm2
- pmovmskb %1, mm0
- pmovmskb %3, mm3
- shl %3, 8
- or %1, %3
+ pmovmskb %2, mm0
+ pmovmskb %4, mm3
+ shl %4, 8
+ or %2, %4
+%endif
%endmacro
-%macro COEFF_LAST4 1
-cglobal coeff_last4_%1, 1,3
+%macro COEFF_LAST4 0
+cglobal coeff_last4, 1,3
pxor mm2, mm2
- LAST_MASK4_MMX r1d, r0
+ LAST_MASK 4, r1d, r0
xor r1d, 0xff
shr r1d, 4
- LAST eax, r1d, 0x1f
+ BSR eax, r1d, 0x1f
RET
%endmacro
-%define LAST LAST_X86
-COEFF_LAST4 mmx2
-%define LAST LAST_SSE4A
-COEFF_LAST4 mmx2_lzcnt
+INIT_MMX mmx2
+COEFF_LAST4
+INIT_MMX mmx2, lzcnt
+COEFF_LAST4
%else ; !HIGH_BIT_DEPTH
-%macro LAST_MASK4_MMX 2-3
- movq mm0, [%2]
+%macro LAST_MASK 3-4
+%if %1 == 4
+ movq mm0, [%3]
packsswb mm0, mm0
pcmpeqb mm0, mm2
- pmovmskb %1, mm0
-%endmacro
-
-%macro LAST_MASK_SSE2 2-3
- movdqa xmm0, [%2+ 0]
- packsswb xmm0, [%2+16]
+ pmovmskb %2, mm0
+%elif mmsize == 16
+ movdqa xmm0, [%3+ 0]
+ packsswb xmm0, [%3+16]
pcmpeqb xmm0, xmm2
- pmovmskb %1, xmm0
-%endmacro
-
-%macro LAST_MASK_MMX 3
- movq mm0, [%2+ 0]
- movq mm1, [%2+16]
- packsswb mm0, [%2+ 8]
- packsswb mm1, [%2+24]
+ pmovmskb %2, xmm0
+%else
+ movq mm0, [%3+ 0]
+ movq mm1, [%3+16]
+ packsswb mm0, [%3+ 8]
+ packsswb mm1, [%3+24]
pcmpeqb mm0, mm2
pcmpeqb mm1, mm2
- pmovmskb %1, mm0
- pmovmskb %3, mm1
- shl %3, 8
- or %1, %3
+ pmovmskb %2, mm0
+ pmovmskb %4, mm1
+ shl %4, 8
+ or %2, %4
+%endif
%endmacro
-%macro COEFF_LAST4 1
+%macro COEFF_LAST4 0
%ifdef ARCH_X86_64
-cglobal coeff_last4_%1, 1,1
- LAST rax, [r0], 0x3f
+cglobal coeff_last4, 1,1
+ BSR rax, [r0], 0x3f
shr eax, 4
RET
%else
-cglobal coeff_last4_%1, 0,3
+cglobal coeff_last4, 0,3
mov edx, r0mp
mov eax, [edx+4]
xor ecx, ecx
test eax, eax
cmovz eax, [edx]
setnz cl
- LAST eax, eax, 0x1f
+ BSR eax, eax, 0x1f
shr eax, 4
lea eax, [eax+ecx*2]
RET
%endif
%endmacro
-%define LAST LAST_X86
-COEFF_LAST4 mmx2
-%define LAST LAST_SSE4A
-COEFF_LAST4 mmx2_lzcnt
+INIT_MMX mmx2
+COEFF_LAST4
+INIT_MMX mmx2, lzcnt
+COEFF_LAST4
%endif ; HIGH_BIT_DEPTH
-%macro COEFF_LAST 1
-cglobal coeff_last15_%1, 1,3
+%macro COEFF_LAST 0
+cglobal coeff_last15, 1,3
pxor m2, m2
- LAST_MASK r1d, r0-SIZEOF_DCTCOEF, r2d
+ LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
xor r1d, 0xffff
- LAST eax, r1d, 0x1f
+ BSR eax, r1d, 0x1f
dec eax
RET
-cglobal coeff_last16_%1, 1,3
+cglobal coeff_last16, 1,3
pxor m2, m2
- LAST_MASK r1d, r0, r2d
+ LAST_MASK 16, r1d, r0, r2d
xor r1d, 0xffff
- LAST eax, r1d, 0x1f
+ BSR eax, r1d, 0x1f
RET
%ifndef ARCH_X86_64
-cglobal coeff_last64_%1, 1, 5-mmsize/16
+cglobal coeff_last64, 1, 5-mmsize/16
pxor m2, m2
- LAST_MASK r2d, r0+SIZEOF_DCTCOEF* 32, r4d
- LAST_MASK r3d, r0+SIZEOF_DCTCOEF* 48, r4d
+ LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d
+ LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF* 48, r4d
shl r3d, 16
or r2d, r3d
xor r2d, -1
jne .secondhalf
- LAST_MASK r1d, r0+SIZEOF_DCTCOEF* 0, r4d
- LAST_MASK r3d, r0+SIZEOF_DCTCOEF*16, r4d
+ LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r4d
+ LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*16, r4d
shl r3d, 16
or r1d, r3d
not r1d
- LAST eax, r1d, 0x1f
+ BSR eax, r1d, 0x1f
RET
.secondhalf:
- LAST eax, r2d, 0x1f
+ BSR eax, r2d, 0x1f
add eax, 32
RET
%else
-cglobal coeff_last64_%1, 1,4
+cglobal coeff_last64, 1,4
pxor m2, m2
- LAST_MASK_SSE2 r1d, r0+SIZEOF_DCTCOEF* 0
- LAST_MASK_SSE2 r2d, r0+SIZEOF_DCTCOEF*16
- LAST_MASK_SSE2 r3d, r0+SIZEOF_DCTCOEF*32
- LAST_MASK_SSE2 r0d, r0+SIZEOF_DCTCOEF*48
+ LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
+ LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
+ LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*32
+ LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
shl r2d, 16
shl r0d, 16
or r1d, r2d
shl r3, 32
or r1, r3
not r1
- LAST rax, r1, 0x3f
+ BSR rax, r1, 0x3f
RET
%endif
%endmacro
-%define LAST LAST_X86
%ifndef ARCH_X86_64
-INIT_MMX
-%define LAST_MASK LAST_MASK_MMX
-COEFF_LAST mmx2
+INIT_MMX mmx2
+COEFF_LAST
%endif
-INIT_XMM
-%define LAST_MASK LAST_MASK_SSE2
-COEFF_LAST sse2
-%define LAST LAST_SSE4A
-COEFF_LAST sse2_lzcnt
+INIT_XMM sse2
+COEFF_LAST
+INIT_XMM sse2, lzcnt
+COEFF_LAST
;-----------------------------------------------------------------------------
; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
;-----------------------------------------------------------------------------
-%macro LZCOUNT_X86 3
- bsr %1, %2
- xor %1, %3
-%endmacro
-
-%macro LZCOUNT_SSE4A 3
- lzcnt %1, %2
-%endmacro
-
; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
%ifdef WIN64
DECLARE_REG_TMP 3,1,2,0,4,5,6
DECLARE_REG_TMP 6,3,2,1,4,5,0
%endif
-%macro COEFF_LEVELRUN 2
-cglobal coeff_level_run%2_%1,0,7
+%macro COEFF_LEVELRUN 1
+cglobal coeff_level_run%1,0,7
movifnidn t0, r0mp
movifnidn t1, r1mp
pxor m2, m2
- LAST_MASK t5d, t0-(%2&1)*SIZEOF_DCTCOEF, t4d
+ LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
not t5d
- shl t5d, 32-((%2+1)&~1)
- mov t4d, %2-1
+ shl t5d, 32-((%1+1)&~1)
+ mov t4d, %1-1
LZCOUNT t3d, t5d, 0x1f
xor t6d, t6d
add t5d, t5d
REP_RET
%endmacro
-INIT_MMX
-%define LZCOUNT LZCOUNT_X86
+INIT_MMX mmx2
%ifndef ARCH_X86_64
-%define LAST_MASK LAST_MASK_MMX
-COEFF_LEVELRUN mmx2, 15
-COEFF_LEVELRUN mmx2, 16
+COEFF_LEVELRUN 15
+COEFF_LEVELRUN 16
%endif
-%define LAST_MASK LAST_MASK4_MMX
-COEFF_LEVELRUN mmx2, 4
-INIT_XMM
-%define LAST_MASK LAST_MASK_SSE2
-COEFF_LEVELRUN sse2, 15
-COEFF_LEVELRUN sse2, 16
-%define LZCOUNT LZCOUNT_SSE4A
-COEFF_LEVELRUN sse2_lzcnt, 15
-COEFF_LEVELRUN sse2_lzcnt, 16
-INIT_MMX
-%define LAST_MASK LAST_MASK4_MMX
-COEFF_LEVELRUN mmx2_lzcnt, 4
+COEFF_LEVELRUN 4
+INIT_XMM sse2
+COEFF_LEVELRUN 15
+COEFF_LEVELRUN 16
+INIT_XMM sse2, lzcnt
+COEFF_LEVELRUN 15
+COEFF_LEVELRUN 16
+INIT_MMX mmx2, lzcnt
+COEFF_LEVELRUN 4
RET
%endmacro
-%macro SAD_W16 1
+%macro SAD_W16 0
;-----------------------------------------------------------------------------
; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_16x16_%1, 4,4,8
- movdqu m0, [r2]
- movdqu m1, [r2+r3]
+cglobal pixel_sad_16x16, 4,4,8
+ movu m0, [r2]
+ movu m1, [r2+r3]
lea r2, [r2+2*r3]
- movdqu m2, [r2]
- movdqu m3, [r2+r3]
+ movu m2, [r2]
+ movu m3, [r2+r3]
lea r2, [r2+2*r3]
psadbw m0, [r0]
psadbw m1, [r0+r1]
lea r0, [r0+2*r1]
- movdqu m4, [r2]
+ movu m4, [r2]
paddw m0, m1
psadbw m2, [r0]
psadbw m3, [r0+r1]
lea r0, [r0+2*r1]
- movdqu m5, [r2+r3]
+ movu m5, [r2+r3]
lea r2, [r2+2*r3]
paddw m2, m3
- movdqu m6, [r2]
- movdqu m7, [r2+r3]
+ movu m6, [r2]
+ movu m7, [r2+r3]
lea r2, [r2+2*r3]
paddw m0, m2
psadbw m4, [r0]
psadbw m5, [r0+r1]
lea r0, [r0+2*r1]
- movdqu m1, [r2]
+ movu m1, [r2]
paddw m4, m5
psadbw m6, [r0]
psadbw m7, [r0+r1]
lea r0, [r0+2*r1]
- movdqu m2, [r2+r3]
+ movu m2, [r2+r3]
lea r2, [r2+2*r3]
paddw m6, m7
- movdqu m3, [r2]
+ movu m3, [r2]
paddw m0, m4
- movdqu m4, [r2+r3]
+ movu m4, [r2+r3]
lea r2, [r2+2*r3]
paddw m0, m6
psadbw m1, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
- movdqu m5, [r2]
+ movu m5, [r2]
paddw m1, m2
psadbw m3, [r0]
psadbw m4, [r0+r1]
lea r0, [r0+2*r1]
- movdqu m6, [r2+r3]
+ movu m6, [r2+r3]
lea r2, [r2+2*r3]
paddw m3, m4
- movdqu m7, [r2]
+ movu m7, [r2]
paddw m0, m1
- movdqu m1, [r2+r3]
+ movu m1, [r2+r3]
paddw m0, m3
psadbw m5, [r0]
psadbw m6, [r0+r1]
;-----------------------------------------------------------------------------
; int pixel_sad_16x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_16x8_%1, 4,4
- movdqu m0, [r2]
- movdqu m2, [r2+r3]
+cglobal pixel_sad_16x8, 4,4
+ movu m0, [r2]
+ movu m2, [r2+r3]
lea r2, [r2+2*r3]
- movdqu m3, [r2]
- movdqu m4, [r2+r3]
+ movu m3, [r2]
+ movu m4, [r2+r3]
psadbw m0, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
paddw m0, m2
paddw m3, m4
paddw m0, m3
- movdqu m1, [r2]
- movdqu m2, [r2+r3]
+ movu m1, [r2]
+ movu m2, [r2+r3]
lea r2, [r2+2*r3]
- movdqu m3, [r2]
- movdqu m4, [r2+r3]
+ movu m3, [r2]
+ movu m4, [r2+r3]
psadbw m1, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
SAD_END_SSE2
%endmacro
-INIT_XMM
-SAD_W16 sse2
-%define movdqu lddqu
-SAD_W16 sse3
-%define movdqu movdqa
-SAD_W16 sse2_aligned
-%undef movdqu
+INIT_XMM sse2
+SAD_W16
+INIT_XMM sse3
+SAD_W16
+INIT_XMM sse2, aligned
+SAD_W16
%macro SAD_INC_4x8P_SSE 1
movq m1, [r0]
paddw m0, m2
%endmacro
+INIT_XMM
;Even on Nehalem, no sizes other than 8x16 benefit from this method.
cglobal pixel_sad_8x16_sse2, 4,4
SAD_INC_4x8P_SSE 0
movd [r2+4], mm1 ;H prediction cost
RET
-%macro INTRA_SADx3_4x4 1
-cglobal intra_sad_x3_4x4_%1, 3,3
+%macro INTRA_SADx3_4x4 0
+cglobal intra_sad_x3_4x4, 3,3
movd xmm4, [r1+FDEC_STRIDE*0-4]
pinsrd xmm4, [r1+FDEC_STRIDE*1-4], 1
pinsrd xmm4, [r1+FDEC_STRIDE*2-4], 2
RET
%endmacro ; INTRA_SADx3_4x4
-INIT_XMM
-INTRA_SADx3_4x4 sse4
-INIT_AVX
-INTRA_SADx3_4x4 avx
+INIT_XMM sse4
+INTRA_SADx3_4x4
+INIT_XMM avx
+INTRA_SADx3_4x4
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[33], int res[3]);
movd [r2+8], m1
RET
-%macro INTRA_SADx3_8x8 1
-cglobal intra_sad_x3_8x8_%1, 3,4,9
+%macro INTRA_SADx3_8x8 0
+cglobal intra_sad_x3_8x8, 3,4,9
%ifdef PIC
lea r11, [h8x8_pred_shuf]
%define shuf r11
RET
%endmacro ; INTRA_SADx3_8x8
-INIT_XMM
-INTRA_SADx3_8x8 ssse3
-INIT_AVX
-INTRA_SADx3_8x8 avx
+INIT_XMM ssse3
+INTRA_SADx3_8x8
+INIT_XMM avx
+INTRA_SADx3_8x8
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
-%macro INTRA_SAD_HV_ITER 2
-%ifidn %2, ssse3
+%macro INTRA_SAD_HV_ITER 1
+%if cpuflag(ssse3)
movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
pshufb m1, m7
%endif
%endmacro
-%macro INTRA_SAD_8x8C 1
-cglobal intra_sad_x3_8x8c_%1, 3,3
+%macro INTRA_SAD_8x8C 0
+cglobal intra_sad_x3_8x8c, 3,3
movq m6, [r1 - FDEC_STRIDE]
add r1, FDEC_STRIDE*4
-%ifidn %1,ssse3
+%if cpuflag(ssse3)
movq m7, [pb_3]
%endif
- INTRA_SAD_HV_ITER 0, %1
- INTRA_SAD_HV_ITER 2, %1
- INTRA_SAD_HV_ITER 4, %1
- INTRA_SAD_HV_ITER 6, %1
+ INTRA_SAD_HV_ITER 0
+ INTRA_SAD_HV_ITER 2
+ INTRA_SAD_HV_ITER 4
+ INTRA_SAD_HV_ITER 6
movd [r2+4], m0
movd [r2+8], m2
pxor m7, m7
paddw m0, m3
psrlw m0, 2
pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
-%ifidn %1, ssse3
+%if cpuflag(ssse3)
movq2dq xmm0, m0
pshufb xmm0, [pb_shuf8x8c]
movq xmm1, [r0+FENC_STRIDE*0]
RET
%endmacro
-INIT_MMX
-INTRA_SAD_8x8C mmx2
-INTRA_SAD_8x8C ssse3
+INIT_MMX mmx2
+INTRA_SAD_8x8C
+INIT_MMX ssse3
+INTRA_SAD_8x8C
;-----------------------------------------------------------------------------
;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
-%macro INTRA_SAD16 1-2 0
-cglobal intra_sad_x3_16x16_%1,3,5,%2
+%macro INTRA_SAD16 0
+cglobal intra_sad_x3_16x16, 3,5,8*(mmsize/16)
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r1-FDEC_STRIDE+0]
psadbw mm1, [r1-FDEC_STRIDE+8]
paddw mm0, mm1
movd r3d, mm0
-%ifidn %1, ssse3
+%if cpuflag(ssse3)
mova m1, [pb_3]
%endif
%assign x 0
RET
%endmacro
-INIT_MMX
-%define SPLATB SPLATB_MMX
-INTRA_SAD16 mmx2
-INIT_XMM
-INTRA_SAD16 sse2, 8
-%define SPLATB SPLATB_SSSE3
-INTRA_SAD16 ssse3, 8
+INIT_MMX mmx2
+INTRA_SAD16
+INIT_XMM sse2
+INTRA_SAD16
+INIT_XMM ssse3
+INTRA_SAD16
SAD_X%1_END
%endmacro
+INIT_MMX
SAD_X 3, 16, 16
SAD_X 3, 16, 8
SAD_X 3, 8, 16
;=============================================================================
%macro SAD_X3_START_1x16P_SSE2 0
- movdqa xmm3, [r0]
- movdqu xmm0, [r1]
- movdqu xmm1, [r2]
- movdqu xmm2, [r3]
+%if cpuflag(misalign)
+ mova xmm2, [r0]
+ movu xmm0, [r1]
+ movu xmm1, [r2]
+ psadbw xmm0, xmm2
+ psadbw xmm1, xmm2
+ psadbw xmm2, [r3]
+%else
+ mova xmm3, [r0]
+ movu xmm0, [r1]
+ movu xmm1, [r2]
+ movu xmm2, [r3]
psadbw xmm0, xmm3
psadbw xmm1, xmm3
psadbw xmm2, xmm3
+%endif
%endmacro
%macro SAD_X3_1x16P_SSE2 2
- movdqa xmm3, [r0+%1]
- movdqu xmm4, [r1+%2]
- movdqu xmm5, [r2+%2]
- movdqu xmm6, [r3+%2]
+%if cpuflag(misalign)
+ mova xmm3, [r0+%1]
+ movu xmm4, [r1+%2]
+ movu xmm5, [r2+%2]
+ psadbw xmm4, xmm3
+ psadbw xmm5, xmm3
+ psadbw xmm3, [r3+%2]
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+ paddw xmm2, xmm3
+%else
+ mova xmm3, [r0+%1]
+ movu xmm4, [r1+%2]
+ movu xmm5, [r2+%2]
+ movu xmm6, [r3+%2]
psadbw xmm4, xmm3
psadbw xmm5, xmm3
psadbw xmm6, xmm3
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm6
+%endif
%endmacro
%macro SAD_X3_2x16P_SSE2 1
%endmacro
%macro SAD_X4_START_1x16P_SSE2 0
- movdqa xmm7, [r0]
- movdqu xmm0, [r1]
- movdqu xmm1, [r2]
- movdqu xmm2, [r3]
- movdqu xmm3, [r4]
+%if cpuflag(misalign)
+ mova xmm3, [r0]
+ movu xmm0, [r1]
+ movu xmm1, [r2]
+ movu xmm2, [r3]
+ psadbw xmm0, xmm3
+ psadbw xmm1, xmm3
+ psadbw xmm2, xmm3
+ psadbw xmm3, [r4]
+%else
+ mova xmm7, [r0]
+ movu xmm0, [r1]
+ movu xmm1, [r2]
+ movu xmm2, [r3]
+ movu xmm3, [r4]
psadbw xmm0, xmm7
psadbw xmm1, xmm7
psadbw xmm2, xmm7
psadbw xmm3, xmm7
+%endif
%endmacro
%macro SAD_X4_1x16P_SSE2 2
- movdqa xmm7, [r0+%1]
- movdqu xmm4, [r1+%2]
- movdqu xmm5, [r2+%2]
- movdqu xmm6, [r3+%2]
+%if cpuflag(misalign)
+ mova xmm7, [r0+%1]
+ movu xmm4, [r1+%2]
+ movu xmm5, [r2+%2]
+ movu xmm6, [r3+%2]
+ psadbw xmm4, xmm7
+ psadbw xmm5, xmm7
+ psadbw xmm6, xmm7
+ psadbw xmm7, [r4+%2]
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+ paddw xmm2, xmm6
+ paddw xmm3, xmm7
+%else
+ mova xmm7, [r0+%1]
+ movu xmm4, [r1+%2]
+ movu xmm5, [r2+%2]
+ movu xmm6, [r3+%2]
%ifdef ARCH_X86_64
- movdqu xmm8, [r4+%2]
+ movu xmm8, [r4+%2]
psadbw xmm4, xmm7
psadbw xmm5, xmm7
psadbw xmm6, xmm7
psadbw xmm5, xmm7
paddw xmm0, xmm4
psadbw xmm6, xmm7
- movdqu xmm4, [r4+%2]
+ movu xmm4, [r4+%2]
paddw xmm1, xmm5
psadbw xmm4, xmm7
paddw xmm2, xmm6
paddw xmm3, xmm4
%endif
+%endif
%endmacro
%macro SAD_X4_2x16P_SSE2 1
RET
%endmacro
-%macro SAD_X3_START_1x16P_SSE2_MISALIGN 0
- movdqa xmm2, [r0]
- movdqu xmm0, [r1]
- movdqu xmm1, [r2]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm2
- psadbw xmm2, [r3]
-%endmacro
-
-%macro SAD_X3_1x16P_SSE2_MISALIGN 2
- movdqa xmm3, [r0+%1]
- movdqu xmm4, [r1+%2]
- movdqu xmm5, [r2+%2]
- psadbw xmm4, xmm3
- psadbw xmm5, xmm3
- psadbw xmm3, [r3+%2]
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm3
-%endmacro
-
-%macro SAD_X4_START_1x16P_SSE2_MISALIGN 0
- movdqa xmm3, [r0]
- movdqu xmm0, [r1]
- movdqu xmm1, [r2]
- movdqu xmm2, [r3]
- psadbw xmm0, xmm3
- psadbw xmm1, xmm3
- psadbw xmm2, xmm3
- psadbw xmm3, [r4]
-%endmacro
-
-%macro SAD_X4_1x16P_SSE2_MISALIGN 2
- movdqa xmm7, [r0+%1]
- movdqu xmm4, [r1+%2]
- movdqu xmm5, [r2+%2]
- movdqu xmm6, [r3+%2]
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- psadbw xmm6, xmm7
- psadbw xmm7, [r4+%2]
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- paddw xmm3, xmm7
-%endmacro
-
-%macro SAD_X3_2x16P_SSE2_MISALIGN 1
-%if %1
- SAD_X3_START_1x16P_SSE2_MISALIGN
-%else
- SAD_X3_1x16P_SSE2_MISALIGN 0, 0
-%endif
- SAD_X3_1x16P_SSE2_MISALIGN FENC_STRIDE, r4
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r4]
- lea r2, [r2+2*r4]
- lea r3, [r3+2*r4]
-%endmacro
-
-%macro SAD_X4_2x16P_SSE2_MISALIGN 1
-%if %1
- SAD_X4_START_1x16P_SSE2_MISALIGN
-%else
- SAD_X4_1x16P_SSE2_MISALIGN 0, 0
-%endif
- SAD_X4_1x16P_SSE2_MISALIGN FENC_STRIDE, r5
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r5]
- lea r2, [r2+2*r5]
- lea r3, [r3+2*r5]
- lea r4, [r4+2*r5]
-%endmacro
-
;-----------------------------------------------------------------------------
; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
-%macro SAD_X_SSE2 4
-cglobal pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9
+%macro SAD_X_SSE2 3
+cglobal pixel_sad_x%1_%2x%3, 2+%1,2+%1,9
%ifdef WIN64
%assign i %1+1
movsxd r %+ i, r %+ i %+ d
SAD_X%1_END_SSE2
%endmacro
-%macro SAD_X_SSE2_MISALIGN 4
-cglobal pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9
-%ifdef WIN64
- %assign i %1+1
- movsxd r %+ i, r %+ i %+ d
-%endif
- SAD_X%1_2x%2P_SSE2_MISALIGN 1
-%rep %3/2-1
- SAD_X%1_2x%2P_SSE2_MISALIGN 0
-%endrep
- SAD_X%1_END_SSE2
-%endmacro
-
-SAD_X_SSE2 3, 16, 16, sse2
-SAD_X_SSE2 3, 16, 8, sse2
-SAD_X_SSE2 3, 8, 16, sse2
-SAD_X_SSE2 3, 8, 8, sse2
-SAD_X_SSE2 3, 8, 4, sse2
-SAD_X_SSE2 4, 16, 16, sse2
-SAD_X_SSE2 4, 16, 8, sse2
-SAD_X_SSE2 4, 8, 16, sse2
-SAD_X_SSE2 4, 8, 8, sse2
-SAD_X_SSE2 4, 8, 4, sse2
+INIT_XMM sse2
+SAD_X_SSE2 3, 16, 16
+SAD_X_SSE2 3, 16, 8
+SAD_X_SSE2 3, 8, 16
+SAD_X_SSE2 3, 8, 8
+SAD_X_SSE2 3, 8, 4
+SAD_X_SSE2 4, 16, 16
+SAD_X_SSE2 4, 16, 8
+SAD_X_SSE2 4, 8, 16
+SAD_X_SSE2 4, 8, 8
+SAD_X_SSE2 4, 8, 4
-SAD_X_SSE2_MISALIGN 3, 16, 16, sse2
-SAD_X_SSE2_MISALIGN 3, 16, 8, sse2
-SAD_X_SSE2_MISALIGN 4, 16, 16, sse2
-SAD_X_SSE2_MISALIGN 4, 16, 8, sse2
+INIT_XMM sse2, misalign
+SAD_X_SSE2 3, 16, 16
+SAD_X_SSE2 3, 16, 8
+SAD_X_SSE2 4, 16, 16
+SAD_X_SSE2 4, 16, 8
-%define movdqu lddqu
-SAD_X_SSE2 3, 16, 16, sse3
-SAD_X_SSE2 3, 16, 8, sse3
-SAD_X_SSE2 4, 16, 16, sse3
-SAD_X_SSE2 4, 16, 8, sse3
-%undef movdqu
+INIT_XMM sse3
+SAD_X_SSE2 3, 16, 16
+SAD_X_SSE2 3, 16, 8
+SAD_X_SSE2 4, 16, 16
+SAD_X_SSE2 4, 16, 8
; instantiate the aligned sads
+INIT_MMX
%ifndef ARCH_X86_64
SAD16_CACHELINE_FUNC_MMX2 8, 32
SAD16_CACHELINE_FUNC_MMX2 16, 32
;-----------------------------------------------------------------------------
; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int )
;-----------------------------------------------------------------------------
-%macro SAD_MMX 4
-cglobal pixel_sad_%1x%2_%4, 4,4
+%macro SAD_MMX 3
+cglobal pixel_sad_%1x%2, 4,4
pxor m0, m0
%rep %2/%3
SAD_INC_%3x%1P_MMX
RET
%endmacro
-INIT_MMX
-%define ABS1 ABS1_MMX
-%define ABS2 ABS2_MMX
-SAD_MMX 16, 16, 1, mmx2
-SAD_MMX 16, 8, 1, mmx2
-SAD_MMX 8, 16, 2, mmx2
-SAD_MMX 8, 8, 2, mmx2
-SAD_MMX 8, 4, 2, mmx2
-SAD_MMX 4, 8, 2, mmx2
-SAD_MMX 4, 4, 2, mmx2
-%define ABS1 ABS1_SSSE3
-%define ABS2 ABS2_SSSE3
-SAD_MMX 4, 8, 2, ssse3
-SAD_MMX 4, 4, 2, ssse3
-%undef ABS1
-%undef ABS2
+INIT_MMX mmx2
+SAD_MMX 16, 16, 1
+SAD_MMX 16, 8, 1
+SAD_MMX 8, 16, 2
+SAD_MMX 8, 8, 2
+SAD_MMX 8, 4, 2
+SAD_MMX 4, 8, 2
+SAD_MMX 4, 4, 2
+INIT_MMX ssse3
+SAD_MMX 4, 8, 2
+SAD_MMX 4, 4, 2
;=============================================================================
; SAD XMM
;-----------------------------------------------------------------------------
; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int )
;-----------------------------------------------------------------------------
-%macro SAD_XMM 3
-cglobal pixel_sad_%1x%2_%3, 4,4,8
+%macro SAD_XMM 2
+cglobal pixel_sad_%1x%2, 4,4,8
pxor m0, m0
%rep %2/2
SAD_INC_2x%1P_XMM
RET
%endmacro
-INIT_XMM
-%define ABS1 ABS1_MMX
-%define ABS2 ABS2_MMX
-SAD_XMM 16, 16, sse2
-SAD_XMM 16, 8, sse2
-SAD_XMM 8, 16, sse2
-SAD_XMM 8, 8, sse2
-SAD_XMM 8, 4, sse2
-%define movdqu movdqa
-SAD_XMM 16, 16, sse2_aligned
-SAD_XMM 16, 8, sse2_aligned
-SAD_XMM 8, 16, sse2_aligned
-SAD_XMM 8, 8, sse2_aligned
-%undef movdqu
-%define ABS1 ABS1_SSSE3
-%define ABS2 ABS2_SSSE3
-SAD_XMM 16, 16, ssse3
-SAD_XMM 16, 8, ssse3
-SAD_XMM 8, 16, ssse3
-SAD_XMM 8, 8, ssse3
-SAD_XMM 8, 4, ssse3
-%define movdqu movdqa
-SAD_XMM 16, 16, ssse3_aligned
-SAD_XMM 16, 8, ssse3_aligned
-SAD_XMM 8, 16, ssse3_aligned
-SAD_XMM 8, 8, ssse3_aligned
-%undef movdqu
-%undef ABS1
-%undef ABS2
+INIT_XMM sse2
+SAD_XMM 16, 16
+SAD_XMM 16, 8
+SAD_XMM 8, 16
+SAD_XMM 8, 8
+SAD_XMM 8, 4
+INIT_XMM sse2, aligned
+SAD_XMM 16, 16
+SAD_XMM 16, 8
+SAD_XMM 8, 16
+SAD_XMM 8, 8
+INIT_XMM ssse3
+SAD_XMM 16, 16
+SAD_XMM 16, 8
+SAD_XMM 8, 16
+SAD_XMM 8, 8
+SAD_XMM 8, 4
+INIT_XMM ssse3, aligned
+SAD_XMM 16, 16
+SAD_XMM 16, 8
+SAD_XMM 8, 16
+SAD_XMM 8, 8
;=============================================================================
; SAD x3/x4
paddw m1, m6
paddw m2, m7
paddw m3, m8
-%elifidn ABS1, ABS1_SSSE3
+%elif cpuflag(ssse3)
movu m7, [r3+%2]
psubw m5, m4
psubw m6, m4
; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
; uint16_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
-%macro SAD_X 4
-cglobal pixel_sad_x%1_%2x%3_%4, 6,7,XMM_REGS
+%macro SAD_X 3
+cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
%assign regnum %1+1
%xdefine STRIDE r %+ regnum
%ifdef WIN64
SAD_X%1_END %2, %3
%endmacro
-INIT_MMX
+INIT_MMX mmx2
%define XMM_REGS 0
-%define ABS1 ABS1_MMX
-%define ABS2 ABS2_MMX
-SAD_X 3, 16, 16, mmx2
-SAD_X 3, 16, 8, mmx2
-SAD_X 3, 8, 16, mmx2
-SAD_X 3, 8, 8, mmx2
-SAD_X 3, 8, 4, mmx2
-SAD_X 3, 4, 8, mmx2
-SAD_X 3, 4, 4, mmx2
-SAD_X 4, 16, 16, mmx2
-SAD_X 4, 16, 8, mmx2
-SAD_X 4, 8, 16, mmx2
-SAD_X 4, 8, 8, mmx2
-SAD_X 4, 8, 4, mmx2
-SAD_X 4, 4, 8, mmx2
-SAD_X 4, 4, 4, mmx2
-%define ABS1 ABS1_SSSE3
-%define ABS2 ABS2_SSSE3
-SAD_X 3, 4, 8, ssse3
-SAD_X 3, 4, 4, ssse3
-SAD_X 4, 4, 8, ssse3
-SAD_X 4, 4, 4, ssse3
-INIT_XMM
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 3, 8, 16
+SAD_X 3, 8, 8
+SAD_X 3, 8, 4
+SAD_X 3, 4, 8
+SAD_X 3, 4, 4
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+SAD_X 4, 8, 16
+SAD_X 4, 8, 8
+SAD_X 4, 8, 4
+SAD_X 4, 4, 8
+SAD_X 4, 4, 4
+INIT_MMX ssse3
+SAD_X 3, 4, 8
+SAD_X 3, 4, 4
+SAD_X 4, 4, 8
+SAD_X 4, 4, 4
+INIT_XMM ssse3
%define XMM_REGS 9
-SAD_X 3, 16, 16, ssse3
-SAD_X 3, 16, 8, ssse3
-SAD_X 3, 8, 16, ssse3
-SAD_X 3, 8, 8, ssse3
-SAD_X 3, 8, 4, ssse3
-SAD_X 4, 16, 16, ssse3
-SAD_X 4, 16, 8, ssse3
-SAD_X 4, 8, 16, ssse3
-SAD_X 4, 8, 8, ssse3
-SAD_X 4, 8, 4, ssse3
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 3, 8, 16
+SAD_X 3, 8, 8
+SAD_X 3, 8, 4
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+SAD_X 4, 8, 16
+SAD_X 4, 8, 8
+SAD_X 4, 8, 4
+INIT_XMM sse2
%define XMM_REGS 11
-%define ABS1 ABS1_MMX
-%define ABS2 ABS2_MMX
-SAD_X 3, 16, 16, sse2
-SAD_X 3, 16, 8, sse2
-SAD_X 3, 8, 16, sse2
-SAD_X 3, 8, 8, sse2
-SAD_X 3, 8, 4, sse2
-SAD_X 4, 16, 16, sse2
-SAD_X 4, 16, 8, sse2
-SAD_X 4, 8, 16, sse2
-SAD_X 4, 8, 8, sse2
-SAD_X 4, 8, 4, sse2
-%undef ABS1
-%undef ABS2
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 3, 8, 16
+SAD_X 3, 8, 8
+SAD_X 3, 8, 4
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+SAD_X 4, 8, 16
+SAD_X 4, 8, 8
+SAD_X 4, 8, 4
; we need more flexible macro.
; RET:
-; Pops anything that was pushed by PROLOGUE
+; Pops anything that was pushed by PROLOGUE, and returns.
; REP_RET:
; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
%assign function_align 16
-; Symbol prefix for C linkage
-%macro cglobal 1-2+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+%macro cglobal 1-2+ ; name, [PROLOGUE args]
+%if %0 == 1
+ cglobal_internal %1 %+ SUFFIX
+%else
+ cglobal_internal %1 %+ SUFFIX, %2
+%endif
+%endmacro
+%macro cglobal_internal 1-2+
%ifndef cglobaled_%1
%xdefine %1 mangle(program_name %+ _ %+ %1)
%xdefine %1.skip_prologue %1 %+ .skip_prologue
CAT_XDEFINE cglobaled_, %1, 1
%endif
+ %xdefine current_function %1
%ifidn __OUTPUT_FORMAT__,elf
global %1:function hidden
%else
%macro cextern 1
%xdefine %1 mangle(program_name %+ _ %+ %1)
+ CAT_XDEFINE cglobaled_, %1, 1
extern %1
%endmacro
-;like cextern, but without the prefix
+; like cextern, but without the prefix
%macro cextern_naked 1
%xdefine %1 mangle(%1)
+ CAT_XDEFINE cglobaled_, %1, 1
extern %1
%endmacro
SECTION .note.GNU-stack noalloc noexec nowrite progbits
%endif
+; cpuflags
+
+%assign cpuflags_mmx (1<<0)
+%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
+%assign cpuflags_sse (1<<2) | cpuflags_mmx2
+%assign cpuflags_sse2 (1<<3) | cpuflags_sse
+%assign cpuflags_sse2slow (1<<4) | cpuflags_sse2
+%assign cpuflags_sse3 (1<<5) | cpuflags_sse2
+%assign cpuflags_ssse3 (1<<6) | cpuflags_sse3
+%assign cpuflags_sse4 (1<<7) | cpuflags_ssse3
+%assign cpuflags_sse42 (1<<8) | cpuflags_sse4
+%assign cpuflags_avx (1<<9) | cpuflags_sse42
+
+%assign cpuflags_cache32 (1<<16)
+%assign cpuflags_cache64 (1<<17)
+%assign cpuflags_slowctz (1<<18)
+%assign cpuflags_lzcnt (1<<19)
+%assign cpuflags_misalign (1<<20)
+%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
+
+%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
+%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
+
+; Takes up to 2 cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-2
+ %if %0 >= 1
+ %xdefine cpuname %1
+ %assign cpuflags cpuflags_%1
+ %if %0 >= 2
+ %xdefine cpuname %1_%2
+ %assign cpuflags cpuflags | cpuflags_%2
+ %endif
+ %xdefine SUFFIX _ %+ cpuname
+ %if cpuflag(avx)
+ %assign avx_enabled 1
+ %endif
+ %if cpuflag(aligned)
+ %define movu mova
+ %elifidn %1, sse3
+ %define movu lddqu
+ %endif
+ %else
+ %xdefine SUFFIX
+ %undef cpuname
+ %undef cpuflags
+ %endif
+%endmacro
+
; merge mmx and sse*
%macro CAT_XDEFINE 3
%undef %1%2
%endmacro
-%macro INIT_MMX 0
+%macro INIT_MMX 0-1+
%assign avx_enabled 0
- %define RESET_MM_PERMUTATION INIT_MMX
+ %define RESET_MM_PERMUTATION INIT_MMX %1
%define mmsize 8
%define num_mmregs 8
%define mova movq
CAT_UNDEF nmm, %%i
%assign %%i %%i+1
%endrep
+ INIT_CPUFLAGS %1
%endmacro
-%macro INIT_XMM 0
+%macro INIT_XMM 0-1+
%assign avx_enabled 0
- %define RESET_MM_PERMUTATION INIT_XMM
+ %define RESET_MM_PERMUTATION INIT_XMM %1
%define mmsize 16
%define num_mmregs 8
%ifdef ARCH_X86_64
CAT_XDEFINE nxmm, %%i, %%i
%assign %%i %%i+1
%endrep
+ INIT_CPUFLAGS %1
%endmacro
-%macro INIT_AVX 0
- INIT_XMM
+%macro INIT_YMM 0-1+
%assign avx_enabled 1
- %define PALIGNR PALIGNR_SSSE3
- %define RESET_MM_PERMUTATION INIT_AVX
-%endmacro
-
-%macro INIT_YMM 0
- %assign avx_enabled 1
- %define RESET_MM_PERMUTATION INIT_YMM
+ %define RESET_MM_PERMUTATION INIT_YMM %1
%define mmsize 32
%define num_mmregs 8
%ifdef ARCH_X86_64
%endif
%define mova vmovaps
%define movu vmovups
+ %undef movh
+ %undef movnta
%assign %%i 0
%rep num_mmregs
CAT_XDEFINE m, %%i, ymm %+ %%i
CAT_XDEFINE nymm, %%i, %%i
%assign %%i %%i+1
%endrep
+ INIT_CPUFLAGS %1
%endmacro
INIT_MMX
%endrep
%endmacro
-; If SAVE_MM_PERMUTATION is placed at the end of a function and given the
-; function name, then any later calls to that function will automatically
-; load the permutation, so values can be returned in mmregs.
-%macro SAVE_MM_PERMUTATION 1 ; name to save as
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+ %if %0
+ %xdefine %%f %1_m
+ %else
+ %xdefine %%f current_function %+ _m
+ %endif
%assign %%i 0
%rep num_mmregs
- CAT_XDEFINE %1_m, %%i, m %+ %%i
+ CAT_XDEFINE %%f, %%i, m %+ %%i
%assign %%i %%i+1
%endrep
%endmacro
%macro LOAD_MM_PERMUTATION 1 ; name to load from
- %assign %%i 0
- %rep num_mmregs
- CAT_XDEFINE m, %%i, %1_m %+ %%i
- CAT_XDEFINE n, m %+ %%i, %%i
- %assign %%i %%i+1
- %endrep
+ %ifdef %1_m0
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, %1_m %+ %%i
+ CAT_XDEFINE n, m %+ %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
%endmacro
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
%macro call 1
- call %1
- %ifdef %1_m0
- LOAD_MM_PERMUTATION %1
+ call_internal %1, %1 %+ SUFFIX
+%endmacro
+%macro call_internal 2
+ %xdefine %%i %1
+ %ifndef cglobaled_%1
+ %ifdef cglobaled_%2
+ %xdefine %%i %2
+ %endif
%endif
+ call %%i
+ LOAD_MM_PERMUTATION %%i
%endmacro
; Substitutions that reduce instruction size but are functionally equivalent
%assign PIXEL_MAX ((1 << BIT_DEPTH)-1)
%macro SBUTTERFLY 4
-%if avx_enabled == 0
- mova m%4, m%2
+%if avx_enabled && mmsize == 16
+ punpckh%1 m%4, m%2, m%3
punpckl%1 m%2, m%3
- punpckh%1 m%4, m%3
%else
- punpckh%1 m%4, m%2, m%3
+ mova m%4, m%2
punpckl%1 m%2, m%3
+ punpckh%1 m%4, m%3
%endif
SWAP %3, %4
%endmacro
%endif
%endmacro
-%macro ABS1_MMX 2 ; a, tmp
+%macro ABS1 2 ; a, tmp
+%if cpuflag(ssse3)
+ pabsw %1, %1
+%else
pxor %2, %2
psubw %2, %1
pmaxsw %1, %2
+%endif
%endmacro
-%macro ABS2_MMX 4 ; a, b, tmp0, tmp1
+%macro ABS2 4 ; a, b, tmp0, tmp1
+%if cpuflag(ssse3)
+ pabsw %1, %1
+ pabsw %2, %2
+%else
pxor %3, %3
pxor %4, %4
psubw %3, %1
psubw %4, %2
pmaxsw %1, %3
pmaxsw %2, %4
+%endif
%endmacro
-%macro ABS1_SSSE3 2
- pabsw %1, %1
-%endmacro
-
-%macro ABS2_SSSE3 4
- pabsw %1, %1
- pabsw %2, %2
-%endmacro
-
-%macro ABSB_MMX 2
+%macro ABSB 2
+%if cpuflag(ssse3)
+ pabsb %1, %1
+%else
pxor %2, %2
psubb %2, %1
pminub %1, %2
+%endif
%endmacro
-%macro ABSB2_MMX 4
+%macro ABSB2 4
+%if cpuflag(ssse3)
+ pabsb %1, %1
+ pabsb %2, %2
+%else
pxor %3, %3
pxor %4, %4
psubb %3, %1
psubb %4, %2
pminub %1, %3
pminub %2, %4
+%endif
%endmacro
-%macro ABSD2_MMX 4
- pxor %3, %3
- pxor %4, %4
- pcmpgtd %3, %1
- pcmpgtd %4, %2
- pxor %1, %3
- pxor %2, %4
- psubd %1, %3
- psubd %2, %4
-%endmacro
-
-%macro ABSB_SSSE3 2
- pabsb %1, %1
-%endmacro
-
-%macro ABSB2_SSSE3 4
- pabsb %1, %1
- pabsb %2, %2
-%endmacro
-
-%macro ABS4 6
- ABS2 %1, %2, %5, %6
- ABS2 %3, %4, %5, %6
-%endmacro
-
-%define ABS1 ABS1_MMX
-%define ABS2 ABS2_MMX
-%define ABSB ABSB_MMX
-%define ABSB2 ABSB2_MMX
-
-%macro SPLATB_MMX 3
- movd %1, [%2-3] ;to avoid crossing a cacheline
- punpcklbw %1, %1
- SPLATW %1, %1, 3
-%endmacro
-
-%macro SPLATB_SSSE3 3
+%macro SPLATB 3
+%if cpuflag(ssse3)
movd %1, [%2-3]
pshufb %1, %3
-%endmacro
-
-%macro PALIGNR_MMX 4-5 ; [dst,] src1, src2, imm, tmp
- %define %%dst %1
-%if %0==5
-%ifnidn %1, %2
- mova %%dst, %2
-%endif
- %rotate 1
-%endif
-%ifnidn %4, %2
- mova %4, %2
-%endif
-%if mmsize==8
- psllq %%dst, (8-%3)*8
- psrlq %4, %3*8
%else
- pslldq %%dst, 16-%3
- psrldq %4, %3
+ movd %1, [%2-3] ;to avoid crossing a cacheline
+ punpcklbw %1, %1
+ SPLATW %1, %1, 3
%endif
- por %%dst, %4
%endmacro
-%macro PALIGNR_SSSE3 4-5
-%if %0==5
- palignr %1, %2, %3, %4
+%macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp
+%if cpuflag(ssse3)
+ %if %0==5
+ palignr %1, %2, %3, %4
+ %else
+ palignr %1, %2, %3
+ %endif
%else
- palignr %1, %2, %3
+ %define %%dst %1
+ %if %0==5
+ %ifnidn %1, %2
+ mova %%dst, %2
+ %endif
+ %rotate 1
+ %endif
+ %ifnidn %4, %2
+ mova %4, %2
+ %endif
+ %if mmsize==8
+ psllq %%dst, (8-%3)*8
+ psrlq %4, %3*8
+ %else
+ pslldq %%dst, 16-%3
+ psrldq %4, %3
+ %endif
+ por %%dst, %4
%endif
%endmacro
padd%1 m%2, m%3
padd%1 m%3, m%3
psub%1 m%3, m%2
+%elif avx_enabled
+ padd%1 m%4, m%2, m%3
+ psub%1 m%3, m%2
+ SWAP %2, %4
%else
-%if avx_enabled == 0
mova m%4, m%2
padd%1 m%2, m%3
psub%1 m%3, m%4
-%else
- padd%1 m%4, m%2, m%3
- psub%1 m%3, m%2
- SWAP %2, %4
-%endif
%endif
%endmacro
pblendw m%4, m%3, 01010101b
SWAP %3, %5
%else
-%if avx_enabled == 0
- mova m%5, m%3
- pblendw m%3, m%4, 10101010b
-%else
+%if avx_enabled
pblendw m%5, m%3, m%4, 10101010b
SWAP %3, %5
+%else
+ mova m%5, m%3
+ pblendw m%3, m%4, 10101010b
%endif
psll%1 m%4, 16
psrl%1 m%5, 16
%endmacro
%macro SUMSUB2_BA 4
-%if avx_enabled == 0
- mova m%4, m%2
- padd%1 m%2, m%3
- padd%1 m%2, m%3
- psub%1 m%3, m%4
- psub%1 m%3, m%4
-%else
+%if avx_enabled
padd%1 m%4, m%2, m%3
padd%1 m%4, m%3
psub%1 m%3, m%2
psub%1 m%3, m%2
SWAP %2, %4
+%else
+ mova m%4, m%2
+ padd%1 m%2, m%3
+ padd%1 m%2, m%3
+ psub%1 m%3, m%4
+ psub%1 m%3, m%4
%endif
%endmacro
%endif
%endmacro
-%macro LOAD_DIFF8x4_SSE2 8
- LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDE], [%8+%1*FDEC_STRIDE]
- LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDE], [%8+%2*FDEC_STRIDE]
- LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDE], [%8+%3*FDEC_STRIDE]
- LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDE], [%8+%4*FDEC_STRIDE]
-%endmacro
-
-%macro LOAD_DIFF8x4_SSSE3 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
+%macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
+%if cpuflag(ssse3)
movh m%2, [%8+%1*FDEC_STRIDE]
movh m%1, [%7+%1*FENC_STRIDE]
punpcklbw m%1, m%2
pmaddubsw m%2, m%6
pmaddubsw m%3, m%6
pmaddubsw m%4, m%6
+%else
+ LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDE], [%8+%1*FDEC_STRIDE]
+ LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDE], [%8+%2*FDEC_STRIDE]
+ LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDE], [%8+%3*FDEC_STRIDE]
+ LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDE], [%8+%4*FDEC_STRIDE]
+%endif
%endmacro
%macro STORE_DCT 6
%endif
%endmacro
-%macro SPLATW 2-3 0
+%imacro SPLATW 2-3 0
%if mmsize == 16
pshuflw %1, %2, (%3)*0x55
punpcklqdq %1, %1
%endif
%endmacro
-%macro SPLATD 2-3 0
+%imacro SPLATD 2-3 0
%if mmsize == 16
pshufd %1, %2, (%3)*0x55
%else