%include "vpx_ports/x86_abi_support.asm"
-section .text
- global sym(vp8_short_fdct4x4_mmx)
- global sym(vp8_short_fdct8x4_wmt)
-
-
-%define DCTCONSTANTSBITS (16)
-%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1))
-%define x_c1 (60547) ; cos(pi /8) * (1<<15)
-%define x_c2 (46341) ; cos(pi*2/8) * (1<<15)
-%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
-
-
;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_mmx)
sym(vp8_short_fdct4x4_mmx):
push rbp
- mov rbp, rsp
+ mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
GET_GOT rbx
- push rsi
- push rdi
+ push rsi
+ push rdi
; end prolog
- mov rsi, arg(0) ;input
- mov rdi, arg(1) ;output
-
- lea rdx, [GLOBAL(dct_const_mmx)]
- movsxd rax, dword ptr arg(2) ;pitch
-
- lea rcx, [rsi + rax*2]
- ; read the input data
- movq mm0, [rsi]
- movq mm1, [rsi + rax ]
-
- movq mm2, [rcx]
- movq mm3, [rcx + rax]
- ; get the constants
- ;shift to left by 1 for prescision
- psllw mm0, 3
- psllw mm1, 3
-
- psllw mm2, 3
- psllw mm3, 3
-
- ; transpose for the second stage
- movq mm4, mm0 ; 00 01 02 03
- movq mm5, mm2 ; 10 11 12 03
-
- punpcklwd mm0, mm1 ; 00 10 01 11
- punpckhwd mm4, mm1 ; 02 12 03 13
-
- punpcklwd mm2, mm3 ; 20 30 21 31
- punpckhwd mm5, mm3 ; 22 32 23 33
-
-
- movq mm1, mm0 ; 00 10 01 11
- punpckldq mm0, mm2 ; 00 10 20 30
-
- punpckhdq mm1, mm2 ; 01 11 21 31
-
- movq mm2, mm4 ; 02 12 03 13
- punpckldq mm2, mm5 ; 02 12 22 32
-
- punpckhdq mm4, mm5 ; 03 13 23 33
- movq mm3, mm4
-
-
- ; first stage
- movq mm5, mm0
- movq mm4, mm1
-
- paddw mm0, mm3 ; a = 0 + 3
- paddw mm1, mm2 ; b = 1 + 2
-
- psubw mm4, mm2 ; c = 1 - 2
- psubw mm5, mm3 ; d = 0 - 3
-
-
- ; output 0 and 2
- movq mm6, [rdx + 16] ; c2
- movq mm2, mm0 ; a
-
- paddw mm0, mm1 ; a + b
- psubw mm2, mm1 ; a - b
-
- movq mm1, mm0 ; a + b
- pmulhw mm0, mm6 ; 00 01 02 03
-
- paddw mm0, mm1 ; output 00 01 02 03
- pmulhw mm6, mm2 ; 20 21 22 23
-
- paddw mm2, mm6 ; output 20 21 22 23
-
- ; output 1 and 3
- movq mm6, [rdx + 8] ; c1
- movq mm7, [rdx + 24] ; c3
-
- movq mm1, mm4 ; c
- movq mm3, mm5 ; d
-
- pmulhw mm1, mm7 ; c * c3
- pmulhw mm3, mm6 ; d * c1
-
- paddw mm3, mm5 ; d * c1 rounded
- paddw mm1, mm3 ; output 10 11 12 13
-
- movq mm3, mm4 ; c
- pmulhw mm5, mm7 ; d * c3
-
- pmulhw mm4, mm6 ; c * c1
- paddw mm3, mm4 ; round c* c1
-
- psubw mm5, mm3 ; output 30 31 32 33
- movq mm3, mm5
-
-
- ; done with vertical
- ; transpose for the second stage
- movq mm4, mm0 ; 00 01 02 03
- movq mm5, mm2 ; 10 11 12 03
-
- punpcklwd mm0, mm1 ; 00 10 01 11
- punpckhwd mm4, mm1 ; 02 12 03 13
-
- punpcklwd mm2, mm3 ; 20 30 21 31
- punpckhwd mm5, mm3 ; 22 32 23 33
-
-
- movq mm1, mm0 ; 00 10 01 11
- punpckldq mm0, mm2 ; 00 10 20 30
-
- punpckhdq mm1, mm2 ; 01 11 21 31
-
- movq mm2, mm4 ; 02 12 03 13
- punpckldq mm2, mm5 ; 02 12 22 32
-
- punpckhdq mm4, mm5 ; 03 13 23 33
- movq mm3, mm4
-
-
- ; first stage
- movq mm5, mm0
- movq mm4, mm1
- paddw mm0, mm3 ; a = 0 + 3
- paddw mm1, mm2 ; b = 1 + 2
+ mov rsi, arg(0) ; input
+ mov rdi, arg(1) ; output
- psubw mm4, mm2 ; c = 1 - 2
- psubw mm5, mm3 ; d = 0 - 3
+ movsxd rax, dword ptr arg(2) ;pitch
-
- ; output 0 and 2
- movq mm6, [rdx + 16] ; c2
- movq mm2, mm0 ; a
- paddw mm0, mm1 ; a + b
-
- psubw mm2, mm1 ; a - b
-
- movq mm1, mm0 ; a + b
- pmulhw mm0, mm6 ; 00 01 02 03
-
- paddw mm0, mm1 ; output 00 01 02 03
- pmulhw mm6, mm2 ; 20 21 22 23
-
- paddw mm2, mm6 ; output 20 21 22 23
-
-
- ; output 1 and 3
- movq mm6, [rdx + 8] ; c1
- movq mm7, [rdx + 24] ; c3
-
- movq mm1, mm4 ; c
- movq mm3, mm5 ; d
-
- pmulhw mm1, mm7 ; c * c3
- pmulhw mm3, mm6 ; d * c1
-
- paddw mm3, mm5 ; d * c1 rounded
- paddw mm1, mm3 ; output 10 11 12 13
-
- movq mm3, mm4 ; c
- pmulhw mm5, mm7 ; d * c3
-
- pmulhw mm4, mm6 ; c * c1
- paddw mm3, mm4 ; round c* c1
-
- psubw mm5, mm3 ; output 30 31 32 33
- movq mm3, mm5
- ; done with vertical
-
- pcmpeqw mm4, mm4
- pcmpeqw mm5, mm5
- psrlw mm4, 15
- psrlw mm5, 15
-
- psllw mm4, 2
- psllw mm5, 2
-
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm4
- paddw mm3, mm5
-
- psraw mm0, 3
- psraw mm1, 3
- psraw mm2, 3
- psraw mm3, 3
-
- movq [rdi ], mm0
- movq [rdi+ 8], mm1
- movq [rdi+16], mm2
- movq [rdi+24], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
-sym(vp8_short_fdct8x4_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;input
- mov rdi, arg(1) ;output
-
- lea rdx, [GLOBAL(dct_const_xmm)]
- movsxd rax, dword ptr arg(2) ;pitch
-
- lea rcx, [rsi + rax*2]
+ lea rcx, [rsi + rax*2]
; read the input data
- movdqa xmm0, [rsi]
- movdqa xmm2, [rsi + rax]
-
- movdqa xmm4, [rcx]
- movdqa xmm3, [rcx + rax]
- ; get the constants
- ;shift to left by 1 for prescision
- psllw xmm0, 3
- psllw xmm2, 3
-
- psllw xmm4, 3
- psllw xmm3, 3
-
- ; transpose for the second stage
- movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
- movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
-
- punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
- punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
+ movq mm0, [rsi]
+ movq mm1, [rsi + rax]
- punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
- punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
+ movq mm2, [rcx]
+ movq mm4, [rcx + rax]
- movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
- punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
+ ; transpose for the first stage
+ movq mm3, mm0 ; 00 01 02 03
+ movq mm5, mm2 ; 20 21 22 23
- punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
+ punpcklwd mm0, mm1 ; 00 10 01 11
+ punpckhwd mm3, mm1 ; 02 12 03 13
+ punpcklwd mm2, mm4 ; 20 30 21 31
+ punpckhwd mm5, mm4 ; 22 32 23 33
- movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
- punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
+ movq mm1, mm0 ; 00 10 01 11
+ punpckldq mm0, mm2 ; 00 10 20 30
- punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
- movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
+ punpckhdq mm1, mm2 ; 01 11 21 31
- punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
- punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
+ movq mm2, mm3 ; 02 12 03 13
+ punpckldq mm2, mm5 ; 02 12 22 32
- movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
- punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
+ punpckhdq mm3, mm5 ; 03 13 23 33
- punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
-
- ; xmm0 0
- ; xmm1 1
- ; xmm2 2
- ; xmm3 3
+ ; mm0 0
+ ; mm1 1
+ ; mm2 2
+ ; mm3 3
; first stage
- movdqa xmm5, xmm0
- movdqa xmm4, xmm1
-
- paddw xmm0, xmm3 ; a = 0 + 3
- paddw xmm1, xmm2 ; b = 1 + 2
-
- psubw xmm4, xmm2 ; c = 1 - 2
- psubw xmm5, xmm3 ; d = 0 - 3
+ movq mm5, mm0
+ movq mm4, mm1
+ paddw mm0, mm3 ; a1 = 0 + 3
+ paddw mm1, mm2 ; b1 = 1 + 2
- ; output 0 and 2
- movdqa xmm6, [rdx + 32] ; c2
- movdqa xmm2, xmm0 ; a
+ psubw mm4, mm2 ; c1 = 1 - 2
+ psubw mm5, mm3 ; d1 = 0 - 3
- paddw xmm0, xmm1 ; a + b
- psubw xmm2, xmm1 ; a - b
+ psllw mm5, 3
+ psllw mm4, 3
- movdqa xmm1, xmm0 ; a + b
- pmulhw xmm0, xmm6 ; 00 01 02 03
+ psllw mm0, 3
+ psllw mm1, 3
- paddw xmm0, xmm1 ; output 00 01 02 03
- pmulhw xmm6, xmm2 ; 20 21 22 23
+ ; output 0 and 2
+ movq mm2, mm0 ; a1
- paddw xmm2, xmm6 ; output 20 21 22 23
+ paddw mm0, mm1 ; op[0] = a1 + b1
+ psubw mm2, mm1 ; op[2] = a1 - b1
; output 1 and 3
- movdqa xmm6, [rdx + 16] ; c1
- movdqa xmm7, [rdx + 48] ; c3
-
- movdqa xmm1, xmm4 ; c
- movdqa xmm3, xmm5 ; d
+ ; interleave c1, d1
+ movq mm1, mm5 ; d1
+ punpcklwd mm1, mm4 ; c1 d1
+ punpckhwd mm5, mm4 ; c1 d1
- pmulhw xmm1, xmm7 ; c * c3
- pmulhw xmm3, xmm6 ; d * c1
+ movq mm3, mm1
+ movq mm4, mm5
- paddw xmm3, xmm5 ; d * c1 rounded
- paddw xmm1, xmm3 ; output 10 11 12 13
+ pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
- movdqa xmm3, xmm4 ; c
- pmulhw xmm5, xmm7 ; d * c3
+ pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
- pmulhw xmm4, xmm6 ; c * c1
- paddw xmm3, xmm4 ; round c* c1
+ paddd mm1, MMWORD PTR[GLOBAL(_14500)]
+ paddd mm4, MMWORD PTR[GLOBAL(_14500)]
+ paddd mm3, MMWORD PTR[GLOBAL(_7500)]
+ paddd mm5, MMWORD PTR[GLOBAL(_7500)]
- psubw xmm5, xmm3 ; output 30 31 32 33
- movdqa xmm3, xmm5
+ psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+ psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+ packssdw mm1, mm4 ; op[1]
+ packssdw mm3, mm5 ; op[3]
; done with vertical
; transpose for the second stage
- movdqa xmm4, xmm2 ; 02 12 22 32 06 16 26 36
- movdqa xmm2, xmm1 ; 01 11 21 31 05 15 25 35
+ movq mm4, mm0 ; 00 10 20 30
+ movq mm5, mm2 ; 02 12 22 32
- movdqa xmm1, xmm0 ; 00 10 20 30 04 14 24 34
- movdqa xmm5, xmm4 ; 02 12 22 32 06 16 26 36
+ punpcklwd mm0, mm1 ; 00 01 10 11
+ punpckhwd mm4, mm1 ; 20 21 30 31
- punpcklwd xmm0, xmm2 ; 00 01 10 11 20 21 30 31
- punpckhwd xmm1, xmm2 ; 04 05 14 15 24 25 34 35
+ punpcklwd mm2, mm3 ; 02 03 12 13
+ punpckhwd mm5, mm3 ; 22 23 32 33
- punpcklwd xmm4, xmm3 ; 02 03 12 13 22 23 32 33
- punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
+ movq mm1, mm0 ; 00 01 10 11
+ punpckldq mm0, mm2 ; 00 01 02 03
- movdqa xmm2, xmm0 ; 00 01 10 11 20 21 30 31
- punpckldq xmm0, xmm4 ; 00 01 02 03 10 11 12 13
+ punpckhdq mm1, mm2 ; 01 22 12 13
- punpckhdq xmm2, xmm4 ; 20 21 22 23 30 31 32 33
+ movq mm2, mm4 ; 20 31 30 31
+ punpckldq mm2, mm5 ; 20 21 22 23
+ punpckhdq mm4, mm5 ; 30 31 32 33
- movdqa xmm4, xmm1 ; 04 05 14 15 24 25 34 35
- punpckldq xmm4, xmm5 ; 04 05 06 07 14 15 16 17
+ ; mm0 0
+ ; mm1 1
+ ; mm2 2
+ ; mm3 4
- punpckhdq xmm1, xmm5 ; 24 25 26 27 34 35 36 37
- movdqa xmm3, xmm2 ; 20 21 22 23 30 31 32 33
+ movq mm5, mm0
+ movq mm3, mm1
- punpckhqdq xmm3, xmm1 ; 30 31 32 33 34 35 36 37
- punpcklqdq xmm2, xmm1 ; 20 21 22 23 24 25 26 27
+ paddw mm0, mm4 ; a1 = 0 + 3
+ paddw mm1, mm2 ; b1 = 1 + 2
- movdqa xmm1, xmm0 ; 00 01 02 03 10 11 12 13
- punpcklqdq xmm0, xmm4 ; 00 01 02 03 04 05 06 07
+ psubw mm3, mm2 ; c1 = 1 - 2
+ psubw mm5, mm4 ; d1 = 0 - 3
- punpckhqdq xmm1, xmm4 ; 10 11 12 13 14 15 16 17
+ pxor mm6, mm6 ; zero out for compare
- ; first stage
- movdqa xmm5, xmm0
- movdqa xmm4, xmm1
-
- paddw xmm0, xmm3 ; a = 0 + 3
- paddw xmm1, xmm2 ; b = 1 + 2
-
- psubw xmm4, xmm2 ; c = 1 - 2
- psubw xmm5, xmm3 ; d = 0 - 3
+ pcmpeqw mm6, mm5 ; d1 != 0
+ pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper,
+ ; and keep bit 0 of lower
; output 0 and 2
- movdqa xmm6, [rdx + 32] ; c2
- movdqa xmm2, xmm0 ; a
+ movq mm2, mm0 ; a1
- paddw xmm0, xmm1 ; a + b
- psubw xmm2, xmm1 ; a - b
+ paddw mm0, mm1 ; a1 + b1
+ psubw mm2, mm1 ; a1 - b1
- movdqa xmm1, xmm0 ; a + b
- pmulhw xmm0, xmm6 ; 00 01 02 03
+ paddw mm0, MMWORD PTR[GLOBAL(_7w)]
+ paddw mm2, MMWORD PTR[GLOBAL(_7w)]
- paddw xmm0, xmm1 ; output 00 01 02 03
- pmulhw xmm6, xmm2 ; 20 21 22 23
+ psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4
+ psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4
- paddw xmm2, xmm6 ; output 20 21 22 23
+ movq MMWORD PTR[rdi + 0 ], mm0
+ movq MMWORD PTR[rdi + 16], mm2
; output 1 and 3
- movdqa xmm6, [rdx + 16] ; c1
- movdqa xmm7, [rdx + 48] ; c3
+ ; interleave c1, d1
+ movq mm1, mm5 ; d1
+ punpcklwd mm1, mm3 ; c1 d1
+ punpckhwd mm5, mm3 ; c1 d1
- movdqa xmm1, xmm4 ; c
- movdqa xmm3, xmm5 ; d
+ movq mm3, mm1
+ movq mm4, mm5
- pmulhw xmm1, xmm7 ; c * c3
- pmulhw xmm3, xmm6 ; d * c1
+ pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
- paddw xmm3, xmm5 ; d * c1 rounded
- paddw xmm1, xmm3 ; output 10 11 12 13
+ pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
- movdqa xmm3, xmm4 ; c
- pmulhw xmm5, xmm7 ; d * c3
+ paddd mm1, MMWORD PTR[GLOBAL(_12000)]
+ paddd mm4, MMWORD PTR[GLOBAL(_12000)]
+ paddd mm3, MMWORD PTR[GLOBAL(_51000)]
+ paddd mm5, MMWORD PTR[GLOBAL(_51000)]
- pmulhw xmm4, xmm6 ; c * c1
- paddw xmm3, xmm4 ; round c* c1
+ psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+ psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
- psubw xmm5, xmm3 ; output 30 31 32 33
- movdqa xmm3, xmm5
- ; done with vertical
+ packssdw mm1, mm4 ; op[4]
+ packssdw mm3, mm5 ; op[12]
+
+ paddw mm1, mm6 ; op[4] += (d1!=0)
+ movq MMWORD PTR[rdi + 8 ], mm1
+ movq MMWORD PTR[rdi + 24], mm3
- pcmpeqw xmm4, xmm4
- pcmpeqw xmm5, xmm5;
- psrlw xmm4, 15
- psrlw xmm5, 15
-
- psllw xmm4, 2
- psllw xmm5, 2
-
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm4
- paddw xmm3, xmm5
-
- psraw xmm0, 3
- psraw xmm1, 3
- psraw xmm2, 3
- psraw xmm3, 3
-
- movq QWORD PTR[rdi ], xmm0
- movq QWORD PTR[rdi+ 8], xmm1
- movq QWORD PTR[rdi+16], xmm2
- movq QWORD PTR[rdi+24], xmm3
-
- psrldq xmm0, 8
- psrldq xmm1, 8
- psrldq xmm2, 8
- psrldq xmm3, 8
-
- movq QWORD PTR[rdi+32], xmm0
- movq QWORD PTR[rdi+40], xmm1
- movq QWORD PTR[rdi+48], xmm2
- movq QWORD PTR[rdi+56], xmm3
- ; begin epilog
- pop rdi
- pop rsi
+ ; begin epilog
+ pop rdi
+ pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
-
SECTION_RODATA
-;static const unsigned int dct1st_stage_rounding_mmx[2] =
-align 16
-dct1st_stage_rounding_mmx:
- times 2 dd 8192
-
-
-;static const unsigned int dct2nd_stage_rounding_mmx[2] =
-align 16
-dct2nd_stage_rounding_mmx:
- times 2 dd 32768
-
-
-;static const short dct_matrix[4][4]=
-align 16
-dct_matrix:
- times 4 dw 23170
-
- dw 30274
- dw 12540
- dw -12540
- dw -30274
-
- dw 23170
- times 2 dw -23170
- dw 23170
-
- dw 12540
- dw -30274
- dw 30274
- dw -12540
-
-
-;static const unsigned short dct_const_mmx[4 * 4]=
-align 16
-dct_const_mmx:
- times 4 dw 0
- times 4 dw 60547
- times 4 dw 46341
- times 4 dw 25080
-
-
-;static const unsigned short dct_const_xmm[8 * 4]=
-align 16
-dct_const_xmm:
- times 8 dw 0
- times 8 dw 60547
- times 8 dw 46341
- times 8 dw 25080
+align 8
+_5352_2217:
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+align 8
+_2217_neg5352:
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+align 8
+_cmp_mask:
+ times 4 dw 1
+align 8
+_7w:
+ times 4 dw 7
+align 8
+_14500:
+ times 2 dd 14500
+align 8
+_7500:
+ times 2 dd 7500
+align 8
+_12000:
+ times 2 dd 12000
+align 8
+_51000:
+ times 2 dd 51000
%include "vpx_ports/x86_abi_support.asm"
-;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_fdct4x4_sse2)
-sym(vp8_short_fdct4x4_sse2):
+%macro STACK_FRAME_CREATE 0
+%if ABI_IS_32BIT
+ %define input rsi
+ %define output rdi
+ %define pitch rax
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
-;; SAVE_XMM
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0)
- movsxd rax, DWORD PTR arg(2)
- lea rdi, [rsi + rax*2]
+ mov rdi, arg(1)
+
+ movsxd rax, dword ptr arg(2)
+ lea rcx, [rsi + rax*2]
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ %define input rcx
+ %define output rdx
+ %define pitch r8
+ %else
+ %define input rdi
+ %define output rsi
+ %define pitch rdx
+ %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY 0
+ %define input
+ %define output
+ %define pitch
+
+%if ABI_IS_32BIT
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ pop rbp
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ %endif
+%endif
+ ret
+%endmacro
+
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2)
+sym(vp8_short_fdct4x4_sse2):
- movq xmm0, MMWORD PTR[rsi ] ;03 02 01 00
- movq xmm2, MMWORD PTR[rsi + rax] ;13 12 11 10
- movq xmm1, MMWORD PTR[rsi + rax*2] ;23 22 21 20
- movq xmm3, MMWORD PTR[rdi + rax] ;33 32 31 30
+ STACK_FRAME_CREATE
+
+ movq xmm0, MMWORD PTR[input ] ;03 02 01 00
+ movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10
+ lea input, [input+2*pitch]
+ movq xmm1, MMWORD PTR[input ] ;23 22 21 20
+ movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30
punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00
punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20
- mov rdi, arg(1)
-
movdqa xmm2, xmm0
punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00
punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10
psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1
psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3
psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3
+
movdqa xmm1, xmm0
pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
punpcklqdq xmm0, xmm3 ;op[4] op[0]
punpckhqdq xmm1, xmm3 ;op[12] op[8]
- movdqa XMMWORD PTR[rdi + 0], xmm0
- movdqa XMMWORD PTR[rdi + 16], xmm1
+ movdqa XMMWORD PTR[output + 0], xmm0
+ movdqa XMMWORD PTR[output + 16], xmm1
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
-;; RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
+ STACK_FRAME_DESTROY
+
+;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct8x4_sse2)
+sym(vp8_short_fdct8x4_sse2):
+
+ STACK_FRAME_CREATE
+
+ ; read the input data
+ movdqa xmm0, [input ]
+ movdqa xmm2, [input+ pitch]
+ lea input, [input+2*pitch]
+ movdqa xmm4, [input ]
+ movdqa xmm3, [input+ pitch]
+
+ ; transpose for the first stage
+ movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
+ movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
+
+ punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
+ punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
+
+ punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
+ punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
+
+ movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
+ punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
+
+ punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
+
+ movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
+ punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
+
+ punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
+ movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
+
+ punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
+ punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
+
+ movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
+ punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
+
+ punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
+
+ ; xmm0 0
+ ; xmm1 1
+ ; xmm2 2
+ ; xmm3 3
+
+ ; first stage
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm1
+
+ paddw xmm0, xmm3 ; a1 = 0 + 3
+ paddw xmm1, xmm2 ; b1 = 1 + 2
+
+ psubw xmm4, xmm2 ; c1 = 1 - 2
+ psubw xmm5, xmm3 ; d1 = 0 - 3
+
+ psllw xmm5, 3
+ psllw xmm4, 3
+
+ psllw xmm0, 3
+ psllw xmm1, 3
+
+ ; output 0 and 2
+ movdqa xmm2, xmm0 ; a1
+
+ paddw xmm0, xmm1 ; op[0] = a1 + b1
+ psubw xmm2, xmm1 ; op[2] = a1 - b1
+
+ ; output 1 and 3
+ ; interleave c1, d1
+ movdqa xmm1, xmm5 ; d1
+ punpcklwd xmm1, xmm4 ; c1 d1
+ punpckhwd xmm5, xmm4 ; c1 d1
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm5
+
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+
+ paddd xmm1, XMMWORD PTR[GLOBAL(_14500)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_14500)]
+ paddd xmm3, XMMWORD PTR[GLOBAL(_7500)]
+ paddd xmm5, XMMWORD PTR[GLOBAL(_7500)]
+
+ psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+ psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+
+ packssdw xmm1, xmm4 ; op[1]
+ packssdw xmm3, xmm5 ; op[3]
+
+ ; done with vertical
+ ; transpose for the second stage
+ movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34
+ movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36
+
+ punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31
+ punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35
+
+ punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33
+ punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
+
+ movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31
+ punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13
+
+ punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33
+
+ movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35
+ punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17
+
+ punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37
+ movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33
+
+ punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37
+ punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27
+
+ movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13
+ punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07
+
+ punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17
+
+ ; xmm0 0
+ ; xmm1 4
+ ; xmm2 1
+ ; xmm3 3
+
+ movdqa xmm5, xmm0
+ movdqa xmm2, xmm1
+
+ paddw xmm0, xmm3 ; a1 = 0 + 3
+ paddw xmm1, xmm4 ; b1 = 1 + 2
+
+ psubw xmm4, xmm2 ; c1 = 1 - 2
+ psubw xmm5, xmm3 ; d1 = 0 - 3
+
+ pxor xmm6, xmm6 ; zero out for compare
+
+ pcmpeqw xmm6, xmm5 ; d1 != 0
+
+ pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper,
+ ; and keep bit 0 of lower
+
+ ; output 0 and 2
+ movdqa xmm2, xmm0 ; a1
+
+ paddw xmm0, xmm1 ; a1 + b1
+ psubw xmm2, xmm1 ; a1 - b1
+
+ paddw xmm0, XMMWORD PTR[GLOBAL(_7w)]
+ paddw xmm2, XMMWORD PTR[GLOBAL(_7w)]
+
+ psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4
+ psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4
+
+ ; output 1 and 3
+ ; interleave c1, d1
+ movdqa xmm1, xmm5 ; d1
+ punpcklwd xmm1, xmm4 ; c1 d1
+ punpckhwd xmm5, xmm4 ; c1 d1
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm5
+
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+
+ paddd xmm1, XMMWORD PTR[GLOBAL(_12000)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_12000)]
+ paddd xmm3, XMMWORD PTR[GLOBAL(_51000)]
+ paddd xmm5, XMMWORD PTR[GLOBAL(_51000)]
+
+ psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+ psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+
+ packssdw xmm1, xmm4 ; op[4]
+ packssdw xmm3, xmm5 ; op[12]
+
+ paddw xmm1, xmm6 ; op[4] += (d1!=0)
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm2
+
+ punpcklqdq xmm0, xmm1
+ punpckhqdq xmm4, xmm1
+
+ punpcklqdq xmm2, xmm3
+ punpckhqdq xmm5, xmm3
+
+ movdqa XMMWORD PTR[output + 0 ], xmm0
+ movdqa XMMWORD PTR[output + 16], xmm2
+ movdqa XMMWORD PTR[output + 32], xmm4
+ movdqa XMMWORD PTR[output + 48], xmm5
+
+ STACK_FRAME_DESTROY
SECTION_RODATA
align 16
_cmp_mask:
times 4 dw 1
times 4 dw 0
-
+align 16
+_cmp_mask8x4:
+ times 8 dw 1
align 16
_mult_sub:
dw 1
_7:
times 4 dd 7
align 16
+_7w:
+ times 8 dw 7
+align 16
_14500:
times 4 dd 14500
align 16