push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
; end prolog
- mov rax, 3
- mov rsi, arg(0)
- mov rdi, arg(1)
- shl rax, 16
+ mov rdx, arg(0)
+ mov rax, 30003h
- movq mm0, [rsi + 0] ;ip[0]
- movq mm1, [rsi + 8] ;ip[4]
- or rax, 3 ;00030003h
+ movq mm0, [rdx + 0] ;ip[0]
+ movq mm1, [rdx + 8] ;ip[4]
+ movd mm7, rax
- movq mm2, [rsi + 16] ;ip[8]
- movq mm3, [rsi + 24] ;ip[12]
+ movq mm2, [rdx + 16] ;ip[8]
+ movq mm3, [rdx + 24] ;ip[12]
+ punpcklwd mm7, mm7 ;0003000300030003h
+ mov rdx, arg(1)
- movq mm7, rax
- movq mm4, mm0
+ movq mm4, mm0
+ movq mm5, mm1
- punpcklwd mm7, mm7 ;0003000300030003h
- movq mm5, mm1
+ paddw mm4, mm3 ;ip[0] + ip[12] aka al
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl
- paddw mm4, mm3 ;ip[0] + ip[12] aka al
- paddw mm5, mm2 ;ip[4] + ip[8] aka bl
+ movq mm6, mm4 ;temp al
+ paddw mm4, mm5 ;al + bl
+ psubw mm6, mm5 ;al - bl
- movq mm6, mm4 ;temp al
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1
+ psubw mm1, mm2 ;ip[4] - ip[8] aka c1
- paddw mm4, mm5 ;al + bl
- psubw mm6, mm5 ;al - bl
-
- psubw mm0, mm3 ;ip[0] - ip[12] aka d1
- psubw mm1, mm2 ;ip[4] - ip[8] aka c1
-
- movq mm5, mm0 ;temp dl
-
- paddw mm0, mm1 ;dl + cl
- psubw mm5, mm1 ;dl - cl
+ movq mm5, mm0 ;temp dl
+ paddw mm0, mm1 ;dl + cl
+ psubw mm5, mm1 ;dl - cl
; 03 02 01 00
; 13 12 11 10
; 23 22 21 20
; 33 32 31 30
- movq mm3, mm4 ; 03 02 01 00
- punpcklwd mm4, mm0 ; 11 01 10 00
- punpckhwd mm3, mm0 ; 13 03 12 02
+ movq mm3, mm4 ; 03 02 01 00
+ punpcklwd mm4, mm0 ; 11 01 10 00
+ punpckhwd mm3, mm0 ; 13 03 12 02
- movq mm1, mm6 ; 23 22 21 20
- punpcklwd mm6, mm5 ; 31 21 30 20
- punpckhwd mm1, mm5 ; 33 23 32 22
+ movq mm1, mm6 ; 23 22 21 20
+ punpcklwd mm6, mm5 ; 31 21 30 20
+ punpckhwd mm1, mm5 ; 33 23 32 22
- movq mm0, mm4 ; 11 01 10 00
- movq mm2, mm3 ; 13 03 12 02
+ movq mm0, mm4 ; 11 01 10 00
+ movq mm2, mm3 ; 13 03 12 02
- punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
- punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
+ punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
+ punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
- punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
- punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
+ punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
+ punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
;~~~~~~~~~~~~~~~~~~~~~
- movq mm1, mm0
- movq mm5, mm4
-
- paddw mm1, mm3 ;ip[0] + ip[12] aka al
- paddw mm5, mm2 ;ip[4] + ip[8] aka bl
-
- movq mm6, mm1 ;temp al
-
- paddw mm1, mm5 ;al + bl
- psubw mm6, mm5 ;al - bl
-
- psubw mm0, mm3 ;ip[0] - ip[12] aka d1
- psubw mm4, mm2 ;ip[4] - ip[8] aka c1
-
- movq mm5, mm0 ;temp dl
-
- paddw mm0, mm4 ;dl + cl
- psubw mm5, mm4 ;dl - cl
+ movq mm1, mm0
+ movq mm5, mm4
+ paddw mm1, mm3 ;ip[0] + ip[12] aka al
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl
+
+ movq mm6, mm1 ;temp al
+ paddw mm1, mm5 ;al + bl
+ psubw mm6, mm5 ;al - bl
+ paddw mm1, mm7
+ paddw mm6, mm7
+ psraw mm1, 3
+ psraw mm6, 3
+
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1
+ psubw mm4, mm2 ;ip[4] - ip[8] aka c1
+
+ movq mm5, mm0 ;temp dl
+ paddw mm0, mm4 ;dl + cl
+ psubw mm5, mm4 ;dl - cl
+ paddw mm0, mm7
+ paddw mm5, mm7
+ psraw mm0, 3
+ psraw mm5, 3
;~~~~~~~~~~~~~~~~~~~~~
- movq mm3, mm1 ; 03 02 01 00
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm3, mm0 ; 13 03 12 02
-
- movq mm4, mm6 ; 23 22 21 20
- punpcklwd mm6, mm5 ; 31 21 30 20
- punpckhwd mm4, mm5 ; 33 23 32 22
-
- movq mm0, mm1 ; 11 01 10 00
- movq mm2, mm3 ; 13 03 12 02
-
- punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
- punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4]
-
- punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8]
- punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12]
-
- paddw mm0, mm7
- paddw mm1, mm7
- paddw mm2, mm7
- paddw mm3, mm7
-
- psraw mm0, 3
- psraw mm1, 3
- psraw mm2, 3
- psraw mm3, 3
-
-; movq [rdi + 0], mm0
-; movq [rdi + 8], mm1
-; movq [rdi + 16], mm2
-; movq [rdi + 24], mm3
-
- movd eax, mm0
- psrlq mm0, 32
- mov word ptr[rdi+32*0], ax
- shr eax, 16
- mov word ptr[rdi+32*1], ax
- movd eax, mm0
- mov word ptr[rdi+32*2], ax
- shr eax, 16
- mov word ptr[rdi+32*3], ax
-
- movd ecx, mm1
- psrlq mm1, 32
- mov word ptr[rdi+32*4], cx
- shr ecx, 16
- mov word ptr[rdi+32*5], cx
- movd ecx, mm1
- mov word ptr[rdi+32*6], cx
- shr ecx, 16
- mov word ptr[rdi+32*7], cx
-
- movd eax, mm2
- psrlq mm2, 32
- mov word ptr[rdi+32*8], ax
- shr eax, 16
- mov word ptr[rdi+32*9], ax
- movd eax, mm2
- mov word ptr[rdi+32*10], ax
- shr eax, 16
- mov word ptr[rdi+32*11], ax
-
- movd ecx, mm3
- psrlq mm3, 32
- mov word ptr[rdi+32*12], cx
- shr ecx, 16
- mov word ptr[rdi+32*13], cx
- movd ecx, mm3
- mov word ptr[rdi+32*14], cx
- shr ecx, 16
- mov word ptr[rdi+32*15], cx
+
+ movd eax, mm1
+ movd ecx, mm0
+ psrlq mm0, 32
+ psrlq mm1, 32
+ mov word ptr[rdx+32*0], ax
+ mov word ptr[rdx+32*1], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*4], ax
+ mov word ptr[rdx+32*5], cx
+ movd eax, mm1
+ movd ecx, mm0
+ mov word ptr[rdx+32*8], ax
+ mov word ptr[rdx+32*9], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*12], ax
+ mov word ptr[rdx+32*13], cx
+
+ movd eax, mm6
+ movd ecx, mm5
+ psrlq mm5, 32
+ psrlq mm6, 32
+ mov word ptr[rdx+32*2], ax
+ mov word ptr[rdx+32*3], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*6], ax
+ mov word ptr[rdx+32*7], cx
+ movd eax, mm6
+ movd ecx, mm5
+ mov word ptr[rdx+32*10], ax
+ mov word ptr[rdx+32*11], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*14], ax
+ mov word ptr[rdx+32*15], cx
; begin epilog
- pop rdi
- pop rsi
UNSHADOW_ARGS
pop rbp
ret
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
- SAVE_XMM 6
- push rsi
- push rdi
; end prolog
- mov rsi, arg(0)
- mov rdi, arg(1)
- mov rax, 3
+ mov rcx, arg(0)
+ mov rdx, arg(1)
+ mov rax, 30003h
- movdqa xmm0, [rsi + 0] ;ip[4] ip[0]
- movdqa xmm1, [rsi + 16] ;ip[12] ip[8]
+ movdqa xmm0, [rcx + 0] ;ip[4] ip[0]
+ movdqa xmm1, [rcx + 16] ;ip[12] ip[8]
- shl rax, 16
- or rax, 3 ;00030003h
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm0 ;ip[4] ip[0]
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm0 ;ip[4] ip[0]
- paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+ paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
- movdqa xmm4, xmm0
+ movdqa xmm4, xmm0
punpcklqdq xmm0, xmm3 ;d1 a1
punpckhqdq xmm4, xmm3 ;c1 b1
- movd xmm6, eax
- movdqa xmm1, xmm4 ;c1 b1
- paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+ movdqa xmm1, xmm4 ;c1 b1
+ paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
-;;;temp output
-;; movdqu [rdi + 0], xmm4
-;; movdqu [rdi + 16], xmm3
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; 13 12 11 10 03 02 01 00
;
; 33 32 31 30 23 22 21 20
;
- movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
- punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
- punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
- movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
- punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
+ movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
+ punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
+ punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
+ movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
+ punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm4 ;ip[4] ip[0]
+ movd xmm0, eax
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm4 ;ip[4] ip[0]
- pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03
+ pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03
- paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+ paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
- movdqa xmm5, xmm4
+ movdqa xmm5, xmm4
punpcklqdq xmm4, xmm3 ;d1 a1
punpckhqdq xmm5, xmm3 ;c1 b1
- movdqa xmm1, xmm5 ;c1 b1
- paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ; 13 12 11 10 03 02 01 00
- ;
- ; 33 32 31 30 23 22 21 20
- ;
- movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00
- punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00
- punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10
- movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00
- punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- paddw xmm5, xmm6
- paddw xmm1, xmm6
-
- psraw xmm5, 3
- psraw xmm1, 3
-
-;; movdqa [rdi + 0], xmm5
-;; movdqa [rdi + 16], xmm1
-
- movd eax, xmm5
- psrldq xmm5, 4
- mov word ptr[rdi+32*0], ax
- shr eax, 16
- mov word ptr[rdi+32*1], ax
- movd eax, xmm5
- psrldq xmm5, 4
- mov word ptr[rdi+32*2], ax
- shr eax, 16
- mov word ptr[rdi+32*3], ax
-
- movd eax, xmm5
- psrldq xmm5, 4
- mov word ptr[rdi+32*4], ax
- shr eax, 16
- mov word ptr[rdi+32*5], ax
- movd eax, xmm5
- mov word ptr[rdi+32*6], ax
- shr eax, 16
- mov word ptr[rdi+32*7], ax
-
- movd eax, xmm1
- psrldq xmm1, 4
- mov word ptr[rdi+32*8], ax
- shr eax, 16
- mov word ptr[rdi+32*9], ax
- movd eax, xmm1
- psrldq xmm1, 4
- mov word ptr[rdi+32*10], ax
- shr eax, 16
- mov word ptr[rdi+32*11], ax
-
- movd eax, xmm1
- psrldq xmm1, 4
- mov word ptr[rdi+32*12], ax
- shr eax, 16
- mov word ptr[rdi+32*13], ax
- movd eax, xmm1
- mov word ptr[rdi+32*14], ax
- shr eax, 16
- mov word ptr[rdi+32*15], ax
+ movdqa xmm1, xmm5 ;c1 b1
+ paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+
+ paddw xmm5, xmm0
+ paddw xmm4, xmm0
+ psraw xmm5, 3
+ psraw xmm4, 3
+
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*0], ax
+ mov word ptr[rdx+32*2], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*4], ax
+ mov word ptr[rdx+32*6], cx
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*8], ax
+ mov word ptr[rdx+32*10], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*12], ax
+ mov word ptr[rdx+32*14], cx
+
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*1], ax
+ mov word ptr[rdx+32*3], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*5], ax
+ mov word ptr[rdx+32*7], cx
+ movd eax, xmm5
+ movd ecx, xmm4
+ mov word ptr[rdx+32*9], ax
+ mov word ptr[rdx+32*11], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*13], ax
+ mov word ptr[rdx+32*15], cx
; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
-
-SECTION_RODATA
-align 16
-x_s1sqr2:
- times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1:
- times 4 dw 0x4E7B
-align 16
-fours:
- times 4 dw 0x0004