From dc89ee09f12c5541f391fc89db8458db6cae0950 Mon Sep 17 00:00:00 2001 From: DRC Date: Mon, 19 Feb 2018 14:16:46 -0600 Subject: [PATCH] 64-bit AVX2 implementation of fast integer FDCT Still not faster than SSE2. Improving upon SSE2 performance will probably require restructuring the algorithm to combine the various multiply/add operations, but I'm not sure how to do that without introducing further roundoff error. Left as an exercise for the reader. The IFAST FDCT algorithm is sort of a legacy feature anyhow. Even with SSE2 instructions, the ISLOW FDCT was almost as fast as the IFAST FDCT. Since the ISLOW FDCT has been accelerated with AVX2 instructions, it is now about the same speed as the IFAST FDCT on AVX2-equipped CPUs. --- simd/x86_64/jfdctfst-avx2.asm | 342 ++++++++++++++-------------------- 1 file changed, 144 insertions(+), 198 deletions(-) diff --git a/simd/x86_64/jfdctfst-avx2.asm b/simd/x86_64/jfdctfst-avx2.asm index 3b881ea..d472ee5 100644 --- a/simd/x86_64/jfdctfst-avx2.asm +++ b/simd/x86_64/jfdctfst-avx2.asm @@ -42,6 +42,107 @@ F_0_707 equ DESCALE( 759250124, 30-CONST_BITS) ; FIX(0.707106781) F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965) %endif +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit matrix transpose using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers + +%macro ymmtranspose 8 + ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) + + vpunpcklwd %5, %1, %2 + vpunpckhwd %6, %1, %2 + vpunpcklwd %7, %3, %4 + vpunpckhwd %8, %3, %4 + ; transpose coefficients(phase 1) + ; %5=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53) + ; %6=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57) + ; %7=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73) + ; %8=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77) + + vpunpckldq %1, %5, %7 + vpunpckhdq %2, %5, %7 + vpunpckldq %3, %6, %8 + vpunpckhdq %4, %6, %8 + ; transpose coefficients(phase 2) + ; %1=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71) + ; %2=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73) + ; %3=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75) + ; %4=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77) + + vpermq %1, %1, 0xD8 + vpermq %2, %2, 0x8D + vpermq %3, %3, 0xD8 + vpermq %4, %4, 0x8D + ; transpose coefficients(phase 3) + ; %1=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71) + ; %2=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72) + ; %3=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75) + ; %4=(07 17 27 37 47 57 67 77 06 16 26 36 46 56 66 76) +%endmacro + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit fast integer forward DCT using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers + +%macro dodct 8 + vpsubw %5, %1, %4 ; %5=data0_1-data7_6=tmp7_6 + vpaddw %6, %1, %4 ; %6=data0_1+data7_6=tmp0_1 + vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2 + vpsubw %8, %2, %3 ; %8=data3_2-data4_5=tmp4_5 + + ; -- Even part + + vpaddw %1, %6, %7 ; %1=tmp0_1+tmp3_2=tmp10_11 + vpsubw %6, %6, %7 ; %6=tmp0_1-tmp3_2=tmp13_12 + + vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10 + vpsignw %1, %1, [rel PW_1_NEG1] ; %1=tmp10_neg11 + vpaddw %1, %7, %1 ; %1=data0_4 + + vperm2i128 %7, %6, %6, 0x01 ; %7=tmp12_13 + vpaddw %7, %7, %6 ; %7=(tmp12+13)_(tmp12+13) + vpsllw %7, %7, PRE_MULTIPLY_SCALE_BITS + vpmulhw %7, %7, [rel PW_F0707] ; %7=z1_z1 + vpsignw %7, %7, [rel PW_1_NEG1] ; %7=z1_negz1 + + vperm2i128 %6, %6, %6, 0x00 ; %6=tmp13_13 + vpaddw %3, %6, %7 ; %3=data2_6 + + ; -- Odd part + + vperm2i128 %6, %8, %5, 0x30 ; %6=tmp4_6 + vperm2i128 %7, %8, %5, 0x21 ; %7=tmp5_7 + vpaddw %6, %6, %7 ; %6=tmp10_12 + + vpsllw %6, %6, PRE_MULTIPLY_SCALE_BITS + + vperm2i128 %7, %6, %6, 0x00 ; %7=tmp10_10 + vperm2i128 %2, %6, %6, 0x11 ; %2=tmp12_12 + vpsubw %7, %7, %2 + vpmulhw %7, %7, [rel PW_F0382] ; %7=z5_z5 + + vpmulhw %6, %6, [rel PW_F0541_F1306] ; %6=MULTIPLY(tmp10,FIX_0_541196)_MULTIPLY(tmp12,FIX_1_306562) + vpaddw %6, %6, %7 ; %6=z2_z4 + + vperm2i128 %7, %8, %5, 0x31 ; %7=tmp5_6 + vperm2i128 %2, %5, %8, 0x31 ; %2=tmp6_5 + vpaddw %7, %7, %2 ; %7=(tmp5+6)_(tmp5+6) + vpsllw %7, %7, PRE_MULTIPLY_SCALE_BITS + vpmulhw %7, %7, [rel PW_F0707] ; %7=z3_z3 + vpsignw %7, %7, [rel PW_NEG1_1] ; %7=negz3_z3 + + vperm2i128 %2, %5, %5, 0x00 ; %2=tmp7_7 + vpaddw %2, %2, %7 ; %2=z13_11 + + vpsubw %4, %2, %6 ; %4=z13_11-z2_4=data3_7 + vpaddw %2, %2, %6 ; %2=z13_11+z2_4=data5_1 +%endmacro + ; -------------------------------------------------------------------------- SECTION SEG_CONST @@ -56,10 +157,14 @@ F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965) EXTN(jconst_fdct_ifast_avx2): -PW_F0707 times 8 dw F_0_707 << CONST_SHIFT -PW_F0382 times 8 dw F_0_382 << CONST_SHIFT -PW_F0541 times 8 dw F_0_541 << CONST_SHIFT -PW_F1306 times 8 dw F_1_306 << CONST_SHIFT +PW_F0707 times 16 dw F_0_707 << CONST_SHIFT +PW_F0382 times 16 dw F_0_382 << CONST_SHIFT +PW_F0541_F1306 times 8 dw F_0_541 << CONST_SHIFT + times 8 dw F_1_306 << CONST_SHIFT +PW_1_NEG1 times 8 dw 1 + times 8 dw -1 +PW_NEG1_1 times 8 dw -1 + times 8 dw 1 alignz 32 @@ -86,208 +191,49 @@ EXTN(jsimd_fdct_ifast_avx2): ; ---- Pass 1: process rows. - mov rdx, r10 ; (DCTELEM *) - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] - - ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) - ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) - - vpunpckhwd xmm4, xmm0, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) - vpunpcklwd xmm0, xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) - vpunpckhwd xmm8, xmm2, xmm3 ; xmm8=(24 34 25 35 26 36 27 37) - vpunpcklwd xmm9, xmm2, xmm3 ; xmm9=(20 30 21 31 22 32 23 33) - - movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] - - ; xmm6=(40 41 42 43 44 45 46 47), xmm1=(60 61 62 63 64 65 66 67) - ; xmm7=(50 51 52 53 54 55 56 57), xmm3=(70 71 72 73 74 75 76 77) - - vpunpckhwd xmm2, xmm6, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) - vpunpcklwd xmm6, xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) - vpunpckhwd xmm5, xmm1, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) - vpunpcklwd xmm1, xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) - - vpunpckhdq xmm10, xmm6, xmm1 ; xmm10=(42 52 62 72 43 53 63 73) - vpunpckldq xmm6, xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) - vpunpckhdq xmm3, xmm2, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) - vpunpckldq xmm11, xmm2, xmm5 ; xmm11=(44 54 64 74 45 55 65 75) - - vpunpckhdq xmm7, xmm0, xmm9 ; xmm7=(02 12 22 32 03 13 23 33) - vpunpckldq xmm0, xmm0, xmm9 ; xmm0=(00 10 20 30 01 11 21 31) - vpunpckhdq xmm2, xmm4, xmm8 ; xmm2=(06 16 26 36 07 17 27 37) - vpunpckldq xmm4, xmm4, xmm8 ; xmm4=(04 14 24 34 05 15 25 35) - - vpunpckhqdq xmm1, xmm0, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 - vpunpcklqdq xmm0, xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 - vpunpckhqdq xmm5, xmm2, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 - vpunpcklqdq xmm2, xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 - - vpaddw xmm6, xmm1, xmm2 ; xmm6=data1+data6=tmp1 - vpaddw xmm3, xmm0, xmm5 ; xmm3=data0+data7=tmp0 - vpsubw xmm8, xmm1, xmm2 ; xmm8=data1-data6=tmp6 - vpsubw xmm9, xmm0, xmm5 ; xmm9=data0-data7=tmp7 - - vpunpckhqdq xmm1, xmm7, xmm10 ; xmm1=(03 13 23 33 43 53 63 73)=data3 - vpunpcklqdq xmm7, xmm7, xmm10 ; xmm7=(02 12 22 32 42 52 62 72)=data2 - vpunpckhqdq xmm0, xmm4, xmm11 ; xmm0=(05 15 25 35 45 55 65 75)=data5 - vpunpcklqdq xmm4, xmm4, xmm11 ; xmm4=(04 14 24 34 44 54 64 74)=data4 - - vpsubw xmm2, xmm1, xmm4 ; xmm2=data3-data4=tmp4 - vpsubw xmm5, xmm7, xmm0 ; xmm5=data2-data5=tmp5 - vpaddw xmm1, xmm1, xmm4 ; xmm1=data3+data4=tmp3 - vpaddw xmm7, xmm7, xmm0 ; xmm7=data2+data5=tmp2 - - ; -- Even part - - vpaddw xmm4, xmm3, xmm1 ; xmm4=tmp10 - vpaddw xmm0, xmm6, xmm7 ; xmm0=tmp11 - vpsubw xmm3, xmm3, xmm1 ; xmm3=tmp13 - vpsubw xmm6, xmm6, xmm7 ; xmm6=tmp12 - - vpaddw xmm6, xmm6, xmm3 - vpsllw xmm6, xmm6, PRE_MULTIPLY_SCALE_BITS - vpmulhw xmm6, xmm6, [rel PW_F0707] ; xmm6=z1 - - vpaddw xmm1, xmm4, xmm0 ; xmm1=data0 - vpaddw xmm7, xmm3, xmm6 ; xmm7=data2 - vpsubw xmm10, xmm4, xmm0 ; xmm10=data4 - vpsubw xmm11, xmm3, xmm6 ; xmm11=data6 - - ; -- Odd part - - vpaddw xmm2, xmm2, xmm5 ; xmm2=tmp10 - vpaddw xmm5, xmm5, xmm8 ; xmm5=tmp11 - vpaddw xmm0, xmm8, xmm9 ; xmm0=tmp12, xmm9=tmp7 - - vpsllw xmm2, xmm2, PRE_MULTIPLY_SCALE_BITS - vpsllw xmm0, xmm0, PRE_MULTIPLY_SCALE_BITS - - vpsllw xmm5, xmm5, PRE_MULTIPLY_SCALE_BITS - vpmulhw xmm5, xmm5, [rel PW_F0707] ; xmm5=z3 - - vpmulhw xmm4, xmm2, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) - vpsubw xmm2, xmm2, xmm0 - vpmulhw xmm2, xmm2, [rel PW_F0382] ; xmm2=z5 - vpmulhw xmm0, xmm0, [rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) - vpaddw xmm4, xmm4, xmm2 ; xmm4=z2 - vpaddw xmm0, xmm0, xmm2 ; xmm0=z4 - - vpsubw xmm6, xmm9, xmm5 ; xmm6=z13 - vpaddw xmm3, xmm9, xmm5 ; xmm3=z11 - - vpaddw xmm2, xmm6, xmm4 ; xmm2=data5 - vpaddw xmm5, xmm3, xmm0 ; xmm5=data1 - vpsubw xmm6, xmm6, xmm4 ; xmm6=data3 - vpsubw xmm3, xmm3, xmm0 ; xmm3=data7 + vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)] + vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)] + vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)] + vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)] + ; ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + ; ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + ; ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + ; ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + vperm2i128 ymm0, ymm4, ymm6, 0x20 + vperm2i128 ymm1, ymm4, ymm6, 0x31 + vperm2i128 ymm2, ymm5, ymm7, 0x20 + vperm2i128 ymm3, ymm5, ymm7, 0x31 + ; ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) + + ymmtranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + + dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + ; ymm0=data0_4, ymm1=data5_1, ymm2=data2_6, ymm3=data3_7 ; ---- Pass 2: process columns. - ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) - ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) - - vpunpckhwd xmm4, xmm1, xmm5 ; xmm4=(40 41 50 51 60 61 70 71) - vpunpcklwd xmm1, xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31) - vpunpckhwd xmm9, xmm7, xmm6 ; xmm9=(42 43 52 53 62 63 72 73) - vpunpcklwd xmm8, xmm7, xmm6 ; xmm8=(02 03 12 13 22 23 32 33) - - ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) - ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) - - vpunpcklwd xmm5, xmm10, xmm2 ; xmm5=(04 05 14 15 24 25 34 35) - vpunpckhwd xmm7, xmm10, xmm2 ; xmm7=(44 45 54 55 64 65 74 75) - vpunpcklwd xmm6, xmm11, xmm3 ; xmm6=(06 07 16 17 26 27 36 37) - vpunpckhwd xmm0, xmm11, xmm3 ; xmm0=(46 47 56 57 66 67 76 77) - - vpunpckhdq xmm10, xmm5, xmm6 ; xmm10=(24 25 26 27 34 35 36 37) - vpunpckldq xmm5, xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17) - vpunpckhdq xmm3, xmm7, xmm0 ; xmm3=(64 65 66 67 74 75 76 77) - vpunpckldq xmm11, xmm7, xmm0 ; xmm11=(44 45 46 47 54 55 56 57) - - vpunpckhdq xmm2, xmm1, xmm8 ; xmm2=(20 21 22 23 30 31 32 33) - vpunpckldq xmm1, xmm1, xmm8 ; xmm1=(00 01 02 03 10 11 12 13) - vpunpckhdq xmm7, xmm4, xmm9 ; xmm7=(60 61 62 63 70 71 72 73) - vpunpckldq xmm4, xmm4, xmm9 ; xmm4=(40 41 42 43 50 51 52 53) - - vpunpckhqdq xmm6, xmm1, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 - vpunpcklqdq xmm1, xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 - vpunpckhqdq xmm0, xmm7, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 - vpunpcklqdq xmm7, xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 - - vpaddw xmm5, xmm6, xmm7 ; xmm5=data1+data6=tmp1 - vpaddw xmm3, xmm1, xmm0 ; xmm3=data0+data7=tmp0 - vpsubw xmm8, xmm6, xmm7 ; xmm6=data1-data6=tmp6 - vpsubw xmm9, xmm1, xmm0 ; xmm1=data0-data7=tmp7 - - vpunpckhqdq xmm6, xmm2, xmm10 ; xmm6=(30 31 32 33 34 35 36 37)=data3 - vpunpcklqdq xmm2, xmm2, xmm10 ; xmm2=(20 21 22 23 24 25 26 27)=data2 - vpunpckhqdq xmm1, xmm4, xmm11 ; xmm1=(50 51 52 53 54 55 56 57)=data5 - vpunpcklqdq xmm4, xmm4, xmm11 ; xmm4=(40 41 42 43 44 45 46 47)=data4 - - vpsubw xmm7, xmm6, xmm4 ; xmm7=data3-data4=tmp4 - vpsubw xmm0, xmm2, xmm1 ; xmm0=data2-data5=tmp5 - vpaddw xmm6, xmm6, xmm4 ; xmm6=data3+data4=tmp3 - vpaddw xmm2, xmm2, xmm1 ; xmm2=data2+data5=tmp2 - - ; -- Even part - - vpaddw xmm4, xmm3, xmm6 ; xmm4=tmp10 - vpaddw xmm1, xmm5, xmm2 ; xmm1=tmp11 - vpsubw xmm3, xmm3, xmm6 ; xmm3=tmp13 - vpsubw xmm5, xmm5, xmm2 ; xmm5=tmp12 - - vpaddw xmm5, xmm5, xmm3 - vpsllw xmm5, xmm5, PRE_MULTIPLY_SCALE_BITS - vpmulhw xmm5, xmm5, [rel PW_F0707] ; xmm5=z1 - - vpaddw xmm6, xmm4, xmm1 ; xmm6=data0 - vpaddw xmm2, xmm3, xmm5 ; xmm2=data2 - vpsubw xmm4, xmm4, xmm1 ; xmm4=data4 - vpsubw xmm3, xmm3, xmm5 ; xmm3=data6 - - movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3 - movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6 - movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2 - - ; -- Odd part - - vpaddw xmm7, xmm7, xmm0 ; xmm7=tmp10 - vpaddw xmm0, xmm0, xmm8 ; xmm0=tmp11 - vpaddw xmm1, xmm8, xmm9 ; xmm1=tmp12, xmm5=tmp7 - - vpsllw xmm7, xmm7, PRE_MULTIPLY_SCALE_BITS - vpsllw xmm1, xmm1, PRE_MULTIPLY_SCALE_BITS - - vpsllw xmm0, xmm0, PRE_MULTIPLY_SCALE_BITS - vpmulhw xmm0, xmm0, [rel PW_F0707] ; xmm0=z3 + vperm2i128 ymm1, ymm1, ymm1, 0x01 ; ymm1=data1_5 - vpmulhw xmm4, xmm7, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) - vpsubw xmm7, xmm7, xmm1 - vpmulhw xmm7, xmm7, [rel PW_F0382] ; xmm7=z5 - vpmulhw xmm1, xmm1, [rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) - vpaddw xmm4, xmm4, xmm7 ; xmm4=z2 - vpaddw xmm1, xmm1, xmm7 ; xmm1=z4 + ymmtranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 - vpsubw xmm5, xmm9, xmm0 ; xmm5=z13 - vpaddw xmm3, xmm9, xmm0 ; xmm3=z11 + dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + ; ymm0=data0_4, ymm1=data5_1, ymm2=data2_6, ymm3=data3_7 - vpaddw xmm6, xmm5, xmm4 ; xmm6=data5 - vpaddw xmm2, xmm3, xmm1 ; xmm2=data1 - vpsubw xmm5, xmm5, xmm4 ; xmm5=data3 - vpsubw xmm3, xmm3, xmm1 ; xmm3=data7 + vperm2i128 ymm4, ymm0, ymm1, 0x30 ; ymm4=data0_1 + vperm2i128 ymm5, ymm2, ymm3, 0x20 ; ymm5=data2_3 + vperm2i128 ymm6, ymm0, ymm1, 0x21 ; ymm6=data4_5 + vperm2i128 ymm7, ymm2, ymm3, 0x31 ; ymm7=data6_7 - movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5 - movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3 - movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 - movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 + vmovdqu YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm4 + vmovdqu YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm5 + vmovdqu YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm6 + vmovdqu YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7 + vzeroupper uncollect_args 1 pop rbp ret -- 2.50.1