MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
- MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
-
movq mm6, [pw_1 GLOBAL]
paddw mm0, mm6
- paddw mm4, mm6
+ paddw mm2, mm6
psraw mm0, 1
movq [parm1q+ 0],mm0
- psraw mm4, 1
- movq [parm1q+ 8],mm4
- paddw mm1, mm6
+ psraw mm2, 1
+ movq [parm1q+ 8],mm2
paddw mm3, mm6
- psraw mm1, 1
- movq [parm1q+16],mm1
+ paddw mm4, mm6
psraw mm3, 1
- movq [parm1q+24],mm3
+ movq [parm1q+16],mm3
+ psraw mm4, 1
+ movq [parm1q+24],mm4
ret
cglobal x264_idct4x4dc_mmxext
MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
- MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
-
movq [parm1q+ 0], mm0
- movq [parm1q+ 8], mm4
- movq [parm1q+16], mm1
- movq [parm1q+24], mm3
+ movq [parm1q+ 8], mm2
+ movq [parm1q+16], mm3
+ movq [parm1q+24], mm4
ret
cglobal x264_sub4x4_dct_mmxext
MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
- ; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
- MMX_TRANSPOSE mm1, mm2, mm3, mm0, mm4
-
movq [r10+ 0], mm1 ; dct
- movq [r10+ 8], mm0
- movq [r10+16], mm4
- movq [r10+24], mm3
+ movq [r10+ 8], mm2
+ movq [r10+16], mm3
+ movq [r10+24], mm0
pop rbx
ret
x264_add4x4_idct_mmxext:
; Load dct coeffs
movq mm0, [parm3q+ 0] ; dct
- movq mm4, [parm3q+ 8]
- movq mm3, [parm3q+16]
- movq mm1, [parm3q+24]
+ movq mm1, [parm3q+ 8]
+ movq mm2, [parm3q+16]
+ movq mm3, [parm3q+24]
mov rax, parm1q ; p_dst
movsxd rcx, parm2d ; i_dst
lea rdx, [rcx+rcx*2]
- ; out:mm0, mm1, mm2, mm3
- MMX_TRANSPOSE mm0, mm4, mm3, mm1, mm2
-
MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
MMX_LOAD_DIFF_8P xmm6, xmm8, xmm9, [rsi+r9 ], [rcx+r10]
MMX_LOAD_DIFF_8P xmm7, xmm8, xmm9, [rsi+rdx*4], [rcx+r8*4]
- SSE2_TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
- DCT8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1, xmm6, xmm9
- SSE2_TRANSPOSE8x8 xmm4, xmm5, xmm7, xmm2, xmm8, xmm3, xmm1, xmm6, xmm0
- DCT8_1D xmm4, xmm3, xmm6, xmm2, xmm0, xmm8, xmm7, xmm5, xmm1, xmm9
+ DCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
+ SSE2_TRANSPOSE8x8 xmm5, xmm1, xmm2, xmm6, xmm4, xmm3, xmm7, xmm8, xmm0
+ DCT8_1D xmm5, xmm3, xmm8, xmm6, xmm0, xmm4, xmm2, xmm1, xmm7, xmm9
- movdqa [rdi+0x00], xmm8
+ movdqa [rdi+0x00], xmm4
movdqa [rdi+0x10], xmm3
- movdqa [rdi+0x20], xmm6
- movdqa [rdi+0x30], xmm7
+ movdqa [rdi+0x20], xmm8
+ movdqa [rdi+0x30], xmm2
movdqa [rdi+0x40], xmm0
- movdqa [rdi+0x50], xmm2
- movdqa [rdi+0x60], xmm5
- movdqa [rdi+0x70], xmm1
+ movdqa [rdi+0x50], xmm6
+ movdqa [rdi+0x60], xmm1
+ movdqa [rdi+0x70], xmm7
ret
movdqa xmm6, [rdx+0x60]
movdqa xmm7, [rdx+0x70]
- SSE2_TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
- IDCT8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1, xmm9, xmm6
- SSE2_TRANSPOSE8x8 xmm9, xmm5, xmm1, xmm3, xmm8, xmm0, xmm7, xmm2, xmm4
+ IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm9, xmm8
+ SSE2_TRANSPOSE8x8 xmm9, xmm1, xmm7, xmm3, xmm4, xmm0, xmm2, xmm6, xmm5
paddw xmm9, [pw_32 GLOBAL] ; rounding for the >>6 at the end
- IDCT8_1D xmm9, xmm0, xmm2, xmm3, xmm4, xmm8, xmm1, xmm5, xmm6, xmm7
+ IDCT8_1D xmm9, xmm0, xmm6, xmm3, xmm5, xmm4, xmm7, xmm1, xmm8, xmm2
MMX_ZERO xmm15
- MMX_STORE_DIFF_8P xmm6, xmm14, xmm15, [rdi]
+ MMX_STORE_DIFF_8P xmm8, xmm14, xmm15, [rdi]
MMX_STORE_DIFF_8P xmm0, xmm14, xmm15, [rdi+rsi]
- MMX_STORE_DIFF_8P xmm5, xmm14, xmm15, [rdi+rsi*2]
+ MMX_STORE_DIFF_8P xmm1, xmm14, xmm15, [rdi+rsi*2]
lea rax, [rsi+rsi*2]
add rdi, rax
MMX_STORE_DIFF_8P xmm3, xmm14, xmm15, [rdi]
- MMX_STORE_DIFF_8P xmm4, xmm14, xmm15, [rdi+rsi]
+ MMX_STORE_DIFF_8P xmm5, xmm14, xmm15, [rdi+rsi]
MMX_STORE_DIFF_8P xmm9, xmm14, xmm15, [rdi+rsi*2]
- MMX_STORE_DIFF_8P xmm2, xmm14, xmm15, [rdi+rax]
- MMX_STORE_DIFF_8P xmm1, xmm14, xmm15, [rdi+rsi*4]
+ MMX_STORE_DIFF_8P xmm6, xmm14, xmm15, [rdi+rax]
+ MMX_STORE_DIFF_8P xmm7, xmm14, xmm15, [rdi+rsi*4]
ret
tmp[1][1] = d[1][0] - d[1][1];
d[0][0] = tmp[0][0] + tmp[0][1];
- d[0][1] = tmp[1][0] + tmp[1][1];
- d[1][0] = tmp[0][0] - tmp[0][1];
+ d[1][0] = tmp[1][0] + tmp[1][1];
+ d[0][1] = tmp[0][0] - tmp[0][1];
d[1][1] = tmp[1][0] - tmp[1][1];
}
s23 = tmp[i][2] + tmp[i][3];
d23 = tmp[i][2] - tmp[i][3];
- d[0][i] = ( s01 + s23 + 1 ) >> 1;
- d[1][i] = ( s01 - s23 + 1 ) >> 1;
- d[2][i] = ( d01 - d23 + 1 ) >> 1;
- d[3][i] = ( d01 + d23 + 1 ) >> 1;
+ d[i][0] = ( s01 + s23 + 1 ) >> 1;
+ d[i][1] = ( s01 - s23 + 1 ) >> 1;
+ d[i][2] = ( d01 - d23 + 1 ) >> 1;
+ d[i][3] = ( d01 + d23 + 1 ) >> 1;
}
}
for( i = 0; i < 4; i++ )
{
- s01 = d[0][i] + d[1][i];
- d01 = d[0][i] - d[1][i];
- s23 = d[2][i] + d[3][i];
- d23 = d[2][i] - d[3][i];
+ s01 = d[i][0] + d[i][1];
+ d01 = d[i][0] - d[i][1];
+ s23 = d[i][2] + d[i][3];
+ d23 = d[i][2] - d[i][3];
tmp[0][i] = s01 + s23;
tmp[1][i] = s01 - s23;
const int d03 = tmp[i][0] - tmp[i][3];
const int d12 = tmp[i][1] - tmp[i][2];
- dct[0][i] = s03 + s12;
- dct[1][i] = 2*d03 + d12;
- dct[2][i] = s03 - s12;
- dct[3][i] = d03 - 2*d12;
+ dct[i][0] = s03 + s12;
+ dct[i][1] = 2*d03 + d12;
+ dct[i][2] = s03 - s12;
+ dct[i][3] = d03 - 2*d12;
}
}
for( i = 0; i < 4; i++ )
{
- const int s02 = dct[i][0] + dct[i][2];
- const int d02 = dct[i][0] - dct[i][2];
- const int s13 = dct[i][1] + (dct[i][3]>>1);
- const int d13 = (dct[i][1]>>1) - dct[i][3];
+ const int s02 = dct[0][i] + dct[2][i];
+ const int d02 = dct[0][i] - dct[2][i];
+ const int s13 = dct[1][i] + (dct[3][i]>>1);
+ const int d13 = (dct[1][i]>>1) - dct[3][i];
tmp[i][0] = s02 + s13;
tmp[i][1] = d02 + d13;
const int s02 = tmp[0][i] + tmp[2][i];
const int d02 = tmp[0][i] - tmp[2][i];
const int s13 = tmp[1][i] + (tmp[3][i]>>1);
- const int d13 = (tmp[1][i]>>1) - tmp[3][i];
+ const int d13 = (tmp[1][i]>>1) - tmp[3][i];
d[0][i] = ( s02 + s13 + 32 ) >> 6;
d[1][i] = ( d02 + d13 + 32 ) >> 6;
const int a5 = d07 - d34 - (d25 + (d25>>1));\
const int a6 = d07 + d34 - (d16 + (d16>>1));\
const int a7 = d16 - d25 + (d34 + (d34>>1));\
- SRC(0) = a0 + a1 ;\
- SRC(1) = a4 + (a7>>2);\
- SRC(2) = a2 + (a3>>1);\
- SRC(3) = a5 + (a6>>2);\
- SRC(4) = a0 - a1 ;\
- SRC(5) = a6 - (a5>>2);\
- SRC(6) = (a2>>1) - a3 ;\
- SRC(7) = (a4>>2) - a7 ;\
+ DST(0) = a0 + a1 ;\
+ DST(1) = a4 + (a7>>2);\
+ DST(2) = a2 + (a3>>1);\
+ DST(3) = a5 + (a6>>2);\
+ DST(4) = a0 - a1 ;\
+ DST(5) = a6 - (a5>>2);\
+ DST(6) = (a2>>1) - a3 ;\
+ DST(7) = (a4>>2) - a7 ;\
}
static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
int i;
+ int16_t tmp[8][8];
- pixel_sub_wxh( (int16_t*)dct, 8, pix1, i_pix1, pix2, i_pix2 );
+ pixel_sub_wxh( (int16_t*)tmp, 8, pix1, i_pix1, pix2, i_pix2 );
-#define SRC(x) dct[x][i]
+#define SRC(x) tmp[x][i]
+#define DST(x) tmp[x][i]
for( i = 0; i < 8; i++ )
DCT8_1D
#undef SRC
+#undef DST
-#define SRC(x) dct[i][x]
+#define SRC(x) tmp[i][x]
+#define DST(x) dct[x][i]
for( i = 0; i < 8; i++ )
DCT8_1D
#undef SRC
+#undef DST
}
static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
dct[0][0] += 32; // rounding for the >>6 at the end
-#define SRC(x) dct[i][x]
-#define DST(x,rhs) dct[i][x] = (rhs)
+#define SRC(x) dct[x][i]
+#define DST(x,rhs) dct[x][i] = (rhs)
for( i = 0; i < 8; i++ )
IDCT8_1D
#undef SRC
#undef DST
-#define SRC(x) dct[x][i]
+#define SRC(x) dct[i][x]
#define DST(x,rhs) dst[i + x*i_dst] = clip_uint8( dst[i + x*i_dst] + ((rhs) >> 6) );
for( i = 0; i < 8; i++ )
IDCT8_1D
dctf->dct4x4dc = x264_dct4x4dc_mmxext;
dctf->idct4x4dc = x264_idct4x4dc_mmxext;
+ }
#ifndef ARCH_X86_64
- dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmxext;
- dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmxext;
+ if( cpu&X264_CPU_MMX )
+ {
+ dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
- dctf->add8x8_idct8 = x264_add8x8_idct8_mmxext;
- dctf->add16x16_idct8= x264_add16x16_idct8_mmxext;
-#endif
+ dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
+ dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
}
#endif
+#endif
#if defined(HAVE_SSE2) && defined(ARCH_X86_64)
if( cpu&X264_CPU_SSE2 )
dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
}
#endif
-
+/* FIXME altivec dct is not transposed yet
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC )
{
dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
}
#endif
+*/
}
MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
- MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
-
movq mm6, [x264_mmx_1 GOT_ebx]
paddw mm0, mm6
- paddw mm4, mm6
+ paddw mm2, mm6
psraw mm0, 1
movq [eax+ 0], mm0
- psraw mm4, 1
- movq [eax+ 8], mm4
- paddw mm1, mm6
+ psraw mm2, 1
+ movq [eax+ 8], mm2
paddw mm3, mm6
- psraw mm1, 1
- movq [eax+16], mm1
+ paddw mm4, mm6
psraw mm3, 1
- movq [eax+24], mm3
+ movq [eax+16], mm3
+ psraw mm4, 1
+ movq [eax+24], mm4
picpop ebx
ret
MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
- MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
-
movq [eax+ 0], mm0
- movq [eax+ 8], mm4
- movq [eax+16], mm1
- movq [eax+24], mm3
+ movq [eax+ 8], mm2
+ movq [eax+16], mm3
+ movq [eax+24], mm4
ret
cglobal x264_sub4x4_dct_mmxext
MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
- ; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
- MMX_TRANSPOSE mm1, mm2, mm3, mm0, mm4
-
mov eax, [esp+ 8] ; dct
movq [eax+ 0], mm1
- movq [eax+ 8], mm0
- movq [eax+16], mm4
- movq [eax+24], mm3
+ movq [eax+ 8], mm2
+ movq [eax+16], mm3
+ movq [eax+24], mm0
pop ebx
ret
; Load dct coeffs
mov eax, [esp+12] ; dct
movq mm0, [eax+ 0]
- movq mm4, [eax+ 8]
- movq mm3, [eax+16]
- movq mm1, [eax+24]
+ movq mm1, [eax+ 8]
+ movq mm2, [eax+16]
+ movq mm3, [eax+24]
mov eax, [esp+ 4] ; p_dst
mov ecx, [esp+ 8] ; i_dst
picpush ebx
picgetgot ebx
- ; out:mm0, mm1, mm2, mm3
- MMX_TRANSPOSE mm0, mm4, mm3, mm1, mm2
-
MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
MMX_SUMSUB_BA %1, %2
%endmacro
-%macro MMX_STORE_DIFF_8P 6
- movq %1, %3
- movq %2, %1
- punpcklbw %1, %6
- punpckhbw %2, %6
- paddw %1, %4
- paddw %2, %5
- packuswb %1, %2
- movq %3, %1
-%endmacro
-
cglobal x264_pixel_sub_8x8_mmx
-cglobal x264_xdct8_mmxext
+cglobal x264_pixel_add_8x8_mmx
+cglobal x264_transpose_8x8_mmx
cglobal x264_ydct8_mmx
-
-cglobal x264_xidct8_mmxext
cglobal x264_yidct8_mmx
-cglobal x264_pixel_add_8x8_mmx
ALIGN 16
;-----------------------------------------------------------------------------
pop ebx
ret
-ALIGN 16
-;-----------------------------------------------------------------------------
-; void __cdecl x264_xdct8_mmxext( int16_t dest[8][8] );
-;-----------------------------------------------------------------------------
-x264_xdct8_mmxext:
- mov eax, [esp+04] ; dest
-
- picpush ebx
- picgetgot ebx
-
- movq mm5, [x264_mmx_PPNN GOT_ebx]
- movq mm6, [x264_mmx_PNNP GOT_ebx]
- movq mm4, [x264_mmx_PPPN GOT_ebx]
- movq mm7, [x264_mmx_PPNP GOT_ebx]
-
- ;-------------------------------------------------------------------------
- ; horizontal dct ( compute 1 row at a time -> 8 loops )
- ;-------------------------------------------------------------------------
-
- %assign disp 0
- %rep 8
-
- movq mm0, [eax+disp]
- movq mm1, [eax+disp+8]
-
- pshufw mm2, mm1, 00011011b
- movq mm1, mm0
- paddw mm0, mm2 ; (low)s07/s16/d25/s34(high)
- psubw mm1, mm2 ; (low)d07/d16/d25/d34(high)
-
- pshufw mm2, mm0, 00011011b ; (low)s34/s25/s16/s07(high)
- pmullw mm0, mm5 ; (low)s07/s16/-s25/-s34(high)
- paddw mm0, mm2 ; (low)a0/a1/a3/a2(high)
-
- movq mm3, mm1
- psraw mm1, 1 ; (low)d07/d16/d25/d34(high) (x>>1)
- pshufw mm2, mm3, 10110001b ; (low)d16/d07/d34/d25(high)
- paddw mm1, mm3 ; (low)d07/d16/d25/d34(high) (x+(x>>1))
- pshufw mm3, mm2, 00011011b ; (low)d25/d34/d07/d16(high)
- pmullw mm2, mm5 ; (low)d16/d07/-d34/-d25(high)
- pmullw mm1, mm6 ; (low)d07/-d16/-d25/d34(high) (x+(x>>1))
- paddw mm3, mm2
- paddw mm1, mm3 ; (low)a4/a6/a5/a7(high)
-
-
- pshufw mm2, mm0, 11001001b ; (low)a1/a3/a0/a2(high)
- pshufw mm0, mm0, 10011100b ; (low)a0/a2/a1/a3(high)
- pmullw mm2, [x264_mmx_2121 GOT_ebx]
- pmullw mm0, mm5 ; (low)a0/a2/-a1/-a3(high)
- psraw mm2, 1 ; (low)a1/a3>>1/a0/a2>>1(high)
- paddw mm0, mm2 ; (low)dst0/dst2/dst4/dst6(high)
-
- pshufw mm1, mm1, 00100111b ; (low)a7/a6/a5/a4(high)
- pshufw mm2, mm1, 00011011b ; (low)a4/a5/a6/a7(high)
- psraw mm1, 2 ; (low)a7>>2/a6>>2/a5>>2/a4>>2(high)
- pmullw mm2, mm4 ; (low)a4/a5/a6/-a7(high)
- pmullw mm1, mm7 ; (low)a7>>2/a6>>2/-a5>>2/a4>>2(high)
- paddw mm1, mm2 ; (low)dst1/dst3/dst5/dst7(high)
-
- movq mm2, mm0
- punpcklwd mm0, mm1 ; (low)dst0/dst1/dst2/dst3(high)
- punpckhwd mm2, mm1 ; (low)dst4/dst5/dst6/dst7(high)
-
- movq [eax+disp], mm0
- movq [eax+disp+8], mm2
-
- %assign disp disp+16
- %endrep
-
- picpop ebx
- ret
-
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_ydct8_mmx( int16_t dest[8][8] );
ret
-ALIGN 16
-;-----------------------------------------------------------------------------
-; void __cdecl x264_xidct8_mmxext( int16_t dest[8][8] );
-;-----------------------------------------------------------------------------
-x264_xidct8_mmxext:
- mov eax, [esp+04] ; dest
-
- picpush ebx
- picgetgot ebx
-
- movq mm4, [x264_mmx_PPNN GOT_ebx]
- movq mm5, [x264_mmx_PNPN GOT_ebx]
- movq mm6, [x264_mmx_PPNP GOT_ebx]
- movq mm7, [x264_mmx_PPPN GOT_ebx]
-
- ;-------------------------------------------------------------------------
- ; horizontal idct ( compute 1 row at a time -> 8 loops )
- ;-------------------------------------------------------------------------
-
- %assign disp 0
- %rep 8
-
- pshufw mm0, [eax+disp], 11011000b ; (low)d0,d2,d1,d3(high)
- pshufw mm2, [eax+disp+8], 11011000b ; (low)d4,d6,d5,d7(high)
- movq mm1, mm0
- punpcklwd mm0, mm2 ; (low)d0,d4,d2,d6(high)
- punpckhwd mm1, mm2 ; (low)d1,d5,d3,d7(high)
-
- pshufw mm2, mm0, 10110001b ; (low)d4,d0,d6,d2(high)
- pmullw mm0, [x264_mmx_p2n2p1p1 GOT_ebx]; (low)2*d0,-2*d4,d2,d6(high)
- pmullw mm2, mm6 ; (low)d4,d0,-d6,d2(high)
- psraw mm0, 1 ; (low)d0,-d4,d2>>1,d6>>1(high)
- paddw mm0, mm2 ; (low)e0,e2,e4,e6(high)
-
- movq mm3, mm1 ; (low)d1,d5,d3,d7(high)
- psraw mm1, 1 ; (low)d1>>1,d5>>1,d3>>1,d7>>1(high)
- pshufw mm2, mm3, 10110001b ; (low)d5,d1,d7,d3(high)
- paddw mm1, mm3 ; (low)d1+(d1>>1),d5+(d5>>1),d3+(d3>>1),d7+(d7>>1)(high)
- pshufw mm3, mm2, 00011011b ; (low)d3,d7,d1,d5(high)
- pmullw mm1, mm4 ; (low)d1+(d1>>1),d5+(d5>>1),-d3-(d3>>1),-d7-(d7>>1)(high)
- pmullw mm2, mm5 ; (low)d5,-d1,d7,-d3(high)
- paddw mm1, mm3
- paddw mm1, mm2 ; (low)e7,e5,e3,e1(high)
-
- pshufw mm2, mm0, 00011011b ; (low)e6,e4,e2,e0(high)
- pmullw mm0, mm4 ; (low)e0,e2,-e4,-e6(high)
- pshufw mm3, mm1, 00011011b ; (low)e1,e3,e5,e7(high)
- psraw mm1, 2 ; (low)e7>>2,e5>>2,e3>>2,e1>>2(high)
- pmullw mm3, mm6 ; (low)e1,e3,-e5,e7(high)
- pmullw mm1, mm7 ; (low)e7>>2,e5>>2,e3>>2,-e1>>2(high)
- paddw mm0, mm2 ; (low)f0,f2,f4,f6(high)
- paddw mm1, mm3 ; (low)f1,f3,f5,f7(high)
-
- pshufw mm3, mm0, 00011011b ; (low)f6,f4,f2,f0(high)
- pshufw mm2, mm1, 00011011b ; (low)f7,f5,f3,f1(high)
- psubw mm3, mm1
- paddw mm0, mm2
-
- movq [eax+disp], mm0
- movq [eax+disp+8], mm3
-
- %assign disp disp+16
- %endrep
-
- picpop ebx
- ret
-
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_yidct8_mmx( int16_t dest[8][8] );
MMX_SUMSUB_BA mm3, mm2 ; mm3 = g2, mm2 = g5
MMX_SUMSUB_BA mm1, mm0 ; mm1 = g3, mm0 = g4
- psraw mm7, 6
- psraw mm6, 6
- psraw mm5, 6
- psraw mm4, 6
- psraw mm3, 6
- psraw mm2, 6
- psraw mm1, 6
- psraw mm0, 6
-
movq [eax+disp+0*16], mm7
movq [eax+disp+1*16], mm5
movq [eax+disp+2*16], mm3
ALIGN 16
;-----------------------------------------------------------------------------
-; void __cdecl x264_pixel_add_8x8_mmx( unit8_t *dst, int i_dst, int16_t src[8][8] );
+; void __cdecl x264_pixel_add_8x8_mmx( uint8_t *dst, int i_dst, int16_t src[8][8] );
;-----------------------------------------------------------------------------
x264_pixel_add_8x8_mmx:
mov eax, [esp+04] ; dst
%assign disp 0
%rep 8
- MMX_STORE_DIFF_8P mm0, mm1, [eax], [edx+disp], [edx+disp+8], mm7
+ movq mm0, [eax]
+ movq mm2, [edx+disp]
+ movq mm3, [edx+disp+8]
+ movq mm1, mm0
+ psraw mm2, 6
+ psraw mm3, 6
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ packuswb mm0, mm1
+ movq [eax], mm0
add eax, ecx
%assign disp disp+16
%endrep
ret
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void __cdecl x264_transpose_8x8_mmx( int16_t src[8][8] );
+;-----------------------------------------------------------------------------
+x264_transpose_8x8_mmx:
+ mov eax, [esp+4]
+
+ movq mm0, [eax ]
+ movq mm1, [eax+ 16]
+ movq mm2, [eax+ 32]
+ movq mm3, [eax+ 48]
+ MMX_TRANSPOSE mm0, mm1, mm2, mm3, mm4
+ movq [eax ], mm0
+ movq [eax+ 16], mm3
+ movq [eax+ 32], mm4
+ movq [eax+ 48], mm2
+
+ movq mm0, [eax+ 72]
+ movq mm1, [eax+ 88]
+ movq mm2, [eax+104]
+ movq mm3, [eax+120]
+ MMX_TRANSPOSE mm0, mm1, mm2, mm3, mm4
+ movq [eax+ 72], mm0
+ movq [eax+ 88], mm3
+ movq [eax+104], mm4
+ movq [eax+120], mm2
+
+ movq mm0, [eax+ 8]
+ movq mm1, [eax+ 24]
+ movq mm2, [eax+ 40]
+ movq mm3, [eax+ 56]
+ MMX_TRANSPOSE mm0, mm1, mm2, mm3, mm4
+ movq mm1, [eax+ 64]
+ movq mm5, [eax+ 80]
+ movq mm6, [eax+ 96]
+ movq mm7, [eax+112]
+
+ movq [eax+ 64], mm0
+ movq [eax+ 80], mm3
+ movq [eax+ 96], mm4
+ movq [eax+112], mm2
+ MMX_TRANSPOSE mm1, mm5, mm6, mm7, mm4
+ movq [eax+ 8], mm1
+ movq [eax+ 24], mm7
+ movq [eax+ 40], mm4
+ movq [eax+ 56], mm6
+
+ ret
+
void x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
void x264_pixel_add_8x8_mmx( uint8_t *pix, int i_pix, uint16_t *diff );
-void x264_xdct8_mmxext( int16_t dct[8][8] );
-void x264_xidct8_mmxext( int16_t dct[8][8] );
+void x264_transpose_8x8_mmx( int16_t src[8][8] );
void x264_ydct8_mmx( int16_t dct[8][8] );
-void x264_yidct8_mmx( int16_t dct[8][8] ); // including >>6 at the end
+void x264_yidct8_mmx( int16_t dct[8][8] );
-inline void x264_sub8x8_dct8_mmxext( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+inline void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
x264_pixel_sub_8x8_mmx( (int16_t *)dct, pix1, i_pix1, pix2, i_pix2 );
x264_ydct8_mmx( dct );
- x264_xdct8_mmxext( dct );
+ x264_transpose_8x8_mmx( dct );
+ x264_ydct8_mmx( dct );
}
-void x264_sub16x16_dct8_mmxext( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
- x264_sub8x8_dct8_mmxext( dct[0], pix1, i_pix1, pix2, i_pix2 );
- x264_sub8x8_dct8_mmxext( dct[1], pix1+8, i_pix1, pix2+8, i_pix2 );
- x264_sub8x8_dct8_mmxext( dct[2], pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 );
- x264_sub8x8_dct8_mmxext( dct[3], pix1+8*i_pix1+8, i_pix1, pix2+8*i_pix2+8, i_pix2 );
+ x264_sub8x8_dct8_mmx( dct[0], pix1, i_pix1, pix2, i_pix2 );
+ x264_sub8x8_dct8_mmx( dct[1], pix1+8, i_pix1, pix2+8, i_pix2 );
+ x264_sub8x8_dct8_mmx( dct[2], pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 );
+ x264_sub8x8_dct8_mmx( dct[3], pix1+8*i_pix1+8, i_pix1, pix2+8*i_pix2+8, i_pix2 );
}
-inline void x264_add8x8_idct8_mmxext( uint8_t *dst, int i_dst, int16_t dct[8][8] )
+inline void x264_add8x8_idct8_mmx( uint8_t *dst, int i_dst, int16_t dct[8][8] )
{
dct[0][0] += 32;
- x264_xidct8_mmxext( dct );
x264_yidct8_mmx( dct );
- x264_pixel_add_8x8_mmx( dst, i_dst, (uint16_t *)dct );
+ x264_transpose_8x8_mmx( dct );
+ x264_yidct8_mmx( dct );
+ x264_pixel_add_8x8_mmx( dst, i_dst, (uint16_t *)dct ); // including >>6 at the end
}
-void x264_add16x16_idct8_mmxext( uint8_t *dst, int i_dst, int16_t dct[4][8][8] )
+void x264_add16x16_idct8_mmx( uint8_t *dst, int i_dst, int16_t dct[4][8][8] )
{
- x264_add8x8_idct8_mmxext( dst, i_dst, dct[0] );
- x264_add8x8_idct8_mmxext( dst+8, i_dst, dct[1] );
- x264_add8x8_idct8_mmxext( dst+8*i_dst, i_dst, dct[2] );
- x264_add8x8_idct8_mmxext( dst+8*i_dst+8, i_dst, dct[3] );
+ x264_add8x8_idct8_mmx( dst, i_dst, dct[0] );
+ x264_add8x8_idct8_mmx( dst+8, i_dst, dct[1] );
+ x264_add8x8_idct8_mmx( dst+8*i_dst, i_dst, dct[2] );
+ x264_add8x8_idct8_mmx( dst+8*i_dst+8, i_dst, dct[3] );
}
#endif
void x264_dct4x4dc_mmxext( int16_t d[4][4] );
void x264_idct4x4dc_mmxext( int16_t d[4][4] );
-void x264_sub8x8_dct8_mmxext( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
-void x264_sub16x16_dct8_mmxext( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
-void x264_add8x8_idct8_mmxext( uint8_t *dst, int i_dst, int16_t dct[8][8] );
-void x264_add16x16_idct8_mmxext( uint8_t *dst, int i_dst, int16_t dct[4][8][8] );
+void x264_add8x8_idct8_mmx( uint8_t *dst, int i_dst, int16_t dct[8][8] );
+void x264_add16x16_idct8_mmx( uint8_t *dst, int i_dst, int16_t dct[4][8][8] );
void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
6, 4, 5, 3, 6, 4, 5, 3, 6, 4, 5, 3, 3, 3, 1, 2, 0
};
+/* zigzags are transposed with respect to the tables in the standard */
static const int x264_zigzag_scan4[16] =
{
- 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+ 0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15
};
static const int x264_zigzag_scan8[64] =
{
- 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5,
- 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28,
- 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
- 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
-};
+ 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40,
+ 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
+ 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
+ 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63
+};
static const uint8_t block_idx_x[16] =
{
/****************************************************************************
* Scan and Quant functions
****************************************************************************/
-//static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
-//static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
-#define ZIG(i,y,x) level[i] = dct[y][x];
+#define ZIG(i,y,x) level[i] = dct[x][y];
static inline void scan_zigzag_8x8full( int level[64], int16_t dct[8][8] )
{
ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
#include "config.h"
#endif
+static void transpose( uint8_t *buf, int w )
+{
+ int i, j;
+ for( i = 0; i < w; i++ )
+ for( j = 0; j < i; j++ )
+ XCHG( uint8_t, buf[w*i+j], buf[w*j+i] );
+}
+
static void scaling_list_write( bs_t *s, x264_pps_t *pps, int idx )
{
const int len = idx<4 ? 16 : 64;
pps->scaling_list[i] = x264_cqm_jvt[i];
break;
case X264_CQM_CUSTOM:
+ /* match the transposed DCT & zigzag */
+ transpose( param->cqm_4iy, 4 );
+ transpose( param->cqm_4ic, 4 );
+ transpose( param->cqm_4py, 4 );
+ transpose( param->cqm_4pc, 4 );
+ transpose( param->cqm_8iy, 8 );
+ transpose( param->cqm_8py, 8 );
pps->scaling_list[CQM_4IY] = param->cqm_4iy;
pps->scaling_list[CQM_4IC] = param->cqm_4ic;
pps->scaling_list[CQM_4PY] = param->cqm_4py;