From: Fiona Glaser Date: Wed, 12 Oct 2011 00:04:32 +0000 (-0700) Subject: Some more 4:2:2 x86 asm X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b63a73da3add660358a4bad1a590c2d4ed466dc4;p=libx264 Some more 4:2:2 x86 asm coeff_last8, coeff_level_run8, var2_8x16, predict_8x16c_dc, satd_4x16, intra_mbcmp_8x16c_x3, deblock_h_chroma_422 --- diff --git a/common/deblock.c b/common/deblock.c index b7ac0dcf..a6c85487 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -647,6 +647,9 @@ void x264_deblock_v_chroma_sse2( pixel *pix, int stride, int alpha, int beta, in void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_422_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_422_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_422_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta ); void x264_deblock_v_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta ); void x264_deblock_h_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta ); @@ -736,6 +739,9 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_luma[0] = x264_deblock_h_luma_mmx2; pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2; +#if !HIGH_BIT_DEPTH + pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2; +#endif pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2; @@ -745,12 +751,15 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) if( cpu&X264_CPU_SSE2 ) { pf->deblock_strength = x264_deblock_strength_sse2; + pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2; +#if !HIGH_BIT_DEPTH + pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2; +#endif if( !(cpu&X264_CPU_STACK_MOD4) ) { pf->deblock_luma[1] = x264_deblock_v_luma_sse2; pf->deblock_luma[0] = x264_deblock_h_luma_sse2; pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2; - pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2; @@ -762,12 +771,15 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) if( cpu&X264_CPU_AVX ) { pf->deblock_strength = x264_deblock_strength_avx; + pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx; +#if !HIGH_BIT_DEPTH + pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx; +#endif if( !(cpu&X264_CPU_STACK_MOD4) ) { pf->deblock_luma[1] = x264_deblock_v_luma_avx; pf->deblock_luma[0] = x264_deblock_h_luma_avx; pf->deblock_chroma[1] = x264_deblock_v_chroma_avx; - pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx; diff --git a/common/pixel.c b/common/pixel.c index e7b9984f..ee0148ff 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -547,7 +547,8 @@ INTRA_MBCMP(satd, 8x16, dc, h, v, c,, _c ) INTRA_MBCMP( sad, 16x16, v, h, dc, ,, _c ) INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c ) -#if HIGH_BIT_DEPTH && HAVE_MMX +#if HAVE_MMX +#if HIGH_BIT_DEPTH INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c ) INTRA_MBCMP(satd, 4x4, v, h, dc, , _mmx2, _c ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _c ) @@ -559,6 +560,17 @@ INTRA_MBCMP( sad, 16x16, v, h, dc, , _sse2, _sse2 ) INTRA_MBCMP( sad, 4x4, v, h, dc, , _ssse3, _c ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _ssse3, _sse2 ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _ssse3, _sse2 ) +#else +#define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_mmx +INTRA_MBCMP( sad, 8x16, dc, h, v, c, _mmx2, _mmx2 ) +INTRA_MBCMP(satd, 8x16, dc, h, v, c, _mmx2, _mmx2 ) +INTRA_MBCMP( sad, 8x16, dc, h, v, c, _sse2, _mmx2 ) +INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse2, _mmx2 ) +INTRA_MBCMP(satd, 8x16, dc, h, v, c, _ssse3, _mmx2 ) +INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse4, _mmx2 ) +INTRA_MBCMP(satd, 8x16, dc, h, v, c, _avx, _mmx2 ) +INTRA_MBCMP(satd, 8x16, dc, h, v, c, _xop, _mmx2 ) +#endif #endif // No C implementation of intra_satd_x9. See checkasm for its behavior, @@ -820,17 +832,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT7( sad, _mmx2 ); INIT7( sad_x3, _mmx2 ); INIT7( sad_x4, _mmx2 ); - INIT7( satd, _mmx2 ); + INIT8( satd, _mmx2 ); INIT7( satd_x3, _mmx2 ); INIT7( satd_x4, _mmx2 ); INIT4( hadamard_ac, _mmx2 ); - INIT7( ssd, _mmx2 ); + INIT8( ssd, _mmx2 ); INIT_ADS( _mmx2 ); pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2; +#if ARCH_X86 pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2; +#endif pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2; @@ -856,6 +871,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2; } if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) { @@ -941,7 +957,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT8_NAME( sad_aligned, sad, _mmx2 ); INIT7( sad_x3, _mmx2 ); INIT7( sad_x4, _mmx2 ); - INIT7( satd, _mmx2 ); + INIT8( satd, _mmx2 ); INIT7( satd_x3, _mmx2 ); INIT7( satd_x4, _mmx2 ); INIT4( hadamard_ac, _mmx2 ); @@ -956,6 +972,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2; pixf->vsad = x264_pixel_vsad_mmx2; if( cpu&X264_CPU_CACHELINE_32 ) @@ -984,6 +1001,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) #endif pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmx2; + pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_mmx2; + pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_mmx2; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmx2; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2; @@ -1005,6 +1024,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; #endif pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2; pixf->vsad = x264_pixel_vsad_sse2; } @@ -1014,6 +1034,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( sad_x3, _sse2 ); INIT2( sad_x4, _sse2 ); INIT6( satd, _sse2 ); + pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2; INIT6( satd_x3, _sse2 ); INIT6( satd_x4, _sse2 ); if( !(cpu&X264_CPU_STACK_MOD4) ) @@ -1024,6 +1045,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2; + pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse2; + pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_sse2; if( cpu&X264_CPU_CACHELINE_64 ) { INIT2( ssd, _sse2); /* faster for width 16 on p4 */ @@ -1083,15 +1106,17 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT8( ssd, _ssse3 ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; - INIT7( satd, _ssse3 ); + INIT8( satd, _ssse3 ); INIT7( satd_x3, _ssse3 ); INIT7( satd_x4, _ssse3 ); } pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3; + pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) { INIT2( sad, _cache64_ssse3 ); @@ -1106,7 +1131,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) if( cpu&X264_CPU_SSE4 ) { - INIT7( satd, _sse4 ); + INIT8( satd, _sse4 ); INIT7( satd_x3, _sse4 ); INIT7( satd_x4, _sse4 ); if( !(cpu&X264_CPU_STACK_MOD4) ) @@ -1121,11 +1146,12 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) } pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4; + pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse4; } if( cpu&X264_CPU_AVX ) { - INIT7( satd, _avx ); + INIT8( satd, _avx ); INIT7( satd_x3, _avx ); INIT7( satd_x4, _avx ); INIT_ADS( _avx ); @@ -1142,6 +1168,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT5( ssd, _avx ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx; + pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_avx; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx; @@ -1163,10 +1190,12 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT5( ssd, _xop ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop; + pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop; } #endif //HAVE_MMX diff --git a/common/quant.c b/common/quant.c index 5b3a90d4..a6116b4c 100644 --- a/common/quant.c +++ b/common/quant.c @@ -435,13 +435,15 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz; } pf->decimate_score64 = x264_decimate_score64_mmx2; - pf->coeff_last4 = x264_coeff_last4_mmx2; + pf->coeff_last8 = x264_coeff_last8_mmx2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2; pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2; pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2; + pf->coeff_level_run8 = x264_coeff_level_run8_mmx2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2; pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2; #endif + pf->coeff_last4 = x264_coeff_last4_mmx2; pf->coeff_level_run4 = x264_coeff_level_run4_mmx2; if( cpu&X264_CPU_LZCNT ) pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt; @@ -464,17 +466,21 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->decimate_score15 = x264_decimate_score15_sse2_slowctz; pf->decimate_score16 = x264_decimate_score16_sse2_slowctz; } + pf->coeff_last8 = x264_coeff_last8_sse2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2; + pf->coeff_level_run8 = x264_coeff_level_run8_sse2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2; if( cpu&X264_CPU_LZCNT ) { pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt; + pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt; + pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt; } @@ -555,11 +561,15 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2; #endif pf->coeff_last4 = x264_coeff_last4_mmx2; + pf->coeff_last8 = x264_coeff_last8_mmx2; pf->coeff_level_run4 = x264_coeff_level_run4_mmx2; + pf->coeff_level_run8 = x264_coeff_level_run8_mmx2; if( cpu&X264_CPU_LZCNT ) { pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt; + pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt; pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt; + pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt; } } diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 65c667de..0459ee80 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -1881,6 +1881,48 @@ INIT_MMX mmx2 DEBLOCK_CHROMA %endif +%macro DEBLOCK_H_CHROMA_422 0 +cglobal deblock_h_chroma_422, 5,7,8 +%ifdef ARCH_X86_64 + %define cntr r11 +%else + %define cntr dword r0m +%endif + dec r2d + dec r3d + sub r0, 4 + lea t6, [r1*3] + mov t5, r0 + add r0, t6 + mov cntr, 32/mmsize +.skip_prologue: + TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + LOAD_MASK r2d, r3d + movd m6, [r4] ; tc0 + punpcklbw m6, m6 +%if mmsize == 16 + punpcklbw m6, m6 + punpcklbw m6, m6 +%else + pshufw m6, m6, q0000 +%endif + pand m7, m6 + DEBLOCK_P0_Q0 + TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + lea r0, [r0+r1*(mmsize/2)] + lea t5, [t5+r1*(mmsize/2)] + add r4, mmsize/8 + dec cntr + jg .skip_prologue + REP_RET +%endmacro + +INIT_MMX mmx2 +DEBLOCK_H_CHROMA_422 +INIT_XMM sse2 +DEBLOCK_H_CHROMA_422 +INIT_XMM avx +DEBLOCK_H_CHROMA_422 ; in: %1=p0 %2=p1 %3=q1 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 2234f979..619af7c4 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -213,6 +213,7 @@ cglobal pixel_ssd_%1x%2, 4,5 INIT_MMX mmx2 SSD_ONE 4, 4 SSD_ONE 4, 8 +SSD_ONE 4, 16 SSD_ONE 8, 4 SSD_ONE 8, 8 SSD_ONE 8, 16 @@ -806,12 +807,12 @@ INIT_XMM xop VAR %endif ; !HIGH_BIT_DEPTH -%macro VAR2_END 0 +%macro VAR2_END 1 HADDW m5, m7 movd r1d, m5 imul r1d, r1d HADDD m6, m1 - shr r1d, 6 + shr r1d, %1 movd eax, m6 mov [r4], eax sub eax, r1d ; sqr - (sum * sum >> shift) @@ -821,11 +822,11 @@ VAR ;----------------------------------------------------------------------------- ; int pixel_var2_8x8( pixel *, int, pixel *, int, int * ) ;----------------------------------------------------------------------------- -INIT_MMX mmx2 -cglobal pixel_var2_8x8, 5,6 +%macro VAR2_8x8_MMX 2 +cglobal pixel_var2_8x%1, 5,6 FIX_STRIDES r1, r3 VAR_START 0 - mov r5d, 8 + mov r5d, %1 .loop: %ifdef HIGH_BIT_DEPTH mova m0, [r0] @@ -854,13 +855,19 @@ cglobal pixel_var2_8x8, 5,6 add r2, r3 dec r5d jg .loop - VAR2_END - RET + VAR2_END %2 +%endmacro -INIT_XMM sse2 -cglobal pixel_var2_8x8, 5,6,8 +%ifndef ARCH_X86_64 +INIT_MMX mmx2 +VAR2_8x8_MMX 8, 6 +VAR2_8x8_MMX 16, 7 +%endif + +%macro VAR2_8x8_SSE2 2 +cglobal pixel_var2_8x%1, 5,6,8 VAR_START 1 - mov r5d, 4 + mov r5d, %1/2 .loop: %ifdef HIGH_BIT_DEPTH mova m0, [r0] @@ -886,16 +893,20 @@ cglobal pixel_var2_8x8, 5,6,8 lea r2, [r2+r3*2*SIZEOF_PIXEL] dec r5d jg .loop - VAR2_END - RET + VAR2_END %2 +%endmacro + +INIT_XMM sse2 +VAR2_8x8_SSE2 8, 6 +VAR2_8x8_SSE2 16, 7 %ifndef HIGH_BIT_DEPTH -%macro VAR2_8x8 0 -cglobal pixel_var2_8x8, 5,6,8 +%macro VAR2_8x8_SSSE3 2 +cglobal pixel_var2_8x%1, 5,6,8 pxor m5, m5 ; sum pxor m6, m6 ; sum squared mova m7, [hsub_mul] - mov r5d, 2 + mov r5d, %1/4 .loop: movq m0, [r0] movq m2, [r2] @@ -931,14 +942,15 @@ cglobal pixel_var2_8x8, 5,6,8 lea r2, [r2+r3*2] dec r5d jg .loop - VAR2_END - RET + VAR2_END %2 %endmacro INIT_XMM ssse3 -VAR2_8x8 +VAR2_8x8_SSSE3 8, 6 +VAR2_8x8_SSSE3 16, 7 INIT_XMM xop -VAR2_8x8 +VAR2_8x8_SSSE3 8, 6 +VAR2_8x8_SSSE3 16, 7 %endif ; !HIGH_BIT_DEPTH @@ -1215,6 +1227,17 @@ cglobal pixel_satd_8x4, 4,6 call pixel_satd_8x4_internal_mmx2 SATD_END_MMX +cglobal pixel_satd_4x16, 4,6 + SATD_START_MMX + SATD_4x4_MMX m0, 0, 1 + SATD_4x4_MMX m1, 0, 1 + paddw m0, m1 + SATD_4x4_MMX m1, 0, 1 + paddw m0, m1 + SATD_4x4_MMX m1, 0, 0 + paddw m0, m1 + SATD_END_MMX + cglobal pixel_satd_4x8, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 1 @@ -1261,32 +1284,7 @@ cglobal pixel_satd_4x4, 4,6 %endif %endmacro -;----------------------------------------------------------------------------- -; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -%macro SATDS_SSE2 0 -%if cpuflag(ssse3) -cglobal pixel_satd_4x4, 4, 6, 6 - SATD_START_MMX - mova m4, [hmul_4p] - LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] - LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5] - LOAD_DUP_2x4P m0, m5, [r0], [r0+r1] - LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4] - DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 - HADAMARD 0, sumsub, 0, 1, 2, 3 - HADAMARD 4, sumsub, 0, 1, 2, 3 - HADAMARD 1, amax, 0, 1, 2, 3 - HADDW m0, m1 - movd eax, m0 - RET -%endif - -cglobal pixel_satd_4x8, 4, 6, 8 - SATD_START_MMX -%if cpuflag(ssse3) - mova m7, [hmul_4p] -%endif +%macro SATD_4x8_SSE 2 movd m4, [r2] movd m5, [r2+r3] movd m6, [r2+2*r3] @@ -1303,7 +1301,12 @@ cglobal pixel_satd_4x8, 4, 6, 8 JDUP m5, m3 movd m3, [r0+2*r1] JDUP m1, m3 +%if cpuflag(ssse3) && %1==1 + mova m3, [hmul_4p] + DIFFOP 0, 4, 1, 5, 3 +%else DIFFOP 0, 4, 1, 5, 7 +%endif movd m5, [r2] add r2, r5 movd m3, [r0] @@ -1316,10 +1319,57 @@ cglobal pixel_satd_4x8, 4, 6, 8 JDUP m5, m4 movd m4, [r0+r1] JDUP m3, m4 +%if cpuflag(ssse3) && %1==1 + mova m4, [hmul_4p] + DIFFOP 2, 6, 3, 5, 4 +%else DIFFOP 2, 6, 3, 5, 7 - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6, swap - HADDW m6, m1 - movd eax, m6 +%endif + SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 7, %2 +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +%macro SATDS_SSE2 0 +%if cpuflag(ssse3) +cglobal pixel_satd_4x4, 4, 6, 6 + SATD_START_MMX + mova m4, [hmul_4p] + LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] + LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5] + LOAD_DUP_2x4P m0, m5, [r0], [r0+r1] + LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4] + DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 + HADAMARD 0, sumsub, 0, 1, 2, 3 + HADAMARD 4, sumsub, 0, 1, 2, 3 + HADAMARD 1, amax, 0, 1, 2, 3 + HADDW m0, m1 + movd eax, m0 + RET +%endif + +cglobal pixel_satd_4x8, 4, 6, 8 + SATD_START_MMX +%if cpuflag(ssse3) + mova m7, [hmul_4p] +%endif + SATD_4x8_SSE 0, swap + HADDW m7, m1 + movd eax, m7 + RET + +cglobal pixel_satd_4x16, 4, 6, 8 + SATD_START_MMX +%if cpuflag(ssse3) + mova m7, [hmul_4p] +%endif + SATD_4x8_SSE 0, swap + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + SATD_4x8_SSE 1, add + HADDW m7, m1 + movd eax, m7 RET cglobal pixel_satd_8x8_internal diff --git a/common/x86/pixel.h b/common/x86/pixel.h index b682efe0..5a88b0f2 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -147,6 +147,10 @@ int x264_pixel_var2_8x8_mmx2( pixel *, int, pixel *, int, int * ); int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * ); int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * ); int x264_pixel_var2_8x8_xop( uint8_t *, int, uint8_t *, int, int * ); +int x264_pixel_var2_8x16_mmx2( pixel *, int, pixel *, int, int * ); +int x264_pixel_var2_8x16_sse2( pixel *, int, pixel *, int, int * ); +int x264_pixel_var2_8x16_ssse3( uint8_t *, int, uint8_t *, int, int * ); +int x264_pixel_var2_8x16_xop( uint8_t *, int, uint8_t *, int, int * ); int x264_pixel_vsad_mmx2( pixel *src, int stride, int height ); int x264_pixel_vsad_sse2( pixel *src, int stride, int height ); diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index 973233c0..39017f66 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -1675,6 +1675,16 @@ PREDICT_C_H 16 ; void predict_8x8c_dc( pixel *src ) ;----------------------------------------------------------------------------- +%macro LOAD_LEFT 1 + movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL] + movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL] + add r1d, r2d + movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL] + add r1d, r2d + movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL] + add r1d, r2d +%endmacro + %macro PREDICT_8x8C_DC 0 cglobal predict_8x8c_dc, 1,3 pxor m7, m7 @@ -1691,23 +1701,10 @@ cglobal predict_8x8c_dc, 1,3 %endif add r0, FDEC_STRIDEB*4 - movzx r1d, pixel [r0-FDEC_STRIDEB*4-SIZEOF_PIXEL] - movzx r2d, pixel [r0-FDEC_STRIDEB*3-SIZEOF_PIXEL] - add r1d, r2d - movzx r2d, pixel [r0-FDEC_STRIDEB*2-SIZEOF_PIXEL] - add r1d, r2d - movzx r2d, pixel [r0-FDEC_STRIDEB*1-SIZEOF_PIXEL] - add r1d, r2d - movd m2, r1d ; s2 - - movzx r1d, pixel [r0+FDEC_STRIDEB*0-SIZEOF_PIXEL] - movzx r2d, pixel [r0+FDEC_STRIDEB*1-SIZEOF_PIXEL] - add r1d, r2d - movzx r2d, pixel [r0+FDEC_STRIDEB*2-SIZEOF_PIXEL] - add r1d, r2d - movzx r2d, pixel [r0+FDEC_STRIDEB*3-SIZEOF_PIXEL] - add r1d, r2d - movd m3, r1d ; s3 + LOAD_LEFT 0 ; s2 + movd m2, r1d + LOAD_LEFT 4 ; s3 + movd m3, r1d punpcklwd m0, m1 punpcklwd m2, m3 @@ -1766,6 +1763,124 @@ INIT_MMX sse2 PREDICT_8x8C_DC %endif +%ifdef HIGH_BIT_DEPTH +%macro STORE_4LINES 3 +%if cpuflag(sse2) + movdqa [r0+FDEC_STRIDEB*(%3-4)], %1 + movdqa [r0+FDEC_STRIDEB*(%3-3)], %1 + movdqa [r0+FDEC_STRIDEB*(%3-2)], %1 + movdqa [r0+FDEC_STRIDEB*(%3-1)], %1 +%else + movq [r0+FDEC_STRIDEB*(%3-4)+0], %1 + movq [r0+FDEC_STRIDEB*(%3-4)+8], %2 + movq [r0+FDEC_STRIDEB*(%3-3)+0], %1 + movq [r0+FDEC_STRIDEB*(%3-3)+8], %2 + movq [r0+FDEC_STRIDEB*(%3-2)+0], %1 + movq [r0+FDEC_STRIDEB*(%3-2)+8], %2 + movq [r0+FDEC_STRIDEB*(%3-1)+0], %1 + movq [r0+FDEC_STRIDEB*(%3-1)+8], %2 +%endif +%endmacro +%else +%macro STORE_4LINES 2 + movq [r0+FDEC_STRIDEB*(%2-4)], %1 + movq [r0+FDEC_STRIDEB*(%2-3)], %1 + movq [r0+FDEC_STRIDEB*(%2-2)], %1 + movq [r0+FDEC_STRIDEB*(%2-1)], %1 +%endmacro +%endif + +%macro PREDICT_8x16C_DC 0 +cglobal predict_8x16c_dc, 1,3 + pxor m7, m7 +%ifdef HIGH_BIT_DEPTH + movq m0, [r0-FDEC_STRIDEB+0] + movq m1, [r0-FDEC_STRIDEB+8] + HADDW m0, m2 + HADDW m1, m2 +%else + movd m0, [r0-FDEC_STRIDEB+0] + movd m1, [r0-FDEC_STRIDEB+4] + psadbw m0, m7 ; s0 + psadbw m1, m7 ; s1 +%endif + punpcklwd m0, m1 ; s0, s1 + + add r0, FDEC_STRIDEB*4 + LOAD_LEFT 0 ; s2 + pinsrw m0, r1d, 2 + LOAD_LEFT 4 ; s3 + pinsrw m0, r1d, 3 ; s0, s1, s2, s3 + add r0, FDEC_STRIDEB*8 + LOAD_LEFT 0 ; s4 + pinsrw m1, r1d, 2 + LOAD_LEFT 4 ; s5 + pinsrw m1, r1d, 3 ; s1, __, s4, s5 + sub r0, FDEC_STRIDEB*8 + + pshufw m2, m0, q1310 ; s0, s1, s3, s1 + pshufw m0, m0, q3312 ; s2, s1, s3, s3 + pshufw m3, m1, q0302 ; s4, s1, s5, s1 + pshufw m1, m1, q3322 ; s4, s4, s5, s5 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 2 + psrlw m1, 2 + pavgw m0, m7 + pavgw m1, m7 +%ifdef HIGH_BIT_DEPTH +%if cpuflag(sse2) + movq2dq xmm0, m0 + movq2dq xmm1, m1 + punpcklwd xmm0, xmm0 + punpcklwd xmm1, xmm1 + pshufd xmm2, xmm0, q3322 + pshufd xmm3, xmm1, q3322 + punpckldq xmm0, xmm0 + punpckldq xmm1, xmm1 + STORE_4LINES xmm0, xmm0, 0 + STORE_4LINES xmm2, xmm2, 4 + STORE_4LINES xmm1, xmm1, 8 + STORE_4LINES xmm3, xmm3, 12 +%else + pshufw m2, m0, q0000 + pshufw m3, m0, q1111 + pshufw m4, m0, q2222 + pshufw m5, m0, q3333 + STORE_4LINES m2, m3, 0 + STORE_4LINES m4, m5, 4 + pshufw m2, m1, q0000 + pshufw m3, m1, q1111 + pshufw m4, m1, q2222 + pshufw m5, m1, q3333 + STORE_4LINES m2, m3, 8 + STORE_4LINES m4, m5, 12 +%endif +%else + packuswb m0, m0 ; dc0, dc1, dc2, dc3 + packuswb m1, m1 ; dc4, dc5, dc6, dc7 + punpcklbw m0, m0 + punpcklbw m1, m1 + pshufw m2, m0, q1100 + pshufw m3, m0, q3322 + pshufw m4, m1, q1100 + pshufw m5, m1, q3322 + STORE_4LINES m2, 0 + STORE_4LINES m3, 4 + add r0, FDEC_STRIDEB*8 + STORE_4LINES m4, 0 + STORE_4LINES m5, 4 +%endif + RET +%endmacro + +INIT_MMX mmx2 +PREDICT_8x16C_DC +%ifdef HIGH_BIT_DEPTH +INIT_MMX sse2 +PREDICT_8x16C_DC +%endif + %macro PREDICT_C_DC_TOP 1 %ifdef HIGH_BIT_DEPTH INIT_XMM diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index 009234f7..86abc4c6 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -200,7 +200,7 @@ PREDICT_8x8_P( sse2 ) H += -4 * src[-1*FDEC_STRIDE -1];\ int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\ int b = ( 17 * H + 16 ) >> 5;\ - int c = ( 17 * V + 16 ) >> 5;\ + int c = ( 17 * V + 16 ) >> 5; #if HIGH_BIT_DEPTH #define PREDICT_8x8_P2(cpu1, cpu2)\ @@ -383,16 +383,21 @@ void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] ) if( !(cpu&X264_CPU_MMX) ) return; #if HIGH_BIT_DEPTH - pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse2; + if( !(cpu&X264_CPU_MMX2) ) + return; + pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2; if( !(cpu&X264_CPU_SSE2) ) return; + pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse2; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_sse2; + pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_sse2; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_sse2; #else pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_mmx; if( !(cpu&X264_CPU_MMX2) ) return; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_mmx2; + pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2; if( !(cpu&X264_CPU_SSSE3) ) return; diff --git a/common/x86/predict.h b/common/x86/predict.h index 3537f02e..63e08de1 100644 --- a/common/x86/predict.h +++ b/common/x86/predict.h @@ -50,6 +50,8 @@ void x264_predict_16x16_dc_top_ssse3( uint16_t *src ); void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c ); void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c ); void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c ); +void x264_predict_8x16c_dc_mmx2( pixel *src ); +void x264_predict_8x16c_dc_sse2( uint16_t *src ); void x264_predict_8x16c_dc_top_mmx2( uint8_t *src ); void x264_predict_8x16c_dc_top_sse2( uint16_t *src ); void x264_predict_8x16c_v_mmx( uint8_t *src ); diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 6bb25c7a..1087f2c3 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -1157,12 +1157,25 @@ DECIMATE8x8 pmovmskb %2, mm0 %elif mmsize == 16 movdqa xmm0, [%3+ 0] +%if %1 == 8 + packssdw xmm0, [%3+16] + packsswb xmm0, xmm0 +%else movdqa xmm1, [%3+32] packssdw xmm0, [%3+16] packssdw xmm1, [%3+48] packsswb xmm0, xmm1 +%endif pcmpeqb xmm0, xmm2 pmovmskb %2, xmm0 +%elif %1 == 8 + movq mm0, [%3+ 0] + movq mm1, [%3+16] + packssdw mm0, [%3+ 8] + packssdw mm1, [%3+24] + packsswb mm0, mm1 + pcmpeqb mm0, mm2 + pmovmskb %2, mm0 %else movq mm0, [%3+ 0] movq mm1, [%3+16] @@ -1198,11 +1211,38 @@ COEFF_LAST4 INIT_MMX mmx2, lzcnt COEFF_LAST4 +%macro COEFF_LAST8 0 +cglobal coeff_last8, 1,3 + pxor m2, m2 + LAST_MASK 8, r1d, r0 +%if mmsize == 16 + xor r1d, 0xffff + shr r1d, 8 +%else + xor r1d, 0xff +%endif + BSR eax, r1d, 0x1f + RET +%endmacro + +%ifndef ARCH_X86_64 +INIT_MMX mmx2 +COEFF_LAST8 +%endif +INIT_XMM sse2 +COEFF_LAST8 +INIT_XMM sse2, lzcnt +COEFF_LAST8 + %else ; !HIGH_BIT_DEPTH %macro LAST_MASK 3-4 +%if %1 <= 8 + movq mm0, [%3+ 0] %if %1 == 4 - movq mm0, [%3] packsswb mm0, mm0 +%else + packsswb mm0, [%3+ 8] +%endif pcmpeqb mm0, mm2 pmovmskb %2, mm0 %elif mmsize == 16 @@ -1224,7 +1264,7 @@ COEFF_LAST4 %endif %endmacro -%macro COEFF_LAST4 0 +%macro COEFF_LAST48 0 %ifdef ARCH_X86_64 cglobal coeff_last4, 1,1 BSR rax, [r0], 0x3f @@ -1243,12 +1283,19 @@ cglobal coeff_last4, 0,3 lea eax, [eax+ecx*2] RET %endif + +cglobal coeff_last8, 1,3 + pxor m2, m2 + LAST_MASK 8, r1d, r0, r2d + xor r1d, 0xff + BSR eax, r1d, 0x1f + RET %endmacro INIT_MMX mmx2 -COEFF_LAST4 +COEFF_LAST48 INIT_MMX mmx2, lzcnt -COEFF_LAST4 +COEFF_LAST48 %endif ; HIGH_BIT_DEPTH %macro COEFF_LAST 0 @@ -1368,11 +1415,19 @@ COEFF_LEVELRUN 15 COEFF_LEVELRUN 16 %endif COEFF_LEVELRUN 4 +COEFF_LEVELRUN 8 INIT_XMM sse2 +%ifdef HIGH_BIT_DEPTH +COEFF_LEVELRUN 8 +%endif COEFF_LEVELRUN 15 COEFF_LEVELRUN 16 INIT_XMM sse2, lzcnt +%ifdef HIGH_BIT_DEPTH +COEFF_LEVELRUN 8 +%endif COEFF_LEVELRUN 15 COEFF_LEVELRUN 16 INIT_MMX mmx2, lzcnt COEFF_LEVELRUN 4 +COEFF_LEVELRUN 8 diff --git a/common/x86/quant.h b/common/x86/quant.h index d5d49eba..3b1d4240 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -84,13 +84,17 @@ int x264_decimate_score64_mmx2( dctcoef *dct ); int x264_decimate_score64_sse2( dctcoef *dct ); int x264_decimate_score64_ssse3( dctcoef *dct ); int x264_coeff_last4_mmx2( dctcoef *dct ); +int x264_coeff_last8_mmx2( dctcoef *dct ); int x264_coeff_last15_mmx2( dctcoef *dct ); int x264_coeff_last16_mmx2( dctcoef *dct ); int x264_coeff_last64_mmx2( dctcoef *dct ); +int x264_coeff_last8_sse2( dctcoef *dct ); int x264_coeff_last15_sse2( dctcoef *dct ); int x264_coeff_last16_sse2( dctcoef *dct ); int x264_coeff_last64_sse2( dctcoef *dct ); int x264_coeff_last4_mmx2_lzcnt( dctcoef *dct ); +int x264_coeff_last8_mmx2_lzcnt( dctcoef *dct ); +int x264_coeff_last8_sse2_lzcnt( dctcoef *dct ); int x264_coeff_last15_sse2_lzcnt( dctcoef *dct ); int x264_coeff_last16_sse2_lzcnt( dctcoef *dct ); int x264_coeff_last64_sse2_lzcnt( dctcoef *dct ); @@ -102,5 +106,9 @@ int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); #endif