From 1f57251003aa2fa82000ba86fbb04d6911505bd8 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Sun, 26 Apr 2009 22:13:17 -0700 Subject: [PATCH] Some cosmetics/cleanup Move some macros to x86util.asm that should have been there to begin with. Fix a typo that didn't cause any issues. --- common/x86/pixel-a.asm | 56 ------------------------------------------ common/x86/x86util.asm | 56 ++++++++++++++++++++++++++++++++++++++++++ encoder/encoder.c | 2 +- 3 files changed, 57 insertions(+), 57 deletions(-) diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 6075bb44..9617f9e4 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -391,64 +391,8 @@ cglobal x264_pixel_var_8x8_sse2, 2,4,8 ; SATD ;============================================================================= -%macro TRANS_SSE2 5-6 -; TRANSPOSE2x2 -; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq -; %2: ord/unord (for compat with sse4, unused) -; %3/%4: source regs -; %5/%6: tmp regs -%ifidn %1, d -%define mask [mask_10 GLOBAL] -%define shift 16 -%elifidn %1, q -%define mask [mask_1100 GLOBAL] -%define shift 32 -%endif -%if %0==6 ; less dependency if we have two tmp - mova m%5, mask ; ff00 - mova m%6, m%4 ; x5x4 - psll%1 m%4, shift ; x4.. - pand m%6, m%5 ; x5.. - pandn m%5, m%3 ; ..x0 - psrl%1 m%3, shift ; ..x1 - por m%4, m%5 ; x4x0 - por m%3, m%6 ; x5x1 -%else ; more dependency, one insn less. sometimes faster, sometimes not - mova m%5, m%4 ; x5x4 - psll%1 m%4, shift ; x4.. - pxor m%4, m%3 ; (x4^x1)x0 - pand m%4, mask ; (x4^x1).. - pxor m%3, m%4 ; x4x0 - psrl%1 m%4, shift ; ..(x1^x4) - pxor m%5, m%4 ; x5x1 - SWAP %4, %3, %5 -%endif -%endmacro - %define TRANS TRANS_SSE2 -%macro TRANS_SSE4 5-6 ; see above -%ifidn %1, d - mova m%5, m%3 -%ifidn %2, ord - psrl%1 m%3, 16 -%endif - pblendw m%3, m%4, 10101010b - psll%1 m%4, 16 -%ifidn %2, ord - pblendw m%4, m%5, 01010101b -%else - psrl%1 m%5, 16 - por m%4, m%5 -%endif -%elifidn %1, q - mova m%5, m%3 - shufps m%3, m%4, 10001000b - shufps m%5, m%4, 11011101b - SWAP %4, %5 -%endif -%endmacro - %macro JDUP_SSE2 2 punpckldq %1, %2 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm index 8bfe5520..cfd7767e 100644 --- a/common/x86/x86util.asm +++ b/common/x86/x86util.asm @@ -222,6 +222,62 @@ SUMSUB_BADC %3, %7, %4, %8 %endmacro +%macro TRANS_SSE2 5-6 +; TRANSPOSE2x2 +; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq +; %2: ord/unord (for compat with sse4, unused) +; %3/%4: source regs +; %5/%6: tmp regs +%ifidn %1, d +%define mask [mask_10 GLOBAL] +%define shift 16 +%elifidn %1, q +%define mask [mask_1100 GLOBAL] +%define shift 32 +%endif +%if %0==6 ; less dependency if we have two tmp + mova m%5, mask ; ff00 + mova m%6, m%4 ; x5x4 + psll%1 m%4, shift ; x4.. + pand m%6, m%5 ; x5.. + pandn m%5, m%3 ; ..x0 + psrl%1 m%3, shift ; ..x1 + por m%4, m%5 ; x4x0 + por m%3, m%6 ; x5x1 +%else ; more dependency, one insn less. sometimes faster, sometimes not + mova m%5, m%4 ; x5x4 + psll%1 m%4, shift ; x4.. + pxor m%4, m%3 ; (x4^x1)x0 + pand m%4, mask ; (x4^x1).. + pxor m%3, m%4 ; x4x0 + psrl%1 m%4, shift ; ..(x1^x4) + pxor m%5, m%4 ; x5x1 + SWAP %4, %3, %5 +%endif +%endmacro + +%macro TRANS_SSE4 5-6 ; see above +%ifidn %1, d + mova m%5, m%3 +%ifidn %2, ord + psrl%1 m%3, 16 +%endif + pblendw m%3, m%4, 10101010b + psll%1 m%4, 16 +%ifidn %2, ord + pblendw m%4, m%5, 01010101b +%else + psrl%1 m%5, 16 + por m%4, m%5 +%endif +%elifidn %1, q + mova m%5, m%3 + shufps m%3, m%4, 10001000b + shufps m%5, m%4, 11011101b + SWAP %4, %5 +%endif +%endmacro + %macro HADAMARD 5-6 ; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes) ; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes) diff --git a/encoder/encoder.c b/encoder/encoder.c index 308f8438..4ceb762f 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -728,7 +728,7 @@ x264_t *x264_encoder_open ( x264_param_t *param ) x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c ); x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter ); x264_predict_4x4_init( h->param.cpu, h->predict_4x4 ); - if( !h->param.b_cabac ); + if( !h->param.b_cabac ) x264_init_vlc_tables(); x264_pixel_init( h->param.cpu, &h->pixf ); x264_dct_init( h->param.cpu, &h->dctf ); -- 2.40.0