ASMSRC = common/i386/dct-a.asm common/i386/cpu-a.asm \
common/i386/pixel-a.asm common/i386/mc-a.asm \
common/i386/mc-a2.asm common/i386/predict-a.asm \
- common/i386/pixel-sse2.asm common/i386/quant-a.asm
+ common/i386/pixel-sse2.asm common/i386/quant-a.asm \
+ common/i386/deblock-a.asm
OBJASM = $(ASMSRC:%.asm=%.o)
endif
ASMSRC = common/amd64/dct-a.asm common/amd64/cpu-a.asm \
common/amd64/pixel-a.asm common/amd64/mc-a.asm \
common/amd64/mc-a2.asm common/amd64/predict-a.asm \
- common/amd64/pixel-sse2.asm common/amd64/quant-a.asm
+ common/amd64/pixel-sse2.asm common/amd64/quant-a.asm \
+ common/amd64/deblock-a.asm
OBJASM = $(ASMSRC:%.asm=%.o)
ASFLAGS += -Icommon/amd64
endif
--- /dev/null
+;*****************************************************************************
+;* deblock-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2005 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+BITS 64
+
+%include "amd64inc.asm"
+
+SECTION .rodata align=16
+pb_01: times 16 db 0x01
+pb_3f: times 16 db 0x3f
+pb_ff: times 16 db 0xff
+
+SECTION .text
+cglobal x264_deblock_v_luma_sse2
+cglobal x264_deblock_h_luma_sse2
+cglobal x264_deblock_v_chroma_mmxext
+cglobal x264_deblock_h_chroma_mmxext
+cglobal x264_deblock_v_chroma_intra_mmxext
+cglobal x264_deblock_h_chroma_intra_mmxext
+
+; expands to [base],...,[base+7*stride]
+%define PASS8ROWS(base, base3, stride, stride3) \
+ [base], [base+stride], [base+stride*2], [base3], \
+ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
+
+; in: 8 rows of 4 bytes in %1..%8
+; out: 4 rows of 8 bytes in mm0..mm3
+%macro TRANSPOSE4x8_LOAD 8
+ movd mm0, %1
+ movd mm2, %2
+ movd mm1, %3
+ movd mm3, %4
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+ movq mm2, mm0
+ punpcklwd mm0, mm1
+ punpckhwd mm2, mm1
+
+ movd mm4, %5
+ movd mm6, %6
+ movd mm5, %7
+ movd mm7, %8
+ punpcklbw mm4, mm6
+ punpcklbw mm5, mm7
+ movq mm6, mm4
+ punpcklwd mm4, mm5
+ punpckhwd mm6, mm5
+
+ movq mm1, mm0
+ movq mm3, mm2
+ punpckldq mm0, mm4
+ punpckhdq mm1, mm4
+ punpckldq mm2, mm6
+ punpckhdq mm3, mm6
+%endmacro
+
+; in: 4 rows of 8 bytes in mm0..mm3
+; out: 8 rows of 4 bytes in %1..%8
+%macro TRANSPOSE8x4_STORE 8
+ movq mm4, mm0
+ movq mm5, mm1
+ movq mm6, mm2
+ punpckhdq mm4, mm4
+ punpckhdq mm5, mm5
+ punpckhdq mm6, mm6
+
+ punpcklbw mm0, mm1
+ punpcklbw mm2, mm3
+ movq mm1, mm0
+ punpcklwd mm0, mm2
+ punpckhwd mm1, mm2
+ movd %1, mm0
+ punpckhdq mm0, mm0
+ movd %2, mm0
+ movd %3, mm1
+ punpckhdq mm1, mm1
+ movd %4, mm1
+
+ punpckhdq mm3, mm3
+ punpcklbw mm4, mm5
+ punpcklbw mm6, mm3
+ movq mm5, mm4
+ punpcklwd mm4, mm6
+ punpckhwd mm5, mm6
+ movd %5, mm4
+ punpckhdq mm4, mm4
+ movd %6, mm4
+ movd %7, mm5
+ punpckhdq mm5, mm5
+ movd %8, mm5
+%endmacro
+
+%macro SBUTTERFLY 4
+ movq %4, %2
+ punpckl%1 %2, %3
+ punpckh%1 %4, %3
+%endmacro
+
+; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
+; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
+%macro TRANSPOSE6x8_MEM 9
+ movq mm0, %1
+ movq mm1, %3
+ movq mm2, %5
+ movq mm3, %7
+ SBUTTERFLY bw, mm0, %2, mm4
+ SBUTTERFLY bw, mm1, %4, mm5
+ SBUTTERFLY bw, mm2, %6, mm6
+ movq [%9+0x10], mm5
+ SBUTTERFLY bw, mm3, %8, mm7
+ SBUTTERFLY wd, mm0, mm1, mm5
+ SBUTTERFLY wd, mm2, mm3, mm1
+ punpckhdq mm0, mm2
+ movq [%9+0x00], mm0
+ SBUTTERFLY wd, mm4, [%9+0x10], mm3
+ SBUTTERFLY wd, mm6, mm7, mm2
+ SBUTTERFLY dq, mm4, mm6, mm0
+ SBUTTERFLY dq, mm5, mm1, mm7
+ punpckldq mm3, mm2
+ movq [%9+0x10], mm5
+ movq [%9+0x20], mm7
+ movq [%9+0x30], mm4
+ movq [%9+0x40], mm0
+ movq [%9+0x50], mm3
+%endmacro
+
+; out: %4 = |%1-%2|>%3
+; clobbers: %5
+%macro DIFF_GT 6
+ mov%1 %6, %3
+ mov%1 %5, %2
+ psubusb %6, %2
+ psubusb %5, %3
+ por %5, %6
+ psubusb %5, %4
+%endmacro
+%macro DIFF_GT_MMX 5
+ DIFF_GT q, %1, %2, %3, %4, %5
+%endmacro
+%macro DIFF_GT_SSE2 5
+ DIFF_GT dqa, %1, %2, %3, %4, %5
+%endmacro
+
+; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 %1=alpha-1 %2=beta-1
+; out: mm5=beta-1, mm7=mask
+; clobbers: mm4,mm6
+%macro LOAD_MASK_MMX 2
+ movd mm4, %1
+ movd mm5, %2
+ pshufw mm4, mm4, 0
+ pshufw mm5, mm5, 0
+ packuswb mm4, mm4 ; 8x alpha-1
+ packuswb mm5, mm5 ; 8x beta-1
+ DIFF_GT_MMX mm1, mm2, mm4, mm7, mm6 ; |p0-q0| > alpha-1
+ DIFF_GT_MMX mm0, mm1, mm5, mm4, mm6 ; |p1-p0| > beta-1
+ por mm7, mm4
+ DIFF_GT_MMX mm3, mm2, mm5, mm4, mm6 ; |q1-q0| > beta-1
+ por mm7, mm4
+ pxor mm6, mm6
+ pcmpeqb mm7, mm6
+%endmacro
+%macro LOAD_MASK_SSE2 2
+ movd xmm4, %1
+ movd xmm5, %2
+ pshuflw xmm4, xmm4, 0
+ pshuflw xmm5, xmm5, 0
+ punpcklqdq xmm4, xmm4
+ punpcklqdq xmm5, xmm5
+ packuswb xmm4, xmm4 ; 16x alpha-1
+ packuswb xmm5, xmm5 ; 16x beta-1
+ DIFF_GT_SSE2 xmm1, xmm2, xmm4, xmm7, xmm6 ; |p0-q0| > alpha-1
+ DIFF_GT_SSE2 xmm0, xmm1, xmm5, xmm4, xmm6 ; |p1-p0| > beta-1
+ por xmm7, xmm4
+ DIFF_GT_SSE2 xmm3, xmm2, xmm5, xmm4, xmm6 ; |q1-q0| > beta-1
+ por xmm7, xmm4
+ pxor xmm6, xmm6
+ pcmpeqb xmm7, xmm6
+%endmacro
+
+; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
+; out: mm1=p0' mm2=q0'
+; clobbers: mm0,3-6
+%macro DEBLOCK_P0_Q0 2
+ ; a = q0^p0^((p1-q1)>>2)
+ mov%1 %2m4, %2m0
+ psubb %2m4, %2m3
+ psrlw %2m4, 2
+ pxor %2m4, %2m1
+ pxor %2m4, %2m2
+ ; b = p0^(q1>>2)
+ psrlw %2m3, 2
+ pand %2m3, [pb_3f GLOBAL]
+ mov%1 %2m5, %2m1
+ pxor %2m5, %2m3
+ ; c = q0^(p1>>2)
+ psrlw %2m0, 2
+ pand %2m0, [pb_3f GLOBAL]
+ mov%1 %2m6, %2m2
+ pxor %2m6, %2m0
+ ; d = (c^b) & ~(b^a) & 1
+ pxor %2m6, %2m5
+ pxor %2m5, %2m4
+ pandn %2m5, %2m6
+ pand %2m5, [pb_01 GLOBAL]
+ ; delta = (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3
+ ; = (avg(q0, p1>>2) + (d&a))
+ ; - (avg(p0, q1>>2) + (d^(d&a)))
+ pavgb %2m0, %2m2
+ pand %2m4, %2m5
+ paddusb %2m0, %2m4
+ pavgb %2m3, %2m1
+ pxor %2m4, %2m5
+ paddusb %2m3, %2m4
+ ; p0 += clip(delta, -tc0, tc0)
+ ; q0 -= clip(delta, -tc0, tc0)
+ mov%1 %2m4, %2m0
+ psubusb %2m0, %2m3
+ psubusb %2m3, %2m4
+ pminub %2m0, %2m7
+ pminub %2m3, %2m7
+ paddusb %2m1, %2m0
+ paddusb %2m2, %2m3
+ psubusb %2m1, %2m3
+ psubusb %2m2, %2m0
+%endmacro
+%macro DEBLOCK_P0_Q0_MMX 0
+ DEBLOCK_P0_Q0 q, m
+%endmacro
+%macro DEBLOCK_P0_Q0_SSE2 0
+ DEBLOCK_P0_Q0 dqa, xm
+%endmacro
+
+; in: mm1=p0 mm2=q0
+; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
+; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
+; clobbers: q2, tmp, tc0
+%macro LUMA_Q1_SSE2 6
+ movdqa %6, xmm1
+ pavgb %6, xmm2
+ pavgb %2, %6 ; avg(p2,avg(p0,q0))
+ pxor %6, %3
+ pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
+ psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
+ movdqa %6, %1
+ psubusb %6, %5
+ paddusb %5, %1
+ pmaxub %2, %6
+ pminub %2, %5
+ movdqa %4, %2
+%endmacro
+
+
+SECTION .text
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+x264_deblock_v_luma_sse2:
+ ; rdi = pix
+ movsxd rsi, esi ; stride
+ dec edx ; alpha-1
+ dec ecx ; beta-1
+ movd xmm8, [r8] ; tc0
+ mov r8, rdi
+ sub r8, rsi
+ sub r8, rsi
+ sub r8, rsi ; pix-3*stride
+
+ movdqa xmm0, [r8+rsi] ; p1
+ movdqa xmm1, [r8+2*rsi] ; p0
+ movdqa xmm2, [rdi] ; q0
+ movdqa xmm3, [rdi+rsi] ; q1
+ LOAD_MASK_SSE2 edx, ecx
+
+ punpcklbw xmm8, xmm8
+ punpcklbw xmm8, xmm8 ; xmm8 = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
+ movdqa xmm9, [pb_ff GLOBAL]
+ pcmpeqb xmm9, xmm8
+ pandn xmm9, xmm7
+ pand xmm8, xmm9
+
+ movdqa xmm3, [r8] ; p2
+ DIFF_GT_SSE2 xmm1, xmm3, xmm5, xmm6, xmm7 ; |p2-p0| > beta-1
+ pandn xmm6, xmm9
+ pcmpeqb xmm6, xmm9
+ pand xmm6, xmm9
+ movdqa xmm7, [pb_01 GLOBAL]
+ pand xmm7, xmm6
+ pand xmm6, xmm8
+ paddb xmm7, xmm8
+ LUMA_Q1_SSE2 xmm0, xmm3, [r8], [r8+rsi], xmm6, xmm4
+
+ movdqa xmm4, [rdi+2*rsi] ; q2
+ DIFF_GT_SSE2 xmm2, xmm4, xmm5, xmm6, xmm3 ; |q2-q0| > beta-1
+ pandn xmm6, xmm9
+ pcmpeqb xmm6, xmm9
+ pand xmm6, xmm9
+ pand xmm8, xmm6
+ pand xmm6, [pb_01 GLOBAL]
+ paddb xmm7, xmm6
+ movdqa xmm3, [rdi+rsi]
+ LUMA_Q1_SSE2 xmm3, xmm4, [rdi+2*rsi], [rdi+rsi], xmm8, xmm6
+
+ DEBLOCK_P0_Q0_SSE2
+ movdqa [r8+2*rsi], xmm1
+ movdqa [rdi], xmm2
+
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+x264_deblock_h_luma_sse2:
+ movsxd r10, esi
+ lea r11, [r10+r10*2]
+ lea rax, [rdi-4]
+ lea r9, [rdi-4+r11]
+ %define pix_tmp rsp-104 ; 16x6 for the buffer + 8 for x264_deblock_v_luma_sse2's return address
+
+ ; transpose 6x16 -> tmp space
+ TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
+ lea rax, [rax+r10*8]
+ lea r9, [r9 +r10*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
+
+ ; vertical filter
+ ; alpha, beta, tc0 are still in edx, ecx, r8
+ ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
+ lea rdi, [pix_tmp+0x30]
+ mov esi, 0x10
+ call x264_deblock_v_luma_sse2
+
+ ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
+ add rax, 2
+ add r9, 2
+ movq mm0, [pix_tmp+0x18]
+ movq mm1, [pix_tmp+0x28]
+ movq mm2, [pix_tmp+0x38]
+ movq mm3, [pix_tmp+0x48]
+ TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
+
+ shl r10, 3
+ sub rax, r10
+ sub r9, r10
+ shr r10, 3
+ movq mm0, [pix_tmp+0x10]
+ movq mm1, [pix_tmp+0x20]
+ movq mm2, [pix_tmp+0x30]
+ movq mm3, [pix_tmp+0x40]
+ TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
+
+ ret
+
+
+%macro CHROMA_V_START 0
+ ; rdi = pix
+ movsxd rsi, esi ; stride
+ dec edx ; alpha-1
+ dec ecx ; beta-1
+ mov rax, rdi
+ sub rax, rsi
+ sub rax, rsi
+%endmacro
+
+%macro CHROMA_H_START 0
+ movsxd rsi, esi
+ dec edx
+ dec ecx
+ sub rdi, 2
+ lea r9, [rsi+rsi*2]
+ mov rax, rdi
+ add rdi, r9
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+x264_deblock_v_chroma_mmxext:
+ CHROMA_V_START
+
+ movq mm0, [rax]
+ movq mm1, [rax+rsi]
+ movq mm2, [rdi]
+ movq mm3, [rdi+rsi]
+
+ LOAD_MASK_MMX edx, ecx
+ movd mm6, [r8] ; tc0
+ punpcklbw mm6, mm6
+ pand mm7, mm6
+ DEBLOCK_P0_Q0_MMX
+
+ movq [rax+rsi], mm1
+ movq [rdi], mm2
+ ret
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+x264_deblock_h_chroma_mmxext:
+ CHROMA_H_START
+
+ TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
+ movq [rsp-8], mm0
+ movq [rsp-16], mm3
+
+ LOAD_MASK_MMX edx, ecx
+ movd mm6, [r8] ; tc0
+ punpcklbw mm6, mm6
+ pand mm7, mm6
+ DEBLOCK_P0_Q0_MMX
+
+ movq mm0, [rsp-8]
+ movq mm3, [rsp-16]
+ TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9)
+ ret
+
+
+; in: %1=p0 %2=p1 %3=q1
+; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
+%macro CHROMA_INTRA_P0 3
+ movq mm4, %1
+ pxor mm4, %3
+ pand mm4, [pb_01 GLOBAL] ; mm4 = (p0^q1)&1
+ pavgb %1, %3
+ psubusb %1, mm4
+ pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
+%endmacro
+
+%macro CHROMA_INTRA_BODY 0
+ LOAD_MASK_MMX edx, ecx
+ movq mm5, mm1
+ movq mm6, mm2
+ CHROMA_INTRA_P0 mm1, mm0, mm3
+ CHROMA_INTRA_P0 mm2, mm3, mm0
+ psubb mm1, mm5
+ psubb mm2, mm6
+ pand mm1, mm7
+ pand mm2, mm7
+ paddb mm1, mm5
+ paddb mm2, mm6
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+x264_deblock_v_chroma_intra_mmxext:
+ CHROMA_V_START
+
+ movq mm0, [rax]
+ movq mm1, [rax+rsi]
+ movq mm2, [rdi]
+ movq mm3, [rdi+rsi]
+
+ CHROMA_INTRA_BODY
+
+ movq [rax+rsi], mm1
+ movq [rdi], mm2
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+x264_deblock_h_chroma_intra_mmxext:
+ CHROMA_H_START
+ TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
+ CHROMA_INTRA_BODY
+ TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9)
+ ret
+
x264_dct_function_t dctf;
x264_csp_function_t csp;
x264_quant_function_t quantf;
+ x264_deblock_function_t loopf;
/* vlc table for decoding purpose only */
x264_vlc_table_t *x264_coeff_token_lookup[5];
return a;
}
-static inline void deblocking_filter_edgev( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
{
int i, d;
- const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
- const int alpha = i_alpha_table[i_index_a];
- const int beta = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
-
- for( i = 0; i < 4; i++ )
- {
- if( bS[i] == 0 )
- {
- pix += 4 * i_pix_stride;
+ for( i = 0; i < 4; i++ ) {
+ if( tc0[i] < 0 ) {
+ pix += 4*ystride;
continue;
}
-
- if( bS[i] < 4 )
- {
- const int tc0 = i_tc0_table[i_index_a][bS[i] - 1];
-
- /* 4px edge length */
- for( d = 0; d < 4; d++ )
- {
- const int p0 = pix[-1];
- const int p1 = pix[-2];
- const int p2 = pix[-3];
- const int q0 = pix[0];
- const int q1 = pix[1];
- const int q2 = pix[2];
-
- if( abs( p0 - q0 ) < alpha &&
- abs( p1 - p0 ) < beta &&
- abs( q1 - q0 ) < beta )
- {
- int tc = tc0;
- int i_delta;
-
- if( abs( p2 - p0 ) < beta )
- {
- pix[-2] = p1 + x264_clip3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
- tc++;
- }
- if( abs( q2 - q0 ) < beta )
- {
- pix[1] = q1 + x264_clip3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
- tc++;
- }
-
- i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
- pix[-1] = clip_uint8( p0 + i_delta ); /* p0' */
- pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
+ for( d = 0; d < 4; d++ ) {
+ const int p2 = pix[-3*xstride];
+ const int p1 = pix[-2*xstride];
+ const int p0 = pix[-1*xstride];
+ const int q0 = pix[ 0*xstride];
+ const int q1 = pix[ 1*xstride];
+ const int q2 = pix[ 2*xstride];
+
+ if( X264_ABS( p0 - q0 ) < alpha &&
+ X264_ABS( p1 - p0 ) < beta &&
+ X264_ABS( q1 - q0 ) < beta ) {
+
+ int tc = tc0[i];
+ int delta;
+
+ if( X264_ABS( p2 - p0 ) < beta ) {
+ pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
+ tc++;
}
- pix += i_pix_stride;
- }
- }
- else
- {
- /* 4px edge length */
- for( d = 0; d < 4; d++ )
- {
- const int p0 = pix[-1];
- const int p1 = pix[-2];
- const int p2 = pix[-3];
-
- const int q0 = pix[0];
- const int q1 = pix[1];
- const int q2 = pix[2];
-
- if( abs( p0 - q0 ) < alpha &&
- abs( p1 - p0 ) < beta &&
- abs( q1 - q0 ) < beta )
- {
- if( abs( p0 - q0 ) < (( alpha >> 2 ) + 2 ) )
- {
- if( abs( p2 - p0 ) < beta )
- {
- const int p3 = pix[-4];
- /* p0', p1', p2' */
- pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
- pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
- pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
- }
- else
- {
- /* p0' */
- pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
- }
- if( abs( q2 - q0 ) < beta )
- {
- const int q3 = pix[3];
- /* q0', q1', q2' */
- pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
- pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
- pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
- }
- else
- {
- /* q0' */
- pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
- }
- }
- else
- {
- /* p0', q0' */
- pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
- pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
- }
+ if( X264_ABS( q2 - q0 ) < beta ) {
+ pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
+ tc++;
}
- pix += i_pix_stride;
+
+ delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+ pix[-1*xstride] = clip_uint8( p0 + delta ); /* p0' */
+ pix[ 0*xstride] = clip_uint8( q0 - delta ); /* q0' */
}
+ pix += ystride;
}
}
}
+static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+{
+ deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
+}
+static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+{
+ deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
+}
-static inline void deblocking_filter_edgecv( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
{
int i, d;
- const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
- const int alpha = i_alpha_table[i_index_a];
- const int beta = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
-
- for( i = 0; i < 4; i++ )
- {
- if( bS[i] == 0 )
- {
- pix += 2 * i_pix_stride;
+ for( i = 0; i < 4; i++ ) {
+ const int tc = tc0[i];
+ if( tc <= 0 ) {
+ pix += 2*ystride;
continue;
}
-
- if( bS[i] < 4 )
- {
- const int tc = i_tc0_table[i_index_a][bS[i] - 1] + 1;
- /* 2px edge length (because we use same bS than the one for luma) */
- for( d = 0; d < 2; d++ )
- {
- const int p0 = pix[-1];
- const int p1 = pix[-2];
- const int q0 = pix[0];
- const int q1 = pix[1];
-
- if( abs( p0 - q0 ) < alpha &&
- abs( p1 - p0 ) < beta &&
- abs( q1 - q0 ) < beta )
- {
- const int i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-
- pix[-1] = clip_uint8( p0 + i_delta ); /* p0' */
- pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
- }
- pix += i_pix_stride;
- }
- }
- else
- {
- /* 2px edge length (because we use same bS than the one for luma) */
- for( d = 0; d < 2; d++ )
- {
- const int p0 = pix[-1];
- const int p1 = pix[-2];
- const int q0 = pix[0];
- const int q1 = pix[1];
-
- if( abs( p0 - q0 ) < alpha &&
- abs( p1 - p0 ) < beta &&
- abs( q1 - q0 ) < beta )
- {
- pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
- pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
- }
- pix += i_pix_stride;
+ for( d = 0; d < 2; d++ ) {
+ const int p1 = pix[-2*xstride];
+ const int p0 = pix[-1*xstride];
+ const int q0 = pix[ 0*xstride];
+ const int q1 = pix[ 1*xstride];
+
+ if( X264_ABS( p0 - q0 ) < alpha &&
+ X264_ABS( p1 - p0 ) < beta &&
+ X264_ABS( q1 - q0 ) < beta ) {
+
+ int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+ pix[-1*xstride] = clip_uint8( p0 + delta ); /* p0' */
+ pix[ 0*xstride] = clip_uint8( q0 - delta ); /* q0' */
}
+ pix += ystride;
}
}
}
+static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+{
+ deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
+}
+static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+{
+ deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
+}
-static inline void deblocking_filter_edgeh( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
{
- int i, d;
- const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
- const int alpha = i_alpha_table[i_index_a];
- const int beta = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
-
- int i_pix_next = i_pix_stride;
-
- for( i = 0; i < 4; i++ )
- {
- if( bS[i] == 0 )
- {
- pix += 4;
- continue;
- }
-
- if( bS[i] < 4 )
- {
- const int tc0 = i_tc0_table[i_index_a][bS[i] - 1];
- /* 4px edge length */
- for( d = 0; d < 4; d++ )
- {
- const int p0 = pix[-i_pix_next];
- const int p1 = pix[-2*i_pix_next];
- const int p2 = pix[-3*i_pix_next];
- const int q0 = pix[0];
- const int q1 = pix[1*i_pix_next];
- const int q2 = pix[2*i_pix_next];
-
- if( abs( p0 - q0 ) < alpha &&
- abs( p1 - p0 ) < beta &&
- abs( q1 - q0 ) < beta )
+ int d;
+ for( d = 0; d < 16; d++ ) {
+ const int p2 = pix[-3*xstride];
+ const int p1 = pix[-2*xstride];
+ const int p0 = pix[-1*xstride];
+ const int q0 = pix[ 0*xstride];
+ const int q1 = pix[ 1*xstride];
+ const int q2 = pix[ 2*xstride];
+
+ if( X264_ABS( p0 - q0 ) < alpha &&
+ X264_ABS( p1 - p0 ) < beta &&
+ X264_ABS( q1 - q0 ) < beta ) {
+
+ if(X264_ABS( p0 - q0 ) < ((alpha >> 2) + 2) ){
+ if( X264_ABS( p2 - p0 ) < beta)
{
- int tc = tc0;
- int i_delta;
-
- if( abs( p2 - p0 ) < beta )
- {
- pix[-2*i_pix_next] = p1 + x264_clip3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
- tc++;
- }
- if( abs( q2 - q0 ) < beta )
- {
- pix[i_pix_next] = q1 + x264_clip3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
- tc++;
- }
-
- i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
- pix[-i_pix_next] = clip_uint8( p0 + i_delta ); /* p0' */
- pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
+ const int p3 = pix[-4*xstride];
+ /* p0', p1', p2' */
+ pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+ pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+ pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+ } else {
+ /* p0' */
+ pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
}
- pix++;
- }
- }
- else
- {
- /* 4px edge length */
- for( d = 0; d < 4; d++ )
- {
- const int p0 = pix[-i_pix_next];
- const int p1 = pix[-2*i_pix_next];
- const int p2 = pix[-3*i_pix_next];
- const int q0 = pix[0];
- const int q1 = pix[1*i_pix_next];
- const int q2 = pix[2*i_pix_next];
-
- if( abs( p0 - q0 ) < alpha &&
- abs( p1 - p0 ) < beta &&
- abs( q1 - q0 ) < beta )
+ if( X264_ABS( q2 - q0 ) < beta)
{
- const int p3 = pix[-4*i_pix_next];
- const int q3 = pix[ 3*i_pix_next];
-
- if( abs( p0 - q0 ) < (( alpha >> 2 ) + 2 ) )
- {
- if( abs( p2 - p0 ) < beta )
- {
- /* p0', p1', p2' */
- pix[-1*i_pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
- pix[-2*i_pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
- pix[-3*i_pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
- }
- else
- {
- /* p0' */
- pix[-1*i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
- }
- if( abs( q2 - q0 ) < beta )
- {
- /* q0', q1', q2' */
- pix[0*i_pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
- pix[1*i_pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
- pix[2*i_pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
- }
- else
- {
- /* q0' */
- pix[0*i_pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
- }
- }
- else
- {
- /* p0' */
- pix[-1*i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
- /* q0' */
- pix[0*i_pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
- }
+ const int q3 = pix[3*xstride];
+ /* q0', q1', q2' */
+ pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+ pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+ pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+ } else {
+ /* q0' */
+ pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
}
- pix++;
+ }else{
+ /* p0', q0' */
+ pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+ pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
}
-
}
+ pix += ystride;
}
}
+static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
+{
+ deblock_luma_intra_c( pix, stride, 1, alpha, beta );
+}
+static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
+{
+ deblock_luma_intra_c( pix, 1, stride, alpha, beta );
+}
-static inline void deblocking_filter_edgech( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
-{
- int i, d;
- const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
- const int alpha = i_alpha_table[i_index_a];
- const int beta = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
-
- int i_pix_next = i_pix_stride;
-
- for( i = 0; i < 4; i++ )
- {
- if( bS[i] == 0 )
- {
- pix += 2;
- continue;
+static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
+{
+ int d;
+ for( d = 0; d < 8; d++ ) {
+ const int p1 = pix[-2*xstride];
+ const int p0 = pix[-1*xstride];
+ const int q0 = pix[ 0*xstride];
+ const int q1 = pix[ 1*xstride];
+
+ if( X264_ABS( p0 - q0 ) < alpha &&
+ X264_ABS( p1 - p0 ) < beta &&
+ X264_ABS( q1 - q0 ) < beta ) {
+
+ pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */
+ pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
}
- if( bS[i] < 4 )
- {
- int tc = i_tc0_table[i_index_a][bS[i] - 1] + 1;
- /* 2px edge length (see deblocking_filter_edgecv) */
- for( d = 0; d < 2; d++ )
- {
- const int p0 = pix[-1*i_pix_next];
- const int p1 = pix[-2*i_pix_next];
- const int q0 = pix[0];
- const int q1 = pix[1*i_pix_next];
-
- if( abs( p0 - q0 ) < alpha &&
- abs( p1 - p0 ) < beta &&
- abs( q1 - q0 ) < beta )
- {
- int i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
- pix[-i_pix_next] = clip_uint8( p0 + i_delta ); /* p0' */
- pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
- }
- pix++;
- }
- }
- else
- {
- /* 2px edge length (see deblocking_filter_edgecv) */
- for( d = 0; d < 2; d++ )
- {
- const int p0 = pix[-1*i_pix_next];
- const int p1 = pix[-2*i_pix_next];
- const int q0 = pix[0];
- const int q1 = pix[1*i_pix_next];
-
- if( abs( p0 - q0 ) < alpha &&
- abs( p1 - p0 ) < beta &&
- abs( q1 - q0 ) < beta )
- {
- pix[-i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
- pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
- }
- pix++;
- }
- }
+ pix += ystride;
+ }
+}
+static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
+{
+ deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
+}
+static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
+{
+ deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
+}
+
+static inline void deblock_edge( x264_t *h, uint8_t *pix, int i_stride, int bS[4], int i_qp, int b_chroma,
+ x264_deblock_inter_t pf_inter, x264_deblock_intra_t pf_intra )
+{
+ int i;
+ const int index_a = x264_clip3( i_qp + h->sh.i_alpha_c0_offset, 0, 51 );
+ const int alpha = i_alpha_table[index_a];
+ const int beta = i_beta_table[x264_clip3( i_qp + h->sh.i_beta_offset, 0, 51 )];
+
+ if( bS[0] < 4 ) {
+ int8_t tc[4];
+ for(i=0; i<4; i++)
+ tc[i] = (bS[i] ? i_tc0_table[index_a][bS[i] - 1] : -1) + b_chroma;
+ pf_inter( pix, i_stride, alpha, beta, tc );
+ } else {
+ pf_intra( pix, i_stride, alpha, beta );
}
}
/* vertical edge */
if( !b_8x8_transform || !(i_edge & 1) )
{
- deblocking_filter_edgev( h, &h->fdec->plane[0][16 * mb_y * h->fdec->i_stride[0]+ 16 * mb_x + 4 * i_edge],
- h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1);
+ deblock_edge( h, &h->fdec->plane[0][16*mb_y * h->fdec->i_stride[0] + 16*mb_x + 4*i_edge],
+ h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1, 0,
+ h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra );
}
if( !(i_edge & 1) )
{
/* U/V planes */
int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
- deblocking_filter_edgecv( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+i_edge*2],
- h->fdec->i_stride[1], bS, i_qpc );
- deblocking_filter_edgecv( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+i_edge*2],
- h->fdec->i_stride[2], bS, i_qpc );
+ deblock_edge( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+2*i_edge],
+ h->fdec->i_stride[1], bS, i_qpc, 1,
+ h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );
+ deblock_edge( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+2*i_edge],
+ h->fdec->i_stride[2], bS, i_qpc, 1,
+ h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );
}
}
else
/* horizontal edge */
if( !b_8x8_transform || !(i_edge & 1) )
{
- deblocking_filter_edgeh( h, &h->fdec->plane[0][(16*mb_y + 4 * i_edge) * h->fdec->i_stride[0]+ 16 * mb_x],
- h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1 );
+ deblock_edge( h, &h->fdec->plane[0][(16*mb_y + 4*i_edge) * h->fdec->i_stride[0] + 16*mb_x],
+ h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1, 0,
+ h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra );
}
/* U/V planes */
if( !(i_edge & 1) )
{
int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
- deblocking_filter_edgech( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+i_edge*2*h->fdec->i_stride[1]],
- h->fdec->i_stride[1], bS, i_qpc );
- deblocking_filter_edgech( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+i_edge*2*h->fdec->i_stride[2]],
- h->fdec->i_stride[2], bS, i_qpc );
+ deblock_edge( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+2*i_edge*h->fdec->i_stride[1]],
+ h->fdec->i_stride[1], bS, i_qpc, 1,
+ h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );
+ deblock_edge( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+2*i_edge*h->fdec->i_stride[2]],
+ h->fdec->i_stride[2], bS, i_qpc, 1,
+ h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );
}
}
}
}
}
+void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
+
+#ifdef ARCH_X86_64
+void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+#else
+void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+
+void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+{
+ x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 );
+ x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
+}
+#endif
+void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
+{
+ pf->deblock_v_luma = deblock_v_luma_c;
+ pf->deblock_h_luma = deblock_h_luma_c;
+ pf->deblock_v_chroma = deblock_v_chroma_c;
+ pf->deblock_h_chroma = deblock_h_chroma_c;
+ pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
+ pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
+ pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
+ pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
+
+ if( cpu&X264_CPU_MMXEXT )
+ {
+ pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
+ pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
+ pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
+ pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
+#ifdef ARCH_X86_64
+ if( cpu&X264_CPU_SSE2 )
+ {
+ pf->deblock_v_luma = x264_deblock_v_luma_sse2;
+ pf->deblock_h_luma = x264_deblock_h_luma_sse2;
+ }
+#else
+ pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
+ pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
+#endif
+ }
+}
#ifndef _FRAME_H
#define _FRAME_H 1
+#include <inttypes.h>
+
typedef struct
{
/* */
} x264_frame_t;
+typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+typedef void (*x264_deblock_intra_t)( uint8_t *pix, int stride, int alpha, int beta );
+typedef struct
+{
+ x264_deblock_inter_t deblock_v_luma;
+ x264_deblock_inter_t deblock_h_luma;
+ x264_deblock_inter_t deblock_v_chroma;
+ x264_deblock_inter_t deblock_h_chroma;
+ x264_deblock_intra_t deblock_v_luma_intra;
+ x264_deblock_intra_t deblock_h_luma_intra;
+ x264_deblock_intra_t deblock_v_chroma_intra;
+ x264_deblock_intra_t deblock_h_chroma_intra;
+} x264_deblock_function_t;
+
x264_frame_t *x264_frame_new( x264_t *h );
void x264_frame_delete( x264_frame_t *frame );
void x264_frame_filter( int cpu, x264_frame_t *frame );
void x264_frame_init_lowres( int cpu, x264_frame_t *frame );
+void x264_deblock_init( int cpu, x264_deblock_function_t *pf );
+
#endif
--- /dev/null
+;*****************************************************************************
+;* deblock-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2005 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+%macro cglobal 1
+ %ifdef PREFIX
+ global _%1
+ %define %1 _%1
+ %else
+ global %1
+ %endif
+%endmacro
+
+SECTION .rodata align=16
+pb_01: times 16 db 0x01
+pb_3f: times 16 db 0x3f
+pb_ff: times 16 db 0xff
+
+SECTION .text
+cglobal x264_deblock_v8_luma_mmxext
+cglobal x264_deblock_h_luma_mmxext
+cglobal x264_deblock_v_chroma_mmxext
+cglobal x264_deblock_h_chroma_mmxext
+cglobal x264_deblock_v_chroma_intra_mmxext
+cglobal x264_deblock_h_chroma_intra_mmxext
+
+; expands to [base],...,[base+7*stride]
+%define PASS8ROWS(base, base3, stride, stride3) \
+ [base], [base+stride], [base+stride*2], [base3], \
+ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
+
+; in: 8 rows of 4 bytes in %1..%8
+; out: 4 rows of 8 bytes in mm0..mm3
+%macro TRANSPOSE4x8_LOAD 8
+ movd mm0, %1
+ movd mm2, %2
+ movd mm1, %3
+ movd mm3, %4
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+ movq mm2, mm0
+ punpcklwd mm0, mm1
+ punpckhwd mm2, mm1
+
+ movd mm4, %5
+ movd mm6, %6
+ movd mm5, %7
+ movd mm7, %8
+ punpcklbw mm4, mm6
+ punpcklbw mm5, mm7
+ movq mm6, mm4
+ punpcklwd mm4, mm5
+ punpckhwd mm6, mm5
+
+ movq mm1, mm0
+ movq mm3, mm2
+ punpckldq mm0, mm4
+ punpckhdq mm1, mm4
+ punpckldq mm2, mm6
+ punpckhdq mm3, mm6
+%endmacro
+
+; in: 4 rows of 8 bytes in mm0..mm3
+; out: 8 rows of 4 bytes in %1..%8
+%macro TRANSPOSE8x4_STORE 8
+ movq mm4, mm0
+ movq mm5, mm1
+ movq mm6, mm2
+ punpckhdq mm4, mm4
+ punpckhdq mm5, mm5
+ punpckhdq mm6, mm6
+
+ punpcklbw mm0, mm1
+ punpcklbw mm2, mm3
+ movq mm1, mm0
+ punpcklwd mm0, mm2
+ punpckhwd mm1, mm2
+ movd %1, mm0
+ punpckhdq mm0, mm0
+ movd %2, mm0
+ movd %3, mm1
+ punpckhdq mm1, mm1
+ movd %4, mm1
+
+ punpckhdq mm3, mm3
+ punpcklbw mm4, mm5
+ punpcklbw mm6, mm3
+ movq mm5, mm4
+ punpcklwd mm4, mm6
+ punpckhwd mm5, mm6
+ movd %5, mm4
+ punpckhdq mm4, mm4
+ movd %6, mm4
+ movd %7, mm5
+ punpckhdq mm5, mm5
+ movd %8, mm5
+%endmacro
+
+%macro SBUTTERFLY 4
+ movq %4, %2
+ punpckl%1 %2, %3
+ punpckh%1 %4, %3
+%endmacro
+
+; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
+; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
+%macro TRANSPOSE6x8_MEM 9
+ movq mm0, %1
+ movq mm1, %3
+ movq mm2, %5
+ movq mm3, %7
+ SBUTTERFLY bw, mm0, %2, mm4
+ SBUTTERFLY bw, mm1, %4, mm5
+ SBUTTERFLY bw, mm2, %6, mm6
+ movq [%9+0x10], mm5
+ SBUTTERFLY bw, mm3, %8, mm7
+ SBUTTERFLY wd, mm0, mm1, mm5
+ SBUTTERFLY wd, mm2, mm3, mm1
+ punpckhdq mm0, mm2
+ movq [%9+0x00], mm0
+ SBUTTERFLY wd, mm4, [%9+0x10], mm3
+ SBUTTERFLY wd, mm6, mm7, mm2
+ SBUTTERFLY dq, mm4, mm6, mm0
+ SBUTTERFLY dq, mm5, mm1, mm7
+ punpckldq mm3, mm2
+ movq [%9+0x10], mm5
+ movq [%9+0x20], mm7
+ movq [%9+0x30], mm4
+ movq [%9+0x40], mm0
+ movq [%9+0x50], mm3
+%endmacro
+
+; out: %4 = |%1-%2|>%3
+; clobbers: %5
+%macro DIFF_GT_MMX 5
+ movq %5, %2
+ movq %4, %1
+ psubusb %5, %1
+ psubusb %4, %2
+ por %4, %5
+ psubusb %4, %3
+%endmacro
+
+; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 %1=alpha-1 %2=beta-1
+; out: mm5=beta-1, mm7=mask
+; clobbers: mm4,mm6
+%macro LOAD_MASK_MMX 2
+ movd mm4, %1
+ movd mm5, %2
+ pshufw mm4, mm4, 0
+ pshufw mm5, mm5, 0
+ packuswb mm4, mm4 ; 8x alpha-1
+ packuswb mm5, mm5 ; 8x beta-1
+ DIFF_GT_MMX mm1, mm2, mm4, mm7, mm6 ; |p0-q0| > alpha-1
+ DIFF_GT_MMX mm0, mm1, mm5, mm4, mm6 ; |p1-p0| > beta-1
+ por mm7, mm4
+ DIFF_GT_MMX mm3, mm2, mm5, mm4, mm6 ; |q1-q0| > beta-1
+ por mm7, mm4
+ pxor mm6, mm6
+ pcmpeqb mm7, mm6
+%endmacro
+
+; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
+; out: mm1=p0' mm2=q0'
+; clobbers: mm0,3-6
+%macro DEBLOCK_P0_Q0_MMX 0
+ ; a = q0^p0^((p1-q1)>>2)
+ movq mm4, mm0
+ psubb mm4, mm3
+ psrlw mm4, 2
+ pxor mm4, mm1
+ pxor mm4, mm2
+ ; b = p0^(q1>>2)
+ psrlw mm3, 2
+ pand mm3, [pb_3f]
+ movq mm5, mm1
+ pxor mm5, mm3
+ ; c = q0^(p1>>2)
+ psrlw mm0, 2
+ pand mm0, [pb_3f]
+ movq mm6, mm2
+ pxor mm6, mm0
+ ; d = (c^b) & ~(b^a) & 1
+ pxor mm6, mm5
+ pxor mm5, mm4
+ pandn mm5, mm6
+ pand mm5, [pb_01]
+ ; delta = (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3
+ ; = (avg(q0, p1>>2) + (d&a))
+ ; - (avg(p0, q1>>2) + (d^(d&a)))
+ pavgb mm0, mm2
+ pand mm4, mm5
+ paddusb mm0, mm4
+ pavgb mm3, mm1
+ pxor mm4, mm5
+ paddusb mm3, mm4
+ ; p0 += clip(delta, -tc0, tc0)
+ ; q0 -= clip(delta, -tc0, tc0)
+ movq mm4, mm0
+ psubusb mm0, mm3
+ psubusb mm3, mm4
+ pminub mm0, mm7
+ pminub mm3, mm7
+ paddusb mm1, mm0
+ paddusb mm2, mm3
+ psubusb mm1, mm3
+ psubusb mm2, mm0
+%endmacro
+
+; in: mm1=p0 mm2=q0
+; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
+; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
+; clobbers: q2, tmp, tc0
+%macro LUMA_Q1_MMX 6
+ movq %6, mm1
+ pavgb %6, mm2
+ pavgb %2, %6 ; avg(p2,avg(p0,q0))
+ pxor %6, %3
+ pand %6, [pb_01] ; (p2^avg(p0,q0))&1
+ psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
+ movq %6, %1
+ psubusb %6, %5
+ paddusb %5, %1
+ pmaxub %2, %6
+ pminub %2, %5
+ movq %4, %2
+%endmacro
+
+
+SECTION .text
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+x264_deblock_v8_luma_mmxext:
+ push edi
+ push esi
+ mov edi, [esp+12] ; pix
+ mov esi, [esp+16] ; stride
+ mov edx, [esp+20] ; alpha
+ mov ecx, [esp+24] ; beta
+ dec edx
+ dec ecx
+ mov eax, edi
+ sub eax, esi
+ sub eax, esi
+ sub eax, esi ; pix-3*stride
+ sub esp, 16
+
+ movq mm0, [eax+esi] ; p1
+ movq mm1, [eax+2*esi] ; p0
+ movq mm2, [edi] ; q0
+ movq mm3, [edi+esi] ; q1
+ LOAD_MASK_MMX edx, ecx
+
+ mov ecx, [esp+44] ; tc0, use only the low 16 bits
+ movd mm4, [ecx]
+ punpcklbw mm4, mm4
+ punpcklbw mm4, mm4 ; tc = 4x tc0[1], 4x tc0[0]
+ movq [esp+8], mm4 ; tc
+ pcmpgtb mm4, [pb_ff]
+ pand mm4, mm7
+ movq [esp+0], mm4 ; mask
+
+ movq mm3, [eax] ; p2
+ DIFF_GT_MMX mm1, mm3, mm5, mm6, mm7 ; |p2-p0| > beta-1
+ pandn mm6, mm4
+ pcmpeqb mm6, mm4
+ pand mm6, mm4
+ pand mm4, [esp+8] ; tc
+ movq mm7, [pb_01]
+ pand mm7, mm6
+ pand mm6, mm4
+ paddb mm7, mm4
+ LUMA_Q1_MMX mm0, mm3, [eax], [eax+esi], mm6, mm4
+
+ movq mm4, [edi+2*esi] ; q2
+ DIFF_GT_MMX mm2, mm4, mm5, mm6, mm3 ; |q2-q0| > beta-1
+ movq mm5, [esp+0] ; mask
+ pandn mm6, mm5
+ pcmpeqb mm6, mm5
+ pand mm6, mm5
+ movq mm5, [esp+8] ; tc
+ pand mm5, mm6
+ pand mm6, [pb_01]
+ paddb mm7, mm6
+ movq mm3, [edi+esi]
+ LUMA_Q1_MMX mm3, mm4, [edi+2*esi], [edi+esi], mm5, mm6
+
+ DEBLOCK_P0_Q0_MMX
+ movq [eax+2*esi], mm1
+ movq [edi], mm2
+
+ add esp, 16
+ pop esi
+ pop edi
+ ret
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+x264_deblock_h_luma_mmxext:
+ push ebx
+ push ebp
+ mov eax, [esp+12] ; pix
+ mov ebx, [esp+16] ; stride
+ lea ebp, [ebx+ebx*2]
+ sub eax, 4
+ lea ecx, [eax+ebp]
+ sub esp, 96
+%define pix_tmp esp
+
+ ; transpose 6x16 -> tmp space
+ TRANSPOSE6x8_MEM PASS8ROWS(eax, ecx, ebx, ebp), pix_tmp
+ lea eax, [eax+ebx*8]
+ lea ecx, [ecx+ebx*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(eax, ecx, ebx, ebp), pix_tmp+8
+
+ ; vertical filter
+ push dword [esp+124] ; tc0
+ push dword [esp+124] ; beta
+ push dword [esp+124] ; alpha
+ push dword 16
+ push dword pix_tmp
+ add dword [esp], 0x40 ; pix_tmp+0x30
+ call x264_deblock_v8_luma_mmxext
+
+ add dword [esp ], 8 ; pix_tmp+0x38
+ add dword [esp+16], 2 ; tc0+2
+ call x264_deblock_v8_luma_mmxext
+ add esp, 20
+
+ ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
+ mov eax, [esp+108] ; pix
+ sub eax, 2
+ lea ecx, [eax+ebp]
+
+ movq mm0, [pix_tmp+0x10]
+ movq mm1, [pix_tmp+0x20]
+ movq mm2, [pix_tmp+0x30]
+ movq mm3, [pix_tmp+0x40]
+ TRANSPOSE8x4_STORE PASS8ROWS(eax, ecx, ebx, ebp)
+
+ lea eax, [eax+ebx*8]
+ lea ecx, [ecx+ebx*8]
+ movq mm0, [pix_tmp+0x18]
+ movq mm1, [pix_tmp+0x28]
+ movq mm2, [pix_tmp+0x38]
+ movq mm3, [pix_tmp+0x48]
+ TRANSPOSE8x4_STORE PASS8ROWS(eax, ecx, ebx, ebp)
+
+ add esp, 96
+ pop ebp
+ pop ebx
+ ret
+
+
+%macro CHROMA_V_START 0
+ push edi
+ push esi
+ mov edi, [esp+12] ; pix
+ mov esi, [esp+16] ; stride
+ mov edx, [esp+20] ; alpha
+ mov ecx, [esp+24] ; beta
+ dec edx
+ dec ecx
+ mov eax, edi
+ sub eax, esi
+ sub eax, esi
+%endmacro
+
+%macro CHROMA_H_START 0
+ push edi
+ push esi
+ push ebp
+ mov edi, [esp+16]
+ mov esi, [esp+20]
+ mov edx, [esp+24]
+ mov ecx, [esp+28]
+ dec edx
+ dec ecx
+ sub edi, 2
+ mov ebp, esi
+ add ebp, esi
+ add ebp, esi
+ mov eax, edi
+ add edi, ebp
+%endmacro
+
+%macro CHROMA_END 0
+ pop esi
+ pop edi
+ ret
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+x264_deblock_v_chroma_mmxext:
+ CHROMA_V_START
+ push ebx
+ mov ebx, [esp+32] ; tc0
+
+ movq mm0, [eax]
+ movq mm1, [eax+esi]
+ movq mm2, [edi]
+ movq mm3, [edi+esi]
+
+ LOAD_MASK_MMX edx, ecx
+ movd mm6, [ebx]
+ punpcklbw mm6, mm6
+ pand mm7, mm6
+ DEBLOCK_P0_Q0_MMX
+
+ movq [eax+esi], mm1
+ movq [edi], mm2
+
+ pop ebx
+ CHROMA_END
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+x264_deblock_h_chroma_mmxext:
+ CHROMA_H_START
+ push ebx
+ mov ebx, [esp+36] ; tc0
+ sub esp, 16
+
+ TRANSPOSE4x8_LOAD PASS8ROWS(eax, edi, esi, ebp)
+ movq [esp+8], mm0
+ movq [esp+0], mm3
+
+ LOAD_MASK_MMX edx, ecx
+ movd mm6, [ebx]
+ punpcklbw mm6, mm6
+ pand mm7, mm6
+ DEBLOCK_P0_Q0_MMX
+
+ movq mm0, [esp+8]
+ movq mm3, [esp+0]
+ TRANSPOSE8x4_STORE PASS8ROWS(eax, edi, esi, ebp)
+
+ add esp, 16
+ pop ebx
+ pop ebp
+ CHROMA_END
+
+
+; in: %1=p0 %2=p1 %3=q1
+; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
+%macro CHROMA_INTRA_P0 3
+ movq mm4, %1
+ pxor mm4, %3
+ pand mm4, [pb_01] ; mm4 = (p0^q1)&1
+ pavgb %1, %3
+ psubusb %1, mm4
+ pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
+%endmacro
+
+%macro CHROMA_INTRA_BODY 0
+ LOAD_MASK_MMX edx, ecx
+ movq mm5, mm1
+ movq mm6, mm2
+ CHROMA_INTRA_P0 mm1, mm0, mm3
+ CHROMA_INTRA_P0 mm2, mm3, mm0
+ psubb mm1, mm5
+ psubb mm2, mm6
+ pand mm1, mm7
+ pand mm2, mm7
+ paddb mm1, mm5
+ paddb mm2, mm6
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+x264_deblock_v_chroma_intra_mmxext:
+ CHROMA_V_START
+ movq mm0, [eax]
+ movq mm1, [eax+esi]
+ movq mm2, [edi]
+ movq mm3, [edi+esi]
+ CHROMA_INTRA_BODY
+ movq [eax+esi], mm1
+ movq [edi], mm2
+ CHROMA_END
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+x264_deblock_h_chroma_intra_mmxext:
+ CHROMA_H_START
+ TRANSPOSE4x8_LOAD PASS8ROWS(eax, edi, esi, ebp)
+ CHROMA_INTRA_BODY
+ TRANSPOSE8x4_STORE PASS8ROWS(eax, edi, esi, ebp)
+ pop ebp
+ CHROMA_END
+
x264_mc_init( h->param.cpu, &h->mc );
x264_csp_init( h->param.cpu, h->param.i_csp, &h->csp );
x264_quant_init( h, h->param.cpu, &h->quantf );
+ x264_deblock_init( h->param.cpu, &h->loopf );
memcpy( h->pixf.mbcmp,
( h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1 ) ? h->pixf.sad : h->pixf.satd,
return ret;
}
+static int check_deblock( int cpu_ref, int cpu_new )
+{
+ x264_deblock_function_t db_c;
+ x264_deblock_function_t db_ref;
+ x264_deblock_function_t db_a;
+ int ret = 0, ok = 1, used_asm = 0;
+ int alphas[36], betas[36];
+ int8_t tcs[36][4];
+ int a, c, i, j;
+
+ x264_deblock_init( 0, &db_c );
+ x264_deblock_init( cpu_ref, &db_ref );
+ x264_deblock_init( cpu_new, &db_a );
+
+ /* not exactly the real values of a,b,tc but close enough */
+ a = 255; c = 250;
+ for( i = 35; i >= 0; i-- )
+ {
+ alphas[i] = a;
+ betas[i] = (i+1)/2;
+ tcs[i][0] = tcs[i][2] = (c+6)/10;
+ tcs[i][1] = tcs[i][3] = (c+9)/20;
+ a = a*9/10;
+ c = c*9/10;
+ }
+
+#define TEST_DEBLOCK( name, ... ) \
+ for( i = 0; i < 36; i++ ) \
+ { \
+ for( j = 0; j < 1024; j++ ) \
+ /* two distributions of random to excersize different failure modes */\
+ buf1[j] = rand() & (i&1 ? 0xf : 0xff ); \
+ memcpy( buf3, buf1, 1024 ); \
+ memcpy( buf4, buf1, 1024 ); \
+ if( db_a.name != db_ref.name ) \
+ { \
+ used_asm = 1; \
+ db_c.name( &buf3[8*32], 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+ db_a.name( &buf4[8*32], 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+ if( memcmp( buf3, buf4, 1024 ) ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \
+ break; \
+ } \
+ } \
+ }
+
+ TEST_DEBLOCK( deblock_h_luma, tcs[i] );
+ TEST_DEBLOCK( deblock_v_luma, tcs[i] );
+ TEST_DEBLOCK( deblock_h_chroma, tcs[i] );
+ TEST_DEBLOCK( deblock_v_chroma, tcs[i] );
+ TEST_DEBLOCK( deblock_h_luma_intra );
+ TEST_DEBLOCK( deblock_v_luma_intra );
+ TEST_DEBLOCK( deblock_h_chroma_intra );
+ TEST_DEBLOCK( deblock_v_chroma_intra );
+
+ report( "deblock :" );
+
+ return ret;
+}
+
static int check_quant( int cpu_ref, int cpu_new )
{
x264_quant_function_t qf_c;
return check_pixel( cpu_ref, cpu_new )
+ check_dct( cpu_ref, cpu_new )
+ check_mc( cpu_ref, cpu_new )
+ + check_deblock( cpu_ref, cpu_new )
+ check_quant( cpu_ref, cpu_new );
}