From: Loren Merritt Date: Wed, 26 Oct 2005 08:38:11 +0000 (+0000) Subject: mmx deblocking. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=109ae085288c0068e2f40bfffd41070bd25dfa8b;p=libx264 mmx deblocking. 2.5x faster deblocking functions, 1-4% overall. git-svn-id: svn://svn.videolan.org/x264/trunk@341 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/Makefile b/Makefile index 91085dd9..fb7b85c9 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,8 @@ SRCS += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c ASMSRC = common/i386/dct-a.asm common/i386/cpu-a.asm \ common/i386/pixel-a.asm common/i386/mc-a.asm \ common/i386/mc-a2.asm common/i386/predict-a.asm \ - common/i386/pixel-sse2.asm common/i386/quant-a.asm + common/i386/pixel-sse2.asm common/i386/quant-a.asm \ + common/i386/deblock-a.asm OBJASM = $(ASMSRC:%.asm=%.o) endif @@ -31,7 +32,8 @@ SRCS += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c ASMSRC = common/amd64/dct-a.asm common/amd64/cpu-a.asm \ common/amd64/pixel-a.asm common/amd64/mc-a.asm \ common/amd64/mc-a2.asm common/amd64/predict-a.asm \ - common/amd64/pixel-sse2.asm common/amd64/quant-a.asm + common/amd64/pixel-sse2.asm common/amd64/quant-a.asm \ + common/amd64/deblock-a.asm OBJASM = $(ASMSRC:%.asm=%.o) ASFLAGS += -Icommon/amd64 endif diff --git a/common/amd64/deblock-a.asm b/common/amd64/deblock-a.asm new file mode 100644 index 00000000..47053f15 --- /dev/null +++ b/common/amd64/deblock-a.asm @@ -0,0 +1,494 @@ +;***************************************************************************** +;* deblock-a.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2005 x264 project +;* +;* Authors: Loren Merritt +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +BITS 64 + +%include "amd64inc.asm" + +SECTION .rodata align=16 +pb_01: times 16 db 0x01 +pb_3f: times 16 db 0x3f +pb_ff: times 16 db 0xff + +SECTION .text +cglobal x264_deblock_v_luma_sse2 +cglobal x264_deblock_h_luma_sse2 +cglobal x264_deblock_v_chroma_mmxext +cglobal x264_deblock_h_chroma_mmxext +cglobal x264_deblock_v_chroma_intra_mmxext +cglobal x264_deblock_h_chroma_intra_mmxext + +; expands to [base],...,[base+7*stride] +%define PASS8ROWS(base, base3, stride, stride3) \ + [base], [base+stride], [base+stride*2], [base3], \ + [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] + +; in: 8 rows of 4 bytes in %1..%8 +; out: 4 rows of 8 bytes in mm0..mm3 +%macro TRANSPOSE4x8_LOAD 8 + movd mm0, %1 + movd mm2, %2 + movd mm1, %3 + movd mm3, %4 + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + movq mm2, mm0 + punpcklwd mm0, mm1 + punpckhwd mm2, mm1 + + movd mm4, %5 + movd mm6, %6 + movd mm5, %7 + movd mm7, %8 + punpcklbw mm4, mm6 + punpcklbw mm5, mm7 + movq mm6, mm4 + punpcklwd mm4, mm5 + punpckhwd mm6, mm5 + + movq mm1, mm0 + movq mm3, mm2 + punpckldq mm0, mm4 + punpckhdq mm1, mm4 + punpckldq mm2, mm6 + punpckhdq mm3, mm6 +%endmacro + +; in: 4 rows of 8 bytes in mm0..mm3 +; out: 8 rows of 4 bytes in %1..%8 +%macro TRANSPOSE8x4_STORE 8 + movq mm4, mm0 + movq mm5, mm1 + movq mm6, mm2 + punpckhdq mm4, mm4 + punpckhdq mm5, mm5 + punpckhdq mm6, mm6 + + punpcklbw mm0, mm1 + punpcklbw mm2, mm3 + movq mm1, mm0 + punpcklwd mm0, mm2 + punpckhwd mm1, mm2 + movd %1, mm0 + punpckhdq mm0, mm0 + movd %2, mm0 + movd %3, mm1 + punpckhdq mm1, mm1 + movd %4, mm1 + + punpckhdq mm3, mm3 + punpcklbw mm4, mm5 + punpcklbw mm6, mm3 + movq mm5, mm4 + punpcklwd mm4, mm6 + punpckhwd mm5, mm6 + movd %5, mm4 + punpckhdq mm4, mm4 + movd %6, mm4 + movd %7, mm5 + punpckhdq mm5, mm5 + movd %8, mm5 +%endmacro + +%macro SBUTTERFLY 4 + movq %4, %2 + punpckl%1 %2, %3 + punpckh%1 %4, %3 +%endmacro + +; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 +; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] +%macro TRANSPOSE6x8_MEM 9 + movq mm0, %1 + movq mm1, %3 + movq mm2, %5 + movq mm3, %7 + SBUTTERFLY bw, mm0, %2, mm4 + SBUTTERFLY bw, mm1, %4, mm5 + SBUTTERFLY bw, mm2, %6, mm6 + movq [%9+0x10], mm5 + SBUTTERFLY bw, mm3, %8, mm7 + SBUTTERFLY wd, mm0, mm1, mm5 + SBUTTERFLY wd, mm2, mm3, mm1 + punpckhdq mm0, mm2 + movq [%9+0x00], mm0 + SBUTTERFLY wd, mm4, [%9+0x10], mm3 + SBUTTERFLY wd, mm6, mm7, mm2 + SBUTTERFLY dq, mm4, mm6, mm0 + SBUTTERFLY dq, mm5, mm1, mm7 + punpckldq mm3, mm2 + movq [%9+0x10], mm5 + movq [%9+0x20], mm7 + movq [%9+0x30], mm4 + movq [%9+0x40], mm0 + movq [%9+0x50], mm3 +%endmacro + +; out: %4 = |%1-%2|>%3 +; clobbers: %5 +%macro DIFF_GT 6 + mov%1 %6, %3 + mov%1 %5, %2 + psubusb %6, %2 + psubusb %5, %3 + por %5, %6 + psubusb %5, %4 +%endmacro +%macro DIFF_GT_MMX 5 + DIFF_GT q, %1, %2, %3, %4, %5 +%endmacro +%macro DIFF_GT_SSE2 5 + DIFF_GT dqa, %1, %2, %3, %4, %5 +%endmacro + +; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 %1=alpha-1 %2=beta-1 +; out: mm5=beta-1, mm7=mask +; clobbers: mm4,mm6 +%macro LOAD_MASK_MMX 2 + movd mm4, %1 + movd mm5, %2 + pshufw mm4, mm4, 0 + pshufw mm5, mm5, 0 + packuswb mm4, mm4 ; 8x alpha-1 + packuswb mm5, mm5 ; 8x beta-1 + DIFF_GT_MMX mm1, mm2, mm4, mm7, mm6 ; |p0-q0| > alpha-1 + DIFF_GT_MMX mm0, mm1, mm5, mm4, mm6 ; |p1-p0| > beta-1 + por mm7, mm4 + DIFF_GT_MMX mm3, mm2, mm5, mm4, mm6 ; |q1-q0| > beta-1 + por mm7, mm4 + pxor mm6, mm6 + pcmpeqb mm7, mm6 +%endmacro +%macro LOAD_MASK_SSE2 2 + movd xmm4, %1 + movd xmm5, %2 + pshuflw xmm4, xmm4, 0 + pshuflw xmm5, xmm5, 0 + punpcklqdq xmm4, xmm4 + punpcklqdq xmm5, xmm5 + packuswb xmm4, xmm4 ; 16x alpha-1 + packuswb xmm5, xmm5 ; 16x beta-1 + DIFF_GT_SSE2 xmm1, xmm2, xmm4, xmm7, xmm6 ; |p0-q0| > alpha-1 + DIFF_GT_SSE2 xmm0, xmm1, xmm5, xmm4, xmm6 ; |p1-p0| > beta-1 + por xmm7, xmm4 + DIFF_GT_SSE2 xmm3, xmm2, xmm5, xmm4, xmm6 ; |q1-q0| > beta-1 + por xmm7, xmm4 + pxor xmm6, xmm6 + pcmpeqb xmm7, xmm6 +%endmacro + +; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) +; out: mm1=p0' mm2=q0' +; clobbers: mm0,3-6 +%macro DEBLOCK_P0_Q0 2 + ; a = q0^p0^((p1-q1)>>2) + mov%1 %2m4, %2m0 + psubb %2m4, %2m3 + psrlw %2m4, 2 + pxor %2m4, %2m1 + pxor %2m4, %2m2 + ; b = p0^(q1>>2) + psrlw %2m3, 2 + pand %2m3, [pb_3f GLOBAL] + mov%1 %2m5, %2m1 + pxor %2m5, %2m3 + ; c = q0^(p1>>2) + psrlw %2m0, 2 + pand %2m0, [pb_3f GLOBAL] + mov%1 %2m6, %2m2 + pxor %2m6, %2m0 + ; d = (c^b) & ~(b^a) & 1 + pxor %2m6, %2m5 + pxor %2m5, %2m4 + pandn %2m5, %2m6 + pand %2m5, [pb_01 GLOBAL] + ; delta = (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3 + ; = (avg(q0, p1>>2) + (d&a)) + ; - (avg(p0, q1>>2) + (d^(d&a))) + pavgb %2m0, %2m2 + pand %2m4, %2m5 + paddusb %2m0, %2m4 + pavgb %2m3, %2m1 + pxor %2m4, %2m5 + paddusb %2m3, %2m4 + ; p0 += clip(delta, -tc0, tc0) + ; q0 -= clip(delta, -tc0, tc0) + mov%1 %2m4, %2m0 + psubusb %2m0, %2m3 + psubusb %2m3, %2m4 + pminub %2m0, %2m7 + pminub %2m3, %2m7 + paddusb %2m1, %2m0 + paddusb %2m2, %2m3 + psubusb %2m1, %2m3 + psubusb %2m2, %2m0 +%endmacro +%macro DEBLOCK_P0_Q0_MMX 0 + DEBLOCK_P0_Q0 q, m +%endmacro +%macro DEBLOCK_P0_Q0_SSE2 0 + DEBLOCK_P0_Q0 dqa, xm +%endmacro + +; in: mm1=p0 mm2=q0 +; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp +; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) +; clobbers: q2, tmp, tc0 +%macro LUMA_Q1_SSE2 6 + movdqa %6, xmm1 + pavgb %6, xmm2 + pavgb %2, %6 ; avg(p2,avg(p0,q0)) + pxor %6, %3 + pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 + psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 + movdqa %6, %1 + psubusb %6, %5 + paddusb %5, %1 + pmaxub %2, %6 + pminub %2, %5 + movdqa %4, %2 +%endmacro + + +SECTION .text +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +x264_deblock_v_luma_sse2: + ; rdi = pix + movsxd rsi, esi ; stride + dec edx ; alpha-1 + dec ecx ; beta-1 + movd xmm8, [r8] ; tc0 + mov r8, rdi + sub r8, rsi + sub r8, rsi + sub r8, rsi ; pix-3*stride + + movdqa xmm0, [r8+rsi] ; p1 + movdqa xmm1, [r8+2*rsi] ; p0 + movdqa xmm2, [rdi] ; q0 + movdqa xmm3, [rdi+rsi] ; q1 + LOAD_MASK_SSE2 edx, ecx + + punpcklbw xmm8, xmm8 + punpcklbw xmm8, xmm8 ; xmm8 = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] + movdqa xmm9, [pb_ff GLOBAL] + pcmpeqb xmm9, xmm8 + pandn xmm9, xmm7 + pand xmm8, xmm9 + + movdqa xmm3, [r8] ; p2 + DIFF_GT_SSE2 xmm1, xmm3, xmm5, xmm6, xmm7 ; |p2-p0| > beta-1 + pandn xmm6, xmm9 + pcmpeqb xmm6, xmm9 + pand xmm6, xmm9 + movdqa xmm7, [pb_01 GLOBAL] + pand xmm7, xmm6 + pand xmm6, xmm8 + paddb xmm7, xmm8 + LUMA_Q1_SSE2 xmm0, xmm3, [r8], [r8+rsi], xmm6, xmm4 + + movdqa xmm4, [rdi+2*rsi] ; q2 + DIFF_GT_SSE2 xmm2, xmm4, xmm5, xmm6, xmm3 ; |q2-q0| > beta-1 + pandn xmm6, xmm9 + pcmpeqb xmm6, xmm9 + pand xmm6, xmm9 + pand xmm8, xmm6 + pand xmm6, [pb_01 GLOBAL] + paddb xmm7, xmm6 + movdqa xmm3, [rdi+rsi] + LUMA_Q1_SSE2 xmm3, xmm4, [rdi+2*rsi], [rdi+rsi], xmm8, xmm6 + + DEBLOCK_P0_Q0_SSE2 + movdqa [r8+2*rsi], xmm1 + movdqa [rdi], xmm2 + + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +x264_deblock_h_luma_sse2: + movsxd r10, esi + lea r11, [r10+r10*2] + lea rax, [rdi-4] + lea r9, [rdi-4+r11] + %define pix_tmp rsp-104 ; 16x6 for the buffer + 8 for x264_deblock_v_luma_sse2's return address + + ; transpose 6x16 -> tmp space + TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp + lea rax, [rax+r10*8] + lea r9, [r9 +r10*8] + TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8 + + ; vertical filter + ; alpha, beta, tc0 are still in edx, ecx, r8 + ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them + lea rdi, [pix_tmp+0x30] + mov esi, 0x10 + call x264_deblock_v_luma_sse2 + + ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) + add rax, 2 + add r9, 2 + movq mm0, [pix_tmp+0x18] + movq mm1, [pix_tmp+0x28] + movq mm2, [pix_tmp+0x38] + movq mm3, [pix_tmp+0x48] + TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) + + shl r10, 3 + sub rax, r10 + sub r9, r10 + shr r10, 3 + movq mm0, [pix_tmp+0x10] + movq mm1, [pix_tmp+0x20] + movq mm2, [pix_tmp+0x30] + movq mm3, [pix_tmp+0x40] + TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) + + ret + + +%macro CHROMA_V_START 0 + ; rdi = pix + movsxd rsi, esi ; stride + dec edx ; alpha-1 + dec ecx ; beta-1 + mov rax, rdi + sub rax, rsi + sub rax, rsi +%endmacro + +%macro CHROMA_H_START 0 + movsxd rsi, esi + dec edx + dec ecx + sub rdi, 2 + lea r9, [rsi+rsi*2] + mov rax, rdi + add rdi, r9 +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +x264_deblock_v_chroma_mmxext: + CHROMA_V_START + + movq mm0, [rax] + movq mm1, [rax+rsi] + movq mm2, [rdi] + movq mm3, [rdi+rsi] + + LOAD_MASK_MMX edx, ecx + movd mm6, [r8] ; tc0 + punpcklbw mm6, mm6 + pand mm7, mm6 + DEBLOCK_P0_Q0_MMX + + movq [rax+rsi], mm1 + movq [rdi], mm2 + ret + + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +x264_deblock_h_chroma_mmxext: + CHROMA_H_START + + TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9) + movq [rsp-8], mm0 + movq [rsp-16], mm3 + + LOAD_MASK_MMX edx, ecx + movd mm6, [r8] ; tc0 + punpcklbw mm6, mm6 + pand mm7, mm6 + DEBLOCK_P0_Q0_MMX + + movq mm0, [rsp-8] + movq mm3, [rsp-16] + TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9) + ret + + +; in: %1=p0 %2=p1 %3=q1 +; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 +%macro CHROMA_INTRA_P0 3 + movq mm4, %1 + pxor mm4, %3 + pand mm4, [pb_01 GLOBAL] ; mm4 = (p0^q1)&1 + pavgb %1, %3 + psubusb %1, mm4 + pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) +%endmacro + +%macro CHROMA_INTRA_BODY 0 + LOAD_MASK_MMX edx, ecx + movq mm5, mm1 + movq mm6, mm2 + CHROMA_INTRA_P0 mm1, mm0, mm3 + CHROMA_INTRA_P0 mm2, mm3, mm0 + psubb mm1, mm5 + psubb mm2, mm6 + pand mm1, mm7 + pand mm2, mm7 + paddb mm1, mm5 + paddb mm2, mm6 +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +x264_deblock_v_chroma_intra_mmxext: + CHROMA_V_START + + movq mm0, [rax] + movq mm1, [rax+rsi] + movq mm2, [rdi] + movq mm3, [rdi+rsi] + + CHROMA_INTRA_BODY + + movq [rax+rsi], mm1 + movq [rdi], mm2 + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +x264_deblock_h_chroma_intra_mmxext: + CHROMA_H_START + TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9) + CHROMA_INTRA_BODY + TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9) + ret + diff --git a/common/common.h b/common/common.h index 89d6411d..07acc789 100644 --- a/common/common.h +++ b/common/common.h @@ -497,6 +497,7 @@ struct x264_t x264_dct_function_t dctf; x264_csp_function_t csp; x264_quant_function_t quantf; + x264_deblock_function_t loopf; /* vlc table for decoding purpose only */ x264_vlc_table_t *x264_coeff_token_lookup[5]; diff --git a/common/frame.c b/common/frame.c index 68879721..d44d1845 100644 --- a/common/frame.c +++ b/common/frame.c @@ -293,355 +293,191 @@ static inline int clip_uint8( int a ) return a; } -static inline void deblocking_filter_edgev( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP ) +static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) { int i, d; - const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 ); - const int alpha = i_alpha_table[i_index_a]; - const int beta = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )]; - - for( i = 0; i < 4; i++ ) - { - if( bS[i] == 0 ) - { - pix += 4 * i_pix_stride; + for( i = 0; i < 4; i++ ) { + if( tc0[i] < 0 ) { + pix += 4*ystride; continue; } - - if( bS[i] < 4 ) - { - const int tc0 = i_tc0_table[i_index_a][bS[i] - 1]; - - /* 4px edge length */ - for( d = 0; d < 4; d++ ) - { - const int p0 = pix[-1]; - const int p1 = pix[-2]; - const int p2 = pix[-3]; - const int q0 = pix[0]; - const int q1 = pix[1]; - const int q2 = pix[2]; - - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) - { - int tc = tc0; - int i_delta; - - if( abs( p2 - p0 ) < beta ) - { - pix[-2] = p1 + x264_clip3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 ); - tc++; - } - if( abs( q2 - q0 ) < beta ) - { - pix[1] = q1 + x264_clip3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 ); - tc++; - } - - i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-1] = clip_uint8( p0 + i_delta ); /* p0' */ - pix[0] = clip_uint8( q0 - i_delta ); /* q0' */ + for( d = 0; d < 4; d++ ) { + const int p2 = pix[-3*xstride]; + const int p1 = pix[-2*xstride]; + const int p0 = pix[-1*xstride]; + const int q0 = pix[ 0*xstride]; + const int q1 = pix[ 1*xstride]; + const int q2 = pix[ 2*xstride]; + + if( X264_ABS( p0 - q0 ) < alpha && + X264_ABS( p1 - p0 ) < beta && + X264_ABS( q1 - q0 ) < beta ) { + + int tc = tc0[i]; + int delta; + + if( X264_ABS( p2 - p0 ) < beta ) { + pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] ); + tc++; } - pix += i_pix_stride; - } - } - else - { - /* 4px edge length */ - for( d = 0; d < 4; d++ ) - { - const int p0 = pix[-1]; - const int p1 = pix[-2]; - const int p2 = pix[-3]; - - const int q0 = pix[0]; - const int q1 = pix[1]; - const int q2 = pix[2]; - - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) - { - if( abs( p0 - q0 ) < (( alpha >> 2 ) + 2 ) ) - { - if( abs( p2 - p0 ) < beta ) - { - const int p3 = pix[-4]; - /* p0', p1', p2' */ - pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; - pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; - pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; - } - else - { - /* p0' */ - pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2; - } - if( abs( q2 - q0 ) < beta ) - { - const int q3 = pix[3]; - /* q0', q1', q2' */ - pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; - pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; - pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; - } - else - { - /* q0' */ - pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; - } - } - else - { - /* p0', q0' */ - pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2; - pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; - } + if( X264_ABS( q2 - q0 ) < beta ) { + pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] ); + tc++; } - pix += i_pix_stride; + + delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + pix[-1*xstride] = clip_uint8( p0 + delta ); /* p0' */ + pix[ 0*xstride] = clip_uint8( q0 - delta ); /* q0' */ } + pix += ystride; } } } +static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +{ + deblock_luma_c( pix, stride, 1, alpha, beta, tc0 ); +} +static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +{ + deblock_luma_c( pix, 1, stride, alpha, beta, tc0 ); +} -static inline void deblocking_filter_edgecv( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP ) +static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) { int i, d; - const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 ); - const int alpha = i_alpha_table[i_index_a]; - const int beta = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )]; - - for( i = 0; i < 4; i++ ) - { - if( bS[i] == 0 ) - { - pix += 2 * i_pix_stride; + for( i = 0; i < 4; i++ ) { + const int tc = tc0[i]; + if( tc <= 0 ) { + pix += 2*ystride; continue; } - - if( bS[i] < 4 ) - { - const int tc = i_tc0_table[i_index_a][bS[i] - 1] + 1; - /* 2px edge length (because we use same bS than the one for luma) */ - for( d = 0; d < 2; d++ ) - { - const int p0 = pix[-1]; - const int p1 = pix[-2]; - const int q0 = pix[0]; - const int q1 = pix[1]; - - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) - { - const int i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - - pix[-1] = clip_uint8( p0 + i_delta ); /* p0' */ - pix[0] = clip_uint8( q0 - i_delta ); /* q0' */ - } - pix += i_pix_stride; - } - } - else - { - /* 2px edge length (because we use same bS than the one for luma) */ - for( d = 0; d < 2; d++ ) - { - const int p0 = pix[-1]; - const int p1 = pix[-2]; - const int q0 = pix[0]; - const int q1 = pix[1]; - - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) - { - pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ - pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ - } - pix += i_pix_stride; + for( d = 0; d < 2; d++ ) { + const int p1 = pix[-2*xstride]; + const int p0 = pix[-1*xstride]; + const int q0 = pix[ 0*xstride]; + const int q1 = pix[ 1*xstride]; + + if( X264_ABS( p0 - q0 ) < alpha && + X264_ABS( p1 - p0 ) < beta && + X264_ABS( q1 - q0 ) < beta ) { + + int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + pix[-1*xstride] = clip_uint8( p0 + delta ); /* p0' */ + pix[ 0*xstride] = clip_uint8( q0 - delta ); /* q0' */ } + pix += ystride; } } } +static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +{ + deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 ); +} +static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +{ + deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 ); +} -static inline void deblocking_filter_edgeh( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP ) +static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta ) { - int i, d; - const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 ); - const int alpha = i_alpha_table[i_index_a]; - const int beta = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )]; - - int i_pix_next = i_pix_stride; - - for( i = 0; i < 4; i++ ) - { - if( bS[i] == 0 ) - { - pix += 4; - continue; - } - - if( bS[i] < 4 ) - { - const int tc0 = i_tc0_table[i_index_a][bS[i] - 1]; - /* 4px edge length */ - for( d = 0; d < 4; d++ ) - { - const int p0 = pix[-i_pix_next]; - const int p1 = pix[-2*i_pix_next]; - const int p2 = pix[-3*i_pix_next]; - const int q0 = pix[0]; - const int q1 = pix[1*i_pix_next]; - const int q2 = pix[2*i_pix_next]; - - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) + int d; + for( d = 0; d < 16; d++ ) { + const int p2 = pix[-3*xstride]; + const int p1 = pix[-2*xstride]; + const int p0 = pix[-1*xstride]; + const int q0 = pix[ 0*xstride]; + const int q1 = pix[ 1*xstride]; + const int q2 = pix[ 2*xstride]; + + if( X264_ABS( p0 - q0 ) < alpha && + X264_ABS( p1 - p0 ) < beta && + X264_ABS( q1 - q0 ) < beta ) { + + if(X264_ABS( p0 - q0 ) < ((alpha >> 2) + 2) ){ + if( X264_ABS( p2 - p0 ) < beta) { - int tc = tc0; - int i_delta; - - if( abs( p2 - p0 ) < beta ) - { - pix[-2*i_pix_next] = p1 + x264_clip3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 ); - tc++; - } - if( abs( q2 - q0 ) < beta ) - { - pix[i_pix_next] = q1 + x264_clip3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 ); - tc++; - } - - i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-i_pix_next] = clip_uint8( p0 + i_delta ); /* p0' */ - pix[0] = clip_uint8( q0 - i_delta ); /* q0' */ + const int p3 = pix[-4*xstride]; + /* p0', p1', p2' */ + pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; + pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; + pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; + } else { + /* p0' */ + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; } - pix++; - } - } - else - { - /* 4px edge length */ - for( d = 0; d < 4; d++ ) - { - const int p0 = pix[-i_pix_next]; - const int p1 = pix[-2*i_pix_next]; - const int p2 = pix[-3*i_pix_next]; - const int q0 = pix[0]; - const int q1 = pix[1*i_pix_next]; - const int q2 = pix[2*i_pix_next]; - - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) + if( X264_ABS( q2 - q0 ) < beta) { - const int p3 = pix[-4*i_pix_next]; - const int q3 = pix[ 3*i_pix_next]; - - if( abs( p0 - q0 ) < (( alpha >> 2 ) + 2 ) ) - { - if( abs( p2 - p0 ) < beta ) - { - /* p0', p1', p2' */ - pix[-1*i_pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; - pix[-2*i_pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; - pix[-3*i_pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; - } - else - { - /* p0' */ - pix[-1*i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2; - } - if( abs( q2 - q0 ) < beta ) - { - /* q0', q1', q2' */ - pix[0*i_pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; - pix[1*i_pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; - pix[2*i_pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; - } - else - { - /* q0' */ - pix[0*i_pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2; - } - } - else - { - /* p0' */ - pix[-1*i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2; - /* q0' */ - pix[0*i_pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2; - } + const int q3 = pix[3*xstride]; + /* q0', q1', q2' */ + pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; + pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; + pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; + } else { + /* q0' */ + pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; } - pix++; + }else{ + /* p0', q0' */ + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; + pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; } - } + pix += ystride; } } +static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) +{ + deblock_luma_intra_c( pix, stride, 1, alpha, beta ); +} +static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) +{ + deblock_luma_intra_c( pix, 1, stride, alpha, beta ); +} -static inline void deblocking_filter_edgech( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP ) -{ - int i, d; - const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 ); - const int alpha = i_alpha_table[i_index_a]; - const int beta = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )]; - - int i_pix_next = i_pix_stride; - - for( i = 0; i < 4; i++ ) - { - if( bS[i] == 0 ) - { - pix += 2; - continue; +static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta ) +{ + int d; + for( d = 0; d < 8; d++ ) { + const int p1 = pix[-2*xstride]; + const int p0 = pix[-1*xstride]; + const int q0 = pix[ 0*xstride]; + const int q1 = pix[ 1*xstride]; + + if( X264_ABS( p0 - q0 ) < alpha && + X264_ABS( p1 - p0 ) < beta && + X264_ABS( q1 - q0 ) < beta ) { + + pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */ + pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */ } - if( bS[i] < 4 ) - { - int tc = i_tc0_table[i_index_a][bS[i] - 1] + 1; - /* 2px edge length (see deblocking_filter_edgecv) */ - for( d = 0; d < 2; d++ ) - { - const int p0 = pix[-1*i_pix_next]; - const int p1 = pix[-2*i_pix_next]; - const int q0 = pix[0]; - const int q1 = pix[1*i_pix_next]; - - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) - { - int i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-i_pix_next] = clip_uint8( p0 + i_delta ); /* p0' */ - pix[0] = clip_uint8( q0 - i_delta ); /* q0' */ - } - pix++; - } - } - else - { - /* 2px edge length (see deblocking_filter_edgecv) */ - for( d = 0; d < 2; d++ ) - { - const int p0 = pix[-1*i_pix_next]; - const int p1 = pix[-2*i_pix_next]; - const int q0 = pix[0]; - const int q1 = pix[1*i_pix_next]; - - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) - { - pix[-i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ - pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ - } - pix++; - } - } + pix += ystride; + } +} +static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) +{ + deblock_chroma_intra_c( pix, stride, 1, alpha, beta ); +} +static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) +{ + deblock_chroma_intra_c( pix, 1, stride, alpha, beta ); +} + +static inline void deblock_edge( x264_t *h, uint8_t *pix, int i_stride, int bS[4], int i_qp, int b_chroma, + x264_deblock_inter_t pf_inter, x264_deblock_intra_t pf_intra ) +{ + int i; + const int index_a = x264_clip3( i_qp + h->sh.i_alpha_c0_offset, 0, 51 ); + const int alpha = i_alpha_table[index_a]; + const int beta = i_beta_table[x264_clip3( i_qp + h->sh.i_beta_offset, 0, 51 )]; + + if( bS[0] < 4 ) { + int8_t tc[4]; + for(i=0; i<4; i++) + tc[i] = (bS[i] ? i_tc0_table[index_a][bS[i] - 1] : -1) + b_chroma; + pf_inter( pix, i_stride, alpha, beta, tc ); + } else { + pf_intra( pix, i_stride, alpha, beta ); } } @@ -748,18 +584,21 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type ) /* vertical edge */ if( !b_8x8_transform || !(i_edge & 1) ) { - deblocking_filter_edgev( h, &h->fdec->plane[0][16 * mb_y * h->fdec->i_stride[0]+ 16 * mb_x + 4 * i_edge], - h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1); + deblock_edge( h, &h->fdec->plane[0][16*mb_y * h->fdec->i_stride[0] + 16*mb_x + 4*i_edge], + h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1, 0, + h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra ); } if( !(i_edge & 1) ) { /* U/V planes */ int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] + i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1; - deblocking_filter_edgecv( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+i_edge*2], - h->fdec->i_stride[1], bS, i_qpc ); - deblocking_filter_edgecv( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+i_edge*2], - h->fdec->i_stride[2], bS, i_qpc ); + deblock_edge( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+2*i_edge], + h->fdec->i_stride[1], bS, i_qpc, 1, + h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra ); + deblock_edge( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+2*i_edge], + h->fdec->i_stride[2], bS, i_qpc, 1, + h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra ); } } else @@ -767,18 +606,21 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type ) /* horizontal edge */ if( !b_8x8_transform || !(i_edge & 1) ) { - deblocking_filter_edgeh( h, &h->fdec->plane[0][(16*mb_y + 4 * i_edge) * h->fdec->i_stride[0]+ 16 * mb_x], - h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1 ); + deblock_edge( h, &h->fdec->plane[0][(16*mb_y + 4*i_edge) * h->fdec->i_stride[0] + 16*mb_x], + h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1, 0, + h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra ); } /* U/V planes */ if( !(i_edge & 1) ) { int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] + i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1; - deblocking_filter_edgech( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+i_edge*2*h->fdec->i_stride[1]], - h->fdec->i_stride[1], bS, i_qpc ); - deblocking_filter_edgech( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+i_edge*2*h->fdec->i_stride[2]], - h->fdec->i_stride[2], bS, i_qpc ); + deblock_edge( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+2*i_edge*h->fdec->i_stride[1]], + h->fdec->i_stride[1], bS, i_qpc, 1, + h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra ); + deblock_edge( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+2*i_edge*h->fdec->i_stride[2]], + h->fdec->i_stride[2], bS, i_qpc, 1, + h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra ); } } } @@ -794,6 +636,53 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type ) } } +void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); + +#ifdef ARCH_X86_64 +void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +#else +void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); + +void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +{ + x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 ); + x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 ); +} +#endif +void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) +{ + pf->deblock_v_luma = deblock_v_luma_c; + pf->deblock_h_luma = deblock_h_luma_c; + pf->deblock_v_chroma = deblock_v_chroma_c; + pf->deblock_h_chroma = deblock_h_chroma_c; + pf->deblock_v_luma_intra = deblock_v_luma_intra_c; + pf->deblock_h_luma_intra = deblock_h_luma_intra_c; + pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c; + pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c; + + if( cpu&X264_CPU_MMXEXT ) + { + pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext; + pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext; + pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext; + pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext; +#ifdef ARCH_X86_64 + if( cpu&X264_CPU_SSE2 ) + { + pf->deblock_v_luma = x264_deblock_v_luma_sse2; + pf->deblock_h_luma = x264_deblock_h_luma_sse2; + } +#else + pf->deblock_v_luma = x264_deblock_v_luma_mmxext; + pf->deblock_h_luma = x264_deblock_h_luma_mmxext; +#endif + } +} diff --git a/common/frame.h b/common/frame.h index 60512f30..59fd9f62 100644 --- a/common/frame.h +++ b/common/frame.h @@ -24,6 +24,8 @@ #ifndef _FRAME_H #define _FRAME_H 1 +#include + typedef struct { /* */ @@ -64,6 +66,20 @@ typedef struct } x264_frame_t; +typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +typedef void (*x264_deblock_intra_t)( uint8_t *pix, int stride, int alpha, int beta ); +typedef struct +{ + x264_deblock_inter_t deblock_v_luma; + x264_deblock_inter_t deblock_h_luma; + x264_deblock_inter_t deblock_v_chroma; + x264_deblock_inter_t deblock_h_chroma; + x264_deblock_intra_t deblock_v_luma_intra; + x264_deblock_intra_t deblock_h_luma_intra; + x264_deblock_intra_t deblock_v_chroma_intra; + x264_deblock_intra_t deblock_h_chroma_intra; +} x264_deblock_function_t; + x264_frame_t *x264_frame_new( x264_t *h ); void x264_frame_delete( x264_frame_t *frame ); @@ -79,4 +95,6 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type ); void x264_frame_filter( int cpu, x264_frame_t *frame ); void x264_frame_init_lowres( int cpu, x264_frame_t *frame ); +void x264_deblock_init( int cpu, x264_deblock_function_t *pf ); + #endif diff --git a/common/i386/deblock-a.asm b/common/i386/deblock-a.asm new file mode 100644 index 00000000..64d72c22 --- /dev/null +++ b/common/i386/deblock-a.asm @@ -0,0 +1,527 @@ +;***************************************************************************** +;* deblock-a.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2005 x264 project +;* +;* Authors: Loren Merritt +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +BITS 32 + +%macro cglobal 1 + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + +SECTION .rodata align=16 +pb_01: times 16 db 0x01 +pb_3f: times 16 db 0x3f +pb_ff: times 16 db 0xff + +SECTION .text +cglobal x264_deblock_v8_luma_mmxext +cglobal x264_deblock_h_luma_mmxext +cglobal x264_deblock_v_chroma_mmxext +cglobal x264_deblock_h_chroma_mmxext +cglobal x264_deblock_v_chroma_intra_mmxext +cglobal x264_deblock_h_chroma_intra_mmxext + +; expands to [base],...,[base+7*stride] +%define PASS8ROWS(base, base3, stride, stride3) \ + [base], [base+stride], [base+stride*2], [base3], \ + [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] + +; in: 8 rows of 4 bytes in %1..%8 +; out: 4 rows of 8 bytes in mm0..mm3 +%macro TRANSPOSE4x8_LOAD 8 + movd mm0, %1 + movd mm2, %2 + movd mm1, %3 + movd mm3, %4 + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + movq mm2, mm0 + punpcklwd mm0, mm1 + punpckhwd mm2, mm1 + + movd mm4, %5 + movd mm6, %6 + movd mm5, %7 + movd mm7, %8 + punpcklbw mm4, mm6 + punpcklbw mm5, mm7 + movq mm6, mm4 + punpcklwd mm4, mm5 + punpckhwd mm6, mm5 + + movq mm1, mm0 + movq mm3, mm2 + punpckldq mm0, mm4 + punpckhdq mm1, mm4 + punpckldq mm2, mm6 + punpckhdq mm3, mm6 +%endmacro + +; in: 4 rows of 8 bytes in mm0..mm3 +; out: 8 rows of 4 bytes in %1..%8 +%macro TRANSPOSE8x4_STORE 8 + movq mm4, mm0 + movq mm5, mm1 + movq mm6, mm2 + punpckhdq mm4, mm4 + punpckhdq mm5, mm5 + punpckhdq mm6, mm6 + + punpcklbw mm0, mm1 + punpcklbw mm2, mm3 + movq mm1, mm0 + punpcklwd mm0, mm2 + punpckhwd mm1, mm2 + movd %1, mm0 + punpckhdq mm0, mm0 + movd %2, mm0 + movd %3, mm1 + punpckhdq mm1, mm1 + movd %4, mm1 + + punpckhdq mm3, mm3 + punpcklbw mm4, mm5 + punpcklbw mm6, mm3 + movq mm5, mm4 + punpcklwd mm4, mm6 + punpckhwd mm5, mm6 + movd %5, mm4 + punpckhdq mm4, mm4 + movd %6, mm4 + movd %7, mm5 + punpckhdq mm5, mm5 + movd %8, mm5 +%endmacro + +%macro SBUTTERFLY 4 + movq %4, %2 + punpckl%1 %2, %3 + punpckh%1 %4, %3 +%endmacro + +; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 +; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] +%macro TRANSPOSE6x8_MEM 9 + movq mm0, %1 + movq mm1, %3 + movq mm2, %5 + movq mm3, %7 + SBUTTERFLY bw, mm0, %2, mm4 + SBUTTERFLY bw, mm1, %4, mm5 + SBUTTERFLY bw, mm2, %6, mm6 + movq [%9+0x10], mm5 + SBUTTERFLY bw, mm3, %8, mm7 + SBUTTERFLY wd, mm0, mm1, mm5 + SBUTTERFLY wd, mm2, mm3, mm1 + punpckhdq mm0, mm2 + movq [%9+0x00], mm0 + SBUTTERFLY wd, mm4, [%9+0x10], mm3 + SBUTTERFLY wd, mm6, mm7, mm2 + SBUTTERFLY dq, mm4, mm6, mm0 + SBUTTERFLY dq, mm5, mm1, mm7 + punpckldq mm3, mm2 + movq [%9+0x10], mm5 + movq [%9+0x20], mm7 + movq [%9+0x30], mm4 + movq [%9+0x40], mm0 + movq [%9+0x50], mm3 +%endmacro + +; out: %4 = |%1-%2|>%3 +; clobbers: %5 +%macro DIFF_GT_MMX 5 + movq %5, %2 + movq %4, %1 + psubusb %5, %1 + psubusb %4, %2 + por %4, %5 + psubusb %4, %3 +%endmacro + +; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 %1=alpha-1 %2=beta-1 +; out: mm5=beta-1, mm7=mask +; clobbers: mm4,mm6 +%macro LOAD_MASK_MMX 2 + movd mm4, %1 + movd mm5, %2 + pshufw mm4, mm4, 0 + pshufw mm5, mm5, 0 + packuswb mm4, mm4 ; 8x alpha-1 + packuswb mm5, mm5 ; 8x beta-1 + DIFF_GT_MMX mm1, mm2, mm4, mm7, mm6 ; |p0-q0| > alpha-1 + DIFF_GT_MMX mm0, mm1, mm5, mm4, mm6 ; |p1-p0| > beta-1 + por mm7, mm4 + DIFF_GT_MMX mm3, mm2, mm5, mm4, mm6 ; |q1-q0| > beta-1 + por mm7, mm4 + pxor mm6, mm6 + pcmpeqb mm7, mm6 +%endmacro + +; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) +; out: mm1=p0' mm2=q0' +; clobbers: mm0,3-6 +%macro DEBLOCK_P0_Q0_MMX 0 + ; a = q0^p0^((p1-q1)>>2) + movq mm4, mm0 + psubb mm4, mm3 + psrlw mm4, 2 + pxor mm4, mm1 + pxor mm4, mm2 + ; b = p0^(q1>>2) + psrlw mm3, 2 + pand mm3, [pb_3f] + movq mm5, mm1 + pxor mm5, mm3 + ; c = q0^(p1>>2) + psrlw mm0, 2 + pand mm0, [pb_3f] + movq mm6, mm2 + pxor mm6, mm0 + ; d = (c^b) & ~(b^a) & 1 + pxor mm6, mm5 + pxor mm5, mm4 + pandn mm5, mm6 + pand mm5, [pb_01] + ; delta = (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3 + ; = (avg(q0, p1>>2) + (d&a)) + ; - (avg(p0, q1>>2) + (d^(d&a))) + pavgb mm0, mm2 + pand mm4, mm5 + paddusb mm0, mm4 + pavgb mm3, mm1 + pxor mm4, mm5 + paddusb mm3, mm4 + ; p0 += clip(delta, -tc0, tc0) + ; q0 -= clip(delta, -tc0, tc0) + movq mm4, mm0 + psubusb mm0, mm3 + psubusb mm3, mm4 + pminub mm0, mm7 + pminub mm3, mm7 + paddusb mm1, mm0 + paddusb mm2, mm3 + psubusb mm1, mm3 + psubusb mm2, mm0 +%endmacro + +; in: mm1=p0 mm2=q0 +; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp +; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) +; clobbers: q2, tmp, tc0 +%macro LUMA_Q1_MMX 6 + movq %6, mm1 + pavgb %6, mm2 + pavgb %2, %6 ; avg(p2,avg(p0,q0)) + pxor %6, %3 + pand %6, [pb_01] ; (p2^avg(p0,q0))&1 + psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 + movq %6, %1 + psubusb %6, %5 + paddusb %5, %1 + pmaxub %2, %6 + pminub %2, %5 + movq %4, %2 +%endmacro + + +SECTION .text + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +x264_deblock_v8_luma_mmxext: + push edi + push esi + mov edi, [esp+12] ; pix + mov esi, [esp+16] ; stride + mov edx, [esp+20] ; alpha + mov ecx, [esp+24] ; beta + dec edx + dec ecx + mov eax, edi + sub eax, esi + sub eax, esi + sub eax, esi ; pix-3*stride + sub esp, 16 + + movq mm0, [eax+esi] ; p1 + movq mm1, [eax+2*esi] ; p0 + movq mm2, [edi] ; q0 + movq mm3, [edi+esi] ; q1 + LOAD_MASK_MMX edx, ecx + + mov ecx, [esp+44] ; tc0, use only the low 16 bits + movd mm4, [ecx] + punpcklbw mm4, mm4 + punpcklbw mm4, mm4 ; tc = 4x tc0[1], 4x tc0[0] + movq [esp+8], mm4 ; tc + pcmpgtb mm4, [pb_ff] + pand mm4, mm7 + movq [esp+0], mm4 ; mask + + movq mm3, [eax] ; p2 + DIFF_GT_MMX mm1, mm3, mm5, mm6, mm7 ; |p2-p0| > beta-1 + pandn mm6, mm4 + pcmpeqb mm6, mm4 + pand mm6, mm4 + pand mm4, [esp+8] ; tc + movq mm7, [pb_01] + pand mm7, mm6 + pand mm6, mm4 + paddb mm7, mm4 + LUMA_Q1_MMX mm0, mm3, [eax], [eax+esi], mm6, mm4 + + movq mm4, [edi+2*esi] ; q2 + DIFF_GT_MMX mm2, mm4, mm5, mm6, mm3 ; |q2-q0| > beta-1 + movq mm5, [esp+0] ; mask + pandn mm6, mm5 + pcmpeqb mm6, mm5 + pand mm6, mm5 + movq mm5, [esp+8] ; tc + pand mm5, mm6 + pand mm6, [pb_01] + paddb mm7, mm6 + movq mm3, [edi+esi] + LUMA_Q1_MMX mm3, mm4, [edi+2*esi], [edi+esi], mm5, mm6 + + DEBLOCK_P0_Q0_MMX + movq [eax+2*esi], mm1 + movq [edi], mm2 + + add esp, 16 + pop esi + pop edi + ret + + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +x264_deblock_h_luma_mmxext: + push ebx + push ebp + mov eax, [esp+12] ; pix + mov ebx, [esp+16] ; stride + lea ebp, [ebx+ebx*2] + sub eax, 4 + lea ecx, [eax+ebp] + sub esp, 96 +%define pix_tmp esp + + ; transpose 6x16 -> tmp space + TRANSPOSE6x8_MEM PASS8ROWS(eax, ecx, ebx, ebp), pix_tmp + lea eax, [eax+ebx*8] + lea ecx, [ecx+ebx*8] + TRANSPOSE6x8_MEM PASS8ROWS(eax, ecx, ebx, ebp), pix_tmp+8 + + ; vertical filter + push dword [esp+124] ; tc0 + push dword [esp+124] ; beta + push dword [esp+124] ; alpha + push dword 16 + push dword pix_tmp + add dword [esp], 0x40 ; pix_tmp+0x30 + call x264_deblock_v8_luma_mmxext + + add dword [esp ], 8 ; pix_tmp+0x38 + add dword [esp+16], 2 ; tc0+2 + call x264_deblock_v8_luma_mmxext + add esp, 20 + + ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) + mov eax, [esp+108] ; pix + sub eax, 2 + lea ecx, [eax+ebp] + + movq mm0, [pix_tmp+0x10] + movq mm1, [pix_tmp+0x20] + movq mm2, [pix_tmp+0x30] + movq mm3, [pix_tmp+0x40] + TRANSPOSE8x4_STORE PASS8ROWS(eax, ecx, ebx, ebp) + + lea eax, [eax+ebx*8] + lea ecx, [ecx+ebx*8] + movq mm0, [pix_tmp+0x18] + movq mm1, [pix_tmp+0x28] + movq mm2, [pix_tmp+0x38] + movq mm3, [pix_tmp+0x48] + TRANSPOSE8x4_STORE PASS8ROWS(eax, ecx, ebx, ebp) + + add esp, 96 + pop ebp + pop ebx + ret + + +%macro CHROMA_V_START 0 + push edi + push esi + mov edi, [esp+12] ; pix + mov esi, [esp+16] ; stride + mov edx, [esp+20] ; alpha + mov ecx, [esp+24] ; beta + dec edx + dec ecx + mov eax, edi + sub eax, esi + sub eax, esi +%endmacro + +%macro CHROMA_H_START 0 + push edi + push esi + push ebp + mov edi, [esp+16] + mov esi, [esp+20] + mov edx, [esp+24] + mov ecx, [esp+28] + dec edx + dec ecx + sub edi, 2 + mov ebp, esi + add ebp, esi + add ebp, esi + mov eax, edi + add edi, ebp +%endmacro + +%macro CHROMA_END 0 + pop esi + pop edi + ret +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +x264_deblock_v_chroma_mmxext: + CHROMA_V_START + push ebx + mov ebx, [esp+32] ; tc0 + + movq mm0, [eax] + movq mm1, [eax+esi] + movq mm2, [edi] + movq mm3, [edi+esi] + + LOAD_MASK_MMX edx, ecx + movd mm6, [ebx] + punpcklbw mm6, mm6 + pand mm7, mm6 + DEBLOCK_P0_Q0_MMX + + movq [eax+esi], mm1 + movq [edi], mm2 + + pop ebx + CHROMA_END + + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +x264_deblock_h_chroma_mmxext: + CHROMA_H_START + push ebx + mov ebx, [esp+36] ; tc0 + sub esp, 16 + + TRANSPOSE4x8_LOAD PASS8ROWS(eax, edi, esi, ebp) + movq [esp+8], mm0 + movq [esp+0], mm3 + + LOAD_MASK_MMX edx, ecx + movd mm6, [ebx] + punpcklbw mm6, mm6 + pand mm7, mm6 + DEBLOCK_P0_Q0_MMX + + movq mm0, [esp+8] + movq mm3, [esp+0] + TRANSPOSE8x4_STORE PASS8ROWS(eax, edi, esi, ebp) + + add esp, 16 + pop ebx + pop ebp + CHROMA_END + + +; in: %1=p0 %2=p1 %3=q1 +; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 +%macro CHROMA_INTRA_P0 3 + movq mm4, %1 + pxor mm4, %3 + pand mm4, [pb_01] ; mm4 = (p0^q1)&1 + pavgb %1, %3 + psubusb %1, mm4 + pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) +%endmacro + +%macro CHROMA_INTRA_BODY 0 + LOAD_MASK_MMX edx, ecx + movq mm5, mm1 + movq mm6, mm2 + CHROMA_INTRA_P0 mm1, mm0, mm3 + CHROMA_INTRA_P0 mm2, mm3, mm0 + psubb mm1, mm5 + psubb mm2, mm6 + pand mm1, mm7 + pand mm2, mm7 + paddb mm1, mm5 + paddb mm2, mm6 +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +x264_deblock_v_chroma_intra_mmxext: + CHROMA_V_START + movq mm0, [eax] + movq mm1, [eax+esi] + movq mm2, [edi] + movq mm3, [edi+esi] + CHROMA_INTRA_BODY + movq [eax+esi], mm1 + movq [edi], mm2 + CHROMA_END + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +x264_deblock_h_chroma_intra_mmxext: + CHROMA_H_START + TRANSPOSE4x8_LOAD PASS8ROWS(eax, edi, esi, ebp) + CHROMA_INTRA_BODY + TRANSPOSE8x4_STORE PASS8ROWS(eax, edi, esi, ebp) + pop ebp + CHROMA_END + diff --git a/encoder/encoder.c b/encoder/encoder.c index 43964e27..6f83aa5f 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -566,6 +566,7 @@ x264_t *x264_encoder_open ( x264_param_t *param ) x264_mc_init( h->param.cpu, &h->mc ); x264_csp_init( h->param.cpu, h->param.i_csp, &h->csp ); x264_quant_init( h, h->param.cpu, &h->quantf ); + x264_deblock_init( h->param.cpu, &h->loopf ); memcpy( h->pixf.mbcmp, ( h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1 ) ? h->pixf.sad : h->pixf.satd, diff --git a/tools/checkasm.c b/tools/checkasm.c index 4b621d96..a91086a0 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -296,6 +296,68 @@ static int check_mc( int cpu_ref, int cpu_new ) return ret; } +static int check_deblock( int cpu_ref, int cpu_new ) +{ + x264_deblock_function_t db_c; + x264_deblock_function_t db_ref; + x264_deblock_function_t db_a; + int ret = 0, ok = 1, used_asm = 0; + int alphas[36], betas[36]; + int8_t tcs[36][4]; + int a, c, i, j; + + x264_deblock_init( 0, &db_c ); + x264_deblock_init( cpu_ref, &db_ref ); + x264_deblock_init( cpu_new, &db_a ); + + /* not exactly the real values of a,b,tc but close enough */ + a = 255; c = 250; + for( i = 35; i >= 0; i-- ) + { + alphas[i] = a; + betas[i] = (i+1)/2; + tcs[i][0] = tcs[i][2] = (c+6)/10; + tcs[i][1] = tcs[i][3] = (c+9)/20; + a = a*9/10; + c = c*9/10; + } + +#define TEST_DEBLOCK( name, ... ) \ + for( i = 0; i < 36; i++ ) \ + { \ + for( j = 0; j < 1024; j++ ) \ + /* two distributions of random to excersize different failure modes */\ + buf1[j] = rand() & (i&1 ? 0xf : 0xff ); \ + memcpy( buf3, buf1, 1024 ); \ + memcpy( buf4, buf1, 1024 ); \ + if( db_a.name != db_ref.name ) \ + { \ + used_asm = 1; \ + db_c.name( &buf3[8*32], 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ + db_a.name( &buf4[8*32], 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ + if( memcmp( buf3, buf4, 1024 ) ) \ + { \ + ok = 0; \ + fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \ + break; \ + } \ + } \ + } + + TEST_DEBLOCK( deblock_h_luma, tcs[i] ); + TEST_DEBLOCK( deblock_v_luma, tcs[i] ); + TEST_DEBLOCK( deblock_h_chroma, tcs[i] ); + TEST_DEBLOCK( deblock_v_chroma, tcs[i] ); + TEST_DEBLOCK( deblock_h_luma_intra ); + TEST_DEBLOCK( deblock_v_luma_intra ); + TEST_DEBLOCK( deblock_h_chroma_intra ); + TEST_DEBLOCK( deblock_v_chroma_intra ); + + report( "deblock :" ); + + return ret; +} + static int check_quant( int cpu_ref, int cpu_new ) { x264_quant_function_t qf_c; @@ -368,6 +430,7 @@ int check_all( int cpu_ref, int cpu_new ) return check_pixel( cpu_ref, cpu_new ) + check_dct( cpu_ref, cpu_new ) + check_mc( cpu_ref, cpu_new ) + + check_deblock( cpu_ref, cpu_new ) + check_quant( cpu_ref, cpu_new ); }