Auto-prefix global constants with x264_ in cextern.
Eliminate x264_ prefix from asm files; automate it in cglobal.
Deduplicate asm constants wherever possible to save data cache (move them to a new const-a.asm).
Remove x264_emms() entirely on non-x86 (don't even call an empty function).
Add cextern_naked for a non-prefixed cextern (used in checkasm).
# MMX/SSE optims
ifneq ($(AS),)
-X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
- pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
+X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
+ mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
cpu-a.asm dct-32.asm
X86SRC = $(X86SRC0:%=common/x86/%)
*****************************************************************************/
#include "common.h"
-#include "cpu.h"
#include <stdarg.h>
#include <ctype.h>
#include "dct.h"
#include "cabac.h"
#include "quant.h"
+#include "cpu.h"
/****************************************************************************
* General functions
#endif
-#ifndef HAVE_MMX
-void x264_emms( void )
-{
-}
-#endif
-
-
int x264_cpu_num_processors( void )
{
#if !defined(HAVE_PTHREAD)
uint32_t x264_cpu_detect( void );
int x264_cpu_num_processors( void );
-void x264_emms( void );
+void x264_cpu_emms( void );
+void x264_cpu_sfence( void );
+#ifdef HAVE_MMX
+#define x264_emms() x264_cpu_emms()
+#else
+#define x264_emms()
+#endif
+#define x264_sfence x264_cpu_sfence
void x264_cpu_mask_misalign_sse( void );
/* kluge:
%include "x86inc.asm"
-SECTION_RODATA
-
SECTION .text
-cextern x264_cabac_range_lps
-cextern x264_cabac_transition
-cextern x264_cabac_renorm_shift
+cextern cabac_range_lps
+cextern cabac_transition
+cextern cabac_renorm_shift
; t3 must be ecx, since it's used for shift.
%ifdef WIN64
%endif
%endmacro
-cglobal x264_cabac_encode_decision_asm, 0,7
+cglobal cabac_encode_decision_asm, 0,7
movifnidn t0, r0mp
movifnidn t1d, r1m
mov t5d, [t0+cb.range]
mov t3d, t5d
shr t5d, 6
movifnidn t2d, r2m
- LOAD_GLOBAL t5d, x264_cabac_range_lps-4, t5, t6*4
- LOAD_GLOBAL t4d, x264_cabac_transition, t2, t6*2
+ LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t6*4
+ LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
shr t6d, 6
sub t3d, t5d
cmp t6d, t2d
cmovne t3d, t5d
cmovne t6d, t7d
mov [t0+cb.state+t1], t4b
-;x264_cabac_encode_renorm
+;cabac_encode_renorm
mov t4d, t3d
shr t3d, 3
- LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
+ LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
shl t4d, t3b
shl t6d, t3b
add t3d, [t0+cb.queue]
mov [t0+cb.range], t4d
cmp t3d, 8
jl .update_queue_low
-;x264_cabac_putbyte
+;cabac_putbyte
; alive: t0=cb t3=queue t6=low
%ifdef WIN64
DECLARE_REG_TMP 3,4,1,0,2,5,6,10
--- /dev/null
+;*****************************************************************************
+;* const-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2010 x264 project
+;*
+;* Author: Loren Merritt <lorenm@u.washington.edu>
+;* Fiona Glaser <fiona@x264.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+
+const pb_01, times 8 db 0,1
+const pb_0, times 16 db 0
+const pb_a1, times 16 db 0xa1
+const pb_1, times 16 db 1
+const pb_3, times 16 db 3
+const hsub_mul, times 8 db 1, -1
+const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
+
+const pw_1, times 8 dw 1
+const pw_2, times 8 dw 2
+const pw_4, times 8 dw 4
+const pw_8, times 8 dw 8
+const pw_16, times 8 dw 16
+const pw_32, times 8 dw 32
+const pw_64, times 8 dw 64
+const pw_32_0, times 4 dw 32,
+ times 4 dw 0
+const pw_8000, times 8 dw 0x8000
+const pw_3fff, times 8 dw 0x3fff
+
+const pd_1, times 4 dd 1
+const pd_128, times 4 dd 128
+const pw_00ff, times 8 dw 0x00ff
+const pw_ff00, times 8 dw 0xff00
+
+const pb_reverse, db 7, 6, 5, 4, 3, 2, 1, 0
+const sw_64, dd 64
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
-; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+; int cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid, 5,7
+cglobal cpu_cpuid, 5,7
push rbx
mov r11, r1
mov r10, r2
%else
;-----------------------------------------------------------------------------
-; int x264_cpu_cpuid_test( void )
+; int cpu_cpuid_test( void )
; return 0 if unsupported
;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid_test
+cglobal cpu_cpuid_test
pushfd
push ebx
push ebp
ret
;-----------------------------------------------------------------------------
-; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+; int cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid, 0,6
+cglobal cpu_cpuid, 0,6
mov eax, r0m
cpuid
mov esi, r1m
RET
;-----------------------------------------------------------------------------
-; void x264_stack_align( void (*func)(void*), void *arg );
+; void stack_align( void (*func)(void*), void *arg );
;-----------------------------------------------------------------------------
-cglobal x264_stack_align
+cglobal stack_align
push ebp
mov ebp, esp
sub esp, 8
%endif
;-----------------------------------------------------------------------------
-; void x264_emms( void )
+; void cpu_emms( void )
;-----------------------------------------------------------------------------
-cglobal x264_emms
+cglobal cpu_emms
emms
ret
;-----------------------------------------------------------------------------
-; void x264_cpu_mask_misalign_sse(void)
+; void cpu_sfence( void )
;-----------------------------------------------------------------------------
-cglobal x264_cpu_mask_misalign_sse
+cglobal cpu_sfence
+ sfence
+ ret
+
+;-----------------------------------------------------------------------------
+; void cpu_mask_misalign_sse( void )
+;-----------------------------------------------------------------------------
+cglobal cpu_mask_misalign_sse
sub rsp, 4
stmxcsr [rsp]
or dword [rsp], 1<<17
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
-
-pw_32: times 8 dw 32
-hsub_mul: times 8 db 1, -1
-
SECTION .text
+cextern pw_32
+cextern hsub_mul
+
; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
%endmacro
;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_mmx, 3,3
-global x264_sub8x8_dct8_mmx.skip_prologue
+cglobal sub8x8_dct8_mmx, 3,3
+global sub8x8_dct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
call load_diff_4x8_mmx
%endmacro
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
+; void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_mmx, 2,2
-global x264_add8x8_idct8_mmx.skip_prologue
+cglobal add8x8_idct8_mmx, 2,2
+global add8x8_idct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
add word [r1], 32
INIT_XMM
%macro DCT_SUB8 1
-cglobal x264_sub8x8_dct_%1, 3,3
+cglobal sub8x8_dct_%1, 3,3
add r2, 4*FDEC_STRIDE
-global x264_sub8x8_dct_%1.skip_prologue
+global sub8x8_dct_%1.skip_prologue
.skip_prologue:
%ifnidn %1, sse2
mova m7, [hsub_mul]
ret
;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_%1, 3,3
+cglobal sub8x8_dct8_%1, 3,3
add r2, 4*FDEC_STRIDE
-global x264_sub8x8_dct8_%1.skip_prologue
+global sub8x8_dct8_%1.skip_prologue
.skip_prologue:
%ifidn %1, sse2
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE]
DCT_SUB8 ssse3
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] )
+; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct_sse2, 2,2
+cglobal add8x8_idct_sse2, 2,2
add r0, 4*FDEC_STRIDE
-global x264_add8x8_idct_sse2.skip_prologue
+global add8x8_idct_sse2.skip_prologue
.skip_prologue:
UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
SBUTTERFLY qdq, 0, 1, 4
ret
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
+; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_sse2, 2,2
+cglobal add8x8_idct8_sse2, 2,2
add r0, 4*FDEC_STRIDE
-global x264_add8x8_idct8_sse2.skip_prologue
+global add8x8_idct8_sse2.skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
-pw_32: times 8 dw 32
-hsub_mul: times 8 db 1, -1
-
SECTION .text
+
+cextern pw_32
+cextern hsub_mul
INIT_XMM
%macro DCT8_1D 10
%endmacro
%macro DCT_SUB8 1
-cglobal x264_sub8x8_dct_%1, 3,3,11
+cglobal sub8x8_dct_%1, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
mova m7, [hsub_mul]
call .skip_prologue
RET
%endif
-global x264_sub8x8_dct_%1.skip_prologue
+global sub8x8_dct_%1.skip_prologue
.skip_prologue:
SWAP 7, 9
LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
ret
;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_%1, 3,3,11
+cglobal sub8x8_dct8_%1, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
mova m7, [hsub_mul]
call .skip_prologue
RET
%endif
-global x264_sub8x8_dct8_%1.skip_prologue
+global sub8x8_dct8_%1.skip_prologue
.skip_prologue:
SWAP 7, 10
LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
DCT_SUB8 ssse3
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
+; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_sse2, 2,2,11
+cglobal add8x8_idct8_sse2, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
-global x264_add8x8_idct8_sse2.skip_prologue
+global add8x8_idct8_sse2.skip_prologue
.skip_prologue:
SWAP 7, 9
movdqa m0, [r1+0x00]
ret
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] )
+; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct_sse2, 2,2,11
+cglobal add8x8_idct_sse2, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
-global x264_add8x8_idct_sse2.skip_prologue
+global add8x8_idct_sse2.skip_prologue
.skip_prologue:
SWAP 7, 9
mova m0, [r1+ 0]
%endmacro
SECTION_RODATA
-pw_32_0: times 4 dw 32
- times 4 dw 0
-pw_32: times 8 dw 32
-pw_8000: times 8 dw 0x8000
-hsub_mul: times 8 db 1, -1
-
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
-pb_1: times 16 db 1
-pw_1: times 8 dw 1
SECTION .text
+cextern pw_32_0
+cextern pw_32
+cextern pw_8000
+cextern hsub_mul
+cextern pb_1
+cextern pw_1
+
%macro WALSH4_1D 5
SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
INIT_MMX
;-----------------------------------------------------------------------------
-; void x264_dct4x4dc_mmx( int16_t d[4][4] )
+; void dct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_dct4x4dc_mmx, 1,1
+cglobal dct4x4dc_mmx, 1,1
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
RET
;-----------------------------------------------------------------------------
-; void x264_idct4x4dc_mmx( int16_t d[4][4] )
+; void idct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_idct4x4dc_mmx, 1,1
+cglobal idct4x4dc_mmx, 1,1
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
%macro SUB_DCT4 1
;-----------------------------------------------------------------------------
-; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
+; void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal x264_sub4x4_dct_%1, 3,3
+cglobal sub4x4_dct_%1, 3,3
%ifidn %1, mmx
.skip_prologue:
LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
SUB_DCT4 ssse3
;-----------------------------------------------------------------------------
-; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
+; void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_add4x4_idct_mmx, 2,2
+cglobal add4x4_idct_mmx, 2,2
pxor m7, m7
.skip_prologue:
movq m1, [r1+ 8]
RET
INIT_XMM
-cglobal x264_add4x4_idct_sse4, 2,2,6
+cglobal add4x4_idct_sse4, 2,2,6
mova m0, [r1+0x00] ; row1/row0
mova m2, [r1+0x10] ; row3/row2
mova m1, m0 ; row1/row0
INIT_MMX
;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
cglobal %1, 3,3,11
%endmacro
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
+; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6-7
cglobal %1, 2,2,11
%endmacro
%ifndef ARCH_X86_64
-SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
-ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
-SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
-ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
-
-cextern x264_sub8x8_dct8_mmx.skip_prologue
-cextern x264_add8x8_idct8_mmx.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
-ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
+ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
+SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
+ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
+
+cextern sub8x8_dct8_mmx.skip_prologue
+cextern add8x8_idct8_mmx.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
%endif
INIT_XMM
-cextern x264_sub8x8_dct_sse2.skip_prologue
-cextern x264_sub8x8_dct_ssse3.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
-SUB_NxN_DCT x264_sub16x16_dct_ssse3, x264_sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
-cextern x264_add8x8_idct_sse2.skip_prologue
-ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
+cextern sub8x8_dct_sse2.skip_prologue
+cextern sub8x8_dct_ssse3.skip_prologue
+SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
+cextern add8x8_idct_sse2.skip_prologue
+ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
-cextern x264_sub8x8_dct8_sse2.skip_prologue
-cextern x264_add8x8_idct8_sse2.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
-ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
+cextern sub8x8_dct8_sse2.skip_prologue
+cextern add8x8_idct8_sse2.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
-cextern x264_sub8x8_dct8_ssse3.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct8_ssse3, x264_sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
+cextern sub8x8_dct8_ssse3.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
;-----------------------------------------------------------------------------
movq [%3+FDEC_STRIDE*3], %1
%endmacro
-cglobal x264_add8x8_idct_dc_mmx, 2,2
+cglobal add8x8_idct_dc_mmx, 2,2
movq mm0, [r1]
pxor mm1, mm1
add r0, FDEC_STRIDE*4
ADD_DC mm2, mm3, r0
RET
-cglobal x264_add8x8_idct_dc_ssse3, 2,2
+cglobal add8x8_idct_dc_ssse3, 2,2
movq xmm0, [r1]
pxor xmm1, xmm1
add r0, FDEC_STRIDE*4
movhps [r0+FDEC_STRIDE* 3], xmm5
RET
-cglobal x264_add16x16_idct_dc_mmx, 2,3
+cglobal add16x16_idct_dc_mmx, 2,3
mov r2, 4
.loop:
movq mm0, [r1]
movdqa [r0+%1+FDEC_STRIDE*3], xmm7
%endmacro
-cglobal x264_add16x16_idct_dc_sse2, 2,2,8
+cglobal add16x16_idct_dc_sse2, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
IDCT_DC_STORE 0, xmm2, xmm3
ret
-cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
+cglobal add16x16_idct_dc_ssse3, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
%endmacro
INIT_MMX
-cglobal x264_sub8x8_dct_dc_mmxext, 3,3
+cglobal sub8x8_dct_dc_mmxext, 3,3
DCTDC_2ROW_MMX m0, m4, 0
DCTDC_2ROW_MMX m5, m6, 2
paddw m0, m5
%endif
%endmacro
-cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
+cglobal sub8x8_dct_dc_sse2, 3,3,8
pxor m7, m7
DCTDC_2ROW_SSE2 0, 0, m4
DCTDC_2ROW_SSE2 2, 1, m4
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
+; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro SCAN_8x8 1
-cglobal x264_zigzag_scan_8x8_frame_%1, 2,2,8
+cglobal zigzag_scan_8x8_frame_%1, 2,2,8
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdq2q mm0, xmm0
SCAN_8x8 ssse3
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] )
+; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2
+cglobal zigzag_scan_8x8_frame_mmxext, 2,2
movq mm0, [r1]
movq mm1, [r1+2*8]
movq mm2, [r1+2*14]
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] )
+; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
+cglobal zigzag_scan_4x4_frame_mmx, 2,2
movq mm0, [r1]
movq mm1, [r1+8]
movq mm2, [r1+16]
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] )
+; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
+cglobal zigzag_scan_4x4_frame_ssse3, 2,2
movdqa xmm1, [r1+16]
movdqa xmm0, [r1]
pshufb xmm1, [pb_scan4frameb]
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
+; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
-cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
+cglobal zigzag_scan_4x4_field_mmxext, 2,3
pshufw mm0, [r1+4], 0xd2
movq mm1, [r1+16]
movq mm2, [r1+24]
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[8][8] )
+; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
; Output order:
; 45 46 47 51 56 57 52 53
; 54 55 58 59 60 61 62 63
-cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3
+cglobal zigzag_scan_8x8_field_mmxext, 2,3
movq mm0, [r1+2*0] ; 03 02 01 00
movq mm1, [r1+2*4] ; 07 06 05 04
movq mm2, [r1+2*8] ; 11 10 09 08
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
+; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
%macro ZIGZAG_SUB_4x4 2
%ifidn %1, ac
-cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 4,4,8
+cglobal zigzag_sub_4x4%1_%2_ssse3, 4,4,8
%else
-cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
+cglobal zigzag_sub_4x4%1_%2_ssse3, 3,3,8
%endif
movd xmm0, [r1+0*FENC_STRIDE]
movd xmm1, [r1+1*FENC_STRIDE]
ZIGZAG_SUB_4x4 ac, field
;-----------------------------------------------------------------------------
-; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz )
+; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
;-----------------------------------------------------------------------------
%macro INTERLEAVE 1
%endmacro
INIT_MMX
-cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
+cglobal zigzag_interleave_8x8_cavlc_mmx, 3,3
INTERLEAVE 0
INTERLEAVE 8
INTERLEAVE 16
%endmacro
INIT_XMM
-cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8
+cglobal zigzag_interleave_8x8_cavlc_sse2, 3,3,8
INTERLEAVE_XMM 0
INTERLEAVE_XMM 16
packsswb m2, m3
%include "x86inc.asm"
-SECTION_RODATA
-pb_00: times 16 db 0x00
-pb_01: times 16 db 0x01
-pb_03: times 16 db 0x03
-pb_a1: times 16 db 0xa1
-
SECTION .text
+cextern pb_0
+cextern pb_1
+cextern pb_3
+cextern pb_a1
+
; expands to [base],...,[base+7*stride]
%define PASS8ROWS(base, base3, stride, stride3) \
[base], [base+stride], [base+stride*2], [base3], \
%macro DEBLOCK_P0_Q0 0
mova m5, m1
pxor m5, m2 ; p0^q0
- pand m5, [pb_01] ; (p0^q0)&1
+ pand m5, [pb_1] ; (p0^q0)&1
pcmpeqb m4, m4
pxor m3, m4
pavgb m3, m0 ; (p1 - q1 + 256)>>1
- pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
+ pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
pxor m4, m1
pavgb m4, m2 ; (q0 - p0 + 256)>>1
pavgb m3, m5
pavgb %6, m2
pavgb %2, %6 ; avg(p2,avg(p0,q0))
pxor %6, %3
- pand %6, [pb_01] ; (p2^avg(p0,q0))&1
+ pand %6, [pb_1] ; (p2^avg(p0,q0))&1
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
mova %6, %1
psubusb %6, %5
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
-; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_XMM
-cglobal x264_deblock_v_luma_sse2, 5,5,10
+cglobal deblock_v_luma_sse2, 5,5,10
movd m8, [r4] ; tc0
lea r4, [r1*3]
dec r2d ; alpha-1
RET
;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal x264_deblock_h_luma_sse2, 5,7
+cglobal deblock_h_luma_sse2, 5,7
movsxd r10, r1d
lea r11, [r10+r10*2]
lea r6, [r0-4]
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
- ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
+ ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30]
mov r1d, 0x10
%ifdef WIN64
mov [rsp+0x20], r4
%endif
- call x264_deblock_v_luma_sse2
+ call deblock_v_luma_sse2
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add r6, 2
%macro DEBLOCK_LUMA 3
;-----------------------------------------------------------------------------
-; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_%1, 5,5
+cglobal deblock_%2_luma_%1, 5,5
lea r4, [r1*3]
dec r2 ; alpha-1
neg r4
RET
;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal x264_deblock_h_luma_%1, 0,5
+cglobal deblock_h_luma_%1, 0,5
mov r0, r0mp
mov r3, r1m
lea r4, [r3*3]
PUSH dword r2m
PUSH dword 16
PUSH dword r0
- call x264_deblock_%2_luma_%1
+ call deblock_%2_luma_%1
%ifidn %2, v8
add dword [esp ], 8 ; pix_tmp+0x38
add dword [esp+16], 2 ; tc0+2
- call x264_deblock_%2_luma_%1
+ call deblock_%2_luma_%1
%endif
ADD esp, 20
mova t3, t2
mova t4, t2
psrlw t2, 1
- pavgb t2, mpb_00
+ pavgb t2, mpb_0
pxor t2, t0
- pand t2, mpb_01
+ pand t2, mpb_1
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
mova t1, p2
psubb t2, q1
paddb t3, t3
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
- pand t2, mpb_01
+ pand t2, mpb_1
psubb t1, t2
pavgb t1, p1
pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
psrlw t3, 2
- pavgb t3, mpb_00
+ pavgb t3, mpb_0
pxor t3, t1
- pand t3, mpb_01
+ pand t3, mpb_1
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
mova t3, p0
mova t2, p0
pxor t3, q1
pavgb t2, q1
- pand t3, mpb_01
+ pand t3, mpb_1
psubb t2, t3
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
paddb t2, t2
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
psrlw t2, 2
- pavgb t2, mpb_00
+ pavgb t2, mpb_0
pxor t2, t1
- pand t2, mpb_01
+ pand t2, mpb_1
psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
pxor t0, p1
%define mask0 m12
%define mask1p m13
%define mask1q [rsp-24]
- %define mpb_00 m14
- %define mpb_01 m15
+ %define mpb_0 m14
+ %define mpb_1 m15
%else
%define spill(x) [esp+16*x+((stack_offset+4)&15)]
%define p2 [r4+r1]
%define mask0 spill(2)
%define mask1p spill(3)
%define mask1q spill(4)
- %define mpb_00 [pb_00]
- %define mpb_01 [pb_01]
+ %define mpb_0 [pb_0]
+ %define mpb_1 [pb_1]
%endif
;-----------------------------------------------------------------------------
-; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
+cglobal deblock_%2_luma_intra_%1, 4,6,16
%ifndef ARCH_X86_64
sub esp, 0x60
%endif
mova q0, [r0]
mova q1, [r0+r1]
%ifdef ARCH_X86_64
- pxor mpb_00, mpb_00
- mova mpb_01, [pb_01]
+ pxor mpb_0, mpb_0
+ mova mpb_1, [pb_1]
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
SWAP 7, 12 ; m12=mask0
- pavgb t5, mpb_00
- pavgb t5, mpb_01 ; alpha/4+1
+ pavgb t5, mpb_0
+ pavgb t5, mpb_1 ; alpha/4+1
movdqa p2, [r4+r1]
movdqa q2, [r0+2*r1]
DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
mova m4, t5
mova mask0, m7
- pavgb m4, [pb_00]
- pavgb m4, [pb_01] ; alpha/4+1
+ pavgb m4, [pb_0]
+ pavgb m4, [pb_1] ; alpha/4+1
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
pand m6, mask0
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
INIT_MMX
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_luma_intra_%1, 4,7
+cglobal deblock_h_luma_intra_%1, 4,7
movsxd r10, r1d
lea r11, [r10*3]
lea r6, [r0-4]
lea r0, [pix_tmp+0x40]
mov r1, 0x10
- call x264_deblock_v_luma_intra_%1
+ call deblock_v_luma_intra_%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea r5, [r6+r11]
add rsp, 0x88
RET
%else
-cglobal x264_deblock_h_luma_intra_%1, 2,4
+cglobal deblock_h_luma_intra_%1, 2,4
lea r3, [r1*3]
sub r0, 4
lea r2, [r0+r3]
PUSH dword r2m
PUSH dword 16
PUSH r0
- call x264_deblock_%2_luma_intra_%1
+ call deblock_%2_luma_intra_%1
%ifidn %2, v8
add dword [rsp], 8 ; pix_tmp+8
- call x264_deblock_%2_luma_intra_%1
+ call deblock_%2_luma_intra_%1
%endif
ADD esp, 16
%define t6 r6
;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_mmxext, 5,6
+cglobal deblock_v_chroma_mmxext, 5,6
CHROMA_V_START
movq m0, [t5]
movq m1, [t5+r1]
RET
;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_mmxext, 5,7
+cglobal deblock_h_chroma_mmxext, 5,7
%ifdef ARCH_X86_64
%define buf0 [rsp-24]
%define buf1 [rsp-16]
%macro CHROMA_INTRA_P0 3
movq m4, %1
pxor m4, %3
- pand m4, [pb_01] ; m4 = (p0^q1)&1
+ pand m4, [pb_1] ; m4 = (p0^q1)&1
pavgb %1, %3
psubusb %1, m4
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
%define t6 r5
;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_intra_mmxext, 4,5
+cglobal deblock_v_chroma_intra_mmxext, 4,5
CHROMA_V_START
movq m0, [t5]
movq m1, [t5+r1]
RET
;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_intra_mmxext, 4,6
+cglobal deblock_h_chroma_intra_mmxext, 4,6
CHROMA_H_START
TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
call chroma_intra_body_mmxext
SECTION_RODATA 32
ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0
-pw_1: times 8 dw 1
-pw_4: times 8 dw 4
-pw_8: times 8 dw 8
-pw_32: times 8 dw 32
-pw_64: times 8 dw 64
-sw_64: dd 64
SECTION .text
+cextern pw_1
+cextern pw_4
+cextern pw_8
+cextern pw_32
+cextern pw_64
+cextern sw_64
+
;=============================================================================
; implicit weighted biprediction
;=============================================================================
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
+; int pixel_avg_weight_w16( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
;-----------------------------------------------------------------------------
%macro AVG_WEIGHT 2-3 0
-cglobal x264_pixel_avg_weight_w%2_%1
+cglobal pixel_avg_weight_w%2_%1
BIWEIGHT_START
AVG_START %3
%if %2==8 && mmsize==16
AVG_WEIGHT mmxext, 8
AVG_WEIGHT mmxext, 16
INIT_XMM
-%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext
+%define pixel_avg_weight_w4_sse2 pixel_avg_weight_w4_mmxext
AVG_WEIGHT sse2, 8, 7
AVG_WEIGHT sse2, 16, 7
%define BIWEIGHT BIWEIGHT_SSSE3
%endrep
%endmacro
-
-;void x264_mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src,int i_src_stride, x264_weight_t *weight,int h)
+;-----------------------------------------------------------------------------
+;void mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, weight_t *weight, int h )
+;-----------------------------------------------------------------------------
%ifdef ARCH_X86_64
%define NUMREGS 6
%endif
%macro WEIGHTER 2
- cglobal x264_mc_weight_w%1_%2, NUMREGS, NUMREGS, 7
+ cglobal mc_weight_w%1_%2, NUMREGS, NUMREGS, 7
WEIGHT_START %1
LOAD_HEIGHT
.loop:
%endrep
%endmacro
-;void x264_mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, x264_weight_t *w, int h )
+;-----------------------------------------------------------------------------
+;void mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, weight_t *w, int h )
+;-----------------------------------------------------------------------------
%macro OFFSET 3
- cglobal x264_mc_offset%3_w%1_%2, NUMREGS, NUMREGS
+ cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS
mova m2, [r4]
LOAD_HEIGHT
.loop:
;=============================================================================
;-----------------------------------------------------------------------------
-; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
+; void pixel_avg_4x4( uint8_t *dst, int dst_stride,
+; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
;-----------------------------------------------------------------------------
%macro AVGH 3
-cglobal x264_pixel_avg_%1x%2_%3
+cglobal pixel_avg_%1x%2_%3
mov eax, %2
cmp dword r6m, 32
- jne x264_pixel_avg_weight_w%1_%3
+ jne pixel_avg_weight_w%1_%3
%if mmsize == 16 && %1 == 16
test dword r4m, 15
- jz x264_pixel_avg_w%1_sse2
+ jz pixel_avg_w%1_sse2
%endif
- jmp x264_pixel_avg_w%1_mmxext
+ jmp pixel_avg_w%1_mmxext
%endmacro
;-----------------------------------------------------------------------------
-; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
-; int height, int weight );
+; void pixel_avg_w4( uint8_t *dst, int dst_stride,
+; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
+; int height, int weight );
;-----------------------------------------------------------------------------
%macro AVG_END 0
%endmacro
INIT_MMX
-AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd
+AVG_FUNC pixel_avg_w4_mmxext, movd, movd
AVGH 4, 8, mmxext
AVGH 4, 4, mmxext
AVGH 4, 2, mmxext
-AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq
+AVG_FUNC pixel_avg_w8_mmxext, movq, movq
AVGH 8, 16, mmxext
AVGH 8, 8, mmxext
AVGH 8, 4, mmxext
-cglobal x264_pixel_avg_w16_mmxext
+cglobal pixel_avg_w16_mmxext
AVG_START
movq mm0, [t2 ]
movq mm1, [t2+8]
AVGH 16, 8, mmxext
INIT_XMM
-AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa
+AVG_FUNC pixel_avg_w16_sse2, movdqu, movdqa
AVGH 16, 16, sse2
AVGH 16, 8, sse2
AVGH 8, 16, sse2
;=============================================================================
;-----------------------------------------------------------------------------
-; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src1, int src_stride,
-; uint8_t *src2, int height );
+; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
+; uint8_t *src1, int src_stride,
+; uint8_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W8 2
-cglobal x264_pixel_avg2_w%1_mmxext, 6,7
+cglobal pixel_avg2_w%1_mmxext, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
AVG2_W8 8, movq
%macro AVG2_W16 2
-cglobal x264_pixel_avg2_w%1_mmxext, 6,7
+cglobal pixel_avg2_w%1_mmxext, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
AVG2_W16 12, movd
AVG2_W16 16, movq
-cglobal x264_pixel_avg2_w20_mmxext, 6,7
+cglobal pixel_avg2_w20_mmxext, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
jg .height_loop
REP_RET
-cglobal x264_pixel_avg2_w16_sse2, 6,7
+cglobal pixel_avg2_w16_sse2, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
REP_RET
%macro AVG2_W20 1
-cglobal x264_pixel_avg2_w20_%1, 6,7
+cglobal pixel_avg2_w20_%1, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
%endmacro
%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
-cglobal x264_pixel_avg2_w%1_cache%2_%3
+cglobal pixel_avg2_w%1_cache%2_%3
mov eax, r2m
and eax, 0x1f|(%2>>1)
cmp eax, (32-%1)|(%2>>1)
- jle x264_pixel_avg2_w%1_%3
+ jle pixel_avg2_w%1_%3
;w12 isn't needed because w16 is just as fast if there's no cacheline split
%if %1 == 12
- jmp x264_pixel_avg2_w16_cache_mmxext
+ jmp pixel_avg2_w16_cache_mmxext
%else
- jmp x264_pixel_avg2_w%1_cache_mmxext
+ jmp pixel_avg2_w%1_cache_mmxext
%endif
%endmacro
%2 [r0+%1], mm0
%endmacro
-x264_pixel_avg2_w8_cache_mmxext:
+pixel_avg2_w8_cache_mmxext:
AVG_CACHELINE_START
AVG_CACHELINE_LOOP 0, movq
add r2, r3
jg .height_loop
REP_RET
-x264_pixel_avg2_w16_cache_mmxext:
+pixel_avg2_w16_cache_mmxext:
AVG_CACHELINE_START
AVG_CACHELINE_LOOP 0, movq
AVG_CACHELINE_LOOP 8, movq
jg .height_loop
REP_RET
-x264_pixel_avg2_w20_cache_mmxext:
+pixel_avg2_w20_cache_mmxext:
AVG_CACHELINE_START
AVG_CACHELINE_LOOP 0, movq
AVG_CACHELINE_LOOP 8, movq
rep ret
%endmacro
-cglobal x264_pixel_avg2_w16_cache64_ssse3
+cglobal pixel_avg2_w16_cache64_ssse3
mov eax, r2m
and eax, 0x3f
cmp eax, 0x30
- jle x264_pixel_avg2_w16_sse2
+ jle pixel_avg2_w16_sse2
PROLOGUE 6,7
lea r6, [r4+r2]
and r4, ~0xf
INIT_MMX
;-----------------------------------------------------------------------------
-; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride,
-; uint8_t *src, int i_src_stride, int i_height )
+; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
+; uint8_t *src, int i_src_stride, int i_height )
;-----------------------------------------------------------------------------
-cglobal x264_mc_copy_w4_mmx, 4,6
+cglobal mc_copy_w4_mmx, 4,6
cmp dword r4m, 4
lea r5, [r3*3]
lea r4, [r1*3]
COPY4 movd, movd, r4, r5
RET
-cglobal x264_mc_copy_w8_mmx, 5,7
+cglobal mc_copy_w8_mmx, 5,7
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
jg .height_loop
REP_RET
-cglobal x264_mc_copy_w16_mmx, 5,7
+cglobal mc_copy_w16_mmx, 5,7
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
REP_RET
%endmacro
-COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
+COPY_W16_SSE2 mc_copy_w16_sse2, movdqu
; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
; but with SSE3 the overhead is zero, so there's no reason not to include it.
-COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
-COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
+COPY_W16_SSE2 mc_copy_w16_sse3, lddqu
+COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa
; FIXME assumes 64 byte cachelines
;-----------------------------------------------------------------------------
-; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
-; uint8_t *pix_uv, int stride_uv, int mb_x )
+; void prefetch_fenc( uint8_t *pix_y, int stride_y,
+; uint8_t *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
%ifdef ARCH_X86_64
-cglobal x264_prefetch_fenc_mmxext, 5,5
+cglobal prefetch_fenc_mmxext, 5,5
mov eax, r4d
and eax, 3
imul eax, r1d
RET
%else
-cglobal x264_prefetch_fenc_mmxext
+cglobal prefetch_fenc_mmxext
mov r2, [esp+20]
mov r1, [esp+8]
mov r0, [esp+4]
%endif ; ARCH_X86_64
;-----------------------------------------------------------------------------
-; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
+; void prefetch_ref( uint8_t *pix, int stride, int parity )
;-----------------------------------------------------------------------------
-cglobal x264_prefetch_ref_mmxext, 3,3
+cglobal prefetch_ref_mmxext, 3,3
dec r2d
and r2d, r1d
lea r0, [r0+r2*8+64]
%endmacro
;-----------------------------------------------------------------------------
-; void x264_mc_chroma_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src, int src_stride,
-; int dx, int dy,
-; int width, int height )
+; void mc_chroma( uint8_t *dst, int dst_stride,
+; uint8_t *src, int src_stride,
+; int dx, int dy,
+; int width, int height )
;-----------------------------------------------------------------------------
%macro MC_CHROMA 1-2 0
-cglobal x264_mc_chroma_%1
+cglobal mc_chroma_%1
%if mmsize == 16
cmp dword r6m, 4
- jle x264_mc_chroma_mmxext
+ jle mc_chroma_mmxext
%endif
PROLOGUE 0,6,%2
MC_CHROMA_START
%macro MC_CHROMA_SSSE3 2
INIT_MMX
-cglobal x264_mc_chroma_ssse3%1, 0,6,%2
+cglobal mc_chroma_ssse3%1, 0,6,%2
MC_CHROMA_START
and r4d, 7
and r5d, 7
filt_mul51: times 8 db -5, 1
hpel_shuf: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
-pw_1: times 8 dw 1
-pw_16: times 8 dw 16
-pw_32: times 8 dw 32
-pd_128: times 4 dd 128
-pw_0x3fff: times 4 dw 0x3fff
-
SECTION .text
+cextern pw_1
+cextern pw_16
+cextern pw_32
+cextern pd_128
+cextern pw_3fff
+
%macro LOAD_ADD 4
movh %4, %3
movh %1, %2
%macro HPEL_V 1-2 0
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
+; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_v_%1, 5,6,%2
+cglobal hpel_filter_v_%1, 5,6,%2
%ifdef WIN64
movsxd r4, r4d
%endif
HPEL_V mmxext
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_c_mmxext, 3,3
+cglobal hpel_filter_c_mmxext, 3,3
add r0, r2
lea r1, [r1+r2*2]
neg r2
REP_RET
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_mmxext, 3,3
+cglobal hpel_filter_h_mmxext, 3,3
add r0, r2
add r1, r2
neg r2
%macro HPEL_C 1
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_c_%1, 3,3,9
+cglobal hpel_filter_c_%1, 3,3,9
add r0, r2
lea r1, [r1+r2*2]
neg r2
%endmacro
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_sse2, 3,3,8
+cglobal hpel_filter_h_sse2, 3,3,8
add r0, r2
add r1, r2
neg r2
%ifndef ARCH_X86_64
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_ssse3, 3,3
+cglobal hpel_filter_h_ssse3, 3,3
add r0, r2
add r1, r2
neg r2
%macro HPEL 1
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-; uint8_t *src, int stride, int width, int height)
+; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+; uint8_t *src, int stride, int width, int height)
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_%1, 7,7,16
+cglobal hpel_filter_%1, 7,7,16
%ifdef WIN64
movsxd r4, r4d
movsxd r5, r5d
HPEL ssse3
%endif
-cglobal x264_sfence
- sfence
- ret
-
%undef movntq
%undef movntps
%undef sfence
;-----------------------------------------------------------------------------
-; void x264_plane_copy_core_mmxext( uint8_t *dst, int i_dst,
-; uint8_t *src, int i_src, int w, int h)
+; void plane_copy_core( uint8_t *dst, int i_dst,
+; uint8_t *src, int i_src, int w, int h)
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>w
-cglobal x264_plane_copy_core_mmxext, 6,7
+cglobal plane_copy_core_mmxext, 6,7
movsxdifnidn r1, r1d
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
; memzero SSE will fail for non-mod128.
;-----------------------------------------------------------------------------
-; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
+; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
-cglobal x264_memcpy_aligned_mmx, 3,3
+cglobal memcpy_aligned_mmx, 3,3
test r2d, 16
jz .copy32
sub r2d, 16
REP_RET
;-----------------------------------------------------------------------------
-; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
+; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
-cglobal x264_memcpy_aligned_sse2, 3,3
+cglobal memcpy_aligned_sse2, 3,3
test r2d, 16
jz .copy32
sub r2d, 16
REP_RET
;-----------------------------------------------------------------------------
-; void *x264_memzero_aligned( void *dst, size_t n );
+; void *memzero_aligned( void *dst, size_t n );
;-----------------------------------------------------------------------------
%macro MEMZERO 1
-cglobal x264_memzero_aligned_%1, 2,2
+cglobal memzero_aligned_%1, 2,2
add r0, r1
neg r1
pxor m0, m0
;-----------------------------------------------------------------------------
-; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride )
+; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
;-----------------------------------------------------------------------------
-cglobal x264_integral_init4h_sse4, 3,4
+cglobal integral_init4h_sse4, 3,4
lea r3, [r0+r2*2]
add r1, r2
neg r2
jl .loop
REP_RET
-cglobal x264_integral_init8h_sse4, 3,4
+cglobal integral_init8h_sse4, 3,4
lea r3, [r0+r2*2]
add r1, r2
neg r2
%macro INTEGRAL_INIT_8V 1
;-----------------------------------------------------------------------------
-; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
+; void integral_init8v( uint16_t *sum8, int stride )
;-----------------------------------------------------------------------------
-cglobal x264_integral_init8v_%1, 3,3
+cglobal integral_init8v_%1, 3,3
shl r1, 1
add r0, r1
lea r2, [r0+r1*8]
INTEGRAL_INIT_8V sse2
;-----------------------------------------------------------------------------
-; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
+; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal x264_integral_init4v_mmx, 3,5
+cglobal integral_init4v_mmx, 3,5
shl r2, 1
lea r3, [r0+r2*4]
lea r4, [r0+r2*8]
REP_RET
INIT_XMM
-cglobal x264_integral_init4v_sse2, 3,5
+cglobal integral_init4v_sse2, 3,5
shl r2, 1
add r0, r2
add r1, r2
jl .loop
REP_RET
-cglobal x264_integral_init4v_ssse3, 3,5
+cglobal integral_init4v_ssse3, 3,5
shl r2, 1
add r0, r2
add r1, r2
; int src_stride, int dst_stride, int width, int height )
;-----------------------------------------------------------------------------
%macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
-cglobal x264_frame_init_lowres_core_%1, 6,7,%2
+cglobal frame_init_lowres_core_%1, 6,7,%2
%ifdef WIN64
movsxd r5, r5d
%endif
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
;-----------------------------------------------------------------------------
-cglobal x264_mbtree_propagate_cost_sse2, 6,6
+cglobal mbtree_propagate_cost_sse2, 6,6
shl r5d, 1
lea r0, [r0+r5*2]
add r1, r5
psrld xmm0, 8 ; intra*invq>>8
movq xmm3, [r3+r5] ; inter
movq xmm1, [r1+r5] ; prop
- pand xmm3, [pw_0x3fff]
+ pand xmm3, [pw_3fff]
punpcklwd xmm1, xmm5
punpcklwd xmm3, xmm5
paddd xmm0, xmm1 ; prop + (intra*invq>>8)
DECL_SUF( x264_pixel_avg_4x2, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
#define MC_WEIGHT(w,type) \
- extern void x264_mc_weight_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int );
+ void x264_mc_weight_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int );
#define MC_WEIGHT_OFFSET(w,type) \
- extern void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
- extern void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+ void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+ void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
MC_WEIGHT(w,type)
MC_WEIGHT_OFFSET( 4, mmxext )
#undef MC_OFFSET
#undef MC_WEIGHT
-extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
-extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
-extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
-extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
+void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
+void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
+void x264_prefetch_ref_mmxext( uint8_t *, int, int );
+void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
+ uint8_t *dst, int i_dst_stride,
+ int dx, int dy, int i_width, int i_height );
+void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride,
+ uint8_t *dst, int i_dst_stride,
+ int dx, int dy, int i_width, int i_height );
+void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride,
+ uint8_t *dst, int i_dst_stride,
+ int dx, int dy, int i_width, int i_height );
+void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride,
uint8_t *dst, int i_dst_stride,
int dx, int dy, int i_width, int i_height );
-extern void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int dx, int dy, int i_width, int i_height );
-extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int dx, int dy, int i_width, int i_height );
-extern void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int dx, int dy, int i_width, int i_height );
-extern void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
-extern void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h);
-extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
-extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
-extern void x264_memzero_aligned_mmx( void * dst, int n );
-extern void x264_memzero_aligned_sse2( void * dst, int n );
-extern void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
-extern void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
-extern void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
-extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
-extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
-extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
-extern void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
-extern void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, int len );
+void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
+void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h);
+void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
+void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
+void x264_memzero_aligned_mmx( void * dst, int n );
+void x264_memzero_aligned_sse2( void * dst, int n );
+void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
+void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
+void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
+void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
+void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
+void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, int len );
#define LOWRES(cpu) \
-extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
- int src_stride, int dst_stride, int width, int height );
+void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
+ int src_stride, int dst_stride, int width, int height );
LOWRES(mmxext)
LOWRES(cache32_mmxext)
LOWRES(sse2)
LOWRES(ssse3)
#define PIXEL_AVG_W(width,cpu)\
-extern void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int );
/* This declares some functions that don't exist, but that isn't a problem. */
#define PIXEL_AVG_WALL(cpu)\
PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(20,cpu);
void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
void x264_hpel_filter_c_##cpuc( uint8_t *dst, int16_t *buf, int width );\
void x264_hpel_filter_h_##cpuh( uint8_t *dst, uint8_t *src, int width );\
-void x264_sfence( void );\
static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\
int stride, int width, int height, int16_t *buf )\
{\
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int )
+; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sa8d_8x8_internal_mmxext
+cglobal pixel_sa8d_8x8_internal_mmxext
push r0
push r2
sub esp, 0x74
%endmacro
;-----------------------------------------------------------------------------
-; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res )
+; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
-cglobal x264_intra_sa8d_x3_8x8_core_mmxext
+cglobal intra_sa8d_x3_8x8_core_mmxext
mov eax, [esp+4]
mov ecx, [esp+8]
sub esp, 0x70
;-----------------------------------------------------------------------------
-; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
-; const uint8_t *pix2, int stride2, int sums[2][4] )
+; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
+; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_4x4x2_core_mmxext
+cglobal pixel_ssim_4x4x2_core_mmxext
push ebx
push edi
mov ebx, [esp+16]
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
-pw_1: times 8 dw 1
-pw_00ff: times 8 dw 0xff
-ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
-ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
+SECTION_RODATA 32
mask_ff: times 16 db 0xff
times 16 db 0
+ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
+ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1
mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1
-hsub_mul: times 8 db 1, -1
hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
hmul_8p: times 8 db 1
times 4 db 1, -1
SECTION .text
+cextern pw_1
+cextern pw_00ff
+
+cextern hsub_mul
+
%macro HADDD 2 ; sum junk
%if mmsize == 16
movhlps %2, %1
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
+; int pixel_ssd_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SSD 3-4 0
%if %1 != %2
%else
%assign function_align 16
%endif
-cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0
+cglobal pixel_ssd_%1x%2_%3, 0,0,0
mov al, %1*%2/mmsize/2
%if %1 != %2
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_var_wxh_mmxext( uint8_t *, int )
+; int pixel_var_wxh( uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal x264_pixel_var_16x16_mmxext, 2,3
+cglobal pixel_var_16x16_mmxext, 2,3
VAR_START 0
VAR_2ROW 8, 16
VAR_END
-cglobal x264_pixel_var_8x8_mmxext, 2,3
+cglobal pixel_var_8x8_mmxext, 2,3
VAR_START 0
VAR_2ROW r1, 4
VAR_END
INIT_XMM
-cglobal x264_pixel_var_16x16_sse2, 2,3,8
+cglobal pixel_var_16x16_sse2, 2,3,8
VAR_START 1
mov r2d, 8
.loop:
jg .loop
VAR_END
-cglobal x264_pixel_var_8x8_sse2, 2,4,8
+cglobal pixel_var_8x8_sse2, 2,4,8
VAR_START 1
mov r2d, 2
lea r3, [r1*3]
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * )
+; int pixel_var2_8x8( uint8_t *, int, uint8_t *, int, int * )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
INIT_MMX
-cglobal x264_pixel_var2_8x8_mmxext, 5,6
+cglobal pixel_var2_8x8_mmxext, 5,6
VAR_START 0
mov r5d, 8
.loop:
%endif
INIT_XMM
-cglobal x264_pixel_var2_8x8_sse2, 5,6,8
+cglobal pixel_var2_8x8_sse2, 5,6,8
VAR_START 1
mov r5d, 4
.loop:
VAR2_END
RET
-cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
+cglobal pixel_var2_8x8_ssse3, 5,6,8
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
mova m7, [hsub_mul]
; for small blocks on x86_32, modify pixel pointer instead.
;-----------------------------------------------------------------------------
-; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal x264_pixel_satd_16x4_internal_mmxext
+cglobal pixel_satd_16x4_internal_mmxext
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 0
paddw m0, m2
paddw m0, m1
ret
-cglobal x264_pixel_satd_8x8_internal_mmxext
+cglobal pixel_satd_8x8_internal_mmxext
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 1
paddw m0, m2
paddw m0, m1
-x264_pixel_satd_8x4_internal_mmxext:
+pixel_satd_8x4_internal_mmxext:
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 0
paddw m0, m2
paddw m0, m1
ret
-cglobal x264_pixel_satd_16x16_mmxext, 4,6
+cglobal pixel_satd_16x16_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
%rep 3
- call x264_pixel_satd_16x4_internal_mmxext
+ call pixel_satd_16x4_internal_mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endrep
- call x264_pixel_satd_16x4_internal_mmxext
+ call pixel_satd_16x4_internal_mmxext
HADDUW m0, m1
movd eax, m0
RET
-cglobal x264_pixel_satd_16x8_mmxext, 4,6
+cglobal pixel_satd_16x8_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
- call x264_pixel_satd_16x4_internal_mmxext
+ call pixel_satd_16x4_internal_mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- call x264_pixel_satd_16x4_internal_mmxext
+ call pixel_satd_16x4_internal_mmxext
SATD_END_MMX
-cglobal x264_pixel_satd_8x16_mmxext, 4,6
+cglobal pixel_satd_8x16_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
- call x264_pixel_satd_8x8_internal_mmxext
+ call pixel_satd_8x8_internal_mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- call x264_pixel_satd_8x8_internal_mmxext
+ call pixel_satd_8x8_internal_mmxext
SATD_END_MMX
-cglobal x264_pixel_satd_8x8_mmxext, 4,6
+cglobal pixel_satd_8x8_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
- call x264_pixel_satd_8x8_internal_mmxext
+ call pixel_satd_8x8_internal_mmxext
SATD_END_MMX
-cglobal x264_pixel_satd_8x4_mmxext, 4,6
+cglobal pixel_satd_8x4_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
- call x264_pixel_satd_8x4_internal_mmxext
+ call pixel_satd_8x4_internal_mmxext
SATD_END_MMX
-cglobal x264_pixel_satd_4x8_mmxext, 4,6
+cglobal pixel_satd_4x8_mmxext, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 1
SATD_4x4_MMX m1, 0, 0
paddw m0, m1
SATD_END_MMX
-cglobal x264_pixel_satd_4x4_mmxext, 4,6
+cglobal pixel_satd_4x4_mmxext, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 0
SATD_END_MMX
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
+; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 1
INIT_XMM
%ifnidn %1, sse2
-cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
+cglobal pixel_satd_4x4_%1, 4, 6, 6
SATD_START_MMX
mova m4, [hmul_4p]
LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
RET
%endif
-cglobal x264_pixel_satd_4x8_%1, 4, 6, 8
+cglobal pixel_satd_4x8_%1, 4, 6, 8
SATD_START_MMX
%ifnidn %1, sse2
mova m7, [hmul_4p]
movd eax, m6
RET
-cglobal x264_pixel_satd_8x8_internal_%1
+cglobal pixel_satd_8x8_internal_%1
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
-x264_pixel_satd_8x4_internal_%1:
+pixel_satd_8x4_internal_%1:
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
ret
%ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
-cglobal x264_pixel_satd_16x4_internal_%1
+cglobal pixel_satd_16x4_internal_%1
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
lea r0, [r0+4*r1]
SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
ret
-cglobal x264_pixel_satd_16x8_%1, 4,6,12
+cglobal pixel_satd_16x8_%1, 4,6,12
SATD_START_SSE2 %1, m10, m7
%ifidn %1, sse2
mova m7, [pw_00ff]
%endif
- jmp x264_pixel_satd_16x8_internal_%1
+ jmp pixel_satd_16x8_internal_%1
-cglobal x264_pixel_satd_16x16_%1, 4,6,12
+cglobal pixel_satd_16x16_%1, 4,6,12
SATD_START_SSE2 %1, m10, m7
%ifidn %1, sse2
mova m7, [pw_00ff]
%endif
- call x264_pixel_satd_16x4_internal_%1
- call x264_pixel_satd_16x4_internal_%1
-x264_pixel_satd_16x8_internal_%1:
- call x264_pixel_satd_16x4_internal_%1
- call x264_pixel_satd_16x4_internal_%1
+ call pixel_satd_16x4_internal_%1
+ call pixel_satd_16x4_internal_%1
+pixel_satd_16x8_internal_%1:
+ call pixel_satd_16x4_internal_%1
+ call pixel_satd_16x4_internal_%1
SATD_END_SSE2 %1, m10
%else
-cglobal x264_pixel_satd_16x8_%1, 4,6,8
+cglobal pixel_satd_16x8_%1, 4,6,8
SATD_START_SSE2 %1, m6, m7
BACKUP_POINTERS
- call x264_pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
RESTORE_AND_INC_POINTERS
- call x264_pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
SATD_END_SSE2 %1, m6
-cglobal x264_pixel_satd_16x16_%1, 4,6,8
+cglobal pixel_satd_16x16_%1, 4,6,8
SATD_START_SSE2 %1, m6, m7
BACKUP_POINTERS
- call x264_pixel_satd_8x8_internal_%1
- call x264_pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
RESTORE_AND_INC_POINTERS
- call x264_pixel_satd_8x8_internal_%1
- call x264_pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
SATD_END_SSE2 %1, m6
%endif
-cglobal x264_pixel_satd_8x16_%1, 4,6,8
+cglobal pixel_satd_8x16_%1, 4,6,8
SATD_START_SSE2 %1, m6, m7
- call x264_pixel_satd_8x8_internal_%1
- call x264_pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
SATD_END_SSE2 %1, m6
-cglobal x264_pixel_satd_8x8_%1, 4,6,8
+cglobal pixel_satd_8x8_%1, 4,6,8
SATD_START_SSE2 %1, m6, m7
- call x264_pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
SATD_END_SSE2 %1, m6
-cglobal x264_pixel_satd_8x4_%1, 4,6,8
+cglobal pixel_satd_8x4_%1, 4,6,8
SATD_START_SSE2 %1, m6, m7
- call x264_pixel_satd_8x4_internal_%1
+ call pixel_satd_8x4_internal_%1
SATD_END_SSE2 %1, m6
%endmacro ; SATDS_SSE2
%macro SA8D 1
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
-; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
+; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sa8d_8x8_internal_%1
+cglobal pixel_sa8d_8x8_internal_%1
lea r10, [r0+4*r1]
lea r11, [r2+4*r3]
LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
paddw m0, m1
paddw m0, m2
paddw m0, m8
- SAVE_MM_PERMUTATION x264_pixel_sa8d_8x8_internal_%1
+ SAVE_MM_PERMUTATION pixel_sa8d_8x8_internal_%1
ret
-cglobal x264_pixel_sa8d_8x8_%1, 4,6,12
+cglobal pixel_sa8d_8x8_%1, 4,6,12
lea r4, [3*r1]
lea r5, [3*r3]
%ifnidn %1, sse2
mova m7, [hmul_8p]
%endif
- call x264_pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal_%1
HADDW m0, m1
movd eax, m0
add eax, 1
shr eax, 1
RET
-cglobal x264_pixel_sa8d_16x16_%1, 4,6,12
+cglobal pixel_sa8d_16x16_%1, 4,6,12
lea r4, [3*r1]
lea r5, [3*r3]
%ifnidn %1, sse2
mova m7, [hmul_8p]
%endif
- call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
+ call pixel_sa8d_8x8_internal_%1 ; pix[0]
add r2, 8
add r0, 8
mova m10, m0
- call x264_pixel_sa8d_8x8_internal_%1 ; pix[8]
+ call pixel_sa8d_8x8_internal_%1 ; pix[8]
lea r2, [r2+8*r3]
lea r0, [r0+8*r1]
paddusw m10, m0
- call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
+ call pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
sub r2, 8
sub r0, 8
paddusw m10, m0
- call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
+ call pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
paddusw m0, m10
HADDUW m0, m1
movd eax, m0
%else ; ARCH_X86_32
%ifnidn %1, mmxext
-cglobal x264_pixel_sa8d_8x8_internal_%1
+cglobal pixel_sa8d_8x8_internal_%1
%define spill0 [esp+4]
%define spill1 [esp+20]
%define spill2 [esp+36]
ret
%endif ; ifndef mmxext
-cglobal x264_pixel_sa8d_8x8_%1, 4,7
+cglobal pixel_sa8d_8x8_%1, 4,7
mov r6, esp
and esp, ~15
sub esp, 48
lea r4, [3*r1]
lea r5, [3*r3]
- call x264_pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal_%1
HADDW m0, m1
movd eax, m0
add eax, 1
mov esp, r6
RET
-cglobal x264_pixel_sa8d_16x16_%1, 4,7
+cglobal pixel_sa8d_16x16_%1, 4,7
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [3*r1]
lea r5, [3*r3]
- call x264_pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal_%1
%ifidn %1, mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
mova [esp+48], m0
- call x264_pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal_%1
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8
add r2, 8
paddusw m0, [esp+48]
mova [esp+48], m0
- call x264_pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal_%1
%ifidn %1, mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
paddusw m0, [esp+48]
%endif
mova [esp+64-mmsize], m0
- call x264_pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal_%1
paddusw m0, [esp+64-mmsize]
%if mmsize == 16
HADDUW m0, m1
%ifdef ARCH_X86_64
INIT_XMM
;-----------------------------------------------------------------------------
-; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
+; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
-cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16
+cglobal intra_sa8d_x3_8x8_core_%1, 3,3,16
; 8x8 hadamard
pxor m8, m8
movq m0, [r0+0*FENC_STRIDE]
; in: r0 = fenc
; out: m0..m3 = hadamard coefs
INIT_MMX
-cglobal x264_hadamard_load
+cglobal hadamard_load
; not really a global, but otherwise cycles get attributed to the wrong function in profiling
pxor m7, m7
movd m0, [r0+0*FENC_STRIDE]
punpcklbw m2, m7
punpcklbw m3, m7
HADAMARD4_2D 0, 1, 2, 3, 4
- SAVE_MM_PERMUTATION x264_hadamard_load
+ SAVE_MM_PERMUTATION hadamard_load
ret
%macro SCALAR_SUMSUB 4
%macro INTRA_SATDS_MMX 1
INIT_MMX
;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_4x4_%1, 2,6
+cglobal intra_satd_x3_4x4_%1, 2,6
%ifdef ARCH_X86_64
; stack is 16 byte aligned because abi says so
%define top_1d rsp-8 ; size 8
%define t0 r2
%endif
- call x264_hadamard_load
+ call hadamard_load
SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
mov t0d, r0d
SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5
%endif
;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_16x16_%1, 0,7
+cglobal intra_satd_x3_16x16_%1, 0,7
%ifdef ARCH_X86_64
%assign stack_pad 88
%else
.loop_y:
xor r4d, r4d
.loop_x:
- call x264_hadamard_load
+ call hadamard_load
SUM3x4 %1
SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
RET
;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_8x8c_%1, 0,6
+cglobal intra_satd_x3_8x8c_%1, 0,6
; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
SUB rsp, 72
%define sums rsp+48 ; size 24
.loop_y:
xor r4d, r4d
.loop_x:
- call x264_hadamard_load
+ call hadamard_load
SUM3x4 %1
SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
; out: [tmp]=hadamard4, m0=satd
-cglobal x264_hadamard_ac_4x4_mmxext
+cglobal hadamard_ac_4x4_mmxext
movh m0, [r0]
movh m1, [r0+r1]
movh m2, [r0+r1*2]
paddw m0, m1
paddw m2, m3
paddw m0, m2
- SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext
+ SAVE_MM_PERMUTATION hadamard_ac_4x4_mmxext
ret
-cglobal x264_hadamard_ac_2x2max_mmxext
+cglobal hadamard_ac_2x2max_mmxext
mova m0, [r3+0x00]
mova m1, [r3+0x20]
mova m2, [r3+0x40]
HADAMARD 0, max, 1, 3, 4, 5
paddw m7, m0
paddw m7, m1
- SAVE_MM_PERMUTATION x264_hadamard_ac_2x2max_mmxext
+ SAVE_MM_PERMUTATION hadamard_ac_2x2max_mmxext
ret
-cglobal x264_hadamard_ac_8x8_mmxext
+cglobal hadamard_ac_8x8_mmxext
mova m6, [mask_ac4]
pxor m7, m7
- call x264_hadamard_ac_4x4_mmxext
+ call hadamard_ac_4x4_mmxext
add r0, 4
add r3, 32
mova m5, m0
- call x264_hadamard_ac_4x4_mmxext
+ call hadamard_ac_4x4_mmxext
lea r0, [r0+4*r1]
add r3, 64
paddw m5, m0
- call x264_hadamard_ac_4x4_mmxext
+ call hadamard_ac_4x4_mmxext
sub r0, 4
sub r3, 32
paddw m5, m0
- call x264_hadamard_ac_4x4_mmxext
+ call hadamard_ac_4x4_mmxext
paddw m5, m0
sub r3, 40
mova [rsp+gprsize+8], m5 ; save satd
%rep 3
- call x264_hadamard_ac_2x2max_mmxext
+ call hadamard_ac_2x2max_mmxext
%endrep
mova m0, [r3+0x00]
mova m1, [r3+0x20]
paddw m6, m7
mova [rsp+gprsize], m6 ; save sa8d
SWAP m0, m6
- SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext
+ SAVE_MM_PERMUTATION hadamard_ac_8x8_mmxext
ret
%macro HADAMARD_AC_WXH_MMX 2
-cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
+cglobal pixel_hadamard_ac_%1x%2_mmxext, 2,4
%assign pad 16-gprsize-(stack_offset&15)
%define ysub r1
sub rsp, 16+128+pad
lea r2, [r1*3]
lea r3, [rsp+16]
- call x264_hadamard_ac_8x8_mmxext
+ call hadamard_ac_8x8_mmxext
%if %2==16
%define ysub r2
lea r0, [r0+r1*4]
sub rsp, 16
- call x264_hadamard_ac_8x8_mmxext
+ call hadamard_ac_8x8_mmxext
%endif
%if %1==16
neg ysub
sub rsp, 16
lea r0, [r0+ysub*4+8]
neg ysub
- call x264_hadamard_ac_8x8_mmxext
+ call hadamard_ac_8x8_mmxext
%if %2==16
lea r0, [r0+r1*4]
sub rsp, 16
- call x264_hadamard_ac_8x8_mmxext
+ call hadamard_ac_8x8_mmxext
%endif
%endif
mova m1, [rsp+0x08]
INIT_XMM
; in: r0=pix, r1=stride, r2=stride*3
; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
-cglobal x264_hadamard_ac_8x8_%1
+cglobal hadamard_ac_8x8_%1
%ifdef ARCH_X86_64
%define spill0 m8
%define spill1 m9
paddw m2, m4
paddw m0, m2
mova [rsp+gprsize+16], m0 ; save sa8d
- SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1
+ SAVE_MM_PERMUTATION hadamard_ac_8x8_%1
ret
HADAMARD_AC_WXH_SSE2 16, 16, %1
HADAMARD_AC_WXH_SSE2 8, 8, %1
%endmacro ; HADAMARD_AC_SSE2
-; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
+; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
%macro HADAMARD_AC_WXH_SSE2 3
-cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3,11
+cglobal pixel_hadamard_ac_%1x%2_%3, 2,3,11
%assign pad 16-gprsize-(stack_offset&15)
%define ysub r1
sub rsp, 48+pad
lea r2, [r1*3]
- call x264_hadamard_ac_8x8_%3
+ call hadamard_ac_8x8_%3
%if %2==16
%define ysub r2
lea r0, [r0+r1*4]
sub rsp, 32
- call x264_hadamard_ac_8x8_%3
+ call hadamard_ac_8x8_%3
%endif
%if %1==16
neg ysub
sub rsp, 32
lea r0, [r0+ysub*4+8]
neg ysub
- call x264_hadamard_ac_8x8_%3
+ call hadamard_ac_8x8_%3
%if %2==16
lea r0, [r0+r1*4]
sub rsp, 32
- call x264_hadamard_ac_8x8_%3
+ call hadamard_ac_8x8_%3
%endif
%endif
mova m1, [rsp+0x20]
; instantiate satds
%ifndef ARCH_X86_64
-cextern x264_pixel_sa8d_8x8_internal_mmxext
+cextern pixel_sa8d_8x8_internal_mmxext
SA8D mmxext
%endif
;=============================================================================
;-----------------------------------------------------------------------------
-; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
-; const uint8_t *pix2, int stride2, int sums[2][4] )
+; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
+; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
%macro SSIM_ITER 1
paddd m3, m6
%endmacro
-cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
+cglobal pixel_ssim_4x4x2_core_sse2, 4,4,8
pxor m0, m0
SSIM_ITER 0
SSIM_ITER 1
RET
;-----------------------------------------------------------------------------
-; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
+; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_end4_sse2, 3,3,7
+cglobal pixel_ssim_end4_sse2, 3,3,7
movdqa m0, [r0+ 0]
movdqa m1, [r0+16]
movdqa m2, [r0+32]
%define ABS1 ABS1_MMX
;-----------------------------------------------------------------------------
-; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
-; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
+; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
+; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_ads4_mmxext, 4,7
+cglobal pixel_ads4_mmxext, 4,7
movq mm6, [r0]
movq mm4, [r0+8]
pshufw mm7, mm6, 0
movd [t0], mm1
ADS_END 1
-cglobal x264_pixel_ads2_mmxext, 4,7
+cglobal pixel_ads2_mmxext, 4,7
movq mm6, [r0]
pshufw mm5, r6m, 0
pshufw mm7, mm6, 0
movd [t0], mm4
ADS_END 1
-cglobal x264_pixel_ads1_mmxext, 4,7
+cglobal pixel_ads1_mmxext, 4,7
pshufw mm7, [r0], 0
pshufw mm6, r6m, 0
ADS_START 2
ADS_END 2
%macro ADS_SSE2 1
-cglobal x264_pixel_ads4_%1, 4,7,12
+cglobal pixel_ads4_%1, 4,7,12
movdqa xmm4, [r0]
pshuflw xmm7, xmm4, 0
pshuflw xmm6, xmm4, 0xAA
%endif ; ARCH
ADS_END 2
-cglobal x264_pixel_ads2_%1, 4,7,8
+cglobal pixel_ads2_%1, 4,7,8
movq xmm6, [r0]
movd xmm5, r6m
pshuflw xmm7, xmm6, 0
movq [t0], xmm1
ADS_END 2
-cglobal x264_pixel_ads1_%1, 4,7,8
+cglobal pixel_ads1_%1, 4,7,8
movd xmm7, [r0]
movd xmm6, r6m
pshuflw xmm7, xmm7, 0
%define ABS1 ABS1_SSSE3
ADS_SSE2 ssse3
-; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
+; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
; int nmv=0, i, j;
; *(uint32_t*)(masks+width) = 0;
; }
; return nmv;
; }
-cglobal x264_pixel_ads_mvs, 0,7,0
+cglobal pixel_ads_mvs, 0,7,0
ads_mvs:
%ifdef ARCH_X86_64
; mvs = r4
%include "x86inc.asm"
%include "x86util.asm"
+SECTION_RODATA
+
+pw_76543210:
+pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
+pb_00s_ff: times 8 db 0
+pb_0s_ff: times 7 db 0
+ db 0xff
+
+SECTION .text
+
+cextern pb_1
+cextern pb_3
+cextern pw_2
+cextern pw_4
+cextern pw_8
+cextern pw_ff00
+cextern pb_reverse
+
%macro STORE8x8 2
add r0, 4*FDEC_STRIDE
movq [r0 + -4*FDEC_STRIDE], %1
movdqa [r0 + 3*FDEC_STRIDE], %1
%endmacro
-SECTION_RODATA
-
-ALIGN 16
-pb_1: times 16 db 1
-pb_3: times 16 db 3
-pw_2: times 4 dw 2
-pw_4: times 4 dw 4
-pw_8: times 8 dw 8
-pw_76543210:
-pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
-pb_00s_ff: times 8 db 0
-pb_0s_ff: times 7 db 0
- db 0xff
-pw_ff00: times 8 dw 0xff00
-pb_reverse: db 7, 6, 5, 4, 3, 2, 1, 0
-
-SECTION .text
-
; dest, left, right, src, tmp
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED8x8_LOWPASS0 6
%endmacro
;-----------------------------------------------------------------------------
-; void predict_4x4_ddl_mmxext( uint8_t *src )
+; void predict_4x4_ddl( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_4x4_ddl_mmxext, 1,1
movq mm1, [r0-FDEC_STRIDE]
RET
;-----------------------------------------------------------------------------
-; void predict_4x4_ddr_mmxext( uint8_t *src )
+; void predict_4x4_ddr( uint8_t *src )
;-----------------------------------------------------------------------------
%macro PREDICT_4x4 1
cglobal predict_4x4_ddr_%1, 1,1
PREDICT_4x4 ssse3
;-----------------------------------------------------------------------------
-; void predict_4x4_hu_mmxext( uint8_t *src )
+; void predict_4x4_hu( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_4x4_hu_mmxext, 1,1
movq mm0, [r0+0*FDEC_STRIDE-8]
RET
;-----------------------------------------------------------------------------
-; void predict_4x4_vl_mmxext( uint8_t *src )
+; void predict_4x4_vl( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_4x4_vl_mmxext, 1,1
movq mm1, [r0-FDEC_STRIDE]
PREDICT_FILTER ssse3
;-----------------------------------------------------------------------------
-; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_v( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_v_mmxext, 2,2
movq mm0, [r1+16]
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] )
+; void predict_8x8_h( uint8_t *src, uint8_t edge[33] )
;-----------------------------------------------------------------------------
INIT_MMX
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
+; void predict_8x8_dc( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
cglobal predict_8x8_dc_mmxext, 2,2
pxor mm0, mm0
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
+; void predict_8x8_dc_top( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
%macro PRED8x8_DC 2
cglobal %1, 2,2
; functions if we know sse2 is available.
;-----------------------------------------------------------------------------
-; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddl_mmxext, 2,2
movq mm5, [r1+16]
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddr_mmxext, 2,2
movq mm1, [r1+7]
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
%define PALIGNR PALIGNR_MMX
cglobal predict_8x8_hu_mmxext, 2,2
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_vr_core( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
; fills only some pixels:
RET
;-----------------------------------------------------------------------------
-; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
+; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
cglobal predict_8x8c_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
REP_RET
;-----------------------------------------------------------------------------
-; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
+; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
cglobal predict_16x16_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
%endif ; !ARCH_X86_64
;-----------------------------------------------------------------------------
-; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddl_sse2, 2,2
movdqa xmm3, [r1+16]
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddr_sse2, 2,2
movdqu xmm3, [r1+8]
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_vl_sse2, 2,2
movdqa xmm4, [r1+16]
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_vr_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_vr_sse2, 2,2,7
movdqu xmm0, [r1+8]
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_hd_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
%define PALIGNR PALIGNR_MMX
cglobal predict_8x8_hd_mmxext, 2,2
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_hd_ssse3( uint8_t *src, uint8_t *edge )
+; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_HD 1
cglobal predict_8x8_hd_%1, 2,2
%define PALIGNR PALIGNR_MMX
;-----------------------------------------------------------------------------
-; void predict_8x8_hu_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_HU 1
cglobal predict_8x8_hu_%1, 2,2
PREDICT_8x8_HU ssse3
;-----------------------------------------------------------------------------
-; void predict_8x8c_v_mmx( uint8_t *src )
+; void predict_8x8c_v( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_8x8c_v_mmx, 1,1
movq mm0, [r0 - FDEC_STRIDE]
RET
;-----------------------------------------------------------------------------
-; void predict_8x8c_h_mmxext( uint8_t *src )
+; void predict_8x8c_h( uint8_t *src )
;-----------------------------------------------------------------------------
%macro PRED_8x8C_H 1
PRED_8x8C_H ssse3
;-----------------------------------------------------------------------------
-; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
+; void predict_8x8c_dc_core( uint8_t *src, int s2, int s3 )
;-----------------------------------------------------------------------------
cglobal predict_8x8c_dc_core_mmxext, 1,1
movq mm0, [r0 - FDEC_STRIDE]
RET
;-----------------------------------------------------------------------------
-; void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c )
+; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
cglobal predict_8x8c_p_core_sse2, 1,1
RET
;-----------------------------------------------------------------------------
-; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c )
+; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
cglobal predict_16x16_p_core_sse2, 1,2,8
movd xmm0, r1m
REP_RET
;-----------------------------------------------------------------------------
-; void predict_16x16_v_mmx( uint8_t *src )
+; void predict_16x16_v( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_16x16_v_mmx, 1,2
movq mm0, [r0 - FDEC_STRIDE]
REP_RET
;-----------------------------------------------------------------------------
-; void predict_16x16_v_sse2( uint8_t *src )
+; void predict_16x16_v( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_16x16_v_sse2, 1,1
movdqa xmm0, [r0 - FDEC_STRIDE]
RET
;-----------------------------------------------------------------------------
-; void predict_16x16_h_mmxext( uint8_t *src )
+; void predict_16x16_h( uint8_t *src )
;-----------------------------------------------------------------------------
%macro PRED_16x16_H 1
PRED_16x16_H ssse3
;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
+; void predict_16x16_dc_core( uint8_t *src, int i_dc_left )
;-----------------------------------------------------------------------------
%macro PRED16x16_DC 2
REP_RET
;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left )
+; void predict_16x16_dc_core( uint8_t *src, int i_dc_left )
;-----------------------------------------------------------------------------
%macro PRED16x16_DC_SSE2 2
#include "predict.h"
#include "pixel.h"
-extern void predict_16x16_v_mmx( uint8_t *src );
-extern void predict_16x16_h_mmxext( uint8_t *src );
-extern void predict_16x16_h_ssse3( uint8_t *src );
-extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
-extern void predict_16x16_dc_left_core_mmxext( uint8_t *src, int i_dc_left );
-extern void predict_16x16_dc_top_mmxext( uint8_t *src );
-extern void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
-extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
-extern void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
-extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
-extern void predict_8x8c_dc_top_mmxext( uint8_t *src );
-extern void predict_8x8c_v_mmx( uint8_t *src );
-extern void predict_8x8c_h_mmxext( uint8_t *src );
-extern void predict_8x8c_h_ssse3( uint8_t *src );
-extern void predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_vr_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hd_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hu_ssse3( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_filter_mmxext ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
-extern void predict_8x8_filter_ssse3 ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
-extern void predict_4x4_ddl_mmxext( uint8_t *src );
-extern void predict_4x4_ddr_mmxext( uint8_t *src );
-extern void predict_4x4_vl_mmxext( uint8_t *src );
-extern void predict_4x4_vr_mmxext( uint8_t *src );
-extern void predict_4x4_vr_ssse3( uint8_t *src );
-extern void predict_4x4_hd_mmxext( uint8_t *src );
-extern void predict_4x4_hd_ssse3( uint8_t *src );
-extern void predict_4x4_dc_mmxext( uint8_t *src );
-extern void predict_4x4_ddr_ssse3( uint8_t *src );
-extern void predict_4x4_hu_mmxext( uint8_t *src );
-extern void predict_16x16_dc_top_sse2( uint8_t *src );
-extern void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left );
-extern void predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left );
-extern void predict_16x16_v_sse2( uint8_t *src );
-extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
+ void x264_predict_16x16_v_mmx( uint8_t *src );
+ void x264_predict_16x16_h_mmxext( uint8_t *src );
+ void x264_predict_16x16_h_ssse3( uint8_t *src );
+ void x264_predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
+ void x264_predict_16x16_dc_left_core_mmxext( uint8_t *src, int i_dc_left );
+ void x264_predict_16x16_dc_top_mmxext( uint8_t *src );
+ void x264_predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
+ void x264_predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
+ void x264_predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
+ void x264_predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
+ void x264_predict_8x8c_dc_top_mmxext( uint8_t *src );
+ void x264_predict_8x8c_v_mmx( uint8_t *src );
+ void x264_predict_8x8c_h_mmxext( uint8_t *src );
+ void x264_predict_8x8c_h_ssse3( uint8_t *src );
+ void x264_predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_vr_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hd_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hu_ssse3( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_filter_mmxext( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
+ void x264_predict_8x8_filter_ssse3( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
+ void x264_predict_4x4_ddl_mmxext( uint8_t *src );
+ void x264_predict_4x4_ddr_mmxext( uint8_t *src );
+ void x264_predict_4x4_vl_mmxext( uint8_t *src );
+ void x264_predict_4x4_vr_mmxext( uint8_t *src );
+ void x264_predict_4x4_vr_ssse3( uint8_t *src );
+ void x264_predict_4x4_hd_mmxext( uint8_t *src );
+ void x264_predict_4x4_hd_ssse3( uint8_t *src );
+ void x264_predict_4x4_dc_mmxext( uint8_t *src );
+ void x264_predict_4x4_ddr_ssse3( uint8_t *src );
+ void x264_predict_4x4_hu_mmxext( uint8_t *src );
+ void x264_predict_16x16_dc_top_sse2( uint8_t *src );
+ void x264_predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left );
+ void x264_predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left );
+ void x264_predict_16x16_v_sse2( uint8_t *src );
+ void x264_predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\
#define PREDICT_16x16_P(name)\
-static void predict_16x16_p_##name( uint8_t *src )\
+static void x264_predict_16x16_p_##name( uint8_t *src )\
{\
int a, b, c;\
int H = 0;\
b = ( 5 * H + 32 ) >> 6;\
c = ( 5 * V + 32 ) >> 6;\
i00 = a - b * 7 - c * 7 + 16;\
- predict_16x16_p_core_##name( src, i00, b, c );\
+ x264_predict_16x16_p_core_##name( src, i00, b, c );\
}
#ifndef ARCH_X86_64
PREDICT_16x16_P( sse2 )
#ifdef __GNUC__
-static void predict_16x16_p_ssse3( uint8_t *src )
+static void x264_predict_16x16_p_ssse3( uint8_t *src )
{
int a, b, c, i00;
int H, V;
b = ( 5 * H + 32 ) >> 6;
c = ( 5 * V + 32 ) >> 6;
i00 = a - b * 7 - c * 7 + 16;
- predict_16x16_p_core_sse2( src, i00, b, c );
+ x264_predict_16x16_p_core_sse2( src, i00, b, c );
}
#endif
#define PREDICT_8x8_P(name)\
-static void predict_8x8c_p_##name( uint8_t *src )\
+static void x264_predict_8x8c_p_##name( uint8_t *src )\
{\
int a, b, c;\
int H = 0;\
b = ( 17 * H + 16 ) >> 5;\
c = ( 17 * V + 16 ) >> 5;\
i00 = a -3*b -3*c + 16;\
- predict_8x8c_p_core_##name( src, i00, b, c );\
+ x264_predict_8x8c_p_core_##name( src, i00, b, c );\
}
#ifndef ARCH_X86_64
PREDICT_8x8_P( sse2 )
#ifdef __GNUC__
-static void predict_8x8c_p_ssse3( uint8_t *src )
+static void x264_predict_8x8c_p_ssse3( uint8_t *src )
{
int a, b, c, i00;
int H, V;
b = ( 17 * H + 16 ) >> 5;
c = ( 17 * V + 16 ) >> 5;
i00 = a -3*b -3*c + 16;
- predict_8x8c_p_core_sse2( src, i00, b, c );
+ x264_predict_8x8c_p_core_sse2( src, i00, b, c );
}
#endif
#define PREDICT_16x16_DC(name)\
-static void predict_16x16_dc_##name( uint8_t *src )\
+static void x264_predict_16x16_dc_##name( uint8_t *src )\
{\
uint32_t dc=16;\
int i;\
dc += src[-1 + i * FDEC_STRIDE];\
dc += src[-1 + (i+1) * FDEC_STRIDE];\
}\
- predict_16x16_dc_core_##name( src, dc );\
+ x264_predict_16x16_dc_core_##name( src, dc );\
}
PREDICT_16x16_DC( mmxext )
PREDICT_16x16_DC( sse2 )
#define PREDICT_16x16_DC_LEFT(name)\
-static void predict_16x16_dc_left_##name( uint8_t *src )\
+static void x264_predict_16x16_dc_left_##name( uint8_t *src )\
{\
uint32_t dc=8;\
int i;\
dc += src[-1 + i * FDEC_STRIDE];\
dc += src[-1 + (i+1) * FDEC_STRIDE];\
}\
- predict_16x16_dc_left_core_##name( src, dc>>4 );\
+ x264_predict_16x16_dc_left_core_##name( src, dc>>4 );\
}
PREDICT_16x16_DC_LEFT( mmxext )
PREDICT_16x16_DC_LEFT( sse2 )
-static void predict_8x8c_dc_mmxext( uint8_t *src )
+static void x264_predict_8x8c_dc_mmxext( uint8_t *src )
{
int s2 = 4
+ src[-1 + 0*FDEC_STRIDE]
+ src[-1 + 6*FDEC_STRIDE]
+ src[-1 + 7*FDEC_STRIDE];
- predict_8x8c_dc_core_mmxext( src, s2, s3 );
+ x264_predict_8x8c_dc_core_mmxext( src, s2, s3 );
}
#ifdef ARCH_X86_64
-static void predict_8x8c_dc_left( uint8_t *src )
+static void x264_predict_8x8c_dc_left( uint8_t *src )
{
int y;
uint32_t s0 = 0, s1 = 0;
#define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
#ifndef ARCH_X86_64
-static void predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] )
+static void x264_predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] )
{
- predict_8x8_vr_core_mmxext( src, edge );
+ x264_predict_8x8_vr_core_mmxext( src, edge );
{
PREDICT_8x8_LOAD_TOPLEFT
PREDICT_8x8_LOAD_LEFT
{
if( !(cpu&X264_CPU_MMX) )
return;
- pf[I_PRED_16x16_V] = predict_16x16_v_mmx;
+ pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
- pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext;
- pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext;
- pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_mmxext;
+ pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_mmxext;
+ pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_mmxext;
+ pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmxext;
#ifndef ARCH_X86_64
- pf[I_PRED_16x16_P] = predict_16x16_p_mmxext;
+ pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmxext;
#endif
- pf[I_PRED_16x16_H] = predict_16x16_h_mmxext;
+ pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmxext;
if( !(cpu&X264_CPU_SSE2) )
return;
- pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2;
- pf[I_PRED_16x16_V] = predict_16x16_v_sse2;
+ pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2;
+ pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
- pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
- pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_sse2;
- pf[I_PRED_16x16_P] = predict_16x16_p_sse2;
+ pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2;
+ pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
+ pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
- pf[I_PRED_16x16_H] = predict_16x16_h_ssse3;
+ pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3;
#ifdef __GNUC__
- pf[I_PRED_16x16_P] = predict_16x16_p_ssse3;
+ pf[I_PRED_16x16_P] = x264_predict_16x16_p_ssse3;
#endif
}
if( !(cpu&X264_CPU_MMX) )
return;
#ifdef ARCH_X86_64
- pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left;
+ pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
#endif
- pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx;
+ pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
- pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top_mmxext;
- pf[I_PRED_CHROMA_H] = predict_8x8c_h_mmxext;
+ pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_mmxext;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmxext;
#ifndef ARCH_X86_64
- pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmxext;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_mmxext;
#endif
- pf[I_PRED_CHROMA_DC] = predict_8x8c_dc_mmxext;
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmxext;
if( !(cpu&X264_CPU_SSE2) )
return;
- pf[I_PRED_CHROMA_P] = predict_8x8c_p_sse2;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
- pf[I_PRED_CHROMA_H] = predict_8x8c_h_ssse3;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3;
#ifdef __GNUC__
- pf[I_PRED_CHROMA_P] = predict_8x8c_p_ssse3;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_ssse3;
#endif
}
{
if( !(cpu&X264_CPU_MMXEXT) )
return;
- pf[I_PRED_8x8_V] = predict_8x8_v_mmxext;
- pf[I_PRED_8x8_H] = predict_8x8_h_mmxext;
- pf[I_PRED_8x8_DC] = predict_8x8_dc_mmxext;
- pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext;
- pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext;
- pf[I_PRED_8x8_HD] = predict_8x8_hd_mmxext;
- *predict_8x8_filter = predict_8x8_filter_mmxext;
+ pf[I_PRED_8x8_V] = x264_predict_8x8_v_mmxext;
+ pf[I_PRED_8x8_H] = x264_predict_8x8_h_mmxext;
+ pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_mmxext;
+ pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmxext;
+ pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmxext;
+ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmxext;
+ *predict_8x8_filter = x264_predict_8x8_filter_mmxext;
#ifdef ARCH_X86
- pf[I_PRED_8x8_DDL] = predict_8x8_ddl_mmxext;
- pf[I_PRED_8x8_DDR] = predict_8x8_ddr_mmxext;
- pf[I_PRED_8x8_VR] = predict_8x8_vr_mmxext;
- pf[I_PRED_8x8_HU] = predict_8x8_hu_mmxext;
+ pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmxext;
+ pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_mmxext;
+ pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_mmxext;
+ pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_mmxext;
#endif
if( !(cpu&X264_CPU_SSE2) )
return;
- pf[I_PRED_8x8_DDL] = predict_8x8_ddl_sse2;
- pf[I_PRED_8x8_VL] = predict_8x8_vl_sse2;
- pf[I_PRED_8x8_VR] = predict_8x8_vr_sse2;
- pf[I_PRED_8x8_DDR] = predict_8x8_ddr_sse2;
- pf[I_PRED_8x8_HD] = predict_8x8_hd_sse2;
- pf[I_PRED_8x8_HU] = predict_8x8_hu_sse2;
+ pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2;
+ pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2;
+ pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2;
+ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2;
+ pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
- pf[I_PRED_8x8_HD] = predict_8x8_hd_ssse3;
- pf[I_PRED_8x8_HU] = predict_8x8_hu_ssse3;
- *predict_8x8_filter = predict_8x8_filter_ssse3;
+ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3;
+ pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
+ *predict_8x8_filter = x264_predict_8x8_filter_ssse3;
}
void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
{
if( !(cpu&X264_CPU_MMXEXT) )
return;
- pf[I_PRED_4x4_VR] = predict_4x4_vr_mmxext;
- pf[I_PRED_4x4_DDL] = predict_4x4_ddl_mmxext;
- pf[I_PRED_4x4_VL] = predict_4x4_vl_mmxext;
- pf[I_PRED_4x4_DC] = predict_4x4_dc_mmxext;
- pf[I_PRED_4x4_DDR] = predict_4x4_ddr_mmxext;
- pf[I_PRED_4x4_HD] = predict_4x4_hd_mmxext;
- pf[I_PRED_4x4_HU] = predict_4x4_hu_mmxext;
+ pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmxext;
+ pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext;
+ pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_mmxext;
+ pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_mmxext;
+ pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmxext;
+ pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_mmxext;
+ pf[I_PRED_4x4_HU] = x264_predict_4x4_hu_mmxext;
if( !(cpu&X264_CPU_SSSE3) )
return;
- pf[I_PRED_4x4_DDR] = predict_4x4_ddr_ssse3;
- pf[I_PRED_4x4_VR] = predict_4x4_vr_ssse3;
- pf[I_PRED_4x4_HD] = predict_4x4_hd_ssse3;
+ pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
+ pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
+ pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
}
%include "x86util.asm"
SECTION_RODATA
-pb_1: times 16 db 1
-pw_1: times 8 dw 1
-pd_1: times 4 dd 1
-pb_01: times 8 db 0, 1
%macro DQM4 3
dw %1, %2, %1, %2, %2, %3, %2, %3
SECTION .text
+cextern pb_1
+cextern pw_1
+cextern pd_1
+cextern pb_01
+
%macro QUANT_DC_START_MMX 0
movd m6, r1m ; mf
movd m7, r2m ; bias
%endmacro
;-----------------------------------------------------------------------------
-; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
+; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
;-----------------------------------------------------------------------------
%macro QUANT_DC 2-3 0
cglobal %1, 1,1,%3
%endmacro
;-----------------------------------------------------------------------------
-; int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
+; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
;-----------------------------------------------------------------------------
%macro QUANT_AC 2
cglobal %1, 3,3
%define PABSW PABSW_MMX
%define PSIGNW PSIGNW_MMX
%define QUANT_DC_START QUANT_DC_START_MMX
-QUANT_DC x264_quant_2x2_dc_mmxext, 1
+QUANT_DC quant_2x2_dc_mmxext, 1
%ifndef ARCH_X86_64 ; not needed because sse2 is faster
-QUANT_DC x264_quant_4x4_dc_mmxext, 4
-QUANT_AC x264_quant_4x4_mmx, 4
-QUANT_AC x264_quant_8x8_mmx, 16
+QUANT_DC quant_4x4_dc_mmxext, 4
+QUANT_AC quant_4x4_mmx, 4
+QUANT_AC quant_8x8_mmx, 16
%endif
INIT_XMM
-QUANT_DC x264_quant_4x4_dc_sse2, 2, 8
-QUANT_AC x264_quant_4x4_sse2, 2
-QUANT_AC x264_quant_8x8_sse2, 8
+QUANT_DC quant_4x4_dc_sse2, 2, 8
+QUANT_AC quant_4x4_sse2, 2
+QUANT_AC quant_8x8_sse2, 8
%define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3
-QUANT_DC x264_quant_4x4_dc_ssse3, 2, 8
-QUANT_AC x264_quant_4x4_ssse3, 2
-QUANT_AC x264_quant_8x8_ssse3, 8
+QUANT_DC quant_4x4_dc_ssse3, 2, 8
+QUANT_AC quant_4x4_ssse3, 2
+QUANT_AC quant_8x8_ssse3, 8
INIT_MMX
-QUANT_DC x264_quant_2x2_dc_ssse3, 1
+QUANT_DC quant_2x2_dc_ssse3, 1
%define QUANT_END QUANT_END_SSE4
;Not faster on Conroe, so only used in SSE4 versions
%define QUANT_DC_START QUANT_DC_START_SSSE3
INIT_XMM
-QUANT_DC x264_quant_4x4_dc_sse4, 2, 8
-QUANT_AC x264_quant_4x4_sse4, 2
-QUANT_AC x264_quant_8x8_sse4, 8
+QUANT_DC quant_4x4_dc_sse4, 2, 8
+QUANT_AC quant_4x4_sse4, 2
+QUANT_AC quant_8x8_sse4, 8
%endmacro
;-----------------------------------------------------------------------------
-; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+; void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT 4
-cglobal x264_dequant_%2x%2_%1, 0,3
+cglobal dequant_%2x%2_%1, 0,3
.skip_prologue:
DEQUANT_START %3+2, %3
psrld m3, 1
DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
-cglobal x264_dequant_%2x%2_flat16_%1, 0,3
+cglobal dequant_%2x%2_flat16_%1, 0,3
movifnidn t2d, r2m
%if %2 == 8
cmp t2d, 12
- jl x264_dequant_%2x%2_%1.skip_prologue
+ jl dequant_%2x%2_%1.skip_prologue
sub t2d, 12
%endif
imul t0d, t2d, 0x2b
DEQUANT sse2, 8, 6, 2
%macro DEQUANT_DC 1
-cglobal x264_dequant_4x4dc_%1, 0,3
+cglobal dequant_4x4dc_%1, 0,3
DEQUANT_START 6, 6
.lshift:
DEQUANT_DC sse2
;-----------------------------------------------------------------------------
-; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
+; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
;-----------------------------------------------------------------------------
%macro DENOISE_DCT 1-2 0
-cglobal x264_denoise_dct_%1, 4,5,%2
+cglobal denoise_dct_%1, 4,5,%2
movzx r4d, word [r0] ; backup DC coefficient
pxor m6, m6
.loop:
;-----------------------------------------------------------------------------
-; int x264_decimate_score( int16_t *dct )
+; int decimate_score( int16_t *dct )
;-----------------------------------------------------------------------------
%macro DECIMATE_MASK_SSE2 6
or %2, %6
%endmacro
-cextern x264_decimate_table4
-cextern x264_decimate_table8
+cextern decimate_table4
+cextern decimate_table8
%macro DECIMATE4x4 2
;A LUT is faster than bsf on AMD processors, and no slower on Intel
;This is not true for score64.
-cglobal x264_decimate_score%1_%2, 1,3
+cglobal decimate_score%1_%2, 1,3
%ifdef PIC
- lea r10, [x264_decimate_table4]
+ lea r10, [decimate_table4]
lea r11, [decimate_mask_table4]
%define table r10
%define mask_table r11
%else
- %define table x264_decimate_table4
+ %define table decimate_table4
%define mask_table decimate_mask_table4
%endif
DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx
%macro DECIMATE8x8 1
%ifdef ARCH_X86_64
-cglobal x264_decimate_score64_%1, 1,4
+cglobal decimate_score64_%1, 1,4
%ifdef PIC
- lea r10, [x264_decimate_table8]
+ lea r10, [decimate_table8]
%define table r10
%else
- %define table x264_decimate_table8
+ %define table decimate_table8
%endif
mova m5, [pb_1]
DECIMATE_MASK r1d, eax, r0, m5, %1, null
%else ; ARCH
%ifidn %1, mmxext
-cglobal x264_decimate_score64_%1, 1,6
+cglobal decimate_score64_%1, 1,6
%else
-cglobal x264_decimate_score64_%1, 1,5
+cglobal decimate_score64_%1, 1,5
%endif
mova m7, [pb_1]
DECIMATE_MASK r3, r2, r0, m7, %1, r5
je .largerun
shrd r3, r4, cl
shr r4, cl
- add r0b, byte [x264_decimate_table8 + ecx]
+ add r0b, byte [decimate_table8 + ecx]
shrd r3, r4, 1
shr r4, 1
cmp r0, 6 ;score64's threshold is never higher than 6
DECIMATE8x8 ssse3
;-----------------------------------------------------------------------------
-; int x264_coeff_last( int16_t *dct )
+; int coeff_last( int16_t *dct )
;-----------------------------------------------------------------------------
%macro LAST_MASK_SSE2 2-3
%macro COEFF_LAST4 1
%ifdef ARCH_X86_64
-cglobal x264_coeff_last4_%1, 1,1
+cglobal coeff_last4_%1, 1,1
LAST rax, [r0], 0x3f
shr eax, 4
RET
%else
-cglobal x264_coeff_last4_%1, 0,3
+cglobal coeff_last4_%1, 0,3
mov edx, r0mp
mov eax, [edx+4]
xor ecx, ecx
COEFF_LAST4 mmxext_lzcnt
%macro COEFF_LAST 1
-cglobal x264_coeff_last15_%1, 1,3
+cglobal coeff_last15_%1, 1,3
pxor m2, m2
LAST_MASK r1d, r0-2, r2d
xor r1d, 0xffff
dec eax
RET
-cglobal x264_coeff_last16_%1, 1,3
+cglobal coeff_last16_%1, 1,3
pxor m2, m2
LAST_MASK r1d, r0, r2d
xor r1d, 0xffff
RET
%ifndef ARCH_X86_64
-cglobal x264_coeff_last64_%1, 1, 5-mmsize/16
+cglobal coeff_last64_%1, 1, 5-mmsize/16
pxor m2, m2
LAST_MASK r2d, r0+64, r4d
LAST_MASK r3d, r0+96, r4d
add eax, 32
RET
%else
-cglobal x264_coeff_last64_%1, 1,4
+cglobal coeff_last64_%1, 1,4
pxor m2, m2
LAST_MASK_SSE2 r1d, r0
LAST_MASK_SSE2 r2d, r0+32
COEFF_LAST sse2_lzcnt
;-----------------------------------------------------------------------------
-; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
+; int coeff_level_run( int16_t *dct, run_level_t *runlevel )
;-----------------------------------------------------------------------------
%macro LAST_MASK4_MMX 2-3
%endif
%macro COEFF_LEVELRUN 2
-cglobal x264_coeff_level_run%2_%1,0,7
+cglobal coeff_level_run%2_%1,0,7
movifnidn t0, r0mp
movifnidn t1, r1mp
pxor m2, m2
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
-pb_3: times 16 db 3
-pb_shuf8x8c: db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
-pw_8: times 4 dw 8
-sw_64: dd 64
-
SECTION .text
+cextern pb_3
+cextern pb_shuf8x8c
+cextern pw_8
+cextern sw_6
+cextern sw_64
+
;=============================================================================
; SAD MMX
;=============================================================================
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SAD 2
-cglobal x264_pixel_sad_%1x%2_mmxext, 4,4
+cglobal pixel_sad_%1x%2_mmxext, 4,4
pxor mm0, mm0
%rep %2/2
SAD_INC_2x%1P
%macro SAD_W16 1
;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x16_%1, 4,4,8
+cglobal pixel_sad_16x16_%1, 4,4,8
movdqu m0, [r2]
movdqu m1, [r2+r3]
lea r2, [r2+2*r3]
SAD_END_SSE2
;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x8_%1, 4,4
+cglobal pixel_sad_16x8_%1, 4,4
movdqu m0, [r2]
movdqu m2, [r2+r3]
lea r2, [r2+2*r3]
%endmacro
;Even on Nehalem, no sizes other than 8x16 benefit from this method.
-cglobal x264_pixel_sad_8x16_sse2, 4,4
+cglobal pixel_sad_8x16_sse2, 4,4
SAD_INC_4x8P_SSE 0
SAD_INC_4x8P_SSE 1
SAD_INC_4x8P_SSE 1
RET
;-----------------------------------------------------------------------------
-; void intra_sad_x3_4x4 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
-cglobal x264_intra_sad_x3_4x4_mmxext, 3,3
+cglobal intra_sad_x3_4x4_mmxext, 3,3
pxor mm7, mm7
movd mm0, [r1-FDEC_STRIDE]
movd mm1, [r0+FENC_STRIDE*0]
RET
;-----------------------------------------------------------------------------
-; void intra_sad_x3_8x8 ( uint8_t *fenc, uint8_t edge[33], int res[3]);
+; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[33], int res[3]);
;-----------------------------------------------------------------------------
;m0 = DC
%endmacro
INIT_MMX
-cglobal x264_intra_sad_x3_8x8_mmxext, 3,3
+cglobal intra_sad_x3_8x8_mmxext, 3,3
movq m7, [r1+7]
pxor m0, m0
movq m6, [r1+16] ;V prediction
RET
;-----------------------------------------------------------------------------
-; void intra_sad_x3_8x8c ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
%macro INTRA_SAD_HV_ITER 2
%endmacro
%macro INTRA_SAD_8x8C 1
-cglobal x264_intra_sad_x3_8x8c_%1, 3,3
+cglobal intra_sad_x3_8x8c_%1, 3,3
movq m6, [r1 - FDEC_STRIDE]
add r1, FDEC_STRIDE*4
%ifidn %1,ssse3
;-----------------------------------------------------------------------------
-; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
%macro INTRA_SAD16 1-2 0
-cglobal x264_intra_sad_x3_16x16_%1,3,5,%2
+cglobal intra_sad_x3_16x16_%1,3,5,%2
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r1-FDEC_STRIDE+0]
%endmacro
;-----------------------------------------------------------------------------
-; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-; uint8_t *pix2, int i_stride, int scores[3] )
+; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
-cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
+cglobal pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
%ifdef WIN64
%assign i %1+1
movsxd r %+ i, r %+ i %+ d
%endmacro
;-----------------------------------------------------------------------------
-; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-; uint8_t *pix2, int i_stride, int scores[3] )
+; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X_SSE2 4
-cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9
+cglobal pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9
%ifdef WIN64
%assign i %1+1
movsxd r %+ i, r %+ i %+ d
%endmacro
%macro SAD_X_SSE2_MISALIGN 4
-cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9
+cglobal pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9
%ifdef WIN64
%assign i %1+1
movsxd r %+ i, r %+ i %+ d
%endmacro
%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
-cglobal x264_pixel_sad_16x%2_cache64_%1
+cglobal pixel_sad_16x%2_cache64_%1
mov eax, r2m
and eax, 0x37
cmp eax, 0x30
- jle x264_pixel_sad_16x%2_sse2
+ jle pixel_sad_16x%2_sse2
PROLOGUE 4,6
mov r4d, r2d
and r4d, 15
mov eax, r2m
and eax, 0x17|%1|(%4>>1)
cmp eax, 0x10|%1|(%4>>1)
- jle x264_pixel_sad_%1x%2_mmxext
+ jle pixel_sad_%1x%2_mmxext
and eax, 7
shl eax, 3
movd mm6, [sw_64]
%endmacro
%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal x264_pixel_sad_16x%1_cache%2_mmxext
+cglobal pixel_sad_16x%1_cache%2_mmxext
SAD_CACHELINE_START_MMX2 16, %1, %1, %2
.loop:
movq mm1, [r2]
%endmacro
%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal x264_pixel_sad_8x%1_cache%2_mmxext
+cglobal pixel_sad_8x%1_cache%2_mmxext
SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
.loop:
movq mm1, [r2+8]
%endmacro
%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
-cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6
+cglobal pixel_sad_x3_%1x%2_cache%3_%6
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
- jmp x264_pixel_sad_x3_%1x%2_%4
+ jmp pixel_sad_x3_%1x%2_%4
.split:
%ifdef ARCH_X86_64
PROLOGUE 6,7
mov r3, r4
mov r10, r0
mov r11, r5
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11], eax
%ifdef WIN64
mov r2, [rsp]
pop r2
%endif
mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11+4], eax
%ifdef WIN64
mov r2, [rsp+8]
pop r2
%endif
mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11+8], eax
%ifdef WIN64
add rsp, 24
push dword [esp+16]
push dword 16
push dword [esp+20]
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+32]
mov [edi], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+36]
mov [edi+4], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [edi+8], eax
add esp, 16
pop edi
%endmacro
%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
-cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6
+cglobal pixel_sad_x4_%1x%2_cache%3_%6
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
CHECK_SPLIT r4m, %1, %3
- jmp x264_pixel_sad_x4_%1x%2_%4
+ jmp pixel_sad_x4_%1x%2_%4
.split:
%ifdef ARCH_X86_64
PROLOGUE 6,7
mov r1, FENC_STRIDE
mov r3, r5
mov r10, r0
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11], eax
%ifdef WIN64
mov r2, [rsp]
pop r2
%endif
mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11+4], eax
%ifdef WIN64
mov r2, [rsp+8]
pop r2
%endif
mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11+8], eax
%ifdef WIN64
mov r2, [rsp+16]
pop r2
%endif
mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11+12], eax
%ifdef WIN64
add rsp, 24
push dword [esp+16]
push dword 16
push dword [esp+20]
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+32]
mov [edi], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+36]
mov [edi+4], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+40]
mov [edi+8], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [edi+12], eax
add esp, 16
pop edi
; as this feature might be useful for others as well. Send patches or ideas
; to x264-devel@videolan.org .
+%define program_name x264
+
%ifdef ARCH_X86_64
%ifidn __OUTPUT_FORMAT__,win32
%define WIN64
; Symbol prefix for C linkage
%macro cglobal 1-2+
- %xdefine %1 mangle(%1)
+ %xdefine %1 mangle(program_name %+ _ %+ %1)
%xdefine %1.skip_prologue %1 %+ .skip_prologue
%ifidn __OUTPUT_FORMAT__,elf
global %1:function hidden
%endmacro
%macro cextern 1
+ %xdefine %1 mangle(program_name %+ _ %+ %1)
+ extern %1
+%endmacro
+
+;like cextern, but without the prefix
+%macro cextern_naked 1
%xdefine %1 mangle(%1)
extern %1
%endmacro
+%macro const 2+
+ %xdefine %1 mangle(program_name %+ _ %+ %1)
+ global %1
+ %1: %2
+%endmacro
+
; This is needed for ELF, otherwise the GNU linker assumes the stack is
; executable by default.
%ifidn __OUTPUT_FORMAT__,elf
#include <unistd.h>
#include "common/common.h"
-#include "common/cpu.h"
#include "macroblock.h"
#include "me.h"
#include "ratecontrol.h"
#include <math.h>
#include "common/common.h"
-#include "common/cpu.h"
#include "set.h"
#include "analyse.h"
* # of bframes + # of threads.
*/
#include "common/common.h"
-#include "common/cpu.h"
#include "analyse.h"
static void x264_lookahead_shift( x264_synch_frame_list_t *dst, x264_synch_frame_list_t *src, int count )
#include <math.h>
#include "common/common.h"
-#include "common/cpu.h"
#include "ratecontrol.h"
#include "me.h"
#include <math.h>
#include "common/common.h"
-#include "common/cpu.h"
#include "macroblock.h"
#include "me.h"
SECTION .text
-cextern puts
+cextern_naked puts
; max number of args used by any x264 asm function.
; (max_args % 4) must equal 3 for stack alignment
;-----------------------------------------------------------------------------
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
-cglobal x264_checkasm_call, 4,7,16
+cglobal checkasm_call, 4,7,16
sub rsp, max_args*8
%assign stack_offset stack_offset+max_args*8
mov r6, r0
;-----------------------------------------------------------------------------
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
-cglobal x264_checkasm_call, 1,7
+cglobal checkasm_call, 1,7
mov r3, n3
mov r4, n4
mov r5, n5
;-----------------------------------------------------------------------------
; int x264_stack_pagealign( int (*func)(), int align )
;-----------------------------------------------------------------------------
-cglobal x264_stack_pagealign, 2,2
+cglobal stack_pagealign, 2,2
push rbp
mov rbp, rsp
and rsp, ~0xfff