INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm
endif
CODEC_EXPORTS-yes += vpx/exports_com
CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
third_party/x86inc/x86inc.asm \
vpx_config.asm \
vpx_ports/x86_abi_support.asm \
+ vpx_dsp/x86/bitdepth_conversion_sse2.asm \
vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
@echo " [CREATE] $@"
%define private_prefix vp9
%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION .text
psllw m0, 2
psllw m1, 2
-%if CONFIG_VP9_HIGHBITDEPTH
- ; sign extension
- mova m2, m0
- mova m3, m1
- punpcklwd m0, m0
- punpcklwd m1, m1
- punpckhwd m2, m2
- punpckhwd m3, m3
- psrad m0, 16
- psrad m1, 16
- psrad m2, 16
- psrad m3, 16
- mova [outputq], m0
- mova [outputq + 16], m2
- mova [outputq + 32], m1
- mova [outputq + 48], m3
-%else
- mova [outputq], m0
- mova [outputq + 16], m1
-%endif
+ STORE_TRAN_LOW 0, outputq, 0, 2, 3
+ STORE_TRAN_LOW 1, outputq, 1, 2, 3
RET
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
+DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2.h
+# This file is included in libs.mk. Including it here would cause it to be
+# compiled into an object. Even as an empty file, this would create an
+# executable section on the stack.
+#DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2$(ASM)
+
# bit reader
DSP_SRCS-yes += prob.h
DSP_SRCS-yes += prob.c
DSP_SRCS-yes += quantize.c
DSP_SRCS-yes += quantize.h
-DSP_SRCS-$(HAVE_SSE2) += x86/fdct.h
DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_ports/mem.h"
void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
;
%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION .text
SWAP 7, 9
%endmacro
-%if CONFIG_VP9_HIGHBITDEPTH
-; store %1 to outputq + %2
-; uses m8-m10 as scratch registers
-%macro STORE_TRAN_LOW 2
- pxor m8, m8
- mova m9, m%1
- mova m10, m%1
- pcmpgtw m8, m%1
- punpcklwd m9, m8
- punpckhwd m10, m8
- mova [outputq + %2], m9
- mova [outputq + %2 + 16], m10
-%endmacro
-%endif
INIT_XMM ssse3
cglobal hadamard_8x8, 3, 5, 11, input, stride, output
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
HMD8_1D
-%if CONFIG_VP9_HIGHBITDEPTH
- STORE_TRAN_LOW 0, 0
- STORE_TRAN_LOW 1, 32
- STORE_TRAN_LOW 2, 64
- STORE_TRAN_LOW 3, 96
- STORE_TRAN_LOW 4, 128
- STORE_TRAN_LOW 5, 160
- STORE_TRAN_LOW 6, 192
- STORE_TRAN_LOW 7, 224
-%else
- mova [outputq + 0], m0
- mova [outputq + 16], m1
- mova [outputq + 32], m2
- mova [outputq + 48], m3
- mova [outputq + 64], m4
- mova [outputq + 80], m5
- mova [outputq + 96], m6
- mova [outputq + 112], m7
-%endif
+ STORE_TRAN_LOW 0, outputq, 0, 8, 9
+ STORE_TRAN_LOW 1, outputq, 1, 8, 9
+ STORE_TRAN_LOW 2, outputq, 2, 8, 9
+ STORE_TRAN_LOW 3, outputq, 3, 8, 9
+ STORE_TRAN_LOW 4, outputq, 4, 8, 9
+ STORE_TRAN_LOW 5, outputq, 5, 8, 9
+ STORE_TRAN_LOW 6, outputq, 6, 8, 9
+ STORE_TRAN_LOW 7, outputq, 7, 8, 9
RET
%endif
--- /dev/null
+;
+; Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm.
+; vpx_config.asm is not guarded so can not be included twice. Because this will
+; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be
+; included after those files.
+
+; Increment register by sizeof() tran_low_t * 8.
+%macro INCREMENT_TRAN_LOW 1
+%if CONFIG_VP9_HIGHBITDEPTH
+ add %1, 32
+%else
+ add %1, 16
+%endif
+%endmacro
+
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+%if CONFIG_VP9_HIGHBITDEPTH
+ lea %1, [%1 + %2 * 4]
+%else
+ lea %1, [%1 + %2 * 2]
+%endif
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bits.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova m%1, [%2 + %3 * 32]
+ packssdw m%1, [%2 + %3 * 32 + 16]
+%else
+ mova m%1, [%2 + %3 * 16]
+%endif
+%endmacro
+
+; Store m%1 to %2 + %3.
+; %3 is the offset in elements, not bits.
+; If tran_low_t is 16 bits (low bit depth configuration) then store the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
+; extend the values first.
+; Uses m%4-m%6 as scratch registers for high bit depth.
+%macro STORE_TRAN_LOW 5
+%if CONFIG_VP9_HIGHBITDEPTH
+ pxor m%4, m%4
+ mova m%5, m%1
+ pcmpgtw m%4, m%1
+ punpcklwd m%5, m%4
+ punpckhwd m%1, m%4
+ mova [%2 + %3 * 32 + 0], m%5
+ mova [%2 + %3 * 32 + 16], m%1
+%else
+ mova [%2 + %3 * 16], m%1
+%endif
+%endmacro
/*
- * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
;
%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION_RODATA
lea r3, [2 * strideq]
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [inputq + 0]
- packssdw m0, [inputq + 16]
- mova m1, [inputq + 32]
- packssdw m1, [inputq + 48]
- mova m2, [inputq + 64]
- packssdw m2, [inputq + 80]
- mova m3, [inputq + 96]
- packssdw m3, [inputq + 112]
-%else
- mova m0, [inputq + 0]
- mova m1, [inputq + 16]
- mova m2, [inputq + 32]
- mova m3, [inputq + 48]
-%endif
+ LOAD_TRAN_LOW 0, inputq, 0
+ LOAD_TRAN_LOW 1, inputq, 1
+ LOAD_TRAN_LOW 2, inputq, 2
+ LOAD_TRAN_LOW 3, inputq, 3
punpcklwd m0, m1
punpcklwd m2, m3
lea r4, [rsp + transposed_in]
idct32x32_34_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
+ LOAD_TRAN_LOW 0, r3, 0
+ LOAD_TRAN_LOW 1, r3, 4
+ LOAD_TRAN_LOW 2, r3, 8
+ LOAD_TRAN_LOW 3, r3, 12
+ LOAD_TRAN_LOW 4, r3, 16
+ LOAD_TRAN_LOW 5, r3, 20
+ LOAD_TRAN_LOW 6, r3, 24
+ LOAD_TRAN_LOW 7, r3, 28
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
mov r7, 2
idct32x32_135_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
+ LOAD_TRAN_LOW 0, r3, 0
+ LOAD_TRAN_LOW 1, r3, 4
+ LOAD_TRAN_LOW 2, r3, 8
+ LOAD_TRAN_LOW 3, r3, 12
+ LOAD_TRAN_LOW 4, r3, 16
+ LOAD_TRAN_LOW 5, r3, 20
+ LOAD_TRAN_LOW 6, r3, 24
+ LOAD_TRAN_LOW 7, r3, 28
+
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
mova [r4 + 0], m0
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
-%if CONFIG_VP9_HIGHBITDEPTH
- add r3, 32
-%else
- add r3, 16
-%endif
+ INCREMENT_TRAN_LOW r3
add r4, 16 * 8
dec r7
jne idct32x32_135_transpose
IDCT32X32_135 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
- lea inputq, [inputq + 32 * 32]
-%else
- lea inputq, [inputq + 16 * 32]
-%endif
+ INCREMENT_ELEMENTS_TRAN_LOW inputq, 8*32
dec r6
jnz idct32x32_135
mov r7, 4
idct32x32_1024_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
+ LOAD_TRAN_LOW 0, r3, 0
+ LOAD_TRAN_LOW 1, r3, 4
+ LOAD_TRAN_LOW 2, r3, 8
+ LOAD_TRAN_LOW 3, r3, 12
+ LOAD_TRAN_LOW 4, r3, 16
+ LOAD_TRAN_LOW 5, r3, 20
+ LOAD_TRAN_LOW 6, r3, 24
+ LOAD_TRAN_LOW 7, r3, 28
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
mova [r4 + 16 * 5], m5
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
-%if CONFIG_VP9_HIGHBITDEPTH
- add r3, 32
-%else
- add r3, 16
-%endif
+ INCREMENT_TRAN_LOW r3
add r4, 16 * 8
dec r7
jne idct32x32_1024_transpose
IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
- lea inputq, [inputq + 32 * 32]
-%else
- lea inputq, [inputq + 16 * 32]
-%endif
+ INCREMENT_ELEMENTS_TRAN_LOW inputq, 8*32
dec r6
jnz idct32x32_1024
;
%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION .text
INIT_XMM sse2
cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [inputq + 0]
- packssdw m0, [inputq + 16]
- mova m1, [inputq + 32]
- packssdw m1, [inputq + 48]
-%else
- mova m0, [inputq + 0]
- mova m1, [inputq + 16]
-%endif
+ LOAD_TRAN_LOW 0, inputq, 0
+ LOAD_TRAN_LOW 1, inputq, 1
psraw m0, 2
psraw m1, 2
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block, const int16_t *zbin_ptr,