From: Yunqing Wang Date: Tue, 25 Oct 2011 19:14:16 +0000 (-0400) Subject: Multiple-resolution encoder X-Git-Tag: v1.0.0~64 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=aa7335e610b961626f77130bc99b24de1031601d;p=libvpx Multiple-resolution encoder The example encoder down-samples the input video frames a number of times with a down-sampling factor, and then encodes and outputs bitstreams with different resolutions. Support arbitrary down-sampling factor, and down-sampling factor can be different for each encoding level. For example, the encoder can be tested as follows. 1. Configure with multi-resolution encoding enabled: ../libvpx/configure --target=x86-linux-gcc --disable-codecs --enable-vp8 --enable-runtime_cpu_detect --enable-debug --disable-install-docs --enable-error-concealment --enable-multi-res-encoding 2. Run make 3. Encode: If input video is 1280x720, run: ./vp8_multi_resolution_encoder 1280 720 input.yuv 1.ivf 2.ivf 3.ivf 1 (output: 1.ivf(1280x720); 2.ivf(640x360); 3.ivf(320x180). The last parameter is set to 1/0 to show/not show PSNR.) 4. Decode: ./simple_decoder 1.ivf 1.yuv ./simple_decoder 2.ivf 2.yuv ./simple_decoder 3.ivf 3.yuv 5. View video: mplayer 1.yuv -demuxer rawvideo -rawvideo w=1280:h=720 -loop 0 -fps 30 mplayer 2.yuv -demuxer rawvideo -rawvideo w=640:h=360 -loop 0 -fps 30 mplayer 3.yuv -demuxer rawvideo -rawvideo w=320:h=180 -loop 0 -fps 30 The encoding parameters can be modified in vp8_multi_resolution_encoder.c, for example, target bitrate, frame rate... Modified API. John helped a lot with that. Thanks! Change-Id: I03be9a51167eddf94399f92d269599fb3f3d54f5 --- diff --git a/configure b/configure index cca94a24c..363687a11 100755 --- a/configure +++ b/configure @@ -35,7 +35,7 @@ Advanced options: ${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders) ${toggle_mem_tracker} track memory usage ${toggle_postproc} postprocessing - ${toggle_multithread} multithreaded encoding and decoding. + ${toggle_multithread} multithreaded encoding and decoding ${toggle_spatial_resampling} spatial sampling (scaling) support ${toggle_realtime_only} enable this option while building for real-time encoding ${toggle_error_concealment} enable this option to get a decoder which is able to conceal losses @@ -44,6 +44,7 @@ Advanced options: ${toggle_static} static library support ${toggle_small} favor smaller size over speed ${toggle_postproc_visualizer} macro block / block level visualizers + ${toggle_multi_res_encoding} enable multiple-resolution encoding Codecs: Codecs can be selectively enabled or disabled individually, or by family: @@ -262,6 +263,7 @@ CONFIG_LIST=" postproc_visualizer os_support unit_tests + multi_res_encoding " CMDLINE_SELECT=" extra_warnings @@ -304,6 +306,7 @@ CMDLINE_SELECT=" small postproc_visualizer unit_tests + multi_res_encoding " process_cmdline() { diff --git a/examples.mk b/examples.mk index 8088d3217..1f7dcc171 100644 --- a/examples.mk +++ b/examples.mk @@ -96,6 +96,16 @@ GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c vp8cx_set_ref.GUID = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame +# C file is provided, not generated automatically. +GEN_EXAMPLES-$(CONFIG_MULTI_RES_ENCODING) += vp8_multi_resolution_encoder.c +vp8_multi_resolution_encoder.SRCS \ + += third_party/libyuv/include/libyuv/basic_types.h \ + third_party/libyuv/include/libyuv/cpu_id.h \ + third_party/libyuv/include/libyuv/scale.h \ + third_party/libyuv/source/scale.c \ + third_party/libyuv/source/cpu_id.c +vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de +vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding # Handle extra library flags depending on codec configuration diff --git a/third_party/libyuv/README.webm b/third_party/libyuv/README.webm new file mode 100644 index 000000000..32766be27 --- /dev/null +++ b/third_party/libyuv/README.webm @@ -0,0 +1,17 @@ +Name: libyuv +URL: http://code.google.com/p/libyuv/ +Version: 90 +License: BSD +License File: LICENSE + +Description: +libyuv is an open source project that includes YUV conversion and scaling +functionality. + +The optimized scaler in libyuv is used in multiple resolution encoder example, +which down-samples the original input video (f.g. 1280x720) a number of times +in order to encode multiple resolution bit streams. + +Local Modifications: +Modified the original scaler code from C++ to C to fit in our current build +system. This is a temporal solution, and will be improved later. \ No newline at end of file diff --git a/third_party/libyuv/include/libyuv/basic_types.h b/third_party/libyuv/include/libyuv/basic_types.h new file mode 100644 index 000000000..87f8bd2de --- /dev/null +++ b/third_party/libyuv/include/libyuv/basic_types.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ +#define INCLUDE_LIBYUV_BASIC_TYPES_H_ + +#include // for NULL, size_t + +#ifndef WIN32 +#include // for uintptr_t +#endif + +#ifndef INT_TYPES_DEFINED +#define INT_TYPES_DEFINED +#ifdef COMPILER_MSVC +typedef __int64 int64; +#else +typedef long long int64; +#endif /* COMPILER_MSVC */ +typedef int int32; +typedef short int16; +typedef char int8; + +#ifdef COMPILER_MSVC +typedef unsigned __int64 uint64; +typedef __int64 int64; +#ifndef INT64_C +#define INT64_C(x) x ## I64 +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## UI64 +#endif +#define INT64_F "I64" +#else +typedef unsigned long long uint64; +//typedef long long int64; +#ifndef INT64_C +#define INT64_C(x) x ## LL +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## ULL +#endif +#define INT64_F "ll" +#endif /* COMPILER_MSVC */ +typedef unsigned int uint32; +typedef unsigned short uint16; +typedef unsigned char uint8; +#endif // INT_TYPES_DEFINED + +// Detect compiler is for x86 or x64. +#if defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86) +#define CPU_X86 1 +#endif + +#define IS_ALIGNED(p, a) (0==((uintptr_t)(p) & ((a)-1))) +#define ALIGNP(p, t) \ + ((uint8*)((((uintptr_t)(p) + \ + ((t)-1)) & ~((t)-1)))) + +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ diff --git a/third_party/libyuv/include/libyuv/cpu_id.h b/third_party/libyuv/include/libyuv/cpu_id.h new file mode 100644 index 000000000..8ebafe9b5 --- /dev/null +++ b/third_party/libyuv/include/libyuv/cpu_id.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ +#define INCLUDE_LIBYUV_CPU_ID_H_ + +//namespace libyuv { + +// These flags are only valid on x86 processors +static const int kCpuHasSSE2 = 1; +static const int kCpuHasSSSE3 = 2; + +// SIMD support on ARM processors +static const int kCpuHasNEON = 4; + +// Detect CPU has SSE2 etc. +int TestCpuFlag(int flag); + +// For testing, allow CPU flags to be disabled. +void MaskCpuFlagsForTest(int enable_flags); + +//} // namespace libyuv + +#endif // INCLUDE_LIBYUV_CPU_ID_H_ diff --git a/third_party/libyuv/include/libyuv/scale.h b/third_party/libyuv/include/libyuv/scale.h new file mode 100644 index 000000000..5b2d364ad --- /dev/null +++ b/third_party/libyuv/include/libyuv/scale.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_H_ +#define INCLUDE_LIBYUV_SCALE_H_ + +#include "third_party/libyuv/include/libyuv/basic_types.h" + +//namespace libyuv { + +// Supported filtering +typedef enum { + kFilterNone = 0, // Point sample; Fastest + kFilterBilinear = 1, // Faster than box, but lower quality scaling down. + kFilterBox = 2 // Highest quality +}FilterMode; + +// Scales a YUV 4:2:0 image from the src width and height to the +// dst width and height. +// If filtering is kFilterNone, a simple nearest-neighbor algorithm is +// used. This produces basic (blocky) quality at the fastest speed. +// If filtering is kFilterBilinear, interpolation is used to produce a better +// quality image, at the expense of speed. +// If filtering is kFilterBox, averaging is used to produce ever better +// quality image, at further expense of speed. +// Returns 0 if successful. + +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + FilterMode filtering); + +// Legacy API +// If dst_height_offset is non-zero, the image is offset by that many pixels +// and stretched to (dst_height - dst_height_offset * 2) pixels high, +// instead of dst_height. +int Scale_1(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int dst_height_offset, + int interpolate); + +// Same, but specified src terms of each plane location and stride. +int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + int interpolate); + +// For testing, allow disabling of optimizations. +void SetUseReferenceImpl(int use); + +//} // namespace libyuv + +#endif // INCLUDE_LIBYUV_SCALE_H_ diff --git a/third_party/libyuv/source/cpu_id.c b/third_party/libyuv/source/cpu_id.c new file mode 100644 index 000000000..e3b66f21d --- /dev/null +++ b/third_party/libyuv/source/cpu_id.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/cpu_id.h" +#include "third_party/libyuv/include/libyuv/basic_types.h" // for CPU_X86 + +#ifdef _MSC_VER +#include +#endif + +// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. +#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) +static inline void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} +#elif defined(__i386__) || defined(__x86_64__) +static inline void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "cpuid\n" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} +#endif + +//namespace libyuv { + +// CPU detect function for SIMD instruction sets. +static int cpu_info_initialized_ = 0; +static int cpu_info_ = 0; + +// Global lock for cpu initialization. +static void InitCpuFlags() { +#ifdef CPU_X86 + int cpu_info[4]; + __cpuid(cpu_info, 1); + cpu_info_ = (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) | + (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0); +#elif defined(__ARM_NEON__) + // gcc -mfpu=neon defines __ARM_NEON__ + // if code is specifically built for Neon-only, enable the flag. + cpu_info_ |= kCpuHasNEON; +#else + cpu_info_ = 0; +#endif + cpu_info_initialized_ = 1; +} + +void MaskCpuFlagsForTest(int enable_flags) { + InitCpuFlags(); + cpu_info_ &= enable_flags; +} + +int TestCpuFlag(int flag) { + if (!cpu_info_initialized_) { + InitCpuFlags(); + } + return cpu_info_ & flag ? 1 : 0; +} + +//} // namespace libyuv diff --git a/third_party/libyuv/source/row.h b/third_party/libyuv/source/row.h new file mode 100644 index 000000000..0486fe23a --- /dev/null +++ b/third_party/libyuv/source/row.h @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBYUV_SOURCE_ROW_H_ +#define LIBYUV_SOURCE_ROW_H_ + +#include "third_party/libyuv/include/libyuv/basic_types.h" + +#define kMaxStride (2048 * 4) +//#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) + +#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR) +#define YUV_DISABLE_ASM +#endif + +#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#define HAS_FASTCONVERTYUVTOARGBROW_NEON +void FastConvertYUVToARGBRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#define HAS_FASTCONVERTYUVTOBGRAROW_NEON +void FastConvertYUVToBGRARow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#define HAS_FASTCONVERTYUVTOABGRROW_NEON +void FastConvertYUVToABGRRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#endif + +// The following are available on all x86 platforms +#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ + !defined(YUV_DISABLE_ASM) +#define HAS_ABGRTOARGBROW_SSSE3 +#define HAS_BGRATOARGBROW_SSSE3 +#define HAS_BG24TOARGBROW_SSSE3 +#define HAS_RAWTOARGBROW_SSSE3 +#define HAS_RGB24TOYROW_SSSE3 +#define HAS_RAWTOYROW_SSSE3 +#define HAS_RGB24TOUVROW_SSSE3 +#define HAS_RAWTOUVROW_SSSE3 +#define HAS_ARGBTOYROW_SSSE3 +#define HAS_BGRATOYROW_SSSE3 +#define HAS_ABGRTOYROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 +#define HAS_BGRATOUVROW_SSSE3 +#define HAS_ABGRTOUVROW_SSSE3 +#define HAS_I400TOARGBROW_SSE2 +#define HAS_FASTCONVERTYTOARGBROW_SSE2 +#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3 +#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3 +#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3 +#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3 +#define HAS_REVERSE_ROW_SSSE3 +#endif + +// The following are available on Neon platforms +#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#define HAS_REVERSE_ROW_NEON +#endif + +//extern "C" { + +#ifdef HAS_ARGBTOYROW_SSSE3 +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#endif +#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3) +#define HASRGB24TOYROW_SSSE3 +#endif +#ifdef HASRGB24TOYROW_SSSE3 +void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#endif +#ifdef HAS_REVERSE_ROW_SSSE3 +void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width); +#endif +#ifdef HAS_REVERSE_ROW_NEON +void ReverseRow_NEON(const uint8* src, uint8* dst, int width); +#endif +void ReverseRow_C(const uint8* src, uint8* dst, int width); + +void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); + +#ifdef HAS_BG24TOARGBROW_SSSE3 +void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix); +void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix); +void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); +void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); +#endif +void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix); +void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix); +void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); +void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); + +#ifdef HAS_I400TOARGBROW_SSE2 +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +#endif +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); + +#if defined(_MSC_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +typedef __declspec(align(16)) signed char vec8[16]; +typedef __declspec(align(16)) unsigned char uvec8[16]; +typedef __declspec(align(16)) signed short vec16[8]; +#else // __GNUC__ +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +typedef signed char __attribute__((vector_size(16))) vec8; +typedef unsigned char __attribute__((vector_size(16))) uvec8; +typedef signed short __attribute__((vector_size(16))) vec16; +#endif + +//extern "C" +SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); +//extern "C" +SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]); +//extern "C" +SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]); + +void FastConvertYUVToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToBGRARow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYToARGBRow_C(const uint8* y_buf, + uint8* rgb_buf, + int width); + +#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2 +void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width); +#endif + +#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 +void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +#endif + +#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 +void FastConvertYToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width); + +#endif + +//} // extern "C" + +#endif // LIBYUV_SOURCE_ROW_H_ diff --git a/third_party/libyuv/source/scale.c b/third_party/libyuv/source/scale.c new file mode 100644 index 000000000..02ffdac65 --- /dev/null +++ b/third_party/libyuv/source/scale.c @@ -0,0 +1,3914 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/scale.h" + +#include +#include + +#include "third_party/libyuv/include/libyuv/cpu_id.h" +#include "third_party/libyuv/source/row.h" +#if defined(_MSC_VER) +#define ALIGN16(var) __declspec(align(16)) var +#else +#define ALIGN16(var) var __attribute__((aligned(16))) +#endif + +// Note: A Neon reference manual +// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html +// Note: Some SSE2 reference manuals +// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf + +//namespace libyuv { + +// Set the following flag to true to revert to only +// using the reference implementation ScalePlaneBox(), and +// NOT the optimized versions. Useful for debugging and +// when comparing the quality of the resulting YUV planes +// as produced by the optimized and non-optimized versions. + +static int use_reference_impl_ = 0; + +void SetUseReferenceImpl(int use) { + use_reference_impl_ = use; +} + +// TODO: The preprocessor definitions for Win64 are not right in build system. +// Disable optimized code for now. +#define YUV_DISABLE_ASM + +/** + * NEON downscalers with interpolation. + * + * Provided by Fritz Koenig + * + */ + +#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#define HAS_SCALEROWDOWN2_NEON +void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1 + "vst1.u8 {q0}, [%1]! \n" // store even pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + asm volatile ( + "add %1, %0 \n" // change the stride to row 2 pointer + "1: \n" + "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment + "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vst1.u8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" // 16 processed per loop + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); +} + +#define HAS_SCALEROWDOWN4_NEON +static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + "vld2.u8 {d0, d1}, [%0]! \n" + "vtrn.u8 d1, d0 \n" + "vshrn.u16 d0, q0, #8 \n" + "vst1.u32 {d0[1]}, [%1]! \n" + + "subs %2, #4 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc" + ); +} + +static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "add r4, %0, %3 \n" + "add r5, r4, %3 \n" + "add %3, r5, %3 \n" + "1: \n" + "vld1.u8 {q0}, [%0]! \n" // load up 16x4 block of input data + "vld1.u8 {q1}, [r4]! \n" + "vld1.u8 {q2}, [r5]! \n" + "vld1.u8 {q3}, [%3]! \n" + + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + + "vpaddl.u16 q0, q0 \n" + + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + + "vmovn.u16 d0, q0 \n" + "vst1.u32 {d0[0]}, [%1]! \n" + + "subs %2, #4 \n" + "bhi 1b \n" + + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(src_stride) // %3 + : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" + ); +} + +#define HAS_SCALEROWDOWN34_NEON +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vmov d2, d3 \n" // order needs to be d0, d1, d2 + "vst3.u8 {d0, d1, d2}, [%1]! \n" + "subs %2, #24 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc" + ); +} + +static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" + + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" + + "vst3.u8 {d0, d1, d2}, [%1]! \n" + + "subs %2, #24 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" + ); +} + +static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + "vst3.u8 {d0, d1, d2}, [%1]! \n" + + "subs %2, #24 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" + ); +} + +#define HAS_SCALEROWDOWN38_NEON +const uint8 shuf38[16] __attribute__ ((aligned(16))) = + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; +const uint8 shuf38_2[16] __attribute__ ((aligned(16))) = + { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; +const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) = + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; +const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) = + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; + +// 32 -> 12 +static void ScaleRowDown38_NEON(const uint8* src_ptr, int, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u8 {q3}, [%3] \n" + "1: \n" + "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.u8 {d4}, [%1]! \n" + "vst1.u32 {d5[0]}, [%1]! \n" + "subs %2, #12 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(shuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" + ); +} + +// 32x3 -> 12x1 +static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u16 {q13}, [%4] \n" + "vld1.u8 {q14}, [%5] \n" + "vld1.u8 {q15}, [%6] \n" + "add r4, %0, %3, lsl #1 \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.u8 {d16, d17, d18, d19}, [r4]! \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q13 \n" + "vmovn.u16 d4, q2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q15 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.u8 {d3}, [%1]! \n" + "vst1.u32 {d4[0]}, [%1]! \n" + "subs %2, #12 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(mult38_div6), // %4 + "r"(shuf38_2), // %5 + "r"(mult38_div9) // %6 + : "r4", "q0", "q1", "q2", "q3", "q8", "q9", + "q13", "q14", "q15", "memory", "cc" + ); +} + +// 32x2 -> 12x1 +static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u16 {q13}, [%4] \n" + "vld1.u8 {q14}, [%5] \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q13 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.u8 {d3}, [%1]! \n" + "vst1.u32 {d4[0]}, [%1]! \n" + "subs %2, #12 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(mult38_div6), // %4 + "r"(shuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" + ); +} + +/** + * SSE2 downscalers with interpolation. + * + * Provided by Frank Barchard (fbarchard@google.com) + * + */ + +// Constants for SSE2 code +#elif (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && \ + !defined(YUV_DISABLE_ASM) +#if defined(_MSC_VER) +#define TALIGN16(t, var) __declspec(align(16)) t _ ## var +#elif defined(OSX) && defined(__i386__) +#define TALIGN16(t, var) t var __attribute__((aligned(16))) +#else +#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) +#endif + +// Offsets for source bytes 0 to 9 +TALIGN16(const uint8, shuf0[16]) = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +TALIGN16(const uint8, shuf1[16]) = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +TALIGN16(const uint8, shuf2[16]) = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +TALIGN16(const uint8, shuf01[16]) = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +TALIGN16(const uint8, shuf11[16]) = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +TALIGN16(const uint8, shuf21[16]) = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +TALIGN16(const uint8, madd01[16]) = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +TALIGN16(const uint8, madd11[16]) = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +TALIGN16(const uint8, madd21[16]) = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +TALIGN16(const int16, round34[8]) = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +TALIGN16(const uint8, shuf38a[16]) = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +TALIGN16(const uint8, shuf38b[16]) = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +TALIGN16(const uint8, shufac0[16]) = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +TALIGN16(const uint8, shufac3[16]) = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +TALIGN16(const uint16, scaleac3[8]) = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +TALIGN16(const uint8, shufab0[16]) = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +TALIGN16(const uint8, shufab1[16]) = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +TALIGN16(const uint8, shufab2[16]) = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +TALIGN16(const uint16, scaleab2[8]) = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; +#endif + +#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) + +#define HAS_SCALEROWDOWN2_SSE2 +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) +static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + + ret + } +} +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) +void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + + pop esi + ret + } +} + +#define HAS_SCALEROWDOWN4_SSE2 +// Point samples 32 pixels to 8 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x000000ff + psrld xmm5, 24 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 8 + ja wloop + + popad + ret + } +} + +// Blends 32x4 rectangle to 8x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + lea edx, [ebx + ebx * 2] // src_stride * 3 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + movdqa xmm2, [esi + ebx] + movdqa xmm3, [esi + ebx + 16] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, [esi + ebx * 2] + movdqa xmm3, [esi + ebx * 2 + 16] + movdqa xmm4, [esi + edx] + movdqa xmm5, [esi + edx + 16] + lea esi, [esi + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm7 + pand xmm3, xmm7 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa xmm2, xmm0 // average columns (16 to 8 pixels) + psrlw xmm0, 8 + pand xmm2, xmm7 + pavgw xmm0, xmm2 + packuswb xmm0, xmm0 + + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 8 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN8_SSE2 +// Point samples 32 pixels to 4 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. +__declspec(naked) +static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes + psrlq xmm5, 56 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 // 32->16 + packuswb xmm0, xmm0 // 16->8 + packuswb xmm0, xmm0 // 8->4 + movd dword ptr [edi], xmm0 + lea edi, [edi + 4] + sub ecx, 4 + ja wloop + + popad + ret + } +} + +// Blends 32x8 rectangle to 4x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. +__declspec(naked) +static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + lea edx, [ebx + ebx * 2] // src_stride * 3 + pxor xmm7, xmm7 + + wloop: + movdqa xmm0, [esi] // average 8 rows to 1 + movdqa xmm1, [esi + 16] + movdqa xmm2, [esi + ebx] + movdqa xmm3, [esi + ebx + 16] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + movdqa xmm2, [esi + ebx * 2] + movdqa xmm3, [esi + ebx * 2 + 16] + movdqa xmm4, [esi + edx] + movdqa xmm5, [esi + edx + 16] + lea ebp, [esi + ebx * 4] + lea esi, [esi + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, [ebp] + movdqa xmm3, [ebp + 16] + movdqa xmm4, [ebp + ebx] + movdqa xmm5, [ebp + ebx + 16] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + movdqa xmm4, [ebp + ebx * 2] + movdqa xmm5, [ebp + ebx * 2 + 16] + movdqa xmm6, [ebp + edx] + pavgb xmm4, xmm6 + movdqa xmm6, [ebp + edx + 16] + pavgb xmm5, xmm6 + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + psadbw xmm0, xmm7 // average 32 pixels to 4 + psadbw xmm1, xmm7 + pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01 + pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx + por xmm0, xmm1 // -> 3201 + psrlw xmm0, 3 + packuswb xmm0, xmm0 + packuswb xmm0, xmm0 + movd dword ptr [edi], xmm0 + + lea edi, [edi + 4] + sub ecx, 4 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN34_SSSE3 +// Point samples 32 pixels to 24 pixels. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm3, _shuf0 + movdqa xmm4, _shuf1 + movdqa xmm5, _shuf2 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + movdqa xmm2, xmm1 + palignr xmm1, xmm0, 8 + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edi], xmm0 + movq qword ptr [edi + 8], xmm1 + movq qword ptr [edi + 16], xmm2 + lea edi, [edi + 24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +// Blends 32x2 rectangle to 24x1 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Register usage: +// xmm0 src_row 0 +// xmm1 src_row 1 +// xmm2 shuf 0 +// xmm3 shuf 1 +// xmm4 shuf 2 +// xmm5 madd 0 +// xmm6 madd 1 +// xmm7 round34 + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _round34 + + wloop: + movdqa xmm0, [esi] // pixels 0..7 + movdqa xmm1, [esi+ebx] + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + movdqu xmm0, [esi+8] // pixels 8..15 + movdqu xmm1, [esi+ebx+8] + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+8], xmm0 + movdqa xmm0, [esi+16] // pixels 16..23 + movdqa xmm1, [esi+ebx+16] + lea esi, [esi+32] + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, _madd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+16], xmm0 + lea edi, [edi+24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _round34 + + wloop: + movdqa xmm0, [esi] // pixels 0..7 + movdqa xmm1, [esi+ebx] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + movdqu xmm0, [esi+8] // pixels 8..15 + movdqu xmm1, [esi+ebx+8] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+8], xmm0 + movdqa xmm0, [esi+16] // pixels 16..23 + movdqa xmm1, [esi+ebx+16] + lea esi, [esi+32] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, _madd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+16], xmm0 + lea edi, [edi+24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN38_SSSE3 +// 3/8 point sampler + +// Scale 32 pixels to 12 +__declspec(naked) +static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm4, _shuf38a + movdqa xmm5, _shuf38b + + xloop: + movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5 + movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11 + lea esi, [esi + 32] + pshufb xmm0, xmm4 + pshufb xmm1, xmm5 + paddusb xmm0, xmm1 + + movq qword ptr [edi], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edi + 8], xmm1 + lea edi, [edi + 12] + sub ecx, 12 + ja xloop + + popad + ret + } +} + +// Scale 16x3 pixels to 6x1 with interpolation +__declspec(naked) +static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm4, _shufac0 + movdqa xmm5, _shufac3 + movdqa xmm6, _scaleac3 + pxor xmm7, xmm7 + + xloop: + movdqa xmm0, [esi] // sum up 3 rows into xmm0/1 + movdqa xmm2, [esi + edx] + movhlps xmm1, xmm0 + movhlps xmm3, xmm2 + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 + movdqa xmm2, [esi + edx * 2] + lea esi, [esi + 16] + movhlps xmm3, xmm2 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 + + movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2 + psrldq xmm0, 2 + paddusw xmm2, xmm0 + psrldq xmm0, 2 + paddusw xmm2, xmm0 + pshufb xmm2, xmm4 + + movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2 + psrldq xmm1, 2 + paddusw xmm3, xmm1 + psrldq xmm1, 2 + paddusw xmm3, xmm1 + pshufb xmm3, xmm5 + paddusw xmm2, xmm3 + + pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6 + packuswb xmm2, xmm2 + + movd [edi], xmm2 // write 6 pixels + pextrw eax, xmm2, 2 + mov [edi + 4], ax + lea edi, [edi + 6] + sub ecx, 6 + ja xloop + + popad + ret + } +} + +// Scale 16x2 pixels to 6x1 with interpolation +__declspec(naked) +static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm4, _shufab0 + movdqa xmm5, _shufab1 + movdqa xmm6, _shufab2 + movdqa xmm7, _scaleab2 + + xloop: + movdqa xmm2, [esi] // average 2 rows into xmm2 + pavgb xmm2, [esi + edx] + lea esi, [esi + 16] + + movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0 + pshufb xmm0, xmm4 + movdqa xmm1, xmm2 + pshufb xmm1, xmm5 + paddusw xmm0, xmm1 + pshufb xmm2, xmm6 + paddusw xmm0, xmm2 + + pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2 + packuswb xmm0, xmm0 + + movd [edi], xmm0 // write 6 pixels + pextrw eax, xmm0, 2 + mov [edi + 4], ax + lea edi, [edi + 6] + sub ecx, 6 + ja xloop + + popad + ret + } +} + +#define HAS_SCALEADDROWS_SSE2 + +// Reads 8xN bytes and produces 16 shorts at a time. +__declspec(naked) +static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, + int src_height) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + mov ebx, [esp + 32 + 20] // height + pxor xmm5, xmm5 + dec ebx + + xloop: + // first row + movdqa xmm2, [esi] + lea eax, [esi + edx] + movhlps xmm3, xmm2 + mov ebp, ebx + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + + // sum remaining rows + yloop: + movdqa xmm0, [eax] // read 16 pixels + lea eax, [eax + edx] // advance to next row + movhlps xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + paddusw xmm2, xmm0 // sum 16 words + paddusw xmm3, xmm1 + sub ebp, 1 + ja yloop + + movdqa [edi], xmm2 + movdqa [edi + 16], xmm3 + lea edi, [edi + 32] + lea esi, [esi + 16] + + sub ecx, 16 + ja xloop + + popad + ret + } +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. +#define HAS_SCALEFILTERROWS_SSE2 +__declspec(naked) +static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + cmp eax, 0 + je xloop1 + cmp eax, 128 + je xloop2 + + movd xmm6, eax // xmm6 = y fraction + punpcklwd xmm6, xmm6 + pshufd xmm6, xmm6, 0 + neg eax // xmm5 = 256 - y fraction + add eax, 256 + movd xmm5, eax + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + pxor xmm7, xmm7 + + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + punpcklbw xmm0, xmm7 + punpcklbw xmm2, xmm7 + punpckhbw xmm1, xmm7 + punpckhbw xmm3, xmm7 + pmullw xmm0, xmm5 // scale row 0 + pmullw xmm1, xmm5 + pmullw xmm2, xmm6 // scale row 1 + pmullw xmm3, xmm6 + paddusw xmm0, xmm2 // sum rows + paddusw xmm1, xmm3 + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop1: + movdqa xmm0, [esi] + lea esi, [esi + 16] + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop1 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop2: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + pavgb xmm0, xmm2 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop2 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + } +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. +#define HAS_SCALEFILTERROWS_SSSE3 +__declspec(naked) +static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + cmp eax, 0 + je xloop1 + cmp eax, 128 + je xloop2 + + shr eax, 1 + mov ah,al + neg al + add al, 128 + movd xmm5, eax + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm5 + pmaddubsw xmm1, xmm5 + psrlw xmm0, 7 + psrlw xmm1, 7 + packuswb xmm0, xmm1 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop1: + movdqa xmm0, [esi] + lea esi, [esi + 16] + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop1 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop2: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + pavgb xmm0, xmm2 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop2 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width) { + __asm { + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width + movdqa xmm1, _round34 + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _madd21 + + wloop: + movdqa xmm0, [eax] // pixels 0..7 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax+8] // pixels 8..15 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx+8], xmm0 + movdqa xmm0, [eax+16] // pixels 16..23 + lea eax, [eax+32] + pshufb xmm0, xmm4 + pmaddubsw xmm0, xmm7 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx+16], xmm0 + lea edx, [edx+24] + sub ecx, 24 + ja wloop + ret + } +} + +#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM) + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt +#define HAS_SCALEROWDOWN2_SSE2 +static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +); +} + +static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movdqa 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc" +); +} + +#define HAS_SCALEROWDOWN4_SSE2 +static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +); +} + +static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t temp = 0; + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0x8,%%xmm7 \n" + "lea (%4,%4,2),%3 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%4,1),%%xmm2 \n" + "movdqa 0x10(%0,%4,1),%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa (%0,%4,2),%%xmm2 \n" + "movdqa 0x10(%0,%4,2),%%xmm3 \n" + "movdqa (%0,%3,1),%%xmm4 \n" + "movdqa 0x10(%0,%3,1),%%xmm5 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm7,%%xmm2 \n" + "pand %%xmm7,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "pavgw %%xmm2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(temp) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc" +#if defined(__x86_64__) + , "xmm6", "xmm7" +#endif +); +} + +#define HAS_SCALEROWDOWN8_SSE2 +static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlq $0x38,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +); +} + +#if defined(__i386__) +void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown8Int_SSE2 \n" +"_ScaleRowDown8Int_SSE2: \n" +#else + ".global ScaleRowDown8Int_SSE2 \n" +"ScaleRowDown8Int_SSE2: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%ebx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "lea (%ebx,%ebx,2),%edx \n" + "pxor %xmm7,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa 0x10(%esi),%xmm1 \n" + "movdqa (%esi,%ebx,1),%xmm2 \n" + "movdqa 0x10(%esi,%ebx,1),%xmm3 \n" + "pavgb %xmm2,%xmm0 \n" + "pavgb %xmm3,%xmm1 \n" + "movdqa (%esi,%ebx,2),%xmm2 \n" + "movdqa 0x10(%esi,%ebx,2),%xmm3 \n" + "movdqa (%esi,%edx,1),%xmm4 \n" + "movdqa 0x10(%esi,%edx,1),%xmm5 \n" + "lea (%esi,%ebx,4),%ebp \n" + "lea 0x20(%esi),%esi \n" + "pavgb %xmm4,%xmm2 \n" + "pavgb %xmm5,%xmm3 \n" + "pavgb %xmm2,%xmm0 \n" + "pavgb %xmm3,%xmm1 \n" + "movdqa 0x0(%ebp),%xmm2 \n" + "movdqa 0x10(%ebp),%xmm3 \n" + "movdqa 0x0(%ebp,%ebx,1),%xmm4 \n" + "movdqa 0x10(%ebp,%ebx,1),%xmm5 \n" + "pavgb %xmm4,%xmm2 \n" + "pavgb %xmm5,%xmm3 \n" + "movdqa 0x0(%ebp,%ebx,2),%xmm4 \n" + "movdqa 0x10(%ebp,%ebx,2),%xmm5 \n" + "movdqa 0x0(%ebp,%edx,1),%xmm6 \n" + "pavgb %xmm6,%xmm4 \n" + "movdqa 0x10(%ebp,%edx,1),%xmm6 \n" + "pavgb %xmm6,%xmm5 \n" + "pavgb %xmm4,%xmm2 \n" + "pavgb %xmm5,%xmm3 \n" + "pavgb %xmm2,%xmm0 \n" + "pavgb %xmm3,%xmm1 \n" + "psadbw %xmm7,%xmm0 \n" + "psadbw %xmm7,%xmm1 \n" + "pshufd $0xd8,%xmm0,%xmm0 \n" + "pshufd $0x8d,%xmm1,%xmm1 \n" + "por %xmm1,%xmm0 \n" + "psrlw $0x3,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movd %xmm0,(%edi) \n" + "lea 0x4(%edi),%edi \n" + "sub $0x4,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +// fpic is used for magiccam plugin +#if !defined(__PIC__) +#define HAS_SCALEROWDOWN34_SSSE3 +void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown34_SSSE3 \n" +"_ScaleRowDown34_SSSE3: \n" +#else + ".global ScaleRowDown34_SSSE3 \n" +"ScaleRowDown34_SSSE3: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf0,%xmm3 \n" + "movdqa _shuf1,%xmm4 \n" + "movdqa _shuf2,%xmm5 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa 0x10(%esi),%xmm2 \n" + "lea 0x20(%esi),%esi \n" + "movdqa %xmm2,%xmm1 \n" + "palignr $0x8,%xmm0,%xmm1 \n" + "pshufb %xmm3,%xmm0 \n" + "pshufb %xmm4,%xmm1 \n" + "pshufb %xmm5,%xmm2 \n" + "movq %xmm0,(%edi) \n" + "movq %xmm1,0x8(%edi) \n" + "movq %xmm2,0x10(%edi) \n" + "lea 0x18(%edi),%edi \n" + "sub $0x18,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown34_1_Int_SSSE3 \n" +"_ScaleRowDown34_1_Int_SSSE3: \n" +#else + ".global ScaleRowDown34_1_Int_SSSE3 \n" +"ScaleRowDown34_1_Int_SSSE3: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%ebp \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf01,%xmm2 \n" + "movdqa _shuf11,%xmm3 \n" + "movdqa _shuf21,%xmm4 \n" + "movdqa _madd01,%xmm5 \n" + "movdqa _madd11,%xmm6 \n" + "movdqa _round34,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%ebp),%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm2,%xmm0 \n" + "pmaddubsw %xmm5,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,(%edi) \n" + "movdqu 0x8(%esi),%xmm0 \n" + "movdqu 0x8(%esi,%ebp),%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm3,%xmm0 \n" + "pmaddubsw %xmm6,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x8(%edi) \n" + "movdqa 0x10(%esi),%xmm0 \n" + "movdqa 0x10(%esi,%ebp),%xmm1 \n" + "lea 0x20(%esi),%esi \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm4,%xmm0 \n" + "movdqa _madd21,%xmm1 \n" + "pmaddubsw %xmm1,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x10(%edi) \n" + "lea 0x18(%edi),%edi \n" + "sub $0x18,%ecx \n" + "ja 1b \n" + + "popa \n" + "ret \n" +); + +void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown34_0_Int_SSSE3 \n" +"_ScaleRowDown34_0_Int_SSSE3: \n" +#else + ".global ScaleRowDown34_0_Int_SSSE3 \n" +"ScaleRowDown34_0_Int_SSSE3: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%ebp \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf01,%xmm2 \n" + "movdqa _shuf11,%xmm3 \n" + "movdqa _shuf21,%xmm4 \n" + "movdqa _madd01,%xmm5 \n" + "movdqa _madd11,%xmm6 \n" + "movdqa _round34,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%ebp,1),%xmm1 \n" + "pavgb %xmm0,%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm2,%xmm0 \n" + "pmaddubsw %xmm5,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,(%edi) \n" + "movdqu 0x8(%esi),%xmm0 \n" + "movdqu 0x8(%esi,%ebp,1),%xmm1 \n" + "pavgb %xmm0,%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm3,%xmm0 \n" + "pmaddubsw %xmm6,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x8(%edi) \n" + "movdqa 0x10(%esi),%xmm0 \n" + "movdqa 0x10(%esi,%ebp,1),%xmm1 \n" + "lea 0x20(%esi),%esi \n" + "pavgb %xmm0,%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm4,%xmm0 \n" + "movdqa _madd21,%xmm1 \n" + "pmaddubsw %xmm1,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x10(%edi) \n" + "lea 0x18(%edi),%edi \n" + "sub $0x18,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +#define HAS_SCALEROWDOWN38_SSSE3 +void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown38_SSSE3 \n" +"_ScaleRowDown38_SSSE3: \n" +#else + ".global ScaleRowDown38_SSSE3 \n" +"ScaleRowDown38_SSSE3: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf38a ,%xmm4 \n" + "movdqa _shuf38b ,%xmm5 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa 0x10(%esi),%xmm1 \n" + "lea 0x20(%esi),%esi \n" + "pshufb %xmm4,%xmm0 \n" + "pshufb %xmm5,%xmm1 \n" + "paddusb %xmm1,%xmm0 \n" + "movq %xmm0,(%edi) \n" + "movhlps %xmm0,%xmm1 \n" + "movd %xmm1,0x8(%edi) \n" + "lea 0xc(%edi),%edi \n" + "sub $0xc,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown38_3_Int_SSSE3 \n" +"_ScaleRowDown38_3_Int_SSSE3: \n" +#else + ".global ScaleRowDown38_3_Int_SSSE3 \n" +"ScaleRowDown38_3_Int_SSSE3: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shufac0,%xmm4 \n" + "movdqa _shufac3,%xmm5 \n" + "movdqa _scaleac3,%xmm6 \n" + "pxor %xmm7,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "movhlps %xmm0,%xmm1 \n" + "movhlps %xmm2,%xmm3 \n" + "punpcklbw %xmm7,%xmm0 \n" + "punpcklbw %xmm7,%xmm1 \n" + "punpcklbw %xmm7,%xmm2 \n" + "punpcklbw %xmm7,%xmm3 \n" + "paddusw %xmm2,%xmm0 \n" + "paddusw %xmm3,%xmm1 \n" + "movdqa (%esi,%edx,2),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movhlps %xmm2,%xmm3 \n" + "punpcklbw %xmm7,%xmm2 \n" + "punpcklbw %xmm7,%xmm3 \n" + "paddusw %xmm2,%xmm0 \n" + "paddusw %xmm3,%xmm1 \n" + "movdqa %xmm0,%xmm2 \n" + "psrldq $0x2,%xmm0 \n" + "paddusw %xmm0,%xmm2 \n" + "psrldq $0x2,%xmm0 \n" + "paddusw %xmm0,%xmm2 \n" + "pshufb %xmm4,%xmm2 \n" + "movdqa %xmm1,%xmm3 \n" + "psrldq $0x2,%xmm1 \n" + "paddusw %xmm1,%xmm3 \n" + "psrldq $0x2,%xmm1 \n" + "paddusw %xmm1,%xmm3 \n" + "pshufb %xmm5,%xmm3 \n" + "paddusw %xmm3,%xmm2 \n" + "pmulhuw %xmm6,%xmm2 \n" + "packuswb %xmm2,%xmm2 \n" + "movd %xmm2,(%edi) \n" + "pextrw $0x2,%xmm2,%eax \n" + "mov %ax,0x4(%edi) \n" + "lea 0x6(%edi),%edi \n" + "sub $0x6,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown38_2_Int_SSSE3 \n" +"_ScaleRowDown38_2_Int_SSSE3: \n" +#else + ".global ScaleRowDown38_2_Int_SSSE3 \n" +"ScaleRowDown38_2_Int_SSSE3: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shufab0,%xmm4 \n" + "movdqa _shufab1,%xmm5 \n" + "movdqa _shufab2,%xmm6 \n" + "movdqa _scaleab2,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm2 \n" + "pavgb (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm2,%xmm0 \n" + "pshufb %xmm4,%xmm0 \n" + "movdqa %xmm2,%xmm1 \n" + "pshufb %xmm5,%xmm1 \n" + "paddusw %xmm1,%xmm0 \n" + "pshufb %xmm6,%xmm2 \n" + "paddusw %xmm2,%xmm0 \n" + "pmulhuw %xmm7,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movd %xmm0,(%edi) \n" + "pextrw $0x2,%xmm0,%eax \n" + "mov %ax,0x4(%edi) \n" + "lea 0x6(%edi),%edi \n" + "sub $0x6,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); +#endif // __PIC__ + +#define HAS_SCALEADDROWS_SSE2 +void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, + int src_height); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleAddRows_SSE2 \n" +"_ScaleAddRows_SSE2: \n" +#else + ".global ScaleAddRows_SSE2 \n" +"ScaleAddRows_SSE2: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "mov 0x34(%esp),%ebx \n" + "pxor %xmm5,%xmm5 \n" + +"1:" + "movdqa (%esi),%xmm2 \n" + "lea (%esi,%edx,1),%eax \n" + "movhlps %xmm2,%xmm3 \n" + "lea -0x1(%ebx),%ebp \n" + "punpcklbw %xmm5,%xmm2 \n" + "punpcklbw %xmm5,%xmm3 \n" + +"2:" + "movdqa (%eax),%xmm0 \n" + "lea (%eax,%edx,1),%eax \n" + "movhlps %xmm0,%xmm1 \n" + "punpcklbw %xmm5,%xmm0 \n" + "punpcklbw %xmm5,%xmm1 \n" + "paddusw %xmm0,%xmm2 \n" + "paddusw %xmm1,%xmm3 \n" + "sub $0x1,%ebp \n" + "ja 2b \n" + + "movdqa %xmm2,(%edi) \n" + "movdqa %xmm3,0x10(%edi) \n" + "lea 0x20(%edi),%edi \n" + "lea 0x10(%esi),%esi \n" + "sub $0x10,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version +#define HAS_SCALEFILTERROWS_SSE2 +void ScaleFilterRows_SSE2(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleFilterRows_SSE2 \n" +"_ScaleFilterRows_SSE2: \n" +#else + ".global ScaleFilterRows_SSE2 \n" +"ScaleFilterRows_SSE2: \n" +#endif + "push %esi \n" + "push %edi \n" + "mov 0xc(%esp),%edi \n" + "mov 0x10(%esp),%esi \n" + "mov 0x14(%esp),%edx \n" + "mov 0x18(%esp),%ecx \n" + "mov 0x1c(%esp),%eax \n" + "cmp $0x0,%eax \n" + "je 2f \n" + "cmp $0x80,%eax \n" + "je 3f \n" + "movd %eax,%xmm6 \n" + "punpcklwd %xmm6,%xmm6 \n" + "pshufd $0x0,%xmm6,%xmm6 \n" + "neg %eax \n" + "add $0x100,%eax \n" + "movd %eax,%xmm5 \n" + "punpcklwd %xmm5,%xmm5 \n" + "pshufd $0x0,%xmm5,%xmm5 \n" + "pxor %xmm7,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,%xmm1 \n" + "movdqa %xmm2,%xmm3 \n" + "punpcklbw %xmm7,%xmm0 \n" + "punpcklbw %xmm7,%xmm2 \n" + "punpckhbw %xmm7,%xmm1 \n" + "punpckhbw %xmm7,%xmm3 \n" + "pmullw %xmm5,%xmm0 \n" + "pmullw %xmm5,%xmm1 \n" + "pmullw %xmm6,%xmm2 \n" + "pmullw %xmm6,%xmm3 \n" + "paddusw %xmm2,%xmm0 \n" + "paddusw %xmm3,%xmm1 \n" + "psrlw $0x8,%xmm0 \n" + "psrlw $0x8,%xmm1 \n" + "packuswb %xmm1,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 1b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" + +"2:" + "movdqa (%esi),%xmm0 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 2b \n" + + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" + +"3:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "pavgb %xmm2,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 3b \n" + + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" +); + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version +#define HAS_SCALEFILTERROWS_SSSE3 +void ScaleFilterRows_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleFilterRows_SSSE3 \n" +"_ScaleFilterRows_SSSE3: \n" +#else + ".global ScaleFilterRows_SSSE3 \n" +"ScaleFilterRows_SSSE3: \n" +#endif + "push %esi \n" + "push %edi \n" + "mov 0xc(%esp),%edi \n" + "mov 0x10(%esp),%esi \n" + "mov 0x14(%esp),%edx \n" + "mov 0x18(%esp),%ecx \n" + "mov 0x1c(%esp),%eax \n" + "cmp $0x0,%eax \n" + "je 2f \n" + "cmp $0x80,%eax \n" + "je 3f \n" + "shr %eax \n" + "mov %al,%ah \n" + "neg %al \n" + "add $0x80,%al \n" + "movd %eax,%xmm5 \n" + "punpcklwd %xmm5,%xmm5 \n" + "pshufd $0x0,%xmm5,%xmm5 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,%xmm1 \n" + "punpcklbw %xmm2,%xmm0 \n" + "punpckhbw %xmm2,%xmm1 \n" + "pmaddubsw %xmm5,%xmm0 \n" + "pmaddubsw %xmm5,%xmm1 \n" + "psrlw $0x7,%xmm0 \n" + "psrlw $0x7,%xmm1 \n" + "packuswb %xmm1,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 1b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" + +"2:" + "movdqa (%esi),%xmm0 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 2b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" + +"3:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "pavgb %xmm2,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 3b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" +); + +#elif defined(__x86_64__) +static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "lea (%3,%3,2),%%r10 \n" + "pxor %%xmm7,%%xmm7 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movdqa 0x10(%0,%3,1),%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa (%0,%3,2),%%xmm2 \n" + "movdqa 0x10(%0,%3,2),%%xmm3 \n" + "movdqa (%0,%%r10,1),%%xmm4 \n" + "movdqa 0x10(%0,%%r10,1),%%xmm5 \n" + "lea (%0,%3,4),%%r11 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa 0x0(%%r11),%%xmm2 \n" + "movdqa 0x10(%%r11),%%xmm3 \n" + "movdqa 0x0(%%r11,%3,1),%%xmm4 \n" + "movdqa 0x10(%%r11,%3,1),%%xmm5 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "movdqa 0x0(%%r11,%3,2),%%xmm4 \n" + "movdqa 0x10(%%r11,%3,2),%%xmm5 \n" + "movdqa 0x0(%%r11,%%r10,1),%%xmm6 \n" + "pavgb %%xmm6,%%xmm4 \n" + "movdqa 0x10(%%r11,%%r10,1),%%xmm6 \n" + "pavgb %%xmm6,%%xmm5 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psadbw %%xmm7,%%xmm0 \n" + "psadbw %%xmm7,%%xmm1 \n" + "pshufd $0xd8,%%xmm0,%%xmm0 \n" + "pshufd $0x8d,%%xmm1,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "psrlw $0x3,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "r10", "r11", "xmm6", "xmm7" +); +} + +#define HAS_SCALEROWDOWN34_SSSE3 +static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%3),%%xmm3 \n" + "movdqa (%4),%%xmm4 \n" + "movdqa (%5),%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(_shuf0), // %3 + "r"(_shuf1), // %4 + "r"(_shuf2) // %5 + : "memory", "cc" +); +} + +static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "movdqa (%4),%%xmm2 \n" // _shuf01 + "movdqa (%5),%%xmm3 \n" // _shuf11 + "movdqa (%6),%%xmm4 \n" // _shuf21 + "movdqa (%7),%%xmm5 \n" // _madd01 + "movdqa (%8),%%xmm6 \n" // _madd11 + "movdqa (%9),%%xmm7 \n" // _round34 + "movdqa (%10),%%xmm8 \n" // _madd21 +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqu 0x8(%0),%%xmm0 \n" + "movdqu 0x8(%0,%3),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm6,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x8(%1) \n" + "movdqa 0x10(%0),%%xmm0 \n" + "movdqa 0x10(%0,%3),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm8,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"(_shuf01), // %4 + "r"(_shuf11), // %5 + "r"(_shuf21), // %6 + "r"(_madd01), // %7 + "r"(_madd11), // %8 + "r"(_round34), // %9 + "r"(_madd21) // %10 + : "memory", "cc", "xmm6", "xmm7", "xmm8" +); +} + +static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%4),%%xmm2 \n" // _shuf01 + "movdqa (%5),%%xmm3 \n" // _shuf11 + "movdqa (%6),%%xmm4 \n" // _shuf21 + "movdqa (%7),%%xmm5 \n" // _madd01 + "movdqa (%8),%%xmm6 \n" // _madd11 + "movdqa (%9),%%xmm7 \n" // _round34 + "movdqa (%10),%%xmm8 \n" // _madd21 +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3,1),%%xmm1 \n" + "pavgb %%xmm0,%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqu 0x8(%0),%%xmm0 \n" + "movdqu 0x8(%0,%3,1),%%xmm1 \n" + "pavgb %%xmm0,%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm6,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x8(%1) \n" + "movdqa 0x10(%0),%%xmm0 \n" + "movdqa 0x10(%0,%3,1),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm0,%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm8,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"(_shuf01), // %4 + "r"(_shuf11), // %5 + "r"(_shuf21), // %6 + "r"(_madd01), // %7 + "r"(_madd11), // %8 + "r"(_round34), // %9 + "r"(_madd21) // %10 + : "memory", "cc", "xmm6", "xmm7", "xmm8" +); +} + +#define HAS_SCALEROWDOWN38_SSSE3 +static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "movdqa (%3),%%xmm4 \n" + "movdqa (%4),%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(_shuf38a), // %3 + "r"(_shuf38b) // %4 + : "memory", "cc" +); +} + +static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%4),%%xmm4 \n" + "movdqa (%5),%%xmm5 \n" + "movdqa (%6),%%xmm6 \n" + "pxor %%xmm7,%%xmm7 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm7,%%xmm0 \n" + "punpcklbw %%xmm7,%%xmm1 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqa (%0,%3,2),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm2 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm3 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm3 \n" + "pshufb %%xmm5,%%xmm3 \n" + "paddusw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,(%1) \n" + "pextrw $0x2,%%xmm2,%%eax \n" + "mov %%ax,0x4(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"(_shufac0), // %4 + "r"(_shufac3), // %5 + "r"(_scaleac3) // %6 + : "memory", "cc", "rax", "xmm6", "xmm7" +); +} + +static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%4),%%xmm4 \n" + "movdqa (%5),%%xmm5 \n" + "movdqa (%6),%%xmm6 \n" + "movdqa (%7),%%xmm7 \n" +"1:" + "movdqa (%0),%%xmm2 \n" + "pavgb (%0,%3,1),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusw %%xmm1,%%xmm0 \n" + "pshufb %%xmm6,%%xmm2 \n" + "paddusw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "pextrw $0x2,%%xmm0,%%eax \n" + "mov %%ax,0x4(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"(_shufab0), // %4 + "r"(_shufab1), // %5 + "r"(_shufab2), // %6 + "r"(_scaleab2) // %7 + : "memory", "cc", "rax", "xmm6", "xmm7" +); +} + +#define HAS_SCALEADDROWS_SSE2 +static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, + int src_height) { + asm volatile( + "pxor %%xmm5,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm2 \n" + "lea (%0,%4,1),%%r10 \n" + "movhlps %%xmm2,%%xmm3 \n" + "lea -0x1(%3),%%r11 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + +"2:" + "movdqa (%%r10),%%xmm0 \n" + "lea (%%r10,%4,1),%%r10 \n" + "movhlps %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "paddusw %%xmm0,%%xmm2 \n" + "paddusw %%xmm1,%%xmm3 \n" + "sub $0x1,%%r11 \n" + "ja 2b \n" + + "movdqa %%xmm2,(%1) \n" + "movdqa %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width), // %2 + "+r"(src_height) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "r10", "r11" +); +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version +#define HAS_SCALEFILTERROWS_SSE2 +static void ScaleFilterRows_SSE2(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction) { + if (source_y_fraction == 0) { + asm volatile( + "1:" + "movdqa (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "rax" + ); + return; + } else if (source_y_fraction == 128) { + asm volatile( + "1:" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%3,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "rax" + ); + return; + } else { + asm volatile( + "mov %3,%%eax \n" + "movd %%eax,%%xmm6 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "neg %%eax \n" + "add $0x100,%%eax \n" + "movd %%eax,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pxor %%xmm7,%%xmm7 \n" + "1:" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm7,%%xmm0 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "punpckhbw %%xmm7,%%xmm3 \n" + "pmullw %%xmm5,%%xmm0 \n" + "pmullw %%xmm5,%%xmm1 \n" + "pmullw %%xmm6,%%xmm2 \n" + "pmullw %%xmm6,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "rax", "xmm6", "xmm7" + ); + } + return; +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version +#define HAS_SCALEFILTERROWS_SSSE3 +static void ScaleFilterRows_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction) { + if (source_y_fraction == 0) { + asm volatile( + "1:" + "movdqa (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "rax" + ); + return; + } else if (source_y_fraction == 128) { + asm volatile( + "1:" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%3,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "rax" + ); + return; + } else { + asm volatile( + "mov %3,%%eax \n" + "shr %%eax \n" + "mov %%al,%%ah \n" + "neg %%al \n" + "add $0x80,%%al \n" + "movd %%eax,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "1:" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "rax" + ); + } + return; +} +#endif +#endif + +// CPU agnostic row functions +static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 2; + } +} + +static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = (src_ptr[0] + src_ptr[1] + + src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2; + src_ptr += 2; + } +} + +static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 4; + } +} + +static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + + src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + + src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + + src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + + src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + + 8) >> 4; + src_ptr += 4; + } +} + +// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down. +// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu. +// The following 2 lines cause error on Windows. +//static const int kMaxOutputWidth = 640; +//static const int kMaxRow12 = 1280; //kMaxOutputWidth * 2; +#define kMaxOutputWidth 640 +#define kMaxRow12 1280 + +static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 8; + } +} + +// Note calling code checks width is less than max and if not +// uses ScaleRowDown8_C instead. +static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + ALIGN16(uint8 src_row[kMaxRow12 * 2]); + assert(dst_width <= kMaxOutputWidth); + ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2); + ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride, + src_row + kMaxOutputWidth, + dst_width * 2); + ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width); +} + +static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + uint8* dend; + assert((dst_width % 3 == 0) && (dst_width > 0)); + dend = dst + dst_width; + do { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } while (dst < dend); +} + +// Filter rows 0 and 1 together, 3 : 1 +static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride, + uint8* d, int dst_width) { + uint8* dend; + const uint8* s; + const uint8* t; + assert((dst_width % 3 == 0) && (dst_width > 0)); + dend = d + dst_width; + s = src_ptr; + t = src_ptr + src_stride; + do { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } while (d < dend); +} + +// Filter rows 1 and 2 together, 1 : 1 +static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride, + uint8* d, int dst_width) { + uint8* dend; + const uint8* s; + const uint8* t; + assert((dst_width % 3 == 0) && (dst_width > 0)); + dend = d + dst_width; + s = src_ptr; + t = src_ptr + src_stride; + do { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } while (d < dend); +} + +#if defined(HAS_SCALEFILTERROWS_SSE2) +// Filter row to 3/4 +static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width) { + uint8* dend; + const uint8* s; + assert((dst_width % 3 == 0) && (dst_width > 0)); + dend = dst_ptr + dst_width; + s = src_ptr; + do { + dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; + dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; + dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2; + dst_ptr += 3; + s += 4; + } while (dst_ptr < dend); +} +#endif + +static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int dx) { + int x = 0; + int j; + for (j = 0; j < dst_width; ++j) { + int xi = x >> 16; + int xf1 = x & 0xffff; + int xf0 = 65536 - xf1; + + *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16; + x += dx; + } +} + +//Not work on Windows +//static const int kMaxInputWidth = 2560; +#define kMaxInputWidth 2560 +#if defined(HAS_SCALEFILTERROWS_SSE2) +#define HAS_SCALEROWDOWN34_SSE2 +// Filter rows 0 and 1 together, 3 : 1 +static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + ALIGN16(uint8 row[kMaxInputWidth]); + assert((dst_width % 3 == 0) && (dst_width > 0)); + ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4); + ScaleFilterCols34_C(dst_ptr, row, dst_width); +} + +// Filter rows 1 and 2 together, 1 : 1 +static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + ALIGN16(uint8 row[kMaxInputWidth]); + assert((dst_width % 3 == 0) && (dst_width > 0)); + ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2); + ScaleFilterCols34_C(dst_ptr, row, dst_width); +} +#endif + +static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +// 8x3 -> 3x1 +static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i+=3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + + src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * + (65536 / 9) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + + src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + + src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * + (65536 / 9) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + + src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) * + (65536 / 6) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// 8x2 -> 3x1 +static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i+=3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2]) * (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + + src_ptr[src_stride + 5]) * (65536 / 6) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) * + (65536 / 4) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// C version 8x2 -> 8x1 +static void ScaleFilterRows_C(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction) { + int y1_fraction; + int y0_fraction; + const uint8* src_ptr1; + uint8* end; + assert(dst_width > 0); + y1_fraction = source_y_fraction; + y0_fraction = 256 - y1_fraction; + src_ptr1 = src_ptr + src_stride; + end = dst_ptr + dst_width; + do { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; + dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; + dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; + dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; + dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; + dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; + src_ptr += 8; + src_ptr1 += 8; + dst_ptr += 8; + } while (dst_ptr < end); + dst_ptr[0] = dst_ptr[-1]; +} + +void ScaleAddRows_C(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, int src_height) { + int x,y; + assert(src_width > 0); + assert(src_height > 0); + for (x = 0; x < src_width; ++x) { + const uint8* s = src_ptr + x; + int sum = 0; + for (y = 0; y < src_height; ++y) { + sum += s[0]; + s += src_stride; + } + dst_ptr[x] = sum; + } +} + +/** + * Scale plane, 1/2 + * + * This is an optimized version for scaling down a plane to 1/2 of + * its original size. + * + */ +static void ScalePlaneDown2(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; + } else +#endif +/* TODO: Force to call C version all the time in ordert to get matching results + * in multi-resolution encoder example. + */ +#if 0 //defined(HAS_SCALEROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 16) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; + } else +#endif + { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; + } + + { + int y; + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += (src_stride << 1); + dst_ptr += dst_stride; + } + } +} + +/** + * Scale plane, 1/4 + * + * This is an optimized version for scaling down a plane to 1/4 of + * its original size. + */ +static void ScalePlaneDown4(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(IS_ALIGNED(src_width, 4)); + assert(IS_ALIGNED(src_height, 4)); + +#if defined(HAS_SCALEROWDOWN4_NEON) + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(dst_width, 4)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON; + } else +#endif +#if defined(HAS_SCALEROWDOWN4_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; + } else +#endif + { + ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; + } + + { + int y; + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += (src_stride << 2); + dst_ptr += dst_stride; + } + } +} + +/** + * Scale plane, 1/8 + * + * This is an optimized version for scaling down a plane to 1/8 + * of its original size. + * + */ +static void ScalePlaneDown8(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(IS_ALIGNED(src_width, 8)); + assert(IS_ALIGNED(src_height, 8)); + +#if defined(HAS_SCALEROWDOWN8_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; + } else +#endif + { + ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ? + ScaleRowDown8Int_C : ScaleRowDown8_C; + } + + { + int y; + for (y = 0; y < dst_height; ++y) { + ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += (src_stride << 3); + dst_ptr += dst_stride; + } + } +} + +/** + * Scale plane down, 3/4 + * + * Provided by Frank Barchard (fbarchard@google.com) + * + */ +static void ScalePlaneDown34(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(dst_width % 3 == 0); +#if defined(HAS_SCALEROWDOWN34_NEON) + if (TestCpuFlag(kCpuHasNEON) && + (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_NEON; + ScaleRowDown34_1 = ScaleRowDown34_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON; + } + } else +#endif + +#if defined(HAS_SCALEROWDOWN34_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3; + } + } else +#endif +#if defined(HAS_SCALEROWDOWN34_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_stride, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) && + filtering) { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2; + } else +#endif + { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_C; + ScaleRowDown34_1 = ScaleRowDown34_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_C; + } + } + { + int src_row = 0; + int y; + for (y = 0; y < dst_height; ++y) { + switch (src_row) { + case 0: + ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); + break; + + case 1: + ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width); + break; + + case 2: + ScaleRowDown34_0(src_ptr + src_stride, -src_stride, + dst_ptr, dst_width); + break; + } + ++src_row; + src_ptr += src_stride; + dst_ptr += dst_stride; + if (src_row >= 3) { + src_ptr += src_stride; + src_row = 0; + } + } + } +} + +/** + * Scale plane, 3/8 + * + * This is an optimized version for scaling down a plane to 3/8 + * of its original size. + * + * Reduces 16x3 to 6x1 + */ +static void ScalePlaneDown38(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(dst_width % 3 == 0); +#if defined(HAS_SCALEROWDOWN38_NEON) + if (TestCpuFlag(kCpuHasNEON) && + (dst_width % 12 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_NEON; + ScaleRowDown38_2 = ScaleRowDown38_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON; + } + } else +#endif + +#if defined(HAS_SCALEROWDOWN38_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_stride, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; + } + } else +#endif + { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_C; + ScaleRowDown38_2 = ScaleRowDown38_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_C; + } + } + { + int src_row = 0; + int y; + for (y = 0; y < dst_height; ++y) { + switch (src_row) { + case 0: + case 1: + ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + ++src_row; + break; + + case 2: + ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + src_row = 0; + break; + } + dst_ptr += dst_stride; + } + } +} + +__inline static uint32 SumBox(int iboxwidth, int iboxheight, + int src_stride, const uint8* src_ptr) { + int x, y; + uint32 sum; + assert(iboxwidth > 0); + assert(iboxheight > 0); + sum = 0u; + for (y = 0; y < iboxheight; ++y) { + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + src_ptr += src_stride; + } + return sum; +} + +static void ScalePlaneBoxRow(int dst_width, int boxheight, + int dx, int src_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int x = 0; + int i; + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + int boxwidth; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / + (boxwidth * boxheight); + } +} + +__inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { + uint32 sum; + int x; + assert(iboxwidth > 0); + sum = 0u; + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static void ScaleAddCols2_C(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int scaletbl[2]; + int minboxwidth = (dx >> 16); + scaletbl[0] = 65536 / (minboxwidth * boxheight); + scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); + { + int *scaleptr = scaletbl - minboxwidth; + int x = 0; + int i; + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + int boxwidth; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; + } + } +} + +static void ScaleAddCols1_C(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int boxwidth = (dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int x = 0; + int i; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +/** + * Scale plane down to any dimensions, with interpolation. + * (boxfilter). + * + * Same method as SimpleScale, which is fixed point, outputting + * one pixel of destination using fixed point (16.16) to step + * through source, sampling a box of pixel with simple + * averaging. + */ +static void ScalePlaneBox(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int dx, dy; + assert(dst_width > 0); + assert(dst_height > 0); + dy = (src_height << 16) / dst_height; + dx = (src_width << 16) / dst_width; + if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) || + dst_height * 2 > src_height) { + uint8* dst = dst_ptr; + int dy = (src_height << 16) / dst_height; + int dx = (src_width << 16) / dst_width; + int y = 0; + int j; + for (j = 0; j < dst_height; ++j) { + int iy = y >> 16; + const uint8* const src = src_ptr + iy * src_stride; + int boxheight; + y += dy; + if (y > (src_height << 16)) { + y = (src_height << 16); + } + boxheight = (y >> 16) - iy; + ScalePlaneBoxRow(dst_width, boxheight, + dx, src_stride, + src, dst); + + dst += dst_stride; + } + } else { + ALIGN16(uint16 row[kMaxInputWidth]); + void (*ScaleAddRows)(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, int src_height); + void (*ScaleAddCols)(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* dst_ptr); +#if defined(HAS_SCALEADDROWS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_width, 16)) { + ScaleAddRows = ScaleAddRows_SSE2; + } else +#endif + { + ScaleAddRows = ScaleAddRows_C; + } + if (dx & 0xffff) { + ScaleAddCols = ScaleAddCols2_C; + } else { + ScaleAddCols = ScaleAddCols1_C; + } + + { + int y = 0; + int j; + for (j = 0; j < dst_height; ++j) { + int iy = y >> 16; + const uint8* const src = src_ptr + iy * src_stride; + int boxheight; + y += dy; + if (y > (src_height << 16)) { + y = (src_height << 16); + } + boxheight = (y >> 16) - iy; + ScaleAddRows(src, src_stride, row, src_width, boxheight); + ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr); + dst_ptr += dst_stride; + } + } + } +} + +/** + * Scale plane to/from any dimensions, with interpolation. + */ +static void ScalePlaneBilinearSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int i, j; + uint8* dst = dst_ptr; + int dx = (src_width << 16) / dst_width; + int dy = (src_height << 16) / dst_height; + int maxx = ((src_width - 1) << 16) - 1; + int maxy = ((src_height - 1) << 16) - 1; + int y = (dst_height < src_height) ? 32768 : + (src_height << 16) / dst_height - 32768; + for (i = 0; i < dst_height; ++i) { + int cy = (y < 0) ? 0 : y; + int yi = cy >> 16; + int yf = cy & 0xffff; + const uint8* const src = src_ptr + yi * src_stride; + int x = (dst_width < src_width) ? 32768 : + (src_width << 16) / dst_width - 32768; + for (j = 0; j < dst_width; ++j) { + int cx = (x < 0) ? 0 : x; + int xi = cx >> 16; + int xf = cx & 0xffff; + int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16; + int r1 = (src[xi + src_stride] * (65536 - xf) + + src[xi + src_stride + 1] * xf) >> 16; + *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16; + x += dx; + if (x > maxx) + x = maxx; + } + dst += dst_stride - dst_width; + y += dy; + if (y > maxy) + y = maxy; + } +} + +/** + * Scale plane to/from any dimensions, with bilinear + * interpolation. + */ +static void ScalePlaneBilinear(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int dy; + int dx; + assert(dst_width > 0); + assert(dst_height > 0); + dy = (src_height << 16) / dst_height; + dx = (src_width << 16) / dst_width; + if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) { + ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + + } else { + ALIGN16(uint8 row[kMaxInputWidth + 1]); + void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr, + int src_stride, + int dst_width, int source_y_fraction); + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int dx); +#if defined(HAS_SCALEFILTERROWS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_width, 16)) { + ScaleFilterRows = ScaleFilterRows_SSSE3; + } else +#endif +#if defined(HAS_SCALEFILTERROWS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_width, 16)) { + ScaleFilterRows = ScaleFilterRows_SSE2; + } else +#endif + { + ScaleFilterRows = ScaleFilterRows_C; + } + ScaleFilterCols = ScaleFilterCols_C; + + { + int y = 0; + int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows. + int j; + for (j = 0; j < dst_height; ++j) { + int iy = y >> 16; + int fy = (y >> 8) & 255; + const uint8* const src = src_ptr + iy * src_stride; + ScaleFilterRows(row, src, src_stride, src_width, fy); + ScaleFilterCols(dst_ptr, row, dst_width, dx); + dst_ptr += dst_stride; + y += dy; + if (y > maxy) { + y = maxy; + } + } + } + } +} + +/** + * Scale plane to/from any dimensions, without interpolation. + * Fixed point math is used for performance: The upper 16 bits + * of x and dx is the integer part of the source position and + * the lower 16 bits are the fixed decimal part. + */ +static void ScalePlaneSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + uint8* dst = dst_ptr; + int dx = (src_width << 16) / dst_width; + int y; + for (y = 0; y < dst_height; ++y) { + const uint8* const src = src_ptr + (y * src_height / dst_height) * + src_stride; + // TODO(fbarchard): Round X coordinate by setting x=0x8000. + int x = 0; + int i; + for (i = 0; i < dst_width; ++i) { + *dst++ = src[x >> 16]; + x += dx; + } + dst += dst_stride - dst_width; + } +} + +/** + * Scale plane to/from any dimensions. + */ +static void ScalePlaneAnySize(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + if (!filtering) { + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } else { + // fall back to non-optimized version + ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } +} + +/** + * Scale plane down, any size + * + * This is an optimized version for scaling down a plane to any size. + * The current implementation is ~10 times faster compared to the + * reference implementation for e.g. XGA->LowResPAL + * + */ +static void ScalePlaneDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + if (!filtering) { + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) { + // between 1/2x and 1x use bilinear + ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } else { + ScalePlaneBox(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } +} + +/** + * Copy plane, no scaling + * + * This simply copies the given plane without scaling. + * The current implementation is ~115 times faster + * compared to the reference implementation. + * + */ +static void CopyPlane(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + if (src_stride == src_width && dst_stride == dst_width) { + // All contiguous, so can use REALLY fast path. + memcpy(dst_ptr, src_ptr, src_width * src_height); + } else { + // Not all contiguous; must copy scanlines individually + const uint8* src = src_ptr; + uint8* dst = dst_ptr; + int i; + for (i = 0; i < src_height; ++i) { + memcpy(dst, src, src_width); + dst += dst_stride; + src += src_stride; + } + } +} + +static void ScalePlane(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + FilterMode filtering, int use_ref) { + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); + } else if (dst_width <= src_width && dst_height <= src_height) { + // Scale down. + if (use_ref) { + // For testing, allow the optimized versions to be disabled. + ScalePlaneDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (4 * dst_width == 3 * src_width && + 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + // 3/8 rounded up for odd sized chroma height. + } else if (8 * dst_width == 3 * src_width && + dst_height == ((src_height * 3 + 7) / 8)) { + // optimized, 3/8 + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (4 * dst_width == src_width && 4 * dst_height == src_height) { + // optimized, 1/4 + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (8 * dst_width == src_width && 8 * dst_height == src_height) { + // optimized, 1/8 + ScalePlaneDown8(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else { + // Arbitrary downsample + ScalePlaneDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } + } else { + // Arbitrary scale up and/or down. + ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } +} + +/** + * Scale a plane. + * + * This function in turn calls a scaling function + * suitable for handling the desired resolutions. + * + */ + +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + FilterMode filtering) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + // Negative height means invert the image. + if (src_height < 0) { + int halfheight; + src_height = -src_height; + halfheight = (src_height + 1) >> 1; + src_y = src_y + (src_height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + { + int halfsrc_width = (src_width + 1) >> 1; + int halfsrc_height = (src_height + 1) >> 1; + int halfdst_width = (dst_width + 1) >> 1; + int halfoheight = (dst_height + 1) >> 1; + + ScalePlane(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering, use_reference_impl_); + ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height, + dst_u, dst_stride_u, halfdst_width, halfoheight, + filtering, use_reference_impl_); + ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height, + dst_v, dst_stride_v, halfdst_width, halfoheight, + filtering, use_reference_impl_); + } + return 0; +} + +int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + int interpolate) { + int halfsrc_width; + int halfsrc_height; + int halfdst_width; + int halfoheight; + FilterMode filtering; + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + // Negative height means invert the image. + if (src_height < 0) { + int halfheight; + src_height = -src_height; + halfheight = (src_height + 1) >> 1; + src_y = src_y + (src_height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + halfsrc_width = (src_width + 1) >> 1; + halfsrc_height = (src_height + 1) >> 1; + halfdst_width = (dst_width + 1) >> 1; + halfoheight = (dst_height + 1) >> 1; + filtering = interpolate ? kFilterBox : kFilterNone; + + ScalePlane(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering, use_reference_impl_); + ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height, + dst_u, dst_stride_u, halfdst_width, halfoheight, + filtering, use_reference_impl_); + ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height, + dst_v, dst_stride_v, halfdst_width, halfoheight, + filtering, use_reference_impl_); + return 0; +} + +int Scale_1(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int ooffset, + int interpolate) { + if (!src || src_width <= 0 || src_height <= 0 || + !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 || + ooffset >= dst_height) { + return -1; + } + ooffset = ooffset & ~1; // chroma requires offset to multiple of 2. + { + int halfsrc_width = (src_width + 1) >> 1; + int halfsrc_height = (src_height + 1) >> 1; + int halfdst_width = (dst_width + 1) >> 1; + int halfoheight = (dst_height + 1) >> 1; + int aheight = dst_height - ooffset * 2; // actual output height + const uint8* const iyptr = src; + uint8* oyptr = dst + ooffset * dst_width; + const uint8* const iuptr = src + src_width * src_height; + uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width; + const uint8* const ivptr = src + src_width * src_height + + halfsrc_width * halfsrc_height; + uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight + + (ooffset >> 1) * halfdst_width; + return Scale_2(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width, + src_width, src_height, oyptr, ouptr, ovptr, dst_width, + halfdst_width, halfdst_width, dst_width, aheight, interpolate); + } +} + +//} // namespace libyuv diff --git a/usage.dox b/usage.dox index 0db080b00..9370e428f 100644 --- a/usage.dox +++ b/usage.dox @@ -82,6 +82,7 @@ The available initialization methods are: \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif + \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif diff --git a/usage_cx.dox b/usage_cx.dox index 980a03461..62f3e450b 100644 --- a/usage_cx.dox +++ b/usage_cx.dox @@ -1,6 +1,6 @@ /*! \page usage_encode Encode - The vpx_codec_encode() function is at the core of the decode loop. It + The vpx_codec_encode() function is at the core of the encode loop. It processes raw images passed by the application, producing packets of compressed data. The deadline parameter controls the amount of time in microseconds the encoder should spend working on the frame. For @@ -10,5 +10,4 @@ \ref samples - */ diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index a90c1c0b6..91e90e2a6 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -170,6 +170,18 @@ typedef struct union b_mode_info bmi[16]; } MODE_INFO; +#if CONFIG_MULTI_RES_ENCODING +/* The information needed to be stored for higher-resolution encoder */ +typedef struct +{ + MB_PREDICTION_MODE mode; + MV_REFERENCE_FRAME ref_frame; + int_mv mv; + //union b_mode_info bmi[16]; + int dissim; // dissimilarity level of the macroblock +} LOWER_RES_INFO; +#endif + typedef struct { short *qcoeff; diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 3f04dab4c..37fa5a0cd 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -17,6 +17,7 @@ extern "C" { #endif +#include "vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx/vp8cx.h" #include "vpx/vpx_encoder.h" @@ -207,6 +208,19 @@ extern "C" unsigned int periodicity; unsigned int layer_id[MAX_PERIODICITY]; +#if CONFIG_MULTI_RES_ENCODING + /* Number of total resolutions encoded */ + unsigned int mr_total_resolutions; + + /* Current encoder ID */ + unsigned int mr_encoder_id; + + /* Down-sampling factor */ + vpx_rational_t mr_down_sampling_factor; + + /* Memory location to store low-resolution encoder's mode info */ + void* mr_low_res_mode_info; +#endif } VP8_CONFIG; diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 8ec9e27c9..0927f51cf 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -49,8 +49,8 @@ extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, int count); void vp8_build_block_offsets(MACROBLOCK *x); void vp8_setup_block_ptrs(MACROBLOCK *x); -int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset); -int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); +int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset, int mb_row, int mb_col); +int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int mb_row, int mb_col); static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x ); #ifdef MODE_STATS @@ -475,14 +475,14 @@ void encode_mb_row(VP8_COMP *cpi, if (cm->frame_type == KEY_FRAME) { - *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp); + *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp, mb_row, mb_col); #ifdef MODE_STATS y_modes[xd->mbmi.mode] ++; #endif } else { - *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset); + *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col); #ifdef MODE_STATS inter_y_modes[xd->mbmi.mode] ++; @@ -1142,7 +1142,7 @@ static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x ) #endif } -int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) +int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int mb_row, int mb_col) { int rate; @@ -1182,7 +1182,8 @@ extern void vp8_fix_contexts(MACROBLOCKD *x); int vp8cx_encode_inter_macroblock ( VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset + int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col ) { MACROBLOCKD *const xd = &x->e_mbd; @@ -1230,8 +1231,25 @@ int vp8cx_encode_inter_macroblock } else + { +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_encoder_id == 0) + { + /* Lowest-resolution encoding */ + vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, + &distortion, &intra_error); + + }else + { + /* Higher-resolution encoding */ + vp8_mr_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, + &distortion, &intra_error, mb_row, mb_col); + } +#else vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error); +#endif + } cpi->prediction_error += distortion; cpi->intra_error += intra_error; diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index c1a0ea7bf..9d963832a 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -9,6 +9,7 @@ */ +#include "onyx_int.h" #include "mcomp.h" #include "vpx_mem/vpx_mem.h" #include "vpx_config.h" @@ -182,8 +183,6 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e; #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost #define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best -#define MIN(x,y) (((x)<(y))?(x):(y)) -#define MAX(x,y) (((x)>(y))?(x):(y)) int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, @@ -331,8 +330,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, #undef IFMVCV #undef ERR #undef CHECK_BETTER -#undef MIN -#undef MAX + int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, int error_per_bit, @@ -854,6 +852,8 @@ int vp8_hex_search int k = -1; int all_in; int best_site = -1; + int hex_range = 127; + int dia_range = 8; int_mv fcenter_mv; fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; @@ -873,6 +873,18 @@ int vp8_hex_search in_what_stride, 0x7fffffff) + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); +#if CONFIG_MULTI_RES_ENCODING + /* Lower search range based on prediction info */ + if (search_param >= 6) goto cal_neighbors; + else if (search_param >= 5) hex_range = 4; + else if (search_param >= 4) hex_range = 6; + else if (search_param >= 3) hex_range = 15; + else if (search_param >= 2) hex_range = 31; + else if (search_param >= 1) hex_range = 63; + + dia_range = 8; +#endif + // hex search //j=0 CHECK_BOUNDS(2) @@ -909,7 +921,7 @@ int vp8_hex_search k = best_site; } - for (j = 1; j < 127; j++) + for (j = 1; j < hex_range; j++) { best_site = -1; CHECK_BOUNDS(2) @@ -951,7 +963,7 @@ int vp8_hex_search // check 4 1-away neighbors cal_neighbors: - for (j = 0; j < 32; j++) + for (j = 0; j < dia_range; j++) { best_site = -1; CHECK_BOUNDS(1) diff --git a/vp8/encoder/mr_dissim.c b/vp8/encoder/mr_dissim.c new file mode 100644 index 000000000..7a62a06ec --- /dev/null +++ b/vp8/encoder/mr_dissim.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include "vpx_config.h" +#include "onyx_int.h" +#include "mr_dissim.h" +#include "vpx_mem/vpx_mem.h" +#include "rdopt.h" + +void vp8_cal_low_res_mb_cols(VP8_COMP *cpi) +{ + int low_res_w; + + /* Support arbitrary down-sampling factor */ + unsigned int iw = cpi->oxcf.Width*cpi->oxcf.mr_down_sampling_factor.den + + cpi->oxcf.mr_down_sampling_factor.num - 1; + + low_res_w = iw/cpi->oxcf.mr_down_sampling_factor.num; + cpi->mr_low_res_mb_cols = ((low_res_w + 15) >> 4); +} + +#define GET_MV(x) \ +if(x->mbmi.ref_frame !=INTRA_FRAME) \ +{ \ + mvx[cnt] = x->mbmi.mv.as_mv.row; \ + mvy[cnt] = x->mbmi.mv.as_mv.col; \ + cnt++; \ +} + +#define GET_MV_SIGN(x) \ +if(x->mbmi.ref_frame !=INTRA_FRAME) \ +{ \ + mvx[cnt] = x->mbmi.mv.as_mv.row; \ + mvy[cnt] = x->mbmi.mv.as_mv.col; \ + if (cm->ref_frame_sign_bias[x->mbmi.ref_frame] \ + != cm->ref_frame_sign_bias[tmp->mbmi.ref_frame]) \ + { \ + mvx[cnt] *= -1; \ + mvy[cnt] *= -1; \ + } \ + cnt++; \ +} + +void vp8_cal_dissimilarity(VP8_COMP *cpi) +{ + VP8_COMMON *cm = &cpi->common; + + /* Note: The first row & first column in mip are outside the frame, which + * were initialized to all 0.(ref_frame, mode, mv...) + * Their ref_frame = 0 means they won't be counted in the following + * calculation. + */ + if (cpi->oxcf.mr_total_resolutions >1 + && cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1)) + { + /* Store info for show/no-show frames for supporting alt_ref. + * If parent frame is alt_ref, child has one too. + */ + if(cm->frame_type != KEY_FRAME) + { + int mb_row; + int mb_col; + /* Point to beginning of allocated MODE_INFO arrays. */ + MODE_INFO *tmp = cm->mip + cm->mode_info_stride; + LOWER_RES_INFO* store_mode_info + = (LOWER_RES_INFO*)cpi->oxcf.mr_low_res_mode_info; + + for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++) + { + tmp++; + for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++) + { + int dissim = INT_MAX; + + if(tmp->mbmi.ref_frame !=INTRA_FRAME) + { + int mvx[8]; + int mvy[8]; + int mmvx; + int mmvy; + int cnt=0; + const MODE_INFO *here = tmp; + const MODE_INFO *above = here - cm->mode_info_stride; + const MODE_INFO *left = here - 1; + const MODE_INFO *aboveleft = above - 1; + const MODE_INFO *aboveright = NULL; + const MODE_INFO *right = NULL; + const MODE_INFO *belowleft = NULL; + const MODE_INFO *below = NULL; + const MODE_INFO *belowright = NULL; + + /* If alternate reference frame is used, we have to + * check sign of MV. */ + if(cpi->oxcf.play_alternate) + { + /* Gather mv of neighboring MBs */ + GET_MV_SIGN(above) + GET_MV_SIGN(left) + GET_MV_SIGN(aboveleft) + + if(mb_col < (cm->mb_cols-1)) + { + right = here + 1; + aboveright = above + 1; + GET_MV_SIGN(right) + GET_MV_SIGN(aboveright) + } + + if(mb_row < (cm->mb_rows-1)) + { + below = here + cm->mode_info_stride; + belowleft = below - 1; + GET_MV_SIGN(below) + GET_MV_SIGN(belowleft) + } + + if(mb_col < (cm->mb_cols-1) + && mb_row < (cm->mb_rows-1)) + { + belowright = below + 1; + GET_MV_SIGN(belowright) + } + }else + { + /* No alt_ref and gather mv of neighboring MBs */ + GET_MV(above) + GET_MV(left) + GET_MV(aboveleft) + + if(mb_col < (cm->mb_cols-1)) + { + right = here + 1; + aboveright = above + 1; + GET_MV(right) + GET_MV(aboveright) + } + + if(mb_row < (cm->mb_rows-1)) + { + below = here + cm->mode_info_stride; + belowleft = below - 1; + GET_MV(below) + GET_MV(belowleft) + } + + if(mb_col < (cm->mb_cols-1) + && mb_row < (cm->mb_rows-1)) + { + belowright = below + 1; + GET_MV(belowright) + } + } + + if (cnt > 0) + { + int max_mvx = mvx[0]; + int min_mvx = mvx[0]; + int max_mvy = mvy[0]; + int min_mvy = mvy[0]; + int i; + + if (cnt > 1) + { + for (i=1; i< cnt; i++) + { + if (mvx[i] > max_mvx) max_mvx = mvx[i]; + else if (mvx[i] < min_mvx) min_mvx = mvx[i]; + if (mvy[i] > max_mvy) max_mvy = mvy[i]; + else if (mvy[i] < min_mvy) min_mvy = mvy[i]; + } + } + + mmvx = MAX(abs(min_mvx - here->mbmi.mv.as_mv.row), + abs(max_mvx - here->mbmi.mv.as_mv.row)); + mmvy = MAX(abs(min_mvy - here->mbmi.mv.as_mv.col), + abs(max_mvy - here->mbmi.mv.as_mv.col)); + dissim = MAX(mmvx, mmvy); + } + } + + /* Store mode info for next resolution encoding */ + store_mode_info->mode = tmp->mbmi.mode; + store_mode_info->ref_frame = tmp->mbmi.ref_frame; + store_mode_info->mv.as_int = tmp->mbmi.mv.as_int; + store_mode_info->dissim = dissim; + tmp++; + store_mode_info++; + } + } + } + } +} diff --git a/vp8/encoder/mr_dissim.h b/vp8/encoder/mr_dissim.h new file mode 100644 index 000000000..3d2c2035f --- /dev/null +++ b/vp8/encoder/mr_dissim.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_MR_DISSIM_H +#define __INC_MR_DISSIM_H +#include "vpx_config.h" + +extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi); +extern void vp8_cal_dissimilarity(VP8_COMP *cpi); + +#endif diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 146384b13..d40f009fb 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -36,6 +36,9 @@ #if ARCH_ARM #include "vpx_ports/arm.h" #endif +#if CONFIG_MULTI_RES_ENCODING +#include "mr_dissim.h" +#endif #include #include @@ -2234,6 +2237,13 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) vp8_loop_filter_init(cm); cpi->common.error.setjmp = 0; + +#if CONFIG_MULTI_RES_ENCODING + /* Calculate # of MBs in a row in lower-resolution level image. */ + if (cpi->oxcf.mr_encoder_id > 0) + vp8_cal_low_res_mb_cols(cpi); +#endif + return (VP8_PTR) cpi; } @@ -4338,13 +4348,20 @@ static void encode_frame_to_data_rate IF_RTCD(&cpi->rtcd.variance)); } - // This frame's MVs are saved and will be used in next frame's MV prediction. - // Last frame has one more line(add to bottom) and one more column(add to right) than cm->mip. The edge elements are initialized to 0. - if(cm->show_frame) //do not save for altref frame + /* This frame's MVs are saved and will be used in next frame's MV predictor. + * Last frame has one more line(add to bottom) and one more column(add to + * right) than cm->mip. The edge elements are initialized to 0. + */ +#if CONFIG_MULTI_RES_ENCODING + if(!cpi->oxcf.mr_encoder_id && cm->show_frame) +#else + if(cm->show_frame) /* do not save for altref frame */ +#endif { int mb_row; int mb_col; - MODE_INFO *tmp = cm->mip; //point to beginning of allocated MODE_INFO arrays. + /* Point to beginning of allocated MODE_INFO arrays. */ + MODE_INFO *tmp = cm->mip; if(cm->frame_type != KEY_FRAME) { @@ -4363,6 +4380,10 @@ static void encode_frame_to_data_rate } } +#if CONFIG_MULTI_RES_ENCODING + vp8_cal_dissimilarity(cpi); +#endif + // Update the GF useage maps. // This is done after completing the compression of a frame when all // modes etc. are finalized but before loop filter diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 202f61471..ca36c85af 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -58,6 +58,9 @@ #define MAX_PERIODICITY 16 +#define MAX(x,y) (((x)>(y))?(x):(y)) +#define MIN(x,y) (((x)<(y))?(x):(y)) + typedef struct { int kf_indicated; @@ -679,6 +682,11 @@ typedef struct VP8_COMP double total_ssimg_v_in_layer[MAX_LAYERS]; double total_ssimg_all_in_layer[MAX_LAYERS]; +#if CONFIG_MULTI_RES_ENCODING + /* Number of MBs per row at lower-resolution level */ + int mr_low_res_mb_cols; +#endif + } VP8_COMP; void control_data_rate(VP8_COMP *cpi); diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index f92618fae..d9c89750b 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -703,6 +703,14 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, if (cpi->sf.search_method == HEX) { +#if CONFIG_MULTI_RES_ENCODING + /* TODO: In higher-res pick_inter_mode, step_param is used to + * modify hex search range. Here, set step_param to 0 not to + * change the behavior in lowest-resolution encoder. + * Will improve it later. + */ + step_param = 0; +#endif bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv, step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); @@ -949,3 +957,568 @@ void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_) *rate_ = best_rate; } + +#if CONFIG_MULTI_RES_ENCODING +void vp8_mr_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, + int recon_uvoffset, int *returnrate, + int *returndistortion, int *returnintra, int mb_row, + int mb_col) +{ + BLOCK *b = &x->block[0]; + BLOCKD *d = &x->e_mbd.block[0]; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO best_mbmode; + + int_mv best_ref_mv; + int_mv mode_mv[MB_MODE_COUNT]; + MB_PREDICTION_MODE this_mode; + int num00; + int mdcounts[4]; + int best_rd = INT_MAX; // 1 << 30; + int best_intra_rd = INT_MAX; + int mode_index; + int rate; + int rate2; + int distortion2; + int bestsme; + int best_mode_index = 0; + unsigned int sse = INT_MAX, best_sse = INT_MAX; + + int_mv mvp; + int_mv nearest_mv[4]; + int_mv near_mv[4]; + int_mv frame_best_ref_mv[4]; + int MDCounts[4][4]; + unsigned char *y_buffer[4]; + unsigned char *u_buffer[4]; + unsigned char *v_buffer[4]; + int skip_mode[4] = {0, 0, 0, 0}; + int have_subp_search = cpi->sf.half_pixel_search; /* In real-time mode, + when Speed >= 15, no sub-pixel search. */ + int lfdone=0, gfdone=0, afdone=0; + + LOWER_RES_INFO* store_mode_info + = (LOWER_RES_INFO*)cpi->oxcf.mr_low_res_mode_info; + unsigned int parent_mb_index; + //unsigned int parent_mb_index = map_640x480_to_320x240[mb_row][mb_col]; + int dissim; + int parent_ref_frame; + int_mv parent_ref_mv; + MB_PREDICTION_MODE parent_mode; + + /* Consider different down_sampling_factor. */ + { + /* TODO: Removed the loop that supports special down_sampling_factor + * such as 2, 4, 8. Will revisit it if needed. + * Should also try using a look-up table to see if it helps + * performance. */ + int round = cpi->oxcf.mr_down_sampling_factor.num/2; + int parent_mb_row, parent_mb_col; + + parent_mb_row = (mb_row*cpi->oxcf.mr_down_sampling_factor.den+round) + /cpi->oxcf.mr_down_sampling_factor.num; + parent_mb_col = (mb_col*cpi->oxcf.mr_down_sampling_factor.den+round) + /cpi->oxcf.mr_down_sampling_factor.num; + parent_mb_index = parent_mb_row*cpi->mr_low_res_mb_cols + parent_mb_col; + } + + /* Read lower-resolution mode & motion result from memory.*/ + parent_ref_frame = store_mode_info[parent_mb_index].ref_frame; + parent_mode = store_mode_info[parent_mb_index].mode; + dissim = store_mode_info[parent_mb_index].dissim; + + /* For highest-resolution encoder, adjust dissim value. Lower its quality + * for good performance. */ + if (cpi->oxcf.mr_encoder_id == (cpi->oxcf.mr_total_resolutions - 1)) + dissim>>=1; + + if(parent_ref_frame != INTRA_FRAME) + { + /* Consider different down_sampling_factor. + * The result can be rounded to be more precise, but it takes more time. + */ + //int round = cpi->oxcf.mr_down_sampling_factor.den/2; + parent_ref_mv.as_mv.row = store_mode_info[parent_mb_index].mv.as_mv.row + *cpi->oxcf.mr_down_sampling_factor.num + /cpi->oxcf.mr_down_sampling_factor.den; + parent_ref_mv.as_mv.col = store_mode_info[parent_mb_index].mv.as_mv.col + *cpi->oxcf.mr_down_sampling_factor.num + /cpi->oxcf.mr_down_sampling_factor.den; + + vp8_clamp_mv2(&parent_ref_mv, xd); + } + + vpx_memset(mode_mv, 0, sizeof(mode_mv)); + vpx_memset(nearest_mv, 0, sizeof(nearest_mv)); + vpx_memset(near_mv, 0, sizeof(near_mv)); + vpx_memset(&best_mbmode, 0, sizeof(best_mbmode)); + + cpi->mbs_tested_so_far++; + + *returnintra = INT_MAX; + x->skip = 0; + + x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; + + // if we encode a new mv this is important + // find the best new motion vector + for (mode_index = 0; mode_index < MAX_MODES; mode_index++) + { + int frame_cost; + int this_rd = INT_MAX; + + if (best_rd <= cpi->rd_threshes[mode_index]) + continue; + + /* If parent MB is intra, child MB is intra. */ + if (!parent_ref_frame && vp8_ref_frame_order[mode_index]) + continue; + + /* If parent MB is inter, and it is unlikely there are multiple objects + * in parent MB, we use parent ref frame as child MB's ref frame. */ + if (parent_ref_frame && dissim < 8 + && parent_ref_frame != vp8_ref_frame_order[mode_index]) + continue; + + x->e_mbd.mode_info_context->mbmi.ref_frame = vp8_ref_frame_order[mode_index]; + + if(x->e_mbd.mode_info_context->mbmi.ref_frame) + { + if(x->e_mbd.mode_info_context->mbmi.ref_frame==LAST_FRAME && !lfdone) + { + // set up all the refframe dependent pointers. + //if (x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME + //&& (cpi->ref_frame_flags & VP8_LAST_FLAG)) + if (cpi->ref_frame_flags & VP8_LAST_FLAG) + { + YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx]; + + vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, + &nearest_mv[LAST_FRAME], &near_mv[LAST_FRAME], + &frame_best_ref_mv[LAST_FRAME], MDCounts[LAST_FRAME], + LAST_FRAME, cpi->common.ref_frame_sign_bias); + + y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset; + u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset; + v_buffer[LAST_FRAME] = lst_yv12->v_buffer + recon_uvoffset; + } + else + skip_mode[LAST_FRAME] = 1; + + lfdone = 1; + } + + if(x->e_mbd.mode_info_context->mbmi.ref_frame==GOLDEN_FRAME && !gfdone) + { + //if (x->e_mbd.mode_info_context->mbmi.ref_frame == GOLDEN_FRAME + //&& (cpi->ref_frame_flags & VP8_GOLD_FLAG)) + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + YV12_BUFFER_CONFIG *gld_yv12 = &cpi->common.yv12_fb[cpi->common.gld_fb_idx]; + + vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, + &nearest_mv[GOLDEN_FRAME], &near_mv[GOLDEN_FRAME], + &frame_best_ref_mv[GOLDEN_FRAME],MDCounts[GOLDEN_FRAME], + GOLDEN_FRAME, cpi->common.ref_frame_sign_bias); + + y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset; + u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset; + v_buffer[GOLDEN_FRAME] = gld_yv12->v_buffer + recon_uvoffset; + } + else + skip_mode[GOLDEN_FRAME] = 1; + + gfdone = 1; + } + + if(x->e_mbd.mode_info_context->mbmi.ref_frame==ALTREF_FRAME && !afdone) + { + //if (x->e_mbd.mode_info_context->mbmi.ref_frame == ALTREF_FRAME + //&& (cpi->ref_frame_flags & VP8_ALT_FLAG && cpi->source_alt_ref_active)) + if (cpi->ref_frame_flags & VP8_ALT_FLAG && cpi->source_alt_ref_active) + { + YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx]; + + vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, + &nearest_mv[ALTREF_FRAME], &near_mv[ALTREF_FRAME], + &frame_best_ref_mv[ALTREF_FRAME],MDCounts[ALTREF_FRAME], + ALTREF_FRAME, cpi->common.ref_frame_sign_bias); + + y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset; + u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset; + v_buffer[ALTREF_FRAME] = alt_yv12->v_buffer + recon_uvoffset; + } + else + skip_mode[ALTREF_FRAME] = 1; + + afdone = 1; + } + + if (skip_mode[x->e_mbd.mode_info_context->mbmi.ref_frame]) + continue; + + x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; + x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; + x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; + mode_mv[NEARESTMV] = nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame]; + mode_mv[NEARMV] = near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame]; + best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame]; + memcpy(mdcounts, MDCounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts)); + + if (vp8_mode_order[mode_index] == NEARESTMV && mode_mv[NEARESTMV].as_int ==0) + continue; + if (vp8_mode_order[mode_index] == NEARMV && mode_mv[NEARMV].as_int ==0) + continue; + + if (vp8_mode_order[mode_index] == NEWMV && parent_mode == ZEROMV + && best_ref_mv.as_int==0) //&& dissim==0 + continue; + else if(vp8_mode_order[mode_index] == NEWMV && dissim==0 + && best_ref_mv.as_int==parent_ref_mv.as_int) + continue; + } + + // Check to see if the testing frequency for this mode is at its max + // If so then prevent it from being tested and increase the threshold for its testing + if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1)) + { + //if ( (cpi->mbs_tested_so_far / cpi->mode_test_hit_counts[mode_index]) <= cpi->mode_check_freq[mode_index] ) + if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index])) + { + // Increase the threshold for coding this mode to make it less likely to be chosen + cpi->rd_thresh_mult[mode_index] += 4; + + if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT) + cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT; + + cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + + continue; + } + } + + // We have now reached the point where we are going to test the current + //mode so increment the counter for the number of times it has been tested + cpi->mode_test_hit_counts[mode_index] ++; + + rate2 = 0; + distortion2 = 0; + + this_mode = vp8_mode_order[mode_index]; + + x->e_mbd.mode_info_context->mbmi.mode = this_mode; + x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; + + // Work out the cost assosciated with selecting the reference frame + frame_cost = + x->e_mbd.ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame]; + rate2 += frame_cost; + + // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + // unless ARNR filtering is enabled in which case we want + // an unfiltered alternative + if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) + { + if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME) + continue; + } + + switch (this_mode) + { + case B_PRED: + // Pass best so far to pick_intra4x4mby_modes to use as breakout + distortion2 = best_sse; + pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2); + + if (distortion2 == INT_MAX) + { + this_rd = INT_MAX; + } + else + { + rate2 += rate; + distortion2 = VARIANCE_INVOKE + (&cpi->rtcd.variance, var16x16)( + *(b->base_src), b->src_stride, + x->e_mbd.predictor, 16, &sse); + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + + if (this_rd < best_intra_rd) + { + best_intra_rd = this_rd; + *returnintra = distortion2; + } + } + + break; + + case DC_PRED: + case V_PRED: + case H_PRED: + case TM_PRED: + RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby) + (&x->e_mbd); + distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16) + (*(b->base_src), b->src_stride, + x->e_mbd.predictor, 16, &sse); + rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode]; + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + + if (this_rd < best_intra_rd) + { + best_intra_rd = this_rd; + *returnintra = distortion2; + } + break; + + case NEWMV: + { + int thissme; + int step_param; + int further_steps; + int n = 0; + int sadpb = x->sadperbit16; + int_mv mvp_full; + + int col_min = (best_ref_mv.as_mv.col>>3) - MAX_FULL_PEL_VAL + + ((best_ref_mv.as_mv.col & 7)?1:0); + int row_min = (best_ref_mv.as_mv.row>>3) - MAX_FULL_PEL_VAL + + ((best_ref_mv.as_mv.row & 7)?1:0); + int col_max = (best_ref_mv.as_mv.col>>3) + MAX_FULL_PEL_VAL; + int row_max = (best_ref_mv.as_mv.row>>3) + MAX_FULL_PEL_VAL; + + int tmp_col_min = x->mv_col_min; + int tmp_col_max = x->mv_col_max; + int tmp_row_min = x->mv_row_min; + int tmp_row_max = x->mv_row_max; + + int speed_adjust = (cpi->Speed > 5) ? ((cpi->Speed >= 8)? 3 : 2) : 1; + int diff_mv = MAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row), + abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)); + + // Further step/diamond searches as necessary + step_param = cpi->sf.first_step + speed_adjust; //sf->first_step = 1; for -6 step_param =3; + + // Use parent MV as predictor. Adjust search range accordingly. + mvp.as_int = parent_ref_mv.as_int; + mvp_full.as_mv.col = parent_ref_mv.as_mv.col>>3; + mvp_full.as_mv.row = parent_ref_mv.as_mv.row>>3; + + if(dissim <=32) step_param += 3; + else if(dissim <=128) step_param += 2; + else step_param += 1; + + if(dissim >2 || diff_mv >4) + { + /* Get intersection of UMV window and valid MV window to + * reduce # of checks in diamond search. */ + if (x->mv_col_min < col_min ) + x->mv_col_min = col_min; + if (x->mv_col_max > col_max ) + x->mv_col_max = col_max; + if (x->mv_row_min < row_min ) + x->mv_row_min = row_min; + if (x->mv_row_max > row_max ) + x->mv_row_max = row_max; + + further_steps = (cpi->Speed >= 8)? + 0: (cpi->sf.max_step_search_steps - 1 - step_param); + + if (cpi->sf.search_method == HEX) + { + bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv, step_param, + sadpb, &cpi->fn_ptr[BLOCK_16X16], + x->mvsadcost, x->mvcost, &best_ref_mv); + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + } + else + { + bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.mv, + step_param, sadpb, &num00, + &cpi->fn_ptr[BLOCK_16X16], + x->mvcost, &best_ref_mv); + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + + // Further step/diamond searches as necessary + n = 0; + //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; + + n = num00; + num00 = 0; + + while (n < further_steps) + { + n++; + + if (num00) + num00--; + else + { + thissme = + cpi->diamond_search_sad(x, b, d, &mvp_full, + &d->bmi.mv, + step_param + n, + sadpb, &num00, + &cpi->fn_ptr[BLOCK_16X16], + x->mvcost, &best_ref_mv); + if (thissme < bestsme) + { + bestsme = thissme; + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + } + else + { + d->bmi.mv.as_int = mode_mv[NEWMV].as_int; + } + } + } + } + + x->mv_col_min = tmp_col_min; + x->mv_col_max = tmp_col_max; + x->mv_row_min = tmp_row_min; + x->mv_row_max = tmp_row_max; + }else + { + d->bmi.mv.as_int = mvp_full.as_int; + mode_mv[NEWMV].as_int = mvp_full.as_int; + } + + // This is not needed. + //if (bestsme < INT_MAX) + cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv, + x->errorperbit, + &cpi->fn_ptr[BLOCK_16X16], + cpi->mb.mvcost, + &distortion2,&sse); + + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + + // mv cost; + rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128); + } + + case NEARESTMV: + case NEARMV: + // Trap vectors that reach beyond the UMV borders + // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops + // through to this point because of the lack of break statements + // in the previous two cases. + if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || + ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) || + ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || + ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) + continue; + + case ZEROMV: + rate2 += vp8_cost_mv_ref(this_mode, mdcounts); + x->e_mbd.mode_info_context->mbmi.mv.as_int = + mode_mv[this_mode].as_int; + + if((this_mode != NEWMV) || + !(have_subp_search) || cpi->common.full_pixel==1) + distortion2 = get_inter_mbpred_error(x, + &cpi->fn_ptr[BLOCK_16X16], + &sse, mode_mv[this_mode]); + + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + + if (cpi->active_map_enabled && x->active_ptr[0] == 0) + { + x->skip = 1; + } + else if (sse < x->encode_breakout) + { + // Check u and v to make sure skip is ok + int sse2 = 0; + + sse2 = VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance)); + + if (sse2 * 2 < x->encode_breakout) + x->skip = 1; + else + x->skip = 0; + } + + break; + default: + break; + } + + if (this_rd < best_rd || x->skip) + { + // Note index of best mode + best_mode_index = mode_index; + + *returnrate = rate2; + *returndistortion = distortion2; + best_sse = sse; + best_rd = this_rd; + vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO)); + + // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time + cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT; + cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + } + // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around. + else + { + cpi->rd_thresh_mult[mode_index] += 4; + + if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT) + cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT; + + cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + } + + if (x->skip) + break; + } + + // Reduce the activation RD thresholds for the best choice mode + if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) + { + int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3); + + cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT; + cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index]; + } + + + { + int this_rdbin = (*returndistortion >> 7); + + if (this_rdbin >= 1024) + { + this_rdbin = 1023; + } + + cpi->error_bins[this_rdbin] ++; + } + + if (cpi->is_src_frame_alt_ref && + (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) + { + x->e_mbd.mode_info_context->mbmi.mode = ZEROMV; + x->e_mbd.mode_info_context->mbmi.ref_frame = ALTREF_FRAME; + x->e_mbd.mode_info_context->mbmi.mv.as_int = 0; + x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; + x->e_mbd.mode_info_context->mbmi.mb_skip_coeff = + (cpi->common.mb_no_coeff_skip) ? 1 : 0; + x->e_mbd.mode_info_context->mbmi.partitioning = 0; + + return; + } + + /* set to the best mb mode */ + vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO)); + + if (best_mbmode.mode <= B_PRED) + { + /* set mode_info_context->mbmi.uv_mode */ + pick_intra_mbuv_mode(x); + } + + update_mvcount(cpi, &x->e_mbd, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]); +} +#endif diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h index 1c5d6a6e6..df6042fd5 100644 --- a/vp8/encoder/pickinter.h +++ b/vp8/encoder/pickinter.h @@ -14,6 +14,16 @@ #include "vpx_config.h" #include "vp8/common/onyxc_int.h" -extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra); +extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, + int recon_uvoffset, int *returnrate, + int *returndistortion, int *returnintra); extern void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate); + +#if CONFIG_MULTI_RES_ENCODING +extern void vp8_mr_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, + int recon_yoffset, int recon_uvoffset, + int *returnrate, int *returndistortion, + int *returnintra, int mb_row, int mb_col); +#endif + #endif diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 379ffe0a8..62a2cfc53 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1463,57 +1463,6 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, return bsi.segment_rd; } -static void insertsortmv(int arr[], int len) -{ - int i, j, k; - - for ( i = 1 ; i <= len-1 ; i++ ) - { - for ( j = 0 ; j < i ; j++ ) - { - if ( arr[j] > arr[i] ) - { - int temp; - - temp = arr[i]; - - for ( k = i; k >j; k--) - arr[k] = arr[k - 1] ; - - arr[j] = temp ; - } - } - } -} - -static void insertsortsad(int arr[],int idx[], int len) -{ - int i, j, k; - - for ( i = 1 ; i <= len-1 ; i++ ) - { - for ( j = 0 ; j < i ; j++ ) - { - if ( arr[j] > arr[i] ) - { - int temp, tempi; - - temp = arr[i]; - tempi = idx[i]; - - for ( k = i; k >j; k--) - { - arr[k] = arr[k - 1] ; - idx[k] = idx[k - 1]; - } - - arr[j] = temp ; - idx[j] = tempi; - } - } - } -} - //The improved MV prediction void vp8_mv_pred ( diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h index 95134cb81..5ee869903 100644 --- a/vp8/encoder/rdopt.h +++ b/vp8/encoder/rdopt.h @@ -14,6 +14,57 @@ #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) +static void insertsortmv(int arr[], int len) +{ + int i, j, k; + + for ( i = 1 ; i <= len-1 ; i++ ) + { + for ( j = 0 ; j < i ; j++ ) + { + if ( arr[j] > arr[i] ) + { + int temp; + + temp = arr[i]; + + for ( k = i; k >j; k--) + arr[k] = arr[k - 1] ; + + arr[j] = temp ; + } + } + } +} + +static void insertsortsad(int arr[],int idx[], int len) +{ + int i, j, k; + + for ( i = 1 ; i <= len-1 ; i++ ) + { + for ( j = 0 ; j < i ; j++ ) + { + if ( arr[j] > arr[i] ) + { + int temp, tempi; + + temp = arr[i]; + tempi = idx[i]; + + for ( k = i; k >j; k--) + { + arr[k] = arr[k - 1] ; + idx[k] = idx[k - 1]; + } + + arr[j] = temp ; + idx[j] = tempi; + } + } + } +} + extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue); extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra); extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate); diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 4f21e1456..5bb6b4099 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -264,7 +264,8 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, vpx_codec_enc_cfg_t cfg, - struct vp8_extracfg vp8_cfg) + struct vp8_extracfg vp8_cfg, + vpx_codec_priv_enc_mr_cfg_t *mr_cfg) { oxcf->multi_threaded = cfg.g_threads; oxcf->Version = cfg.g_profile; @@ -355,6 +356,17 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, memcpy (oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id)); } +#if CONFIG_MULTI_RES_ENCODING + if(mr_cfg) + { + oxcf->mr_total_resolutions = mr_cfg->mr_total_resolutions; + oxcf->mr_encoder_id = mr_cfg->mr_encoder_id; + oxcf->mr_down_sampling_factor.num = mr_cfg->mr_down_sampling_factor.num; + oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den; + oxcf->mr_low_res_mode_info = mr_cfg->mr_low_res_mode_info; + } +#endif + //oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile; //strcpy(oxcf->first_pass_file, cfg.g_firstpass_file); @@ -432,7 +444,7 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, if (!res) { ctx->cfg = *cfg; - set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg); + set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); vp8_change_config(ctx->cpi, &ctx->oxcf); } @@ -498,14 +510,38 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx, if (!res) { ctx->vp8_cfg = xcfg; - set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg); + set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); vp8_change_config(ctx->cpi, &ctx->oxcf); } return res; #undef MAP } -static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) + +static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg, + void **mem_loc) +{ + vpx_codec_err_t res = 0; + +#if CONFIG_MULTI_RES_ENCODING + int mb_rows = ((cfg->g_w + 15) >>4); + int mb_cols = ((cfg->g_h + 15) >>4); + + *mem_loc = calloc(mb_rows*mb_cols, sizeof(LOWER_RES_INFO)); + if(!(*mem_loc)) + { + free(*mem_loc); + res = VPX_CODEC_MEM_ERROR; + } + else + res = VPX_CODEC_OK; +#endif + + return res; +} + +static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *mr_cfg) { vpx_codec_err_t res = VPX_DEC_OK; struct vpx_codec_alg_priv *priv; @@ -570,9 +606,16 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) if (!res) { + if(mr_cfg) + ctx->priv->enc.total_encoders = mr_cfg->mr_total_resolutions; + else + ctx->priv->enc.total_encoders = 1; + set_vp8e_config(&ctx->priv->alg_priv->oxcf, ctx->priv->alg_priv->cfg, - ctx->priv->alg_priv->vp8_cfg); + ctx->priv->alg_priv->vp8_cfg, + mr_cfg); + optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf); if (!optr) @@ -587,6 +630,11 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) { +#if CONFIG_MULTI_RES_ENCODING + /* Free multi-encoder shared memory */ + if (ctx->oxcf.mr_total_resolutions > 0 && (ctx->oxcf.mr_encoder_id == ctx->oxcf.mr_total_resolutions-1)) + free(ctx->oxcf.mr_low_res_mode_info); +#endif free(ctx->cx_data); vp8_remove_compressor(&ctx->cpi); @@ -1223,6 +1271,7 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = vp8e_set_config, NOT_IMPLEMENTED, vp8e_get_preview, + vp8e_mr_alloc_mem, } /* encoder functions */ }; @@ -1307,5 +1356,6 @@ vpx_codec_iface_t vpx_enc_vp8_algo = vp8e_set_config, NOT_IMPLEMENTED, vp8e_get_preview, + vp8e_mr_alloc_mem, } /* encoder functions */ }; diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index cdfcd2142..54bdb8568 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -181,9 +181,11 @@ static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) /* nothing to clean up */ } -static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx) +static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data) { vpx_codec_err_t res = VPX_CODEC_OK; + (void) data; /* This function only allocates space for the vpx_codec_alg_priv_t * structure. More memory may be required at the time the stream @@ -564,7 +566,7 @@ static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t *ctx, if (done && !res) { vp8_finalize_mmaps(ctx->priv->alg_priv); - res = ctx->iface->init(ctx); + res = ctx->iface->init(ctx, NULL); } return res; diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index b71a54aea..2d99981f5 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -86,6 +86,8 @@ VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c VP8_CX_SRCS-yes += encoder/temporal_filter.c VP8_CX_SRCS-yes += encoder/temporal_filter.h +VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c +VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c diff --git a/vp8_multi_resolution_encoder.c b/vp8_multi_resolution_encoder.c new file mode 100644 index 000000000..732f96e38 --- /dev/null +++ b/vp8_multi_resolution_encoder.c @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * This is an example demonstrating multi-resolution encoding in VP8. + * High-resolution input video is down-sampled to lower-resolutions. The + * encoder then encodes the video and outputs multiple bitstreams with + * different resolutions. + */ +#include +#include +#include +#include +#include "math.h" +#define VPX_CODEC_DISABLE_COMPAT 1 +#include "vpx/vpx_encoder.h" +#include "vpx/vp8cx.h" +#include "vpx_ports/mem_ops.h" +#define interface (vpx_codec_vp8_cx()) +#define fourcc 0x30385056 + +#define IVF_FILE_HDR_SZ (32) +#define IVF_FRAME_HDR_SZ (12) + +/* + * The input video frame is downsampled several times to generate a multi-level + * hierarchical structure. NUM_ENCODERS is defined as the number of encoding + * levels required. For example, if the size of input video is 1280x720, + * NUM_ENCODERS is 3, and down-sampling factor is 2, the encoder outputs 3 + * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and + * 320x180(level 2) respectively. + */ +#define NUM_ENCODERS 3 + +/* This example uses the scaler function in libyuv. */ +#include "third_party/libyuv/include/libyuv/basic_types.h" +#include "third_party/libyuv/include/libyuv/scale.h" +#include "third_party/libyuv/include/libyuv/cpu_id.h" + +static double vp8_mse2psnr(double Samples, double Peak, double Mse) +{ + double psnr; + + if ((double)Mse > 0.0) + psnr = 10.0 * log10(Peak * Peak * Samples / Mse); + else + psnr = 60; // Limit to prevent / 0 + + if (psnr > 60) + psnr = 60; + + return psnr; +} + +static void die(const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + vprintf(fmt, ap); + if(fmt[strlen(fmt)-1] != '\n') + printf("\n"); + exit(EXIT_FAILURE); +} + +static void die_codec(vpx_codec_ctx_t *ctx, const char *s) { + const char *detail = vpx_codec_error_detail(ctx); + + printf("%s: %s\n", s, vpx_codec_error(ctx)); + if(detail) + printf(" %s\n",detail); + exit(EXIT_FAILURE); +} + +static int read_frame(FILE *f, vpx_image_t *img) { + size_t nbytes, to_read; + int res = 1; + + to_read = img->w*img->h*3/2; + nbytes = fread(img->planes[0], 1, to_read, f); + if(nbytes != to_read) { + res = 0; + if(nbytes > 0) + printf("Warning: Read partial frame. Check your width & height!\n"); + } + return res; +} + +static void write_ivf_file_header(FILE *outfile, + const vpx_codec_enc_cfg_t *cfg, + int frame_cnt) { + char header[32]; + + if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS) + return; + header[0] = 'D'; + header[1] = 'K'; + header[2] = 'I'; + header[3] = 'F'; + mem_put_le16(header+4, 0); /* version */ + mem_put_le16(header+6, 32); /* headersize */ + mem_put_le32(header+8, fourcc); /* headersize */ + mem_put_le16(header+12, cfg->g_w); /* width */ + mem_put_le16(header+14, cfg->g_h); /* height */ + mem_put_le32(header+16, cfg->g_timebase.den); /* rate */ + mem_put_le32(header+20, cfg->g_timebase.num); /* scale */ + mem_put_le32(header+24, frame_cnt); /* length */ + mem_put_le32(header+28, 0); /* unused */ + + if(fwrite(header, 1, 32, outfile)); +} + +static void write_ivf_frame_header(FILE *outfile, + const vpx_codec_cx_pkt_t *pkt) +{ + char header[12]; + vpx_codec_pts_t pts; + + if(pkt->kind != VPX_CODEC_CX_FRAME_PKT) + return; + + pts = pkt->data.frame.pts; + mem_put_le32(header, pkt->data.frame.sz); + mem_put_le32(header+4, pts&0xFFFFFFFF); + mem_put_le32(header+8, pts >> 32); + + if(fwrite(header, 1, 12, outfile)); +} + +int main(int argc, char **argv) +{ + FILE *infile, *outfile[NUM_ENCODERS]; + vpx_codec_ctx_t codec[NUM_ENCODERS]; + vpx_codec_enc_cfg_t cfg[NUM_ENCODERS]; + vpx_codec_pts_t frame_cnt = 0; + vpx_image_t raw[NUM_ENCODERS]; + vpx_codec_err_t res[NUM_ENCODERS]; + + int i; + long width; + long height; + int frame_avail; + int got_data; + int flags = 0; + + /*Currently, only realtime mode is supported in multi-resolution encoding.*/ + int arg_deadline = VPX_DL_REALTIME; + + /* Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you + don't need to know PSNR, which will skip PSNR calculation and save + encoding time. */ + int show_psnr = 0; + uint64_t psnr_sse_total[NUM_ENCODERS] = {0}; + uint64_t psnr_samples_total[NUM_ENCODERS] = {0}; + double psnr_totals[NUM_ENCODERS][4] = {{0,0}}; + int psnr_count[NUM_ENCODERS] = {0}; + + /* Set the required target bitrates for each resolution level. */ + unsigned int target_bitrate[NUM_ENCODERS]={1400, 500, 100}; + /* Enter the frame rate of the input video */ + int framerate = 30; + /* Set down-sampling factor for each resolution level. + dsf[0] controls down sampling from level 0 to level 1; + dsf[1] controls down sampling from level 1 to level 2; + dsf[2] is not used. */ + vpx_rational_t dsf[NUM_ENCODERS] = {{2, 1}, {2, 1}, {1, 1}}; + + if(argc!= (5+NUM_ENCODERS)) + die("Usage: %s \n", + argv[0]); + + printf("Using %s\n",vpx_codec_iface_name(interface)); + + width = strtol(argv[1], NULL, 0); + height = strtol(argv[2], NULL, 0); + + if(width < 16 || width%2 || height <16 || height%2) + die("Invalid resolution: %ldx%ld", width, height); + + /* Open input video file for encoding */ + if(!(infile = fopen(argv[3], "rb"))) + die("Failed to open %s for reading", argv[3]); + + /* Open output file for each encoder to output bitstreams */ + for (i=0; i< NUM_ENCODERS; i++) + { + if(!(outfile[i] = fopen(argv[i+4], "wb"))) + die("Failed to open %s for writing", argv[i+4]); + } + + show_psnr = strtol(argv[NUM_ENCODERS + 4], NULL, 0); + + /* Populate default encoder configuration */ + for (i=0; i< NUM_ENCODERS; i++) + { + res[i] = vpx_codec_enc_config_default(interface, &cfg[i], 0); + if(res[i]) { + printf("Failed to get config: %s\n", vpx_codec_err_to_string(res[i])); + return EXIT_FAILURE; + } + } + + /* + * Update the default configuration according to needs of the application. + */ + /* Highest-resolution encoder settings */ + cfg[0].g_w = width; + cfg[0].g_h = height; + cfg[0].g_threads = 1; /* number of threads used */ + cfg[0].rc_dropframe_thresh = 0; + cfg[0].rc_end_usage = VPX_CBR; + cfg[0].rc_resize_allowed = 0; + cfg[0].rc_min_quantizer = 4; + cfg[0].rc_max_quantizer = 56; + cfg[0].rc_undershoot_pct = 98; + cfg[0].rc_overshoot_pct = 100; + cfg[0].rc_buf_initial_sz = 500; + cfg[0].rc_buf_optimal_sz = 600; + cfg[0].rc_buf_sz = 1000; + //cfg[0].rc_dropframe_thresh = 10; + cfg[0].g_error_resilient = 1; /* Enable error resilient mode */ + cfg[0].g_lag_in_frames = 0; + + /* Disable automatic keyframe placement */ + //cfg[0].kf_mode = VPX_KF_DISABLED; + cfg[0].kf_min_dist = cfg[0].kf_max_dist = 1000; + + cfg[0].rc_target_bitrate = target_bitrate[0]; /* Set target bitrate */ + cfg[0].g_timebase.num = 1; /* Set fps */ + cfg[0].g_timebase.den = framerate; + + /* Other-resolution encoder settings */ + for (i=1; i< NUM_ENCODERS; i++) + { + memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t)); + + cfg[i].g_threads = 1; /* number of threads used */ + cfg[i].rc_target_bitrate = target_bitrate[i]; + + /* Note: Width & height of other-resolution encoders are calculated + * from the highest-resolution encoder's size and the corresponding + * down_sampling_factor. + */ + { + unsigned int iw = cfg[i-1].g_w*dsf[i-1].den + dsf[i-1].num - 1; + unsigned int ih = cfg[i-1].g_h*dsf[i-1].den + dsf[i-1].num - 1; + cfg[i].g_w = iw/dsf[i-1].num; + cfg[i].g_h = ih/dsf[i-1].num; + } + + /* Make width & height to be multiplier of 2. */ + // Should support odd size ??? + if((cfg[i].g_w)%2)cfg[i].g_w++; + if((cfg[i].g_h)%2)cfg[i].g_h++; + } + + /* Allocate image for each encoder */ + for (i=0; i< NUM_ENCODERS; i++) + if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 1)) + die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h); + + for (i=0; i< NUM_ENCODERS; i++) + write_ivf_file_header(outfile[i], &cfg[i], 0); + + /* Initialize multi-encoder */ + if(vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS, + (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0])) + die_codec(&codec[0], "Failed to initialize encoder"); + + /* The extra encoding configuration parameters can be set as follows. */ + /* Set encoding speed */ + for ( i=0; i> 1; + int src_uvheight = (raw[i-1].d_h + 1) >> 1; + const unsigned char* src_y = raw[i-1].planes[VPX_PLANE_Y]; + const unsigned char* src_u = raw[i-1].planes[VPX_PLANE_Y] + + raw[i-1].d_w*raw[i-1].d_h; + const unsigned char* src_v = raw[i-1].planes[VPX_PLANE_Y] + + raw[i-1].d_w*raw[i-1].d_h + + src_uvwidth*src_uvheight; + int dst_uvwidth = (raw[i].d_w + 1) >> 1; + int dst_uvheight = (raw[i].d_h + 1) >> 1; + unsigned char* dst_y = raw[i].planes[VPX_PLANE_Y]; + unsigned char* dst_u = raw[i].planes[VPX_PLANE_Y] + + raw[i].d_w*raw[i].d_h; + unsigned char* dst_v = raw[i].planes[VPX_PLANE_Y] + + raw[i].d_w*raw[i].d_h + + dst_uvwidth*dst_uvheight; + + /* FilterMode 1 or 2 give better psnr than FilterMode 0. */ + I420Scale(src_y, raw[i-1].d_w, src_u, src_uvwidth, src_v, + src_uvwidth, raw[i-1].d_w, raw[i-1].d_h, + dst_y, raw[i].d_w, dst_u, dst_uvwidth, + dst_v, dst_uvwidth, raw[i].d_w, raw[i].d_h, 1); + } + } + + /* Encode each frame at multi-levels */ + if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL, + frame_cnt, 1, flags, arg_deadline)) + die_codec(&codec[0], "Failed to encode frame"); + + for (i=NUM_ENCODERS-1; i>=0 ; i--) + { + got_data = 0; + + while( (pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i])) ) + { + got_data = 1; + switch(pkt[i]->kind) { + case VPX_CODEC_CX_FRAME_PKT: + write_ivf_frame_header(outfile[i], pkt[i]); + if(fwrite(pkt[i]->data.frame.buf, 1, pkt[i]->data.frame.sz, + outfile[i])); + break; + case VPX_CODEC_PSNR_PKT: + if (show_psnr) + { + int j; + + psnr_sse_total[i] += pkt[i]->data.psnr.sse[0]; + psnr_samples_total[i] += pkt[i]->data.psnr.samples[0]; + for (j = 0; j < 4; j++) + { + //fprintf(stderr, "%.3lf ", pkt[i]->data.psnr.psnr[j]); + psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j]; + } + psnr_count[i]++; + } + + break; + default: + break; + } + printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT + && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":"."); + fflush(stdout); + } + } + frame_cnt++; + } + printf("\n"); + + fclose(infile); + + for (i=0; i< NUM_ENCODERS; i++) + { + printf("Processed %ld frames.\n",(long int)frame_cnt-1); + + /* Calculate PSNR and print it out */ + if ( (show_psnr) && (psnr_count[i]>0) ) + { + int j; + double ovpsnr = vp8_mse2psnr(psnr_samples_total[i], 255.0, + psnr_sse_total[i]); + + fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i); + + fprintf(stderr, " %.3lf", ovpsnr); + for (j = 0; j < 4; j++) + { + fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]); + } + } + + if(vpx_codec_destroy(&codec[i])) + die_codec(&codec[i], "Failed to destroy codec"); + + /* Try to rewrite the file header with the actual frame count */ + if(!fseek(outfile[i], 0, SEEK_SET)) + write_ivf_file_header(outfile[i], &cfg[i], frame_cnt-1); + fclose(outfile[i]); + + vpx_img_free(&raw[i]); + } + + return EXIT_SUCCESS; +} diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h index a1ff1921e..0703d6a4f 100644 --- a/vpx/internal/vpx_codec_internal.h +++ b/vpx/internal/vpx_codec_internal.h @@ -56,9 +56,10 @@ * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_CODEC_INTERNAL_ABI_VERSION (3) /**<\hideinitializer*/ +#define VPX_CODEC_INTERNAL_ABI_VERSION (4) /**<\hideinitializer*/ typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t; +typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t; /*!\brief init function pointer prototype * @@ -73,7 +74,8 @@ typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t; * \retval #VPX_CODEC_MEM_ERROR * Memory operation failed. */ -typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx); +typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data); /*!\brief destroy function pointer prototype * @@ -264,6 +266,10 @@ typedef vpx_fixed_buf_t * typedef vpx_image_t * (*vpx_codec_get_preview_frame_fn_t)(vpx_codec_alg_priv_t *ctx); +typedef vpx_codec_err_t +(*vpx_codec_enc_mr_get_mem_loc_fn_t)(const vpx_codec_enc_cfg_t *cfg, + void **mem_loc); + /*!\brief usage configuration mapping * * This structure stores the mapping between usage identifiers and @@ -309,8 +315,9 @@ struct vpx_codec_iface vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */ vpx_codec_get_cx_data_fn_t get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */ vpx_codec_enc_config_set_fn_t cfg_set; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */ - vpx_codec_get_global_headers_fn_t get_glob_hdrs; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */ + vpx_codec_get_global_headers_fn_t get_glob_hdrs; /**< \copydoc ::vpx_codec_get_global_headers_fn_t */ vpx_codec_get_preview_frame_fn_t get_preview; /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */ + vpx_codec_enc_mr_get_mem_loc_fn_t mr_get_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */ } enc; }; @@ -353,9 +360,21 @@ struct vpx_codec_priv unsigned int cx_data_pad_before; unsigned int cx_data_pad_after; vpx_codec_cx_pkt_t cx_data_pkt; + unsigned int total_encoders; } enc; }; +/* + * Multi-resolution encoding internal configuration + */ +struct vpx_codec_priv_enc_mr_cfg +{ + unsigned int mr_total_resolutions; + unsigned int mr_encoder_id; + struct vpx_rational mr_down_sampling_factor; + void* mr_low_res_mode_info; +}; + #undef VPX_CTRL_USE_TYPE #define VPX_CTRL_USE_TYPE(id, typ) \ static typ id##__value(va_list args) {return va_arg(args, typ);} \ diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c index 5d31c2c49..59a783dd9 100644 --- a/vpx/src/vpx_decoder.c +++ b/vpx/src/vpx_decoder.c @@ -56,7 +56,7 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, if (!(flags & VPX_CODEC_USE_XMA)) { - res = ctx->iface->init(ctx); + res = ctx->iface->init(ctx, NULL); if (res) { diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c index 5e86835ea..bddad23ec 100644 --- a/vpx/src/vpx_encoder.c +++ b/vpx/src/vpx_encoder.c @@ -51,7 +51,7 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, ctx->priv = NULL; ctx->init_flags = flags; ctx->config.enc = cfg; - res = ctx->iface->init(ctx); + res = ctx->iface->init(ctx, NULL); if (res) { @@ -66,6 +66,85 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, return SAVE_STATUS(ctx, res); } +vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *cfg, + int num_enc, + vpx_codec_flags_t flags, + vpx_rational_t *dsf, + int ver) +{ + vpx_codec_err_t res = 0; + + if (ver != VPX_ENCODER_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1)) + res = VPX_CODEC_INVALID_PARAM; + else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_PSNR) + && !(iface->caps & VPX_CODEC_CAP_PSNR)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION) + && !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION)) + res = VPX_CODEC_INCAPABLE; + else + { + int i; + void *mem_loc = NULL; + + if(!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) + { + for (i = 0; i < num_enc; i++) + { + vpx_codec_priv_enc_mr_cfg_t mr_cfg; + + /* Validate down-sampling factor. */ + if(dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 || + dsf->den > dsf->num) + { + res = VPX_CODEC_INVALID_PARAM; + break; + } + + mr_cfg.mr_low_res_mode_info = mem_loc; + mr_cfg.mr_total_resolutions = num_enc; + mr_cfg.mr_encoder_id = num_enc-1-i; + mr_cfg.mr_down_sampling_factor.num = dsf->num; + mr_cfg.mr_down_sampling_factor.den = dsf->den; + + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.enc = cfg; + res = ctx->iface->init(ctx, &mr_cfg); + + if (res) + { + ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; + vpx_codec_destroy(ctx); + } + + if (ctx->priv) + ctx->priv->iface = ctx->iface; + + if (res) + break; + + ctx++; + cfg++; + dsf++; + } + } + } + + return SAVE_STATUS(ctx, res); +} vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, @@ -123,7 +202,7 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, vpx_enc_frame_flags_t flags, unsigned long deadline) { - vpx_codec_err_t res; + vpx_codec_err_t res = 0; if (!ctx || (img && !duration)) res = VPX_CODEC_INVALID_PARAM; @@ -136,9 +215,36 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, /* Execute in a normalized floating point environment, if the platform * requires it. */ + unsigned int num_enc =ctx->priv->enc.total_encoders; + FLOATING_POINT_INIT(); - res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts, - duration, flags, deadline); + + if (num_enc == 1) + res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts, + duration, flags, deadline); + else + { + /* Multi-resolution encoding: + * Encode multi-levels in reverse order. For example, + * if mr_total_resolutions = 3, first encode level 2, + * then encode level 1, and finally encode level 0. + */ + int i; + + ctx += num_enc - 1; + if (img) img += num_enc - 1; + + for (i = num_enc-1; i >= 0; i--) + { + if ((res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts, + duration, flags, deadline))) + break; + + ctx--; + if (img) img--; + } + } + FLOATING_POINT_RESTORE(); } diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 87ab20c75..885ca229f 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -634,7 +634,6 @@ extern "C" { * then ts_layer_id = (0,1,0,1,0,1,0,1). */ unsigned int ts_layer_id[MAX_PERIODICITY]; - } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */ @@ -675,6 +674,48 @@ extern "C" { vpx_codec_enc_init_ver(ctx, iface, cfg, flags, VPX_ENCODER_ABI_VERSION) + /*!\brief Initialize multi-encoder instance + * + * Initializes multi-encoder context using the given interface. + * Applications should call the vpx_codec_enc_init_multi convenience macro + * instead of this function directly, to ensure that the ABI version number + * parameter is properly initialized. + * + * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags + * parameter), the storage pointed to by the cfg parameter must be + * kept readable and stable until all memory maps have been set. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] num_enc Total number of encoders. + * \param[in] flags Bitfield of VPX_CODEC_USE_* flags + * \param[in] dsf Pointer to down-sampling factors. + * \param[in] ver ABI version number. Must be set to + * VPX_ENCODER_ABI_VERSION + * \retval #VPX_CODEC_OK + * The decoder algorithm initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ + vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *cfg, + int num_enc, + vpx_codec_flags_t flags, + vpx_rational_t *dsf, + int ver); + + + /*!\brief Convenience macro for vpx_codec_enc_init_multi_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define vpx_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \ + vpx_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf, \ + VPX_ENCODER_ABI_VERSION) + + /*!\brief Get a default configuration * * Initializes a encoder configuration structure with default values. Supports @@ -780,7 +821,6 @@ extern "C" { vpx_enc_frame_flags_t flags, unsigned long deadline); - /*!\brief Set compressed data output buffer * * Sets the buffer that the codec should output the compressed data