From: Deb Mukherjee Date: Sat, 17 May 2014 01:52:01 +0000 (-0700) Subject: Updates libyuv to version 1005 X-Git-Tag: v1.4.0~1529^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=47031c0a54b2054c98066919dbd1e71c60b3c4f4;p=libvpx Updates libyuv to version 1005 Also adds compile check and a libyuv configure flag Change-Id: Ib9f0f4a71c4083e6f0aea7b5a5d175531ef0f66b --- diff --git a/configure b/configure index cc4ca029f..b6d645ae5 100755 --- a/configure +++ b/configure @@ -52,6 +52,7 @@ Advanced options: ${toggle_multi_res_encoding} enable multiple-resolution encoding ${toggle_temporal_denoising} enable temporal denoising and disable the spatial denoiser ${toggle_webm_io} enable input from and output to WebM container + ${toggle_libyuv} enable libyuv Codecs: Codecs can be selectively enabled or disabled individually, or by family: @@ -315,6 +316,7 @@ CONFIG_LIST=" os_support unit_tests webm_io + libyuv decode_perf_tests multi_res_encoding temporal_denoising @@ -368,6 +370,7 @@ CMDLINE_SELECT=" postproc_visualizer unit_tests webm_io + libyuv decode_perf_tests multi_res_encoding temporal_denoising @@ -709,9 +712,11 @@ process_toolchain() { *-vs*) soft_enable unit_tests soft_enable webm_io + soft_enable libyuv ;; *-android-*) soft_enable webm_io + soft_enable libyuv # GTestLog must be modified to use Android logging utilities. ;; *-darwin-*) @@ -727,6 +732,9 @@ int z; EOF check_cxx "$@" < // for NULL, size_t -#if !(defined(_MSC_VER) && (_MSC_VER < 1600)) +#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600)) +#include // for uintptr_t on x86 +#else #include // for uintptr_t #endif +#ifndef GG_LONGLONG #ifndef INT_TYPES_DEFINED #define INT_TYPES_DEFINED #ifdef COMPILER_MSVC @@ -30,9 +33,9 @@ typedef __int64 int64; #endif #define INT64_F "I64" #else // COMPILER_MSVC -#ifdef __LP64__ -typedef unsigned long uint64; -typedef long int64; +#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) +typedef unsigned long uint64; // NOLINT +typedef long int64; // NOLINT #ifndef INT64_C #define INT64_C(x) x ## L #endif @@ -40,9 +43,9 @@ typedef long int64; #define UINT64_C(x) x ## UL #endif #define INT64_F "l" -#else // __LP64__ -typedef unsigned long long uint64; -typedef long long int64; +#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) +typedef unsigned long long uint64; // NOLINT +typedef long long int64; // NOLINT #ifndef INT64_C #define INT64_C(x) x ## LL #endif @@ -54,20 +57,62 @@ typedef long long int64; #endif // COMPILER_MSVC typedef unsigned int uint32; typedef int int32; -typedef unsigned short uint16; -typedef short int16; +typedef unsigned short uint16; // NOLINT +typedef short int16; // NOLINT typedef unsigned char uint8; -typedef char int8; +typedef signed char int8; #endif // INT_TYPES_DEFINED +#endif // GG_LONGLONG // Detect compiler is for x86 or x64. #if defined(__x86_64__) || defined(_M_X64) || \ defined(__i386__) || defined(_M_IX86) #define CPU_X86 1 #endif +// Detect compiler is for ARM. +#if defined(__arm__) || defined(_M_ARM) +#define CPU_ARM 1 +#endif +#ifndef ALIGNP +#ifdef __cplusplus +#define ALIGNP(p, t) \ + (reinterpret_cast(((reinterpret_cast(p) + \ + ((t) - 1)) & ~((t) - 1)))) +#else #define ALIGNP(p, t) \ - ((uint8*)((((uintptr_t)(p) + \ - ((t)-1)) & ~((t)-1)))) + ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1)))) /* NOLINT */ +#endif +#endif + +#if !defined(LIBYUV_API) +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(LIBYUV_BUILDING_SHARED_LIBRARY) +#define LIBYUV_API __declspec(dllexport) +#elif defined(LIBYUV_USING_SHARED_LIBRARY) +#define LIBYUV_API __declspec(dllimport) +#else +#define LIBYUV_API +#endif // LIBYUV_BUILDING_SHARED_LIBRARY +#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \ + (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \ + defined(LIBYUV_USING_SHARED_LIBRARY)) +#define LIBYUV_API __attribute__ ((visibility ("default"))) +#else +#define LIBYUV_API +#endif // __GNUC__ +#endif // LIBYUV_API + +#define LIBYUV_BOOL int +#define LIBYUV_FALSE 0 +#define LIBYUV_TRUE 1 + +// Visual C x86 or GCC little endian. +#if defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86) || \ + defined(__arm__) || defined(_M_ARM) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define LIBYUV_LITTLE_ENDIAN +#endif -#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/cpu_id.h b/third_party/libyuv/include/libyuv/cpu_id.h index 4a53b5bef..fd6276b49 100644 --- a/third_party/libyuv/include/libyuv/cpu_id.h +++ b/third_party/libyuv/include/libyuv/cpu_id.h @@ -1,49 +1,81 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may + * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CPU_ID_H_ +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT #define INCLUDE_LIBYUV_CPU_ID_H_ +#include "basic_types.h" + #ifdef __cplusplus namespace libyuv { extern "C" { #endif -// These flags are only valid on x86 processors -static const int kCpuHasSSE2 = 1; -static const int kCpuHasSSSE3 = 2; +// TODO(fbarchard): Consider overlapping bits for different architectures. +// Internal flag to indicate cpuid requires initialization. +#define kCpuInit 0x1 + +// These flags are only valid on ARM processors. +static const int kCpuHasARM = 0x2; +static const int kCpuHasNEON = 0x4; +// 0x8 reserved for future ARM flag. + +// These flags are only valid on x86 processors. +static const int kCpuHasX86 = 0x10; +static const int kCpuHasSSE2 = 0x20; +static const int kCpuHasSSSE3 = 0x40; +static const int kCpuHasSSE41 = 0x80; +static const int kCpuHasSSE42 = 0x100; +static const int kCpuHasAVX = 0x200; +static const int kCpuHasAVX2 = 0x400; +static const int kCpuHasERMS = 0x800; +static const int kCpuHasFMA3 = 0x1000; +// 0x2000, 0x4000, 0x8000 reserved for future X86 flags. -// These flags are only valid on ARM processors -static const int kCpuHasNEON = 4; +// These flags are only valid on MIPS processors. +static const int kCpuHasMIPS = 0x10000; +static const int kCpuHasMIPS_DSP = 0x20000; +static const int kCpuHasMIPS_DSPR2 = 0x40000; -// Internal flag to indicate cpuid is initialized. -static const int kCpuInitialized = 8; +// Internal function used to auto-init. +LIBYUV_API +int InitCpuFlags(void); + +// Internal function for parsing /proc/cpuinfo. +LIBYUV_API +int ArmCpuCaps(const char* cpuinfo_name); // Detect CPU has SSE2 etc. -// test_flag parameter should be one of kCpuHas constants above +// Test_flag parameter should be one of kCpuHas constants above. // returns non-zero if instruction set is detected static __inline int TestCpuFlag(int test_flag) { - extern int cpu_info_; - extern int InitCpuFlags(); - return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag; + LIBYUV_API extern int cpu_info_; + return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag; } // For testing, allow CPU flags to be disabled. // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. -// -1 to enable all cpu specific optimizations. -// 0 to disable all cpu specific optimizations. +// MaskCpuFlags(-1) to enable all cpu specific optimizations. +// MaskCpuFlags(0) to disable all cpu specific optimizations. +LIBYUV_API void MaskCpuFlags(int enable_flags); +// Low level cpuid for X86. Returns zeros on other CPUs. +// eax is the info type that you want. +// ecx is typically the cpu number, and should normally be zero. +LIBYUV_API +void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info); + #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CPU_ID_H_ +#endif // INCLUDE_LIBYUV_CPU_ID_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/planar_functions.h b/third_party/libyuv/include/libyuv/planar_functions.h new file mode 100644 index 000000000..43f8df36d --- /dev/null +++ b/third_party/libyuv/include/libyuv/planar_functions.h @@ -0,0 +1,439 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT +#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ + +#include "basic_types.h" + +// TODO(fbarchard): Remove the following headers includes. +// #include "convert.h" +// #include "convert_argb.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy a plane of data. +LIBYUV_API +void CopyPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height); + +LIBYUV_API +void CopyPlane_16(const uint16* src_y, int src_stride_y, + uint16* dst_y, int dst_stride_y, + int width, int height); + +// Set a plane of data to a 32 bit value. +LIBYUV_API +void SetPlane(uint8* dst_y, int dst_stride_y, + int width, int height, + uint32 value); + +// Copy I400. Supports inverting. +LIBYUV_API +int I400ToI400(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height); + + +// Copy I422 to I422. +#define I422ToI422 I422Copy +LIBYUV_API +int I422Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Copy I444 to I444. +#define I444ToI444 I444Copy +LIBYUV_API +int I444Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert UYVY to I422. +LIBYUV_API +int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I420 to I400. (calls CopyPlane ignoring u/v). +LIBYUV_API +int I420ToI400(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Alias +#define I420ToI420Mirror I420Mirror + +// I420 mirror. +LIBYUV_API +int I420Mirror(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Alias +#define I400ToI400Mirror I400Mirror + +// I400 mirror. A single plane is mirrored horizontally. +// Pass negative height to achieve 180 degree rotation. +LIBYUV_API +int I400Mirror(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Alias +#define ARGBToARGBMirror ARGBMirror + +// ARGB mirror. +LIBYUV_API +int ARGBMirror(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// Convert NV21 to RGB565. +LIBYUV_API +int NV21ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// I422ToARGB is in convert_argb.h +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_bgra, int dst_stride_bgra, + int width, int height); + +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height); + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height); + +// Draw a rectangle into I420. +LIBYUV_API +int I420Rect(uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int x, int y, int width, int height, + int value_y, int value_u, int value_v); + +// Draw a rectangle into ARGB. +LIBYUV_API +int ARGBRect(uint8* dst_argb, int dst_stride_argb, + int x, int y, int width, int height, uint32 value); + +// Convert ARGB to gray scale ARGB. +LIBYUV_API +int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Make a rectangle of ARGB gray scale. +LIBYUV_API +int ARGBGray(uint8* dst_argb, int dst_stride_argb, + int x, int y, int width, int height); + +// Make a rectangle of ARGB Sepia tone. +LIBYUV_API +int ARGBSepia(uint8* dst_argb, int dst_stride_argb, + int x, int y, int width, int height); + +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The next 4 coefficients apply to B, G, R, A and produce R of the output. +// The last 4 coefficients apply to B, G, R, A and produce A of the output. +LIBYUV_API +int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const int8* matrix_argb, + int width, int height); + +// Deprecated. Use ARGBColorMatrix instead. +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The last 4 coefficients apply to B, G, R, A and produce R of the output. +LIBYUV_API +int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_rgb, + int x, int y, int width, int height); + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +LIBYUV_API +int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int x, int y, int width, int height); + +// Apply a color table each ARGB pixel but preserve destination alpha. +// Table contains 256 ARGB values. +LIBYUV_API +int RGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int x, int y, int width, int height); + +// Apply a luma/color table each ARGB pixel but preserve destination alpha. +// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from +// RGB (YJ style) and C is an 8 bit color component (R, G or B). +LIBYUV_API +int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const uint8* luma_rgb_table, + int width, int height); + +// Apply a 3 term polynomial to ARGB values. +// poly points to a 4x4 matrix. The first row is constants. The 2nd row is +// coefficients for b, g, r and a. The 3rd row is coefficients for b squared, +// g squared, r squared and a squared. The 4rd row is coefficients for b to +// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and +// result clamped to 0 to 255. +// A polynomial approximation can be dirived using software such as 'R'. + +LIBYUV_API +int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const float* poly, + int width, int height); + +// Quantize a rectangle of ARGB. Alpha unaffected. +// scale is a 16 bit fractional fixed point scaler between 0 and 65535. +// interval_size should be a value between 1 and 255. +// interval_offset should be a value between 0 and 255. +LIBYUV_API +int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, + int scale, int interval_size, int interval_offset, + int x, int y, int width, int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); + +// Get function to Alpha Blend ARGB pixels and store to destination. +LIBYUV_API +ARGBBlendRow GetARGBBlend(); + +// Alpha Blend ARGB images and store to destination. +// Alpha of destination is set to 255. +LIBYUV_API +int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. +LIBYUV_API +int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Add ARGB image with ARGB image. Saturates to 255. +LIBYUV_API +int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0. +LIBYUV_API +int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I422 to YUY2. +LIBYUV_API +int I422ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Convert I422 to UYVY. +LIBYUV_API +int I422ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Convert unattentuated ARGB to preattenuated ARGB. +LIBYUV_API +int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert preattentuated ARGB to unattenuated ARGB. +LIBYUV_API +int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert MJPG to ARGB. +LIBYUV_API +int MJPGToARGB(const uint8* sample, size_t sample_size, + uint8* argb, int argb_stride, + int w, int h, int dw, int dh); + +// Internal function - do not call directly. +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +LIBYUV_API +int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height); + +// Blur ARGB image. +// dst_cumsum table of width * (height + 1) * 16 bytes aligned to +// 16 byte boundary. +// dst_stride32_cumsum is number of ints in a row (width * 4). +// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5. +// Blur is optimized for radius of 5 (11x11) or less. +LIBYUV_API +int ARGBBlur(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height, int radius); + +// Multiply ARGB image by ARGB value. +LIBYUV_API +int ARGBShade(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, uint32 value); + +// Interpolate between two ARGB images using specified amount of interpolation +// (0 to 255) and store to destination. +// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0 +// and 255 means 1% src_argb0 and 99% src_argb1. +// Internally uses ARGBScale bilinear filtering. +// Caveat: This function will write up to 16 bytes beyond the end of dst_argb. +LIBYUV_API +int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height, int interpolation); + +#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ + defined(TARGET_IPHONE_SIMULATOR) +#define LIBYUV_DISABLE_X86 +#endif + +// Row functions for copying a pixels from a source with a slope to a row +// of destination. Useful for scaling, rotation, mirror, texture mapping. +LIBYUV_API +void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); +#define HAS_ARGBAFFINEROW_SSE2 +#endif // LIBYUV_DISABLE_X86 + +// Shuffle ARGB channel order. e.g. BGRA to ARGB. +// shuffler is 16 bytes and must be aligned. +LIBYUV_API +int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + const uint8* shuffler, int width, int height); + +// Sobel ARGB effect with planar output. +LIBYUV_API +int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Sobel ARGB effect. +LIBYUV_API +int ARGBSobel(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB. +LIBYUV_API +int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h new file mode 100644 index 000000000..daf5a45e1 --- /dev/null +++ b/third_party/libyuv/include/libyuv/row.h @@ -0,0 +1,1704 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROW_H_ // NOLINT +#define INCLUDE_LIBYUV_ROW_H_ + +#include // For malloc. + +#include "basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) + +#ifdef __cplusplus +#define align_buffer_64(var, size) \ + uint8* var##_mem = reinterpret_cast(malloc((size) + 63)); \ + uint8* var = reinterpret_cast \ + ((reinterpret_cast(var##_mem) + 63) & ~63) +#else +#define align_buffer_64(var, size) \ + uint8* var##_mem = (uint8*)(malloc((size) + 63)); /* NOLINT */ \ + uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */ +#endif + +#define free_aligned_buffer_64(var) \ + free(var##_mem); \ + var = 0 + +#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ + defined(TARGET_IPHONE_SIMULATOR) +#define LIBYUV_DISABLE_X86 +#endif +// True if compiling for SSSE3 as a requirement. +#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3)) +#define LIBYUV_SSSE3_ONLY +#endif + +// Enable for NaCL pepper 33 for bundle and AVX2 support. +// #define NEW_BINUTILS + +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +// Effects: +#define HAS_ARGBADDROW_SSE2 +#define HAS_ARGBAFFINEROW_SSE2 +#define HAS_ARGBATTENUATEROW_SSSE3 +#define HAS_ARGBBLENDROW_SSSE3 +#define HAS_ARGBCOLORMATRIXROW_SSSE3 +#define HAS_ARGBCOLORTABLEROW_X86 +#define HAS_ARGBCOPYALPHAROW_SSE2 +#define HAS_ARGBCOPYYTOALPHAROW_SSE2 +#define HAS_ARGBGRAYROW_SSSE3 +#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 +#define HAS_ARGBMIRRORROW_SSSE3 +#define HAS_ARGBMULTIPLYROW_SSE2 +#define HAS_ARGBPOLYNOMIALROW_SSE2 +#define HAS_ARGBQUANTIZEROW_SSE2 +#define HAS_ARGBSEPIAROW_SSSE3 +#define HAS_ARGBSHADEROW_SSE2 +#define HAS_ARGBSUBTRACTROW_SSE2 +#define HAS_ARGBTOUVROW_SSSE3 +#define HAS_ARGBUNATTENUATEROW_SSE2 +#define HAS_COMPUTECUMULATIVESUMROW_SSE2 +#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +#define HAS_INTERPOLATEROW_SSE2 +#define HAS_INTERPOLATEROW_SSSE3 +#define HAS_RGBCOLORTABLEROW_X86 +#define HAS_SOBELROW_SSE2 +#define HAS_SOBELTOPLANEROW_SSE2 +#define HAS_SOBELXROW_SSE2 +#define HAS_SOBELXYROW_SSE2 +#define HAS_SOBELYROW_SSE2 + +// Conversions: +#define HAS_ABGRTOUVROW_SSSE3 +#define HAS_ABGRTOYROW_SSSE3 +#define HAS_ARGB1555TOARGBROW_SSE2 +#define HAS_ARGB4444TOARGBROW_SSE2 +#define HAS_ARGBSHUFFLEROW_SSE2 +#define HAS_ARGBSHUFFLEROW_SSSE3 +#define HAS_ARGBTOARGB1555ROW_SSE2 +#define HAS_ARGBTOARGB4444ROW_SSE2 +#define HAS_ARGBTOBAYERGGROW_SSE2 +#define HAS_ARGBTOBAYERROW_SSSE3 +#define HAS_ARGBTORAWROW_SSSE3 +#define HAS_ARGBTORGB24ROW_SSSE3 +#define HAS_ARGBTORGB565ROW_SSE2 +#define HAS_ARGBTOUV422ROW_SSSE3 +#define HAS_ARGBTOUV444ROW_SSSE3 +#define HAS_ARGBTOUVJROW_SSSE3 +#define HAS_ARGBTOYJROW_SSSE3 +#define HAS_ARGBTOYROW_SSSE3 +#define HAS_BGRATOUVROW_SSSE3 +#define HAS_BGRATOYROW_SSSE3 +#define HAS_COPYROW_ERMS +#define HAS_COPYROW_SSE2 +#define HAS_COPYROW_X86 +#define HAS_HALFROW_SSE2 +#define HAS_I400TOARGBROW_SSE2 +#define HAS_I411TOARGBROW_SSSE3 +#define HAS_I422TOARGB1555ROW_SSSE3 +#define HAS_I422TOABGRROW_SSSE3 +#define HAS_I422TOARGB1555ROW_SSSE3 +#define HAS_I422TOARGB4444ROW_SSSE3 +#define HAS_I422TOARGBROW_SSSE3 +#define HAS_I422TOBGRAROW_SSSE3 +#define HAS_I422TORAWROW_SSSE3 +#define HAS_I422TORGB24ROW_SSSE3 +#define HAS_I422TORGB565ROW_SSSE3 +#define HAS_I422TORGBAROW_SSSE3 +#define HAS_I422TOUYVYROW_SSE2 +#define HAS_I422TOYUY2ROW_SSE2 +#define HAS_I444TOARGBROW_SSSE3 +#define HAS_MERGEUVROW_SSE2 +#define HAS_MIRRORROW_SSE2 +#define HAS_MIRRORROW_SSSE3 +#define HAS_MIRRORROW_UV_SSSE3 +#define HAS_MIRRORUVROW_SSSE3 +#define HAS_NV12TOARGBROW_SSSE3 +#define HAS_NV12TORGB565ROW_SSSE3 +#define HAS_NV21TOARGBROW_SSSE3 +#define HAS_NV21TORGB565ROW_SSSE3 +#define HAS_RAWTOARGBROW_SSSE3 +#define HAS_RAWTOYROW_SSSE3 +#define HAS_RGB24TOARGBROW_SSSE3 +#define HAS_RGB24TOYROW_SSSE3 +#define HAS_RGB565TOARGBROW_SSE2 +#define HAS_RGBATOUVROW_SSSE3 +#define HAS_RGBATOYROW_SSSE3 +#define HAS_SETROW_X86 +#define HAS_SPLITUVROW_SSE2 +#define HAS_UYVYTOARGBROW_SSSE3 +#define HAS_UYVYTOUV422ROW_SSE2 +#define HAS_UYVYTOUVROW_SSE2 +#define HAS_UYVYTOYROW_SSE2 +#define HAS_YTOARGBROW_SSE2 +#define HAS_YUY2TOARGBROW_SSSE3 +#define HAS_YUY2TOUV422ROW_SSE2 +#define HAS_YUY2TOUVROW_SSE2 +#define HAS_YUY2TOYROW_SSE2 +#endif + +// GCC >= 4.7.0 required for AVX2. +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +#define GCC_HAS_AVX2 1 +#endif // GNUC >= 4.7 +#endif // __GNUC__ + +// clang >= 3.4.0 required for AVX2. +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) +#define CLANG_HAS_AVX2 1 +#endif // clang >= 3.4 +#endif // __clang__ + +// Visual C 2012 required for AVX2. +#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + +// The following are available on all x86 platforms, but +// require VS2012, clang 3.4 or gcc 4.7. +// The code supports NaCL but requires a new compiler and validator. +#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ + defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +// Effects: +#define HAS_ARGBPOLYNOMIALROW_AVX2 +#define HAS_ARGBSHUFFLEROW_AVX2 +#define HAS_ARGBCOPYALPHAROW_AVX2 +#define HAS_ARGBCOPYYTOALPHAROW_AVX2 +#endif + +// The following are require VS2012. +// TODO(fbarchard): Port to gcc. +#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) +#define HAS_ARGBTOUVROW_AVX2 +#define HAS_ARGBTOYJROW_AVX2 +#define HAS_ARGBTOYROW_AVX2 +#define HAS_HALFROW_AVX2 +#define HAS_I422TOARGBROW_AVX2 +#define HAS_INTERPOLATEROW_AVX2 +#define HAS_MERGEUVROW_AVX2 +#define HAS_MIRRORROW_AVX2 +#define HAS_SPLITUVROW_AVX2 +#define HAS_UYVYTOUV422ROW_AVX2 +#define HAS_UYVYTOUVROW_AVX2 +#define HAS_UYVYTOYROW_AVX2 +#define HAS_YUY2TOUV422ROW_AVX2 +#define HAS_YUY2TOUVROW_AVX2 +#define HAS_YUY2TOYROW_AVX2 + +// Effects: +#define HAS_ARGBADDROW_AVX2 +#define HAS_ARGBATTENUATEROW_AVX2 +#define HAS_ARGBMIRRORROW_AVX2 +#define HAS_ARGBMULTIPLYROW_AVX2 +#define HAS_ARGBSUBTRACTROW_AVX2 +#define HAS_ARGBUNATTENUATEROW_AVX2 +#endif // defined(VISUALC_HAS_AVX2) + +// The following are Yasm x86 only: +// TODO(fbarchard): Port AVX2 to inline. +#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM) + (defined(_M_IX86) || defined(_M_X64) || \ + defined(__x86_64__) || defined(__i386__)) +#define HAS_MERGEUVROW_AVX2 +#define HAS_MERGEUVROW_MMX +#define HAS_SPLITUVROW_AVX2 +#define HAS_SPLITUVROW_MMX +#define HAS_UYVYTOYROW_AVX2 +#define HAS_UYVYTOYROW_MMX +#define HAS_YUY2TOYROW_AVX2 +#define HAS_YUY2TOYROW_MMX +#endif + +// The following are disabled when SSSE3 is available: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ + !defined(LIBYUV_SSSE3_ONLY) +#define HAS_ARGBBLENDROW_SSE2 +#define HAS_ARGBATTENUATEROW_SSE2 +#define HAS_MIRRORROW_SSE2 +#endif + +// The following are available on Neon platforms: +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_ABGRTOUVROW_NEON +#define HAS_ABGRTOYROW_NEON +#define HAS_ARGB1555TOARGBROW_NEON +#define HAS_ARGB1555TOUVROW_NEON +#define HAS_ARGB1555TOYROW_NEON +#define HAS_ARGB4444TOARGBROW_NEON +#define HAS_ARGB4444TOUVROW_NEON +#define HAS_ARGB4444TOYROW_NEON +#define HAS_ARGBTOARGB1555ROW_NEON +#define HAS_ARGBTOARGB4444ROW_NEON +#define HAS_ARGBTOBAYERROW_NEON +#define HAS_ARGBTOBAYERGGROW_NEON +#define HAS_ARGBTORAWROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORGB565ROW_NEON +#define HAS_ARGBTOUV411ROW_NEON +#define HAS_ARGBTOUV422ROW_NEON +#define HAS_ARGBTOUV444ROW_NEON +#define HAS_ARGBTOUVROW_NEON +#define HAS_ARGBTOUVJROW_NEON +#define HAS_ARGBTOYROW_NEON +#define HAS_ARGBTOYJROW_NEON +#define HAS_BGRATOUVROW_NEON +#define HAS_BGRATOYROW_NEON +#define HAS_COPYROW_NEON +#define HAS_HALFROW_NEON +#define HAS_I400TOARGBROW_NEON +#define HAS_I411TOARGBROW_NEON +#define HAS_I422TOABGRROW_NEON +#define HAS_I422TOARGB1555ROW_NEON +#define HAS_I422TOARGB4444ROW_NEON +#define HAS_I422TOARGBROW_NEON +#define HAS_I422TOBGRAROW_NEON +#define HAS_I422TORAWROW_NEON +#define HAS_I422TORGB24ROW_NEON +#define HAS_I422TORGB565ROW_NEON +#define HAS_I422TORGBAROW_NEON +#define HAS_I422TOUYVYROW_NEON +#define HAS_I422TOYUY2ROW_NEON +#define HAS_I444TOARGBROW_NEON +#define HAS_MERGEUVROW_NEON +#define HAS_MIRRORROW_NEON +#define HAS_MIRRORUVROW_NEON +#define HAS_NV12TOARGBROW_NEON +#define HAS_NV12TORGB565ROW_NEON +#define HAS_NV21TOARGBROW_NEON +#define HAS_NV21TORGB565ROW_NEON +#define HAS_RAWTOARGBROW_NEON +#define HAS_RAWTOUVROW_NEON +#define HAS_RAWTOYROW_NEON +#define HAS_RGB24TOARGBROW_NEON +#define HAS_RGB24TOUVROW_NEON +#define HAS_RGB24TOYROW_NEON +#define HAS_RGB565TOARGBROW_NEON +#define HAS_RGB565TOUVROW_NEON +#define HAS_RGB565TOYROW_NEON +#define HAS_RGBATOUVROW_NEON +#define HAS_RGBATOYROW_NEON +#define HAS_SETROW_NEON +#define HAS_SPLITUVROW_NEON +#define HAS_UYVYTOARGBROW_NEON +#define HAS_UYVYTOUV422ROW_NEON +#define HAS_UYVYTOUVROW_NEON +#define HAS_UYVYTOYROW_NEON +#define HAS_YTOARGBROW_NEON +#define HAS_YUY2TOARGBROW_NEON +#define HAS_YUY2TOUV422ROW_NEON +#define HAS_YUY2TOUVROW_NEON +#define HAS_YUY2TOYROW_NEON + +// Effects: +#define HAS_ARGBADDROW_NEON +#define HAS_ARGBATTENUATEROW_NEON +#define HAS_ARGBBLENDROW_NEON +#define HAS_ARGBGRAYROW_NEON +#define HAS_ARGBMIRRORROW_NEON +#define HAS_ARGBMULTIPLYROW_NEON +#define HAS_ARGBQUANTIZEROW_NEON +#define HAS_ARGBSEPIAROW_NEON +#define HAS_ARGBSHADEROW_NEON +#define HAS_ARGBSUBTRACTROW_NEON +#define HAS_SOBELROW_NEON +#define HAS_SOBELTOPLANEROW_NEON +#define HAS_SOBELXYROW_NEON +#define HAS_SOBELXROW_NEON +#define HAS_SOBELYROW_NEON +#define HAS_INTERPOLATEROW_NEON +// TODO(fbarchard): Investigate neon unittest failure. +// #define HAS_ARGBCOLORMATRIXROW_NEON +#endif + +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) +#define HAS_COPYROW_MIPS +#if defined(__mips_dsp) && (__mips_dsp_rev >= 2) +#define HAS_I422TOABGRROW_MIPS_DSPR2 +#define HAS_I422TOARGBROW_MIPS_DSPR2 +#define HAS_I422TOBGRAROW_MIPS_DSPR2 +#define HAS_INTERPOLATEROWS_MIPS_DSPR2 +#define HAS_MIRRORROW_MIPS_DSPR2 +#define HAS_MIRRORUVROW_MIPS_DSPR2 +#define HAS_SPLITUVROW_MIPS_DSPR2 +#endif +#endif + +#if defined(_MSC_VER) && !defined(__CLR_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +typedef __declspec(align(16)) int16 vec16[8]; +typedef __declspec(align(16)) int32 vec32[4]; +typedef __declspec(align(16)) int8 vec8[16]; +typedef __declspec(align(16)) uint16 uvec16[8]; +typedef __declspec(align(16)) uint32 uvec32[4]; +typedef __declspec(align(16)) uint8 uvec8[16]; +typedef __declspec(align(32)) int16 lvec16[16]; +typedef __declspec(align(32)) int32 lvec32[8]; +typedef __declspec(align(32)) int8 lvec8[32]; +typedef __declspec(align(32)) uint16 ulvec16[16]; +typedef __declspec(align(32)) uint32 ulvec32[8]; +typedef __declspec(align(32)) uint8 ulvec8[32]; + +#elif defined(__GNUC__) +// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +typedef int16 __attribute__((vector_size(16))) vec16; +typedef int32 __attribute__((vector_size(16))) vec32; +typedef int8 __attribute__((vector_size(16))) vec8; +typedef uint16 __attribute__((vector_size(16))) uvec16; +typedef uint32 __attribute__((vector_size(16))) uvec32; +typedef uint8 __attribute__((vector_size(16))) uvec8; +#else +#define SIMD_ALIGNED(var) var +typedef int16 vec16[8]; +typedef int32 vec32[4]; +typedef int8 vec8[16]; +typedef uint16 uvec16[8]; +typedef uint32 uvec32[4]; +typedef uint8 uvec8[16]; +#endif + +#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) +#define OMITFP +#else +#define OMITFP __attribute__((optimize("omit-frame-pointer"))) +#endif + +// NaCL macros for GCC x86 and x64. + +// TODO(nfullagar): When pepper_33 toolchain is distributed, default to +// NEW_BINUTILS and remove all BUNDLEALIGN occurances. +#if defined(__native_client__) +#define LABELALIGN ".p2align 5\n" +#else +#define LABELALIGN ".p2align 2\n" +#endif +#if defined(__native_client__) && defined(__x86_64__) +#if defined(NEW_BINUTILS) +#define BUNDLELOCK ".bundle_lock\n" +#define BUNDLEUNLOCK ".bundle_unlock\n" +#define BUNDLEALIGN "\n" +#else +#define BUNDLELOCK "\n" +#define BUNDLEUNLOCK "\n" +#define BUNDLEALIGN ".p2align 5\n" +#endif +#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" +#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" +#define MEMLEA(offset, base) #offset "(%q" #base ")" +#define MEMLEA3(offset, index, scale) \ + #offset "(,%q" #index "," #scale ")" +#define MEMLEA4(offset, base, index, scale) \ + #offset "(%q" #base ",%q" #index "," #scale ")" +#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15" +#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15" +#define MEMOPREG(opcode, offset, base, index, scale, reg) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14),%%" #reg "\n" \ + BUNDLEUNLOCK +#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " %%" #reg ",(%%r15,%%r14)\n" \ + BUNDLEUNLOCK +#define MEMOPARG(opcode, offset, base, index, scale, arg) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14),%" #arg "\n" \ + BUNDLEUNLOCK +#else +#define BUNDLEALIGN "\n" +#define MEMACCESS(base) "(%" #base ")" +#define MEMACCESS2(offset, base) #offset "(%" #base ")" +#define MEMLEA(offset, base) #offset "(%" #base ")" +#define MEMLEA3(offset, index, scale) \ + #offset "(,%" #index "," #scale ")" +#define MEMLEA4(offset, base, index, scale) \ + #offset "(%" #base ",%" #index "," #scale ")" +#define MEMMOVESTRING(s, d) +#define MEMSTORESTRING(reg, d) +#define MEMOPREG(opcode, offset, base, index, scale, reg) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n" +#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ + #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" +#define MEMOPARG(opcode, offset, base, index, scale, arg) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" +#endif + +void I444ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToRGB24Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width); +void I422ToRAWRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width); +void I422ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width); +void I422ToARGB1555Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width); +void I422ToARGB4444Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width); +void NV12ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_NEON(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width); +void NV21ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_vu, + uint8* dst_rgb565, + int width); +void YUY2ToARGBRow_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width); + +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_Unaligned_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_Unaligned_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); +void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); +void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix); +void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix); +void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix); +void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix); +void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix); +void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix); +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix); +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix); +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix); +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix); +void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix); +void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix); +void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix); +void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix); +void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); +void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix); +void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix); +void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix); +void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix); + +void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); +void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); +void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix); +void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix); +void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix); +void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix); +void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix); +void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444, + int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix); +void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width); +void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int width); +void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int width); +void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int width); +void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int width); + +void ARGBToUV444Row_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); + +void ARGBToUV422Row_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); + +void ARGBToUV444Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV422Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV411Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); + +void MirrorRow_AVX2(const uint8* src, uint8* dst, int width); +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width); +void MirrorRow_NEON(const uint8* src, uint8* dst, int width); +void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width); +void MirrorRow_C(const uint8* src, uint8* dst, int width); + +void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); +void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); +void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); +void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); + +void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width); + +void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, + uint8* dst_v, int pix); +void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); + +void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, + uint8* dst_uv, int width); +void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); + +void CopyRow_SSE2(const uint8* src, uint8* dst, int count); +void CopyRow_ERMS(const uint8* src, uint8* dst, int count); +void CopyRow_X86(const uint8* src, uint8* dst, int count); +void CopyRow_NEON(const uint8* src, uint8* dst, int count); +void CopyRow_MIPS(const uint8* src, uint8* dst, int count); +void CopyRow_C(const uint8* src, uint8* dst, int count); + +void CopyRow_16_C(const uint16* src, uint16* dst, int count); + +void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); + +void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width); +void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); + +void SetRow_X86(uint8* dst, uint32 v32, int count); +void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, + int dst_stride, int height); +void SetRow_NEON(uint8* dst, uint32 v32, int count); +void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, + int dst_stride, int height); +void SetRow_C(uint8* dst, uint32 v32, int count); +void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride, + int height); + +// ARGBShufflers for BGRAToARGB etc. +void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); + +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, + int pix); + +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix); +void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); +void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); +void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb, + int pix); +void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb, + int pix); +void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb, + int pix); +void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix); + +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); + +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix); + +void I444ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_C(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_C(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_C(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_C(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_C(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_C(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void I422ToBGRARow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToRGB24Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width); +void I422ToRAWRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width); +void I422ToARGB4444Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width); +void I422ToARGB1555Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width); +void I422ToRGB565Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width); +void YToARGBRow_C(const uint8* src_y, + uint8* dst_argb, + int width); +void I422ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I444ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void I422ToBGRARow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToARGB4444Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGB1555Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +// RGB24/RAW are unaligned. +void I422ToRGB24Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width); +void I422ToRAWRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width); + +void I444ToARGBRow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void I422ToBGRARow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I444ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void I422ToBGRARow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToRGB565Row_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +// RGB24/RAW are unaligned. +void I422ToRGB24Row_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRAWRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void YToARGBRow_SSE2(const uint8* src_y, + uint8* dst_argb, + int width); +void YToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width); +void YToARGBRow_Any_SSE2(const uint8* src_y, + uint8* dst_argb, + int width); +void YToARGBRow_Any_NEON(const uint8* src_y, + uint8* dst_argb, + int width); + +// ARGB preattenuated alpha blend. +void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + +// ARGB multiply images. Same API as Blend, but these require +// pointer and width alignment for SSE2. +void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + +// ARGB add images. +void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + +// ARGB subtract images. Same API as Blend, but these require +// pointer and width alignment for SSE2. +void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + +void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); + +void I444ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToABGRRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGBARow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGB24Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRAWRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGB4444Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGB1555Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGB565Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_Any_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_Any_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); + +void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix); +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_C(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix); +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_NEON(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + +void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_C(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + +void HalfRow_C(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix); +void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix); +void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix); +void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix); + +void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, + uint16* dst_uv, int pix); + +void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); +void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); +void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); +void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); +void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); + +void I422ToYUY2Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); +void I422ToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); +void I422ToYUY2Row_Any_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_Any_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); +void I422ToYUY2Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); +void I422ToYUY2Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); + +// Effects related row functions. +void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, + int width); + +// Inverse table for unattenuate, shared by C and SSE2. +extern const uint32 fixed_invtbl8[256]; +void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, + int width); + +void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); + +void ARGBSepiaRow_C(uint8* dst_argb, int width); +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width); +void ARGBSepiaRow_NEON(uint8* dst_argb, int width); + +void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); + +void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); + +void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); + +void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width); +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width); +void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width); + +void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value); +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value); +void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value); + +// Used for blur. +void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count); +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width); + +void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count); +void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width); + +LIBYUV_API +void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); + +// Used for I420Scale, ARGBScale, and ARGBInterpolate. +void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); + +void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); + +// Sobel images. +void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, + uint8* dst_sobelx, int width); +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width); +void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width); +void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width); +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width); +void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width); +void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); + +void ARGBPolynomialRow_C(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); + +void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, + const uint8* luma, uint32 lumacoeff); +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, + const uint8* luma, uint32 lumacoeff); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/scale.h b/third_party/libyuv/include/libyuv/scale.h index 35d0ff55c..973d46457 100644 --- a/third_party/libyuv/include/libyuv/scale.h +++ b/third_party/libyuv/include/libyuv/scale.h @@ -1,30 +1,45 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may + * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_SCALE_H_ +#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT #define INCLUDE_LIBYUV_SCALE_H_ -#include "third_party/libyuv/include/libyuv/basic_types.h" +#include "basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif -// Supported filtering -typedef enum { - kFilterNone = 0, // Point sample; Fastest - kFilterBilinear = 1, // Faster than box, but lower quality scaling down. - kFilterBox = 2 // Highest quality +// Supported filtering. +typedef enum FilterMode { + kFilterNone = 0, // Point sample; Fastest. + kFilterLinear = 1, // Filter horizontally only. + kFilterBilinear = 2, // Faster than box, but lower quality scaling down. + kFilterBox = 3 // Highest quality. } FilterModeEnum; +// Scale a YUV plane. +LIBYUV_API +void ScalePlane(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + enum FilterMode filtering); + +void ScalePlane_16(const uint16* src, int src_stride, + int src_width, int src_height, + uint16* dst, int dst_stride, + int dst_width, int dst_height, + enum FilterMode filtering); + // Scales a YUV 4:2:0 image from the src width and height to the // dst width and height. // If filtering is kFilterNone, a simple nearest-neighbor algorithm is @@ -35,6 +50,7 @@ typedef enum { // quality image, at further expense of speed. // Returns 0 if successful. +LIBYUV_API int I420Scale(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, @@ -43,28 +59,44 @@ int I420Scale(const uint8* src_y, int src_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int dst_width, int dst_height, - FilterModeEnum filtering); + enum FilterMode filtering); -// Legacy API. Deprecated +LIBYUV_API +int I420Scale_16(const uint16* src_y, int src_stride_y, + const uint16* src_u, int src_stride_u, + const uint16* src_v, int src_stride_v, + int src_width, int src_height, + uint16* dst_y, int dst_stride_y, + uint16* dst_u, int dst_stride_u, + uint16* dst_v, int dst_stride_v, + int dst_width, int dst_height, + enum FilterMode filtering); + +#ifdef __cplusplus +// Legacy API. Deprecated. +LIBYUV_API int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, int src_stride_y, int src_stride_u, int src_stride_v, int src_width, int src_height, uint8* dst_y, uint8* dst_u, uint8* dst_v, int dst_stride_y, int dst_stride_u, int dst_stride_v, int dst_width, int dst_height, - int interpolate); + LIBYUV_BOOL interpolate); -// Legacy API. Deprecated -int ScaleOffset(const uint8* src, int src_width, int src_height, - uint8* dst, int dst_width, int dst_height, int dst_yoffset, - int interpolate); +// Legacy API. Deprecated. +LIBYUV_API +int ScaleOffset(const uint8* src_i420, int src_width, int src_height, + uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset, + LIBYUV_BOOL interpolate); -// For testing, allow disabling of optimizations. -void SetUseReferenceImpl(int use); +// For testing, allow disabling of specialized scalers. +LIBYUV_API +void SetUseReferenceImpl(LIBYUV_BOOL use); +#endif // __cplusplus #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_SCALE_H_ +#endif // INCLUDE_LIBYUV_SCALE_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h new file mode 100644 index 000000000..5d91f8f76 --- /dev/null +++ b/third_party/libyuv/include/libyuv/scale_row.h @@ -0,0 +1,341 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ // NOLINT +#define INCLUDE_LIBYUV_SCALE_ROW_H_ + +#include "basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ + defined(TARGET_IPHONE_SIMULATOR) +#define LIBYUV_DISABLE_X86 +#endif + +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_SCALEROWDOWN2_SSE2 +#define HAS_SCALEROWDOWN4_SSE2 +#define HAS_SCALEROWDOWN34_SSSE3 +#define HAS_SCALEROWDOWN38_SSSE3 +#define HAS_SCALEADDROWS_SSE2 +#define HAS_SCALEFILTERCOLS_SSSE3 +#define HAS_SCALECOLSUP2_SSE2 +#define HAS_SCALEARGBROWDOWN2_SSE2 +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +#define HAS_SCALEARGBCOLS_SSE2 +#define HAS_SCALEARGBFILTERCOLS_SSSE3 +#define HAS_SCALEARGBCOLSUP2_SSE2 +#define HAS_FIXEDDIV_X86 +#define HAS_FIXEDDIV1_X86 +#endif + +// The following are available on Neon platforms: +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_SCALEROWDOWN2_NEON +#define HAS_SCALEROWDOWN4_NEON +#define HAS_SCALEROWDOWN34_NEON +#define HAS_SCALEROWDOWN38_NEON +#define HAS_SCALEARGBROWDOWNEVEN_NEON +#define HAS_SCALEARGBROWDOWN2_NEON +#endif + +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ + defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) +#define HAS_SCALEROWDOWN2_MIPS_DSPR2 +#define HAS_SCALEROWDOWN4_MIPS_DSPR2 +#define HAS_SCALEROWDOWN34_MIPS_DSPR2 +#define HAS_SCALEROWDOWN38_MIPS_DSPR2 +#endif + +// Scale ARGB vertically with bilinear interpolation. +void ScalePlaneVertical(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int y, int dy, + int bpp, enum FilterMode filtering); + +void ScalePlaneVertical_16(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_argb, uint16* dst_argb, + int x, int y, int dy, + int wpp, enum FilterMode filtering); + +// Simplify the filtering based on scale factors. +enum FilterMode ScaleFilterReduce(int src_width, int src_height, + int dst_width, int dst_height, + enum FilterMode filtering); + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_C(int num, int div); +int FixedDiv_X86(int num, int div); +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. +int FixedDiv1_C(int num, int div); +int FixedDiv1_X86(int num, int div); +#ifdef HAS_FIXEDDIV_X86 +#define FixedDiv FixedDiv_X86 +#define FixedDiv1 FixedDiv1_X86 +#else +#define FixedDiv FixedDiv_C +#define FixedDiv1 FixedDiv1_C +#endif + +// Compute slope values for stepping. +void ScaleSlope(int src_width, int src_height, + int dst_width, int dst_height, + enum FilterMode filtering, + int* x, int* y, int* dx, int* dy); + +void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* d, int dst_width); +void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* d, int dst_width); +void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx); +void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int, int); +void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int, int); +void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx); +void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx); +void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown38_3_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); +void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height); +void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint32* dst_ptr, int src_width, int src_height); +void ScaleARGBRowDown2_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Linear_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int, int); +void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); + +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, + int src_height); +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +// Row functions. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + +// ScaleRowDown2Box also used by planar functions +// NEON downscalers with interpolation. + +// Note - not static due to reuse in convert for 444 to 420. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x3 -> 12x1 +void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ NOLINT diff --git a/third_party/libyuv/source/cpu_id.c b/third_party/libyuv/source/cpu_id.c deleted file mode 100644 index fccf3dd44..000000000 --- a/third_party/libyuv/source/cpu_id.c +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "third_party/libyuv/include/libyuv/cpu_id.h" - -#ifdef _MSC_VER -#include -#endif -#ifdef __ANDROID__ -#include -#endif - -#include "third_party/libyuv/include/libyuv/basic_types.h" // for CPU_X86 - -// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. -#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) -static inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile ( - "mov %%ebx, %%edi \n" - "cpuid \n" - "xchg %%edi, %%ebx \n" - : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type) - ); -} -#elif defined(__i386__) || defined(__x86_64__) -static inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile ( - "cpuid \n" - : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type) - ); -} -#endif - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// CPU detect function for SIMD instruction sets. -int cpu_info_ = 0; - -int InitCpuFlags() { -#ifdef CPU_X86 - int cpu_info[4]; - __cpuid(cpu_info, 1); - cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) | - (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) | - kCpuInitialized; -#elif defined(__ANDROID__) && defined(__ARM_NEON__) - uint64_t features = android_getCpuFeatures(); - cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) | - kCpuInitialized; -#elif defined(__ARM_NEON__) - // gcc -mfpu=neon defines __ARM_NEON__ - // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags - // to disable Neon on devices that do not have it. - cpu_info_ = kCpuHasNEON | kCpuInitialized; -#else - cpu_info_ = kCpuInitialized; -#endif - return cpu_info_; -} - -void MaskCpuFlags(int enable_flags) { - InitCpuFlags(); - cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc new file mode 100644 index 000000000..520cfe510 --- /dev/null +++ b/third_party/libyuv/source/cpu_id.cc @@ -0,0 +1,283 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/cpu_id.h" + +#ifdef _MSC_VER +#include // For __cpuidex() +#endif +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + !defined(__native_client__) && defined(_M_X64) && \ + defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) +#include // For _xgetbv() +#endif + +#if !defined(__native_client__) +#include // For getenv() +#endif + +// For ArmCpuCaps() but unittested on all platforms +#include +#include + +#include "third_party/libyuv/include/libyuv/basic_types.h" // For CPU_X86 + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// For functions that use the stack and have runtime checks for overflow, +// use SAFEBUFFERS to avoid additional check. +#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) +#define SAFEBUFFERS __declspec(safebuffers) +#else +#define SAFEBUFFERS +#endif + +// Low level cpuid for X86. Returns zeros on other CPUs. +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + (defined(_M_IX86) || defined(_M_X64) || \ + defined(__i386__) || defined(__x86_64__)) +LIBYUV_API +void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { +#if defined(_MSC_VER) +#if (_MSC_FULL_VER >= 160040219) + __cpuidex((int*)(cpu_info), info_eax, info_ecx); +#elif defined(_M_IX86) + __asm { + mov eax, info_eax + mov ecx, info_ecx + mov edi, cpu_info + cpuid + mov [edi], eax + mov [edi + 4], ebx + mov [edi + 8], ecx + mov [edi + 12], edx + } +#else + if (info_ecx == 0) { + __cpuid((int*)(cpu_info), info_eax); + } else { + cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0; + } +#endif +#else // defined(_MSC_VER) + uint32 info_ebx, info_edx; + asm volatile ( // NOLINT +#if defined( __i386__) && defined(__PIC__) + // Preserve ebx for fpic 32 bit. + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" + : "=D" (info_ebx), +#else + "cpuid \n" + : "=b" (info_ebx), +#endif // defined( __i386__) && defined(__PIC__) + "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx)); + cpu_info[0] = info_eax; + cpu_info[1] = info_ebx; + cpu_info[2] = info_ecx; + cpu_info[3] = info_edx; +#endif // defined(_MSC_VER) +} + +#if !defined(__native_client__) +#define HAS_XGETBV +// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. +int TestOsSaveYmm() { + uint32 xcr0 = 0u; +#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) + xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. +#elif defined(_M_IX86) + __asm { + xor ecx, ecx // xcr 0 + _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. + mov xcr0, eax + } +#elif defined(__i386__) || defined(__x86_64__) + asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); +#endif // defined(_MSC_VER) + return((xcr0 & 6) == 6); // Is ymm saved? +} +#endif // !defined(__native_client__) +#else +LIBYUV_API +void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; +} +#endif + +// based on libvpx arm_cpudetect.c +// For Arm, but public to allow testing on any CPU +LIBYUV_API SAFEBUFFERS +int ArmCpuCaps(const char* cpuinfo_name) { + char cpuinfo_line[512]; + FILE* f = fopen(cpuinfo_name, "r"); + if (!f) { + // Assume Neon if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return kCpuHasNEON; + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + if (memcmp(cpuinfo_line, "Features", 8) == 0) { + char* p = strstr(cpuinfo_line, " neon"); + if (p && (p[5] == ' ' || p[5] == '\n')) { + fclose(f); + return kCpuHasNEON; + } + } + } + fclose(f); + return 0; +} + +#if defined(__mips__) && defined(__linux__) +static int MipsCpuCaps(const char* search_string) { + char cpuinfo_line[512]; + const char* file_name = "/proc/cpuinfo"; + FILE* f = fopen(file_name, "r"); + if (!f) { + // Assume DSP if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return kCpuHasMIPS_DSP; + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) { + if (strstr(cpuinfo_line, search_string) != NULL) { + fclose(f); + return kCpuHasMIPS_DSP; + } + } + fclose(f); + return 0; +} +#endif + +// CPU detect function for SIMD instruction sets. +LIBYUV_API +int cpu_info_ = kCpuInit; // cpu_info is not initialized yet. + +// Test environment variable for disabling CPU features. Any non-zero value +// to disable. Zero ignored to make it easy to set the variable on/off. +#if !defined(__native_client__) && !defined(_M_ARM) + +static LIBYUV_BOOL TestEnv(const char* name) { + const char* var = getenv(name); + if (var) { + if (var[0] != '0') { + return LIBYUV_TRUE; + } + } + return LIBYUV_FALSE; +} +#else // nacl does not support getenv(). +static LIBYUV_BOOL TestEnv(const char*) { + return LIBYUV_FALSE; +} +#endif + +LIBYUV_API SAFEBUFFERS +int InitCpuFlags(void) { +#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86) + + uint32 cpu_info1[4] = { 0, 0, 0, 0 }; + uint32 cpu_info7[4] = { 0, 0, 0, 0 }; + CpuId(1, 0, cpu_info1); + CpuId(7, 0, cpu_info7); + cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | + ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | + ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | + ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | + ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) | + ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | + kCpuHasX86; +#ifdef HAS_XGETBV + if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave + TestOsSaveYmm()) { // Saves YMM. + cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | + kCpuHasAVX; + } +#endif + // Environment variable overrides for testing. + if (TestEnv("LIBYUV_DISABLE_X86")) { + cpu_info_ &= ~kCpuHasX86; + } + if (TestEnv("LIBYUV_DISABLE_SSE2")) { + cpu_info_ &= ~kCpuHasSSE2; + } + if (TestEnv("LIBYUV_DISABLE_SSSE3")) { + cpu_info_ &= ~kCpuHasSSSE3; + } + if (TestEnv("LIBYUV_DISABLE_SSE41")) { + cpu_info_ &= ~kCpuHasSSE41; + } + if (TestEnv("LIBYUV_DISABLE_SSE42")) { + cpu_info_ &= ~kCpuHasSSE42; + } + if (TestEnv("LIBYUV_DISABLE_AVX")) { + cpu_info_ &= ~kCpuHasAVX; + } + if (TestEnv("LIBYUV_DISABLE_AVX2")) { + cpu_info_ &= ~kCpuHasAVX2; + } + if (TestEnv("LIBYUV_DISABLE_ERMS")) { + cpu_info_ &= ~kCpuHasERMS; + } + if (TestEnv("LIBYUV_DISABLE_FMA3")) { + cpu_info_ &= ~kCpuHasFMA3; + } +#elif defined(__mips__) && defined(__linux__) + // Linux mips parse text file for dsp detect. + cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP. +#if defined(__mips_dspr2) + cpu_info_ |= kCpuHasMIPS_DSPR2; +#endif + cpu_info_ |= kCpuHasMIPS; + + if (getenv("LIBYUV_DISABLE_MIPS")) { + cpu_info_ &= ~kCpuHasMIPS; + } + if (getenv("LIBYUV_DISABLE_MIPS_DSP")) { + cpu_info_ &= ~kCpuHasMIPS_DSP; + } + if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) { + cpu_info_ &= ~kCpuHasMIPS_DSPR2; + } +#elif defined(__arm__) +// gcc -mfpu=neon defines __ARM_NEON__ +// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. +// For Linux, /proc/cpuinfo can be tested but without that assume Neon. +#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) + cpu_info_ = kCpuHasNEON; +#else + // Linux arm parse text file for neon detect. + cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); +#endif + cpu_info_ |= kCpuHasARM; + if (TestEnv("LIBYUV_DISABLE_NEON")) { + cpu_info_ &= ~kCpuHasNEON; + } +#endif // __arm__ + if (TestEnv("LIBYUV_DISABLE_ASM")) { + cpu_info_ = 0; + } + return cpu_info_; +} + +LIBYUV_API +void MaskCpuFlags(int enable_flags) { + cpu_info_ = InitCpuFlags() & enable_flags; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/planar_functions.cc b/third_party/libyuv/source/planar_functions.cc new file mode 100644 index 000000000..68b8f46e4 --- /dev/null +++ b/third_party/libyuv/source/planar_functions.cc @@ -0,0 +1,2287 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/planar_functions.h" + +#include // for memset() + +#include "third_party/libyuv/include/libyuv/cpu_id.h" +#ifdef HAVE_JPEG +#include "third_party/libyuv/include/libyuv/mjpeg_decoder.h" +#endif +#include "third_party/libyuv/include/libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy a plane of data +LIBYUV_API +void CopyPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + // Coalesce rows. + if (src_stride_y == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_COPYROW_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_NEON; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + + // Copy plane + for (y = 0; y < height; ++y) { + CopyRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +LIBYUV_API +void CopyPlane_16(const uint16* src_y, int src_stride_y, + uint16* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C; + // Coalesce rows. + if (src_stride_y == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_COPYROW_16_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_16_X86; + } +#endif +#if defined(HAS_COPYROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + CopyRow = CopyRow_16_SSE2; + } +#endif +#if defined(HAS_COPYROW_16_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_16_ERMS; + } +#endif +#if defined(HAS_COPYROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_16_NEON; + } +#endif +#if defined(HAS_COPYROW_16_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_16_MIPS; + } +#endif + + // Copy plane + for (y = 0; y < height; ++y) { + CopyRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Copy I422. +LIBYUV_API +int I422Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int halfwidth = (width + 1) >> 1; + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); + return 0; +} + +// Copy I444. +LIBYUV_API +int I444Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; +} + +// Copy I400. +LIBYUV_API +int I400ToI400(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Convert I420 to I400. +LIBYUV_API +int I420ToI400(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Mirror a plane of data. +void MirrorPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_NEON; + } +#endif +#if defined(HAS_MIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSE2; + } +#endif +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + MirrorRow = MirrorRow_SSSE3; + } +#endif +#if defined(HAS_MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + MirrorRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*YUY2ToUV422Row)(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) = + YUY2ToUV422Row_C; + void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) = + YUY2ToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && + dst_stride_y == width && + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; + YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUV422Row = YUY2ToUV422Row_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width >= 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUV422Row = YUY2ToUV422Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// Convert UYVY to I422. +LIBYUV_API +int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*UYVYToUV422Row)(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) = + UYVYToUV422Row_C; + void (*UYVYToYRow)(const uint8* src_uyvy, + uint8* dst_y, int pix) = UYVYToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + // Coalesce rows. + if (src_stride_uyvy == width * 2 && + dst_stride_y == width && + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2; + UYVYToYRow = UYVYToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { + UYVYToUV422Row = UYVYToUV422Row_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + UYVYToYRow = UYVYToYRow_SSE2; + } + } + } + } +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + UYVYToUV422Row = UYVYToUV422Row_Any_AVX2; + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToUV422Row = UYVYToUV422Row_AVX2; + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width >= 16) { + UYVYToUV422Row = UYVYToUV422Row_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUV422Row = UYVYToUV422Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + src_uyvy += src_stride_uyvy; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// Mirror I400 with optional flipping +LIBYUV_API +int I400Mirror(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Mirror I420 with optional flipping +LIBYUV_API +int I420Mirror(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// ARGB mirror. +LIBYUV_API +int ARGBMirror(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + ARGBMirrorRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + +#if defined(HAS_ARGBMIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBMirrorRow = ARGBMirrorRow_SSSE3; + } +#endif +#if defined(HAS_ARGBMIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_AVX2; + } +#endif +#if defined(HAS_ARGBMIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_NEON; + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + ARGBMirrorRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Get a blender that optimized for the CPU, alignment and pixel count. +// As there are 6 blenders to choose from, the caller should try to use +// the same blend function for all pixels if possible. +LIBYUV_API +ARGBBlendRow GetARGBBlend() { + void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width) = ARGBBlendRow_C; +#if defined(HAS_ARGBBLENDROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBBlendRow = ARGBBlendRow_SSSE3; + return ARGBBlendRow; + } +#endif +#if defined(HAS_ARGBBLENDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBBlendRow = ARGBBlendRow_SSE2; + } +#endif +#if defined(HAS_ARGBBLENDROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBBlendRow = ARGBBlendRow_NEON; + } +#endif + return ARGBBlendRow; +} + +// Alpha Blend 2 ARGB images and store to destination. +LIBYUV_API +int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width) = GetARGBBlend(); + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } + + for (y = 0; y < height; ++y) { + ARGBBlendRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Multiply 2 ARGB images and store to destination. +LIBYUV_API +int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst, + int width) = ARGBMultiplyRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBMULTIPLYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_NEON; + } + } +#endif + + // Multiply plane + for (y = 0; y < height; ++y) { + ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Add 2 ARGB images and store to destination. +LIBYUV_API +int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst, + int width) = ARGBAddRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBAddRow = ARGBAddRow_SSE2; + } +#endif +#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBAddRow = ARGBAddRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBAddRow = ARGBAddRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBADDROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + ARGBAddRow = ARGBAddRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBADDROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBAddRow = ARGBAddRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_NEON; + } + } +#endif + + // Add plane + for (y = 0; y < height; ++y) { + ARGBAddRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Subtract 2 ARGB images and store to destination. +LIBYUV_API +int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst, + int width) = ARGBSubtractRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSUBTRACTROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBSubtractRow = ARGBSubtractRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBSubtractRow = ARGBSubtractRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + ARGBSubtractRow = ARGBSubtractRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBSubtractRow = ARGBSubtractRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_NEON; + } + } +#endif + + // Subtract plane + for (y = 0; y < height; ++y) { + ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_bgra, int dst_stride_bgra, + int width, int height) { + int y; + void (*I422ToBGRARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToBGRARow_C; + if (!src_y || !src_u || !src_v || + !dst_bgra || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; + dst_stride_bgra = -dst_stride_bgra; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_bgra == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0; + } +#if defined(HAS_I422TOBGRAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToBGRARow = I422ToBGRARow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToBGRARow = I422ToBGRARow_NEON; + } + } +#elif defined(HAS_I422TOBGRAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToBGRARow = I422ToBGRARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { + I422ToBGRARow = I422ToBGRARow_SSSE3; + } + } + } +#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) { + I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width); + dst_bgra += dst_stride_bgra; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + int y; + void (*I422ToABGRRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToABGRRow_C; + if (!src_y || !src_u || !src_v || + !dst_abgr || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; + dst_stride_abgr = -dst_stride_abgr; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_abgr == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0; + } +#if defined(HAS_I422TOABGRROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToABGRRow = I422ToABGRRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToABGRRow = I422ToABGRRow_NEON; + } + } +#elif defined(HAS_I422TOABGRROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToABGRRow = I422ToABGRRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { + I422ToABGRRow = I422ToABGRRow_SSSE3; + } + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width); + dst_abgr += dst_stride_abgr; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height) { + int y; + void (*I422ToRGBARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || + !dst_rgba || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_rgba == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0; + } +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#elif defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + int y; + void (*NV12ToRGB565Row)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToRGB565Row_C; + if (!src_y || !src_uv || !dst_rgb565 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_NV12TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; + } + } +#elif defined(HAS_NV12TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to RGB565. +LIBYUV_API +int NV21ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + int y; + void (*NV21ToRGB565Row)(const uint8* y_buf, + const uint8* src_vu, + uint8* rgb_buf, + int width) = NV21ToRGB565Row_C; + if (!src_y || !src_vu || !dst_rgb565 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_NV21TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV21ToRGB565Row = NV21ToRGB565Row_SSSE3; + } + } +#elif defined(HAS_NV21TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToRGB565Row = NV21ToRGB565Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +LIBYUV_API +void SetPlane(uint8* dst_y, int dst_stride_y, + int width, int height, + uint32 value) { + int y; + uint32 v32 = value | (value << 8) | (value << 16) | (value << 24); + void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C; + // Coalesce rows. + if (dst_stride_y == width) { + width *= height; + height = 1; + dst_stride_y = 0; + } +#if defined(HAS_SETROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + SetRow = SetRow_NEON; + } +#endif +#if defined(HAS_SETROW_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + SetRow = SetRow_X86; + } +#endif + + // Set plane + for (y = 0; y < height; ++y) { + SetRow(dst_y, v32, width); + dst_y += dst_stride_y; + } +} + +// Draw a rectangle into I420 +LIBYUV_API +int I420Rect(uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int x, int y, + int width, int height, + int value_y, int value_u, int value_v) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + uint8* start_y = dst_y + y * dst_stride_y + x; + uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); + uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); + if (!dst_y || !dst_u || !dst_v || + width <= 0 || height <= 0 || + x < 0 || y < 0 || + value_y < 0 || value_y > 255 || + value_u < 0 || value_u > 255 || + value_v < 0 || value_v > 255) { + return -1; + } + + SetPlane(start_y, dst_stride_y, width, height, value_y); + SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u); + SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v); + return 0; +} + +// Draw a rectangle into ARGB +LIBYUV_API +int ARGBRect(uint8* dst_argb, int dst_stride_argb, + int dst_x, int dst_y, + int width, int height, + uint32 value) { + if (!dst_argb || + width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + dst_argb += dst_y * dst_stride_argb + dst_x * 4; + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_SETROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height); + return 0; + } +#endif +#if defined(HAS_SETROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height); + return 0; + } +#endif + ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height); + return 0; +} + +// Convert unattentuated ARGB to preattenuated ARGB. +// An unattenutated ARGB alpha blend uses the formula +// p = a * f + (1 - a) * b +// where +// p is output pixel +// f is foreground pixel +// b is background pixel +// a is alpha value from foreground pixel +// An preattenutated ARGB alpha blend uses the formula +// p = f + (1 - a) * b +// where +// f is foreground pixel premultiplied by alpha + +LIBYUV_API +int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBATTENUATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBAttenuateRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert preattentuated ARGB to unattenuated ARGB. +LIBYUV_API +int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBUnattenuateRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBUNATTENUATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBUNATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2; + } + } +#endif +// TODO(fbarchard): Neon version. + + for (y = 0; y < height; ++y) { + ARGBUnattenuateRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB to Grayed ARGB. +LIBYUV_API +int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBGrayRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBGRAYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBGrayRow = ARGBGrayRow_SSSE3; + } +#elif defined(HAS_ARGBGRAYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_NEON; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBGrayRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Make a rectangle of ARGB gray scale. +LIBYUV_API +int ARGBGray(uint8* dst_argb, int dst_stride_argb, + int dst_x, int dst_y, + int width, int height) { + int y; + void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBGrayRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBGRAYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBGrayRow = ARGBGrayRow_SSSE3; + } +#elif defined(HAS_ARGBGRAYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_NEON; + } +#endif + for (y = 0; y < height; ++y) { + ARGBGrayRow(dst, dst, width); + dst += dst_stride_argb; + } + return 0; +} + +// Make a rectangle of ARGB Sepia tone. +LIBYUV_API +int ARGBSepia(uint8* dst_argb, int dst_stride_argb, + int dst_x, int dst_y, int width, int height) { + int y; + void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBSEPIAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBSepiaRow = ARGBSepiaRow_SSSE3; + } +#elif defined(HAS_ARGBSEPIAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_NEON; + } +#endif + for (y = 0; y < height; ++y) { + ARGBSepiaRow(dst, width); + dst += dst_stride_argb; + } + return 0; +} + +// Apply a 4x4 matrix to each ARGB pixel. +// Note: Normally for shading, but can be used to swizzle or invert. +LIBYUV_API +int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const int8* matrix_argb, + int width, int height) { + int y; + void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) = ARGBColorMatrixRow_C; + if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; + } +#elif defined(HAS_ARGBCOLORMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; + } +#endif + for (y = 0; y < height; ++y) { + ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Apply a 4x3 matrix to each ARGB pixel. +// Deprecated. +LIBYUV_API +int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_rgb, + int dst_x, int dst_y, int width, int height) { + SIMD_ALIGNED(int8 matrix_argb[16]); + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + + // Convert 4x3 7 bit matrix to 4x4 6 bit matrix. + matrix_argb[0] = matrix_rgb[0] / 2; + matrix_argb[1] = matrix_rgb[1] / 2; + matrix_argb[2] = matrix_rgb[2] / 2; + matrix_argb[3] = matrix_rgb[3] / 2; + matrix_argb[4] = matrix_rgb[4] / 2; + matrix_argb[5] = matrix_rgb[5] / 2; + matrix_argb[6] = matrix_rgb[6] / 2; + matrix_argb[7] = matrix_rgb[7] / 2; + matrix_argb[8] = matrix_rgb[8] / 2; + matrix_argb[9] = matrix_rgb[9] / 2; + matrix_argb[10] = matrix_rgb[10] / 2; + matrix_argb[11] = matrix_rgb[11] / 2; + matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0; + matrix_argb[15] = 64; // 1.0 + + return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb, + dst, dst_stride_argb, + &matrix_argb[0], width, height); +} + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +LIBYUV_API +int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int dst_x, int dst_y, int width, int height) { + int y; + void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + int width) = ARGBColorTableRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOLORTABLEROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + ARGBColorTableRow = ARGBColorTableRow_X86; + } +#endif + for (y = 0; y < height; ++y) { + ARGBColorTableRow(dst, table_argb, width); + dst += dst_stride_argb; + } + return 0; +} + +// Apply a color table each ARGB pixel but preserve destination alpha. +// Table contains 256 ARGB values. +LIBYUV_API +int RGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int dst_x, int dst_y, int width, int height) { + int y; + void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + int width) = RGBColorTableRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_RGBCOLORTABLEROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + RGBColorTableRow = RGBColorTableRow_X86; + } +#endif + for (y = 0; y < height; ++y) { + RGBColorTableRow(dst, table_argb, width); + dst += dst_stride_argb; + } + return 0; +} + +// ARGBQuantize is used to posterize art. +// e.g. rgb / qvalue * qvalue + qvalue / 2 +// But the low levels implement efficiently with 3 parameters, and could be +// used for other high level operations. +// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; +// where scale is 1 / interval_size as a fixed point value. +// The divide is replaces with a multiply by reciprocal fixed point multiply. +// Caveat - although SSE2 saturates, the C function does not and should be used +// with care if doing anything but quantization. +LIBYUV_API +int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, + int scale, int interval_size, int interval_offset, + int dst_x, int dst_y, int width, int height) { + int y; + void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) = ARGBQuantizeRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 || + interval_size < 1 || interval_size > 255) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBQUANTIZEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBQuantizeRow = ARGBQuantizeRow_SSE2; + } +#elif defined(HAS_ARGBQUANTIZEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBQuantizeRow = ARGBQuantizeRow_NEON; + } +#endif + for (y = 0; y < height; ++y) { + ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width); + dst += dst_stride_argb; + } + return 0; +} + +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +LIBYUV_API +int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height) { + int y; + void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; + int32* previous_cumsum = dst_cumsum; + if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { + return -1; + } +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; + } +#endif + memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel. + for (y = 0; y < height; ++y) { + ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width); + previous_cumsum = dst_cumsum; + dst_cumsum += dst_stride32_cumsum; + src_argb += src_stride_argb; + } + return 0; +} + +// Blur ARGB image. +// Caller should allocate CumulativeSum table of width * height * 16 bytes +// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory +// as the buffer is treated as circular. +LIBYUV_API +int ARGBBlur(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height, int radius) { + int y; + void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum, + const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; + void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C; + int32* cumsum_bot_row; + int32* max_cumsum_bot_row; + int32* cumsum_top_row; + + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + if (radius > height) { + radius = height; + } + if (radius > (width / 2 - 1)) { + radius = width / 2 - 1; + } + if (radius <= 0) { + return -1; + } +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; + CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2; + } +#endif + // Compute enough CumulativeSum for first row to be blurred. After this + // one row of CumulativeSum is updated at a time. + ARGBComputeCumulativeSum(src_argb, src_stride_argb, + dst_cumsum, dst_stride32_cumsum, + width, radius); + + src_argb = src_argb + radius * src_stride_argb; + cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum]; + + max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum]; + cumsum_top_row = &dst_cumsum[0]; + + for (y = 0; y < height; ++y) { + int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0; + int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1); + int area = radius * (bot_y - top_y); + int boxwidth = radius * 4; + int x; + int n; + + // Increment cumsum_top_row pointer with circular buffer wrap around. + if (top_y) { + cumsum_top_row += dst_stride32_cumsum; + if (cumsum_top_row >= max_cumsum_bot_row) { + cumsum_top_row = dst_cumsum; + } + } + // Increment cumsum_bot_row pointer with circular buffer wrap around and + // then fill in a row of CumulativeSum. + if ((y + radius) < height) { + const int32* prev_cumsum_bot_row = cumsum_bot_row; + cumsum_bot_row += dst_stride32_cumsum; + if (cumsum_bot_row >= max_cumsum_bot_row) { + cumsum_bot_row = dst_cumsum; + } + ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row, + width); + src_argb += src_stride_argb; + } + + // Left clipped. + for (x = 0; x < radius + 1; ++x) { + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, + boxwidth, area, &dst_argb[x * 4], 1); + area += (bot_y - top_y); + boxwidth += 4; + } + + // Middle unclipped. + n = (width - 1) - radius - x + 1; + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, + boxwidth, area, &dst_argb[x * 4], n); + + // Right clipped. + for (x += n; x <= width - 1; ++x) { + area -= (bot_y - top_y); + boxwidth -= 4; + CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4, + cumsum_bot_row + (x - radius - 1) * 4, + boxwidth, area, &dst_argb[x * 4], 1); + } + dst_argb += dst_stride_argb; + } + return 0; +} + +// Multiply ARGB image by a specified ARGB value. +LIBYUV_API +int ARGBShade(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, uint32 value) { + int y; + void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, + int width, uint32 value) = ARGBShadeRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSHADEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBShadeRow = ARGBShadeRow_SSE2; + } +#elif defined(HAS_ARGBSHADEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBShadeRow = ARGBShadeRow_NEON; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBShadeRow(src_argb, dst_argb, width, value); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Interpolate 2 ARGB images by specified amount (0 to 255). +LIBYUV_API +int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height, int interpolation) { + int y; + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && + IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && + IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 && + IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) && + IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0, + width * 4, interpolation); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Shuffle ARGB channel order. e.g. BGRA to ARGB. +LIBYUV_API +int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + const uint8* shuffler, int width, int height) { + int y; + void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb, + const uint8* shuffler, int pix) = ARGBShuffleRow_C; + if (!src_bgra || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } + // Coalesce rows. + if (src_stride_bgra == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_bgra = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSHUFFLEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBShuffleRow = ARGBShuffleRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBShuffleRow = ARGBShuffleRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBShuffleRow = ARGBShuffleRow_SSSE3; + } + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + ARGBShuffleRow = ARGBShuffleRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGBShuffleRow = ARGBShuffleRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + ARGBShuffleRow = ARGBShuffleRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBShuffleRow = ARGBShuffleRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); + src_bgra += src_stride_bgra; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Sobel ARGB effect. +static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, + void (*SobelRow)(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst, int width)) { + int y; + void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) = ARGBToBayerGGRow_C; + void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) = SobelYRow_C; + void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobely, int width) = + SobelXRow_C; + const int kEdge = 16; // Extra pixels at start of row for extrude/align. + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // ARGBToBayer used to select G channel from ARGB. +#if defined(HAS_ARGBTOBAYERGGROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerGGRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOBAYERROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOBAYERGGROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerGGRow_NEON; + } + } +#endif +#if defined(HAS_SOBELYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelYRow = SobelYRow_SSE2; + } +#endif +#if defined(HAS_SOBELYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelYRow = SobelYRow_NEON; + } +#endif +#if defined(HAS_SOBELXROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelXRow = SobelXRow_SSE2; + } +#endif +#if defined(HAS_SOBELXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelXRow = SobelXRow_NEON; + } +#endif + { + // 3 rows with edges before/after. + const int kRowSize = (width + kEdge + 15) & ~15; + align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge)); + uint8* row_sobelx = rows; + uint8* row_sobely = rows + kRowSize; + uint8* row_y = rows + kRowSize * 2; + + // Convert first row. + uint8* row_y0 = row_y + kEdge; + uint8* row_y1 = row_y0 + kRowSize; + uint8* row_y2 = row_y1 + kRowSize; + ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width); + row_y0[-1] = row_y0[0]; + memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. + ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width); + row_y1[-1] = row_y1[0]; + memset(row_y1 + width, row_y1[width - 1], 16); + memset(row_y2 + width, 0, 16); + + for (y = 0; y < height; ++y) { + // Convert next row of ARGB to Y. + if (y < (height - 1)) { + src_argb += src_stride_argb; + } + ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width); + row_y2[-1] = row_y2[0]; + row_y2[width] = row_y2[width - 1]; + + SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width); + SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width); + SobelRow(row_sobelx, row_sobely, dst_argb, width); + + // Cycle thru circular queue of 3 row_y buffers. + { + uint8* row_yt = row_y0; + row_y0 = row_y1; + row_y1 = row_y2; + row_y2 = row_yt; + } + + dst_argb += dst_stride_argb; + } + free_aligned_buffer_64(rows); + } + return 0; +} + +// Sobel ARGB effect. +LIBYUV_API +int ARGBSobel(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) = SobelRow_C; +#if defined(HAS_SOBELROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + SobelRow = SobelRow_SSE2; + } +#endif +#if defined(HAS_SOBELROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + SobelRow = SobelRow_NEON; + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height, SobelRow); +} + +// Sobel ARGB effect with planar output. +LIBYUV_API +int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height) { + void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_, int width) = SobelToPlaneRow_C; +#if defined(HAS_SOBELTOPLANEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + SobelToPlaneRow = SobelToPlaneRow_SSE2; + } +#endif +#if defined(HAS_SOBELTOPLANEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + SobelToPlaneRow = SobelToPlaneRow_NEON; + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, + width, height, SobelToPlaneRow); +} + +// SobelXY ARGB effect. +// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel. +LIBYUV_API +int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) = SobelXYRow_C; +#if defined(HAS_SOBELXYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + SobelXYRow = SobelXYRow_SSE2; + } +#endif +#if defined(HAS_SOBELXYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + SobelXYRow = SobelXYRow_NEON; + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height, SobelXYRow); +} + +// Apply a 4x4 polynomial to each ARGB pixel. +LIBYUV_API +int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const float* poly, + int width, int height) { + int y; + void (*ARGBPolynomialRow)(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) = ARGBPolynomialRow_C; + if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBPOLYNOMIALROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) { + ARGBPolynomialRow = ARGBPolynomialRow_SSE2; + } +#endif +#if defined(HAS_ARGBPOLYNOMIALROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) && + IS_ALIGNED(width, 2)) { + ARGBPolynomialRow = ARGBPolynomialRow_AVX2; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBPolynomialRow(src_argb, dst_argb, poly, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Apply a lumacolortable to each ARGB pixel. +LIBYUV_API +int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const uint8* luma, + int width, int height) { + int y; + void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb, + int width, const uint8* luma, const uint32 lumacoeff) = + ARGBLumaColorTableRow_C; + if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { + ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Copy Alpha from one ARGB image to another. +LIBYUV_API +int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) = + ARGBCopyAlphaRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOPYALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && + IS_ALIGNED(width, 8)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; + } +#endif +#if defined(HAS_ARGBCOPYALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBCopyAlphaRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Copy a planar Y channel to the alpha channel of a destination ARGB image. +LIBYUV_API +int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) = + ARGBCopyYToAlphaRow_C; + if (!src_y || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && + IS_ALIGNED(width, 8)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; + } +#endif +#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBCopyYToAlphaRow(src_y, dst_argb, width); + src_y += src_stride_y; + dst_argb += dst_stride_argb; + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/row.h b/third_party/libyuv/source/row.h deleted file mode 100644 index eabe18094..000000000 --- a/third_party/libyuv/source/row.h +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef LIBYUV_SOURCE_ROW_H_ -#define LIBYUV_SOURCE_ROW_H_ - -#include "third_party/libyuv/include/libyuv/basic_types.h" - -#define kMaxStride (2048 * 4) -#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) - -#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR) -#define YUV_DISABLE_ASM -#endif - -#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) -#define HAS_FASTCONVERTYUVTOARGBROW_NEON -void FastConvertYUVToARGBRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); -#define HAS_FASTCONVERTYUVTOBGRAROW_NEON -void FastConvertYUVToBGRARow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); -#define HAS_FASTCONVERTYUVTOABGRROW_NEON -void FastConvertYUVToABGRRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); -#endif - -// The following are available on all x86 platforms -#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ - !defined(YUV_DISABLE_ASM) -#define HAS_ABGRTOARGBROW_SSSE3 -#define HAS_BGRATOARGBROW_SSSE3 -#define HAS_BG24TOARGBROW_SSSE3 -#define HAS_RAWTOARGBROW_SSSE3 -#define HAS_RGB24TOYROW_SSSE3 -#define HAS_RAWTOYROW_SSSE3 -#define HAS_RGB24TOUVROW_SSSE3 -#define HAS_RAWTOUVROW_SSSE3 -#define HAS_ARGBTOYROW_SSSE3 -#define HAS_BGRATOYROW_SSSE3 -#define HAS_ABGRTOYROW_SSSE3 -#define HAS_ARGBTOUVROW_SSSE3 -#define HAS_BGRATOUVROW_SSSE3 -#define HAS_ABGRTOUVROW_SSSE3 -#define HAS_I400TOARGBROW_SSE2 -#define HAS_FASTCONVERTYTOARGBROW_SSE2 -#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3 -#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3 -#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3 -#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3 -#define HAS_REVERSE_ROW_SSSE3 -#endif - -// The following are available on Neon platforms -#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) -#define HAS_REVERSE_ROW_NEON -#endif - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#ifdef HAS_ARGBTOYROW_SSSE3 -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -#endif -#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3) -#define HASRGB24TOYROW_SSSE3 -#endif -#ifdef HASRGB24TOYROW_SSSE3 -void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -#endif -#ifdef HAS_REVERSE_ROW_SSSE3 -void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width); -#endif -#ifdef HAS_REVERSE_ROW_NEON -void ReverseRow_NEON(const uint8* src, uint8* dst, int width); -#endif -void ReverseRow_C(const uint8* src, uint8* dst, int width); - -void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); -void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); -void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); -void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); -void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); - -#ifdef HAS_BG24TOARGBROW_SSSE3 -void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix); -void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix); -void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); -void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); -#endif -void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix); -void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix); -void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); -void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); - -#ifdef HAS_I400TOARGBROW_SSE2 -void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); -#endif -void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); - -#if defined(_MSC_VER) -#define SIMD_ALIGNED(var) __declspec(align(16)) var -typedef __declspec(align(16)) signed char vec8[16]; -typedef __declspec(align(16)) unsigned char uvec8[16]; -typedef __declspec(align(16)) signed short vec16[8]; -#else // __GNUC__ -#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) -typedef signed char __attribute__((vector_size(16))) vec8; -typedef unsigned char __attribute__((vector_size(16))) uvec8; -typedef signed short __attribute__((vector_size(16))) vec16; -#endif - -//extern "C" -SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); -//extern "C" -SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]); -//extern "C" -SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]); - -void FastConvertYUVToARGBRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUVToBGRARow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUVToABGRRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUV444ToARGBRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYToARGBRow_C(const uint8* y_buf, - uint8* rgb_buf, - int width); - -#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2 -void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYToARGBRow_SSE2(const uint8* y_buf, - uint8* rgb_buf, - int width); -#endif - -#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 -void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -#endif - -#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 -void FastConvertYToARGBRow_SSE2(const uint8* y_buf, - uint8* rgb_buf, - int width); - -#endif - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // LIBYUV_SOURCE_ROW_H_ diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc new file mode 100644 index 000000000..27a0de119 --- /dev/null +++ b/third_party/libyuv/source/row_any.cc @@ -0,0 +1,542 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/row.h" + +#include "third_party/libyuv/include/libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels. +// TODO(fbarchard): Consider 'any' functions handling odd alignment. +// YUV to RGB does multiple of 8 with SIMD and remainder with C. +#define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK) \ + void NAMEANY(const uint8* y_buf, \ + const uint8* u_buf, \ + const uint8* v_buf, \ + uint8* rgb_buf, \ + int width) { \ + int n = width & ~MASK; \ + I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n); \ + I420TORGB_C(y_buf + n, \ + u_buf + (n >> UV_SHIFT), \ + v_buf + (n >> UV_SHIFT), \ + rgb_buf + n * BPP, width & MASK); \ + } + +#ifdef HAS_I422TOARGBROW_SSSE3 +YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, + 0, 4, 7) +YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, + 1, 4, 7) +YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, + 2, 4, 7) +YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, + 1, 4, 7) +YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, + 1, 4, 7) +YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, + 1, 4, 7) +// I422ToRGB565Row_SSSE3 is unaligned. +YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C, + 1, 2, 7) +YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C, + 1, 2, 7) +YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C, + 1, 2, 7) +// I422ToRGB24Row_SSSE3 is unaligned. +YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7) +YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7) +YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15) +YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15) +#endif // HAS_I422TOARGBROW_SSSE3 +#ifdef HAS_I422TOARGBROW_AVX2 +YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15) +#endif // HAS_I422TOARGBROW_AVX2 +#ifdef HAS_I422TOARGBROW_NEON +YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7) +YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7) +YANY(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, I411ToARGBRow_C, 2, 4, 7) +YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1, 4, 7) +YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1, 4, 7) +YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1, 4, 7) +YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1, 3, 7) +YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1, 3, 7) +YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C, + 1, 2, 7) +YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C, + 1, 2, 7) +YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7) +YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15) +YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) +#endif // HAS_I422TOARGBROW_NEON +#undef YANY + +// Wrappers to handle odd width +#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP) \ + void NAMEANY(const uint8* y_buf, \ + const uint8* uv_buf, \ + uint8* rgb_buf, \ + int width) { \ + int n = width & ~7; \ + NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n); \ + NV12TORGB_C(y_buf + n, \ + uv_buf + (n >> UV_SHIFT), \ + rgb_buf + n * BPP, width & 7); \ + } + +#ifdef HAS_NV12TOARGBROW_SSSE3 +NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, + 0, 4) +NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, + 0, 4) +#endif // HAS_NV12TOARGBROW_SSSE3 +#ifdef HAS_NV12TOARGBROW_NEON +NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4) +NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4) +#endif // HAS_NV12TOARGBROW_NEON +#ifdef HAS_NV12TORGB565ROW_SSSE3 +NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C, + 0, 2) +NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C, + 0, 2) +#endif // HAS_NV12TORGB565ROW_SSSE3 +#ifdef HAS_NV12TORGB565ROW_NEON +NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2) +NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2) +#endif // HAS_NV12TORGB565ROW_NEON +#undef NVANY + +#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \ + void NAMEANY(const uint8* src, \ + uint8* dst, \ + int width) { \ + int n = width & ~MASK; \ + ARGBTORGB_SIMD(src, dst, n); \ + ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK); \ + } + +#if defined(HAS_ARGBTORGB24ROW_SSSE3) +RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C, + 15, 4, 3) +RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C, + 15, 4, 3) +RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C, + 3, 4, 2) +RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C, + 3, 4, 2) +RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, + 3, 4, 2) +#endif +#if defined(HAS_I400TOARGBROW_SSE2) +RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C, + 7, 1, 4) +#endif +#if defined(HAS_YTOARGBROW_SSE2) +RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, + 7, 1, 4) +RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C, + 15, 2, 4) +RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C, + 15, 2, 4) +// These require alignment on ARGB, so C is used for remainder. +RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C, + 15, 3, 4) +RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C, + 15, 3, 4) +RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C, + 7, 2, 4) +RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C, + 7, 2, 4) +RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C, + 7, 2, 4) +#endif +#if defined(HAS_ARGBTORGB24ROW_NEON) +RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3) +RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3) +RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C, + 7, 4, 2) +RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C, + 7, 4, 2) +RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C, + 7, 4, 2) +RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C, + 7, 1, 4) +RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C, + 7, 1, 4) +RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C, + 7, 2, 4) +RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, + 7, 2, 4) +#endif +#undef RGBANY + +// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst. +#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \ + void NAMEANY(const uint8* src, \ + uint8* dst, uint32 selector, \ + int width) { \ + int n = width & ~MASK; \ + ARGBTORGB_SIMD(src, dst, selector, n); \ + ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK); \ + } + +#if defined(HAS_ARGBTOBAYERROW_SSSE3) +BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C, + 7, 4, 1) +#endif +#if defined(HAS_ARGBTOBAYERROW_NEON) +BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C, + 7, 4, 1) +#endif +#if defined(HAS_ARGBTOBAYERGGROW_SSE2) +BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C, + 7, 4, 1) +#endif +#if defined(HAS_ARGBTOBAYERGGROW_NEON) +BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C, + 7, 4, 1) +#endif + +#undef BAYERANY + +// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD. +#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM) \ + void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ + ARGBTOY_SIMD(src_argb, dst_y, width - NUM); \ + ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP, \ + dst_y + (width - NUM) * BPP, NUM); \ + } + +#ifdef HAS_ARGBTOYROW_AVX2 +YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32) +YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32) +YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32) +YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32) +#endif +#ifdef HAS_ARGBTOYROW_SSSE3 +YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16) +#endif +#ifdef HAS_BGRATOYROW_SSSE3 +YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16) +YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16) +YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16) +YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16) +YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16) +#endif +#ifdef HAS_ARGBTOYJROW_SSSE3 +YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16) +#endif +#ifdef HAS_ARGBTOYROW_NEON +YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8) +YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8) +YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8) +YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8) +YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8) +YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8) +YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8) +YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8) +YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8) +YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8) +YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16) +YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16) +YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8) +YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8) +YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8) +YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8) +YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8) +#endif +#undef YANY + +#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ + int n = width & ~MASK; \ + ARGBTOY_SIMD(src_argb, dst_y, n); \ + ARGBTOY_C(src_argb + n * SBPP, \ + dst_y + n * BPP, width & MASK); \ + } + +// Attenuate is destructive so last16 method can not be used due to overlap. +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C, + 4, 4, 3) +#endif +#ifdef HAS_ARGBATTENUATEROW_SSE2 +YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C, + 4, 4, 3) +#endif +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +YANY(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, ARGBUnattenuateRow_C, + 4, 4, 3) +#endif +#ifdef HAS_ARGBATTENUATEROW_AVX2 +YANY(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, ARGBAttenuateRow_C, + 4, 4, 7) +#endif +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +YANY(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, ARGBUnattenuateRow_C, + 4, 4, 7) +#endif +#ifdef HAS_ARGBATTENUATEROW_NEON +YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C, + 4, 4, 7) +#endif +#undef YANY + +// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C. +#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK) \ + void NAMEANY(const uint8* src_argb, int src_stride_argb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + int n = width & ~MASK; \ + ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \ + ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \ + dst_u + (n >> 1), \ + dst_v + (n >> 1), \ + width & MASK); \ + } + +#ifdef HAS_ARGBTOUVROW_AVX2 +UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31) +UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31) +UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31) +#endif +#ifdef HAS_ARGBTOUVROW_SSSE3 +UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15) +UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C, + 4, 15) +UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15) +UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15) +UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15) +UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15) +UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15) +#endif +#ifdef HAS_ARGBTOUVROW_NEON +UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15) +UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15) +UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15) +UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15) +UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15) +UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15) +UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15) +UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15) +UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15) +UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15) +UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15) +UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) +#endif +#undef UVANY + +#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT) \ + void NAMEANY(const uint8* src_uv, \ + uint8* dst_u, uint8* dst_v, int width) { \ + int n = width & ~MASK; \ + ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \ + ANYTOUV_C(src_uv + n * BPP, \ + dst_u + (n >> SHIFT), \ + dst_v + (n >> SHIFT), \ + width & MASK); \ + } + +#ifdef HAS_ARGBTOUV444ROW_SSSE3 +UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3, + ARGBToUV444Row_C, 4, 15, 0) +#endif +#ifdef HAS_YUY2TOUV422ROW_AVX2 +UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, + YUY2ToUV422Row_C, 2, 31, 1) +UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, + UYVYToUV422Row_C, 2, 31, 1) +#endif +#ifdef HAS_ARGBTOUVROW_SSSE3 +UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3, + ARGBToUV422Row_C, 4, 15, 1) +UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2, + YUY2ToUV422Row_C, 2, 15, 1) +UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2, + UYVYToUV422Row_C, 2, 15, 1) +#endif +#ifdef HAS_YUY2TOUV422ROW_NEON +UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, + ARGBToUV444Row_C, 4, 7, 0) +UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, + ARGBToUV422Row_C, 4, 15, 1) +UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, + ARGBToUV411Row_C, 4, 31, 2) +UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, + YUY2ToUV422Row_C, 2, 15, 1) +UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, + UYVYToUV422Row_C, 2, 15, 1) +#endif +#undef UV422ANY + +#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \ + void NAMEANY(const uint8* src_uv, \ + uint8* dst_u, uint8* dst_v, int width) { \ + int n = width & ~MASK; \ + ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \ + ANYTOUV_C(src_uv + n * 2, \ + dst_u + n, \ + dst_v + n, \ + width & MASK); \ + } + +#ifdef HAS_SPLITUVROW_SSE2 +SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15) +#endif +#ifdef HAS_SPLITUVROW_AVX2 +SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31) +#endif +#ifdef HAS_SPLITUVROW_NEON +SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15) +#endif +#ifdef HAS_SPLITUVROW_MIPS_DSPR2 +SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, + SplitUVRow_C, 15) +#endif +#undef SPLITUVROWANY + +#define MERGEUVROW_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \ + void NAMEANY(const uint8* src_u, const uint8* src_v, \ + uint8* dst_uv, int width) { \ + int n = width & ~MASK; \ + ANYTOUV_SIMD(src_u, src_v, dst_uv, n); \ + ANYTOUV_C(src_u + n, \ + src_v + n, \ + dst_uv + n * 2, \ + width & MASK); \ + } + +#ifdef HAS_MERGEUVROW_SSE2 +MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15) +#endif +#ifdef HAS_MERGEUVROW_AVX2 +MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31) +#endif +#ifdef HAS_MERGEUVROW_NEON +MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15) +#endif +#undef MERGEUVROW_ANY + +#define MATHROW_ANY(NAMEANY, ARGBMATH_SIMD, ARGBMATH_C, MASK) \ + void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \ + uint8* dst_argb, int width) { \ + int n = width & ~MASK; \ + ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n); \ + ARGBMATH_C(src_argb0 + n * 4, \ + src_argb1 + n * 4, \ + dst_argb + n * 4, \ + width & MASK); \ + } + +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +MATHROW_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, ARGBMultiplyRow_C, + 3) +#endif +#ifdef HAS_ARGBADDROW_SSE2 +MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3) +#endif +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C, + 3) +#endif +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +MATHROW_ANY(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, ARGBMultiplyRow_C, + 7) +#endif +#ifdef HAS_ARGBADDROW_AVX2 +MATHROW_ANY(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, ARGBAddRow_C, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +MATHROW_ANY(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, ARGBSubtractRow_C, + 7) +#endif +#ifdef HAS_ARGBMULTIPLYROW_NEON +MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C, + 7) +#endif +#ifdef HAS_ARGBADDROW_NEON +MATHROW_ANY(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, ARGBAddRow_C, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_NEON +MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C, + 7) +#endif +#undef MATHROW_ANY + +// Shuffle may want to work in place, so last16 method can not be used. +#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_argb, uint8* dst_argb, \ + const uint8* shuffler, int width) { \ + int n = width & ~MASK; \ + ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n); \ + ARGBTOY_C(src_argb + n * SBPP, \ + dst_argb + n * BPP, shuffler, width & MASK); \ + } + +#ifdef HAS_ARGBSHUFFLEROW_SSE2 +YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, + ARGBShuffleRow_C, 4, 4, 3) +#endif +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 +YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3, + ARGBShuffleRow_C, 4, 4, 7) +#endif +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +YANY(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, + ARGBShuffleRow_C, 4, 4, 15) +#endif +#ifdef HAS_ARGBSHUFFLEROW_NEON +YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, + ARGBShuffleRow_C, 4, 4, 3) +#endif +#undef YANY + +// Interpolate may want to work in place, so last16 method can not be used. +#define NANY(NAMEANY, TERP_SIMD, TERP_C, SBPP, BPP, MASK) \ + void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ + ptrdiff_t src_stride_ptr, int width, \ + int source_y_fraction) { \ + int n = width & ~MASK; \ + TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, \ + n, source_y_fraction); \ + TERP_C(dst_ptr + n * BPP, \ + src_ptr + n * SBPP, src_stride_ptr, \ + width & MASK, source_y_fraction); \ + } + +#ifdef HAS_INTERPOLATEROW_AVX2 +NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, + InterpolateRow_C, 1, 1, 32) +#endif +#ifdef HAS_INTERPOLATEROW_SSSE3 +NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3, + InterpolateRow_C, 1, 1, 15) +#endif +#ifdef HAS_INTERPOLATEROW_SSE2 +NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2, + InterpolateRow_C, 1, 1, 15) +#endif +#ifdef HAS_INTERPOLATEROW_NEON +NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON, + InterpolateRow_C, 1, 1, 15) +#endif +#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2 +NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, + InterpolateRow_C, 1, 1, 3) +#endif +#undef NANY + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/row_common.cc b/third_party/libyuv/source/row_common.cc new file mode 100644 index 000000000..ceb3836cd --- /dev/null +++ b/third_party/libyuv/source/row_common.cc @@ -0,0 +1,2286 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/row.h" + +#include // For memcpy and memset. + +#include "third_party/libyuv/include/libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// llvm x86 is poor at ternary operator, so use branchless min/max. + +#define USE_BRANCHLESS 1 +#if USE_BRANCHLESS +static __inline int32 clamp0(int32 v) { + return ((-(v) >> 31) & (v)); +} + +static __inline int32 clamp255(int32 v) { + return (((255 - (v)) >> 31) | (v)) & 255; +} + +static __inline uint32 Clamp(int32 val) { + int v = clamp0(val); + return (uint32)(clamp255(v)); +} + +static __inline uint32 Abs(int32 v) { + int m = v >> 31; + return (v + m) ^ m; +} +#else // USE_BRANCHLESS +static __inline int32 clamp0(int32 v) { + return (v < 0) ? 0 : v; +} + +static __inline int32 clamp255(int32 v) { + return (v > 255) ? 255 : v; +} + +static __inline uint32 Clamp(int32 val) { + int v = clamp0(val); + return (uint32)(clamp255(v)); +} + +static __inline uint32 Abs(int32 v) { + return (v < 0) ? -v : v; +} +#endif // USE_BRANCHLESS + +#ifdef LIBYUV_LITTLE_ENDIAN +#define WRITEWORD(p, v) *(uint32*)(p) = v +#else +static inline void WRITEWORD(uint8* p, uint32 v) { + p[0] = (uint8)(v & 255); + p[1] = (uint8)((v >> 8) & 255); + p[2] = (uint8)((v >> 16) & 255); + p[3] = (uint8)((v >> 24) & 255); +} +#endif + +void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_rgb24[0]; + uint8 g = src_rgb24[1]; + uint8 r = src_rgb24[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_rgb24 += 3; + } +} + +void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 r = src_raw[0]; + uint8 g = src_raw[1]; + uint8 b = src_raw[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_raw += 3; + } +} + +void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_rgb565[0] & 0x1f; + uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8 r = src_rgb565[1] >> 3; + dst_argb[0] = (b << 3) | (b >> 2); + dst_argb[1] = (g << 2) | (g >> 4); + dst_argb[2] = (r << 3) | (r >> 2); + dst_argb[3] = 255u; + dst_argb += 4; + src_rgb565 += 2; + } +} + +void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb1555[0] & 0x1f; + uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r = (src_argb1555[1] & 0x7c) >> 2; + uint8 a = src_argb1555[1] >> 7; + dst_argb[0] = (b << 3) | (b >> 2); + dst_argb[1] = (g << 3) | (g >> 2); + dst_argb[2] = (r << 3) | (r >> 2); + dst_argb[3] = -a; + dst_argb += 4; + src_argb1555 += 2; + } +} + +void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb4444[0] & 0x0f; + uint8 g = src_argb4444[0] >> 4; + uint8 r = src_argb4444[1] & 0x0f; + uint8 a = src_argb4444[1] >> 4; + dst_argb[0] = (b << 4) | b; + dst_argb[1] = (g << 4) | g; + dst_argb[2] = (r << 4) | r; + dst_argb[3] = (a << 4) | a; + dst_argb += 4; + src_argb4444 += 2; + } +} + +void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb[0]; + uint8 g = src_argb[1]; + uint8 r = src_argb[2]; + dst_rgb[0] = b; + dst_rgb[1] = g; + dst_rgb[2] = r; + dst_rgb += 3; + src_argb += 4; + } +} + +void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb[0]; + uint8 g = src_argb[1]; + uint8 r = src_argb[2]; + dst_rgb[0] = r; + dst_rgb[1] = g; + dst_rgb[2] = b; + dst_rgb += 3; + src_argb += 4; + } +} + +void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 2; + uint8 r0 = src_argb[2] >> 3; + uint8 b1 = src_argb[4] >> 3; + uint8 g1 = src_argb[5] >> 2; + uint8 r1 = src_argb[6] >> 3; + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27)); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 2; + uint8 r0 = src_argb[2] >> 3; + *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 3; + uint8 r0 = src_argb[2] >> 3; + uint8 a0 = src_argb[3] >> 7; + uint8 b1 = src_argb[4] >> 3; + uint8 g1 = src_argb[5] >> 3; + uint8 r1 = src_argb[6] >> 3; + uint8 a1 = src_argb[7] >> 7; + *(uint32*)(dst_rgb) = + b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | + (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 3; + uint8 r0 = src_argb[2] >> 3; + uint8 a0 = src_argb[3] >> 7; + *(uint16*)(dst_rgb) = + b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); + } +} + +void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb[0] >> 4; + uint8 g0 = src_argb[1] >> 4; + uint8 r0 = src_argb[2] >> 4; + uint8 a0 = src_argb[3] >> 4; + uint8 b1 = src_argb[4] >> 4; + uint8 g1 = src_argb[5] >> 4; + uint8 r1 = src_argb[6] >> 4; + uint8 a1 = src_argb[7] >> 4; + *(uint32*)(dst_rgb) = + b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | + (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8 b0 = src_argb[0] >> 4; + uint8 g0 = src_argb[1] >> 4; + uint8 r0 = src_argb[2] >> 4; + uint8 a0 = src_argb[3] >> 4; + *(uint16*)(dst_rgb) = + b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); + } +} + +static __inline int RGBToY(uint8 r, uint8 g, uint8 b) { + return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; +} + +static __inline int RGBToU(uint8 r, uint8 g, uint8 b) { + return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; +} +static __inline int RGBToV(uint8 r, uint8 g, uint8 b) { + return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; +} + +#define MAKEROWY(NAME, R, G, B, BPP) \ +void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ +} \ +void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + \ + src_rgb1[B] + src_rgb1[B + BPP]) >> 2; \ + uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + \ + src_rgb1[G] + src_rgb1[G + BPP]) >> 2; \ + uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + \ + src_rgb1[R] + src_rgb1[R + BPP]) >> 2; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ + uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ + uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ +} + +MAKEROWY(ARGB, 2, 1, 0, 4) +MAKEROWY(BGRA, 1, 2, 3, 4) +MAKEROWY(ABGR, 0, 1, 2, 4) +MAKEROWY(RGBA, 3, 2, 1, 4) +MAKEROWY(RGB24, 2, 1, 0, 3) +MAKEROWY(RAW, 0, 1, 2, 3) +#undef MAKEROWY + +// JPeg uses a variation on BT.601-1 full range +// y = 0.29900 * r + 0.58700 * g + 0.11400 * b +// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center +// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center +// BT.601 Mpeg range uses: +// b 0.1016 * 255 = 25.908 = 25 +// g 0.5078 * 255 = 129.489 = 129 +// r 0.2578 * 255 = 65.739 = 66 +// JPeg 8 bit Y (not used): +// b 0.11400 * 256 = 29.184 = 29 +// g 0.58700 * 256 = 150.272 = 150 +// r 0.29900 * 256 = 76.544 = 77 +// JPeg 7 bit Y: +// b 0.11400 * 128 = 14.592 = 15 +// g 0.58700 * 128 = 75.136 = 75 +// r 0.29900 * 128 = 38.272 = 38 +// JPeg 8 bit U: +// b 0.50000 * 255 = 127.5 = 127 +// g -0.33126 * 255 = -84.4713 = -84 +// r -0.16874 * 255 = -43.0287 = -43 +// JPeg 8 bit V: +// b -0.08131 * 255 = -20.73405 = -20 +// g -0.41869 * 255 = -106.76595 = -107 +// r 0.50000 * 255 = 127.5 = 127 + +static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) { + return (38 * r + 75 * g + 15 * b + 64) >> 7; +} + +static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) { + return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; +} +static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) { + return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; +} + +#define AVGB(a, b) (((a) + (b) + 1) >> 1) + +#define MAKEROWYJ(NAME, R, G, B, BPP) \ +void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ +} \ +void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ + AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ + uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ + AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ + uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ + AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \ + uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \ + uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ +} + +MAKEROWYJ(ARGB, 2, 1, 0, 4) +#undef MAKEROWYJ + +void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_rgb565[0] & 0x1f; + uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8 r = src_rgb565[1] >> 3; + b = (b << 3) | (b >> 2); + g = (g << 2) | (g >> 4); + r = (r << 3) | (r >> 2); + dst_y[0] = RGBToY(r, g, b); + src_rgb565 += 2; + dst_y += 1; + } +} + +void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb1555[0] & 0x1f; + uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r = (src_argb1555[1] & 0x7c) >> 2; + b = (b << 3) | (b >> 2); + g = (g << 3) | (g >> 2); + r = (r << 3) | (r >> 2); + dst_y[0] = RGBToY(r, g, b); + src_argb1555 += 2; + dst_y += 1; + } +} + +void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb4444[0] & 0x0f; + uint8 g = src_argb4444[0] >> 4; + uint8 r = src_argb4444[1] & 0x0f; + b = (b << 4) | b; + g = (g << 4) | g; + r = (r << 4) | r; + dst_y[0] = RGBToY(r, g, b); + src_argb4444 += 2; + dst_y += 1; + } +} + +void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_rgb565[0] & 0x1f; + uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8 r0 = src_rgb565[1] >> 3; + uint8 b1 = src_rgb565[2] & 0x1f; + uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3); + uint8 r1 = src_rgb565[3] >> 3; + uint8 b2 = next_rgb565[0] & 0x1f; + uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8 r2 = next_rgb565[1] >> 3; + uint8 b3 = next_rgb565[2] & 0x1f; + uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); + uint8 r3 = next_rgb565[3] >> 3; + uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787. + uint8 g = (g0 + g1 + g2 + g3); + uint8 r = (r0 + r1 + r2 + r3); + b = (b << 1) | (b >> 6); // 787 -> 888. + r = (r << 1) | (r >> 6); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_rgb565 += 4; + next_rgb565 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 b0 = src_rgb565[0] & 0x1f; + uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8 r0 = src_rgb565[1] >> 3; + uint8 b2 = next_rgb565[0] & 0x1f; + uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8 r2 = next_rgb565[1] >> 3; + uint8 b = (b0 + b2); // 565 * 2 = 676. + uint8 g = (g0 + g2); + uint8 r = (r0 + r2); + b = (b << 2) | (b >> 4); // 676 -> 888 + g = (g << 1) | (g >> 6); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb1555[0] & 0x1f; + uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8 b1 = src_argb1555[2] & 0x1f; + uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); + uint8 r1 = (src_argb1555[3] & 0x7c) >> 2; + uint8 b2 = next_argb1555[0] & 0x1f; + uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8 r2 = (next_argb1555[1] & 0x7c) >> 2; + uint8 b3 = next_argb1555[2] & 0x1f; + uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); + uint8 r3 = (next_argb1555[3] & 0x7c) >> 2; + uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777. + uint8 g = (g0 + g1 + g2 + g3); + uint8 r = (r0 + r1 + r2 + r3); + b = (b << 1) | (b >> 6); // 777 -> 888. + g = (g << 1) | (g >> 6); + r = (r << 1) | (r >> 6); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_argb1555 += 4; + next_argb1555 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 b0 = src_argb1555[0] & 0x1f; + uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8 b2 = next_argb1555[0] & 0x1f; + uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8 r2 = next_argb1555[1] >> 3; + uint8 b = (b0 + b2); // 555 * 2 = 666. + uint8 g = (g0 + g2); + uint8 r = (r0 + r2); + b = (b << 2) | (b >> 4); // 666 -> 888. + g = (g << 2) | (g >> 4); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb4444[0] & 0x0f; + uint8 g0 = src_argb4444[0] >> 4; + uint8 r0 = src_argb4444[1] & 0x0f; + uint8 b1 = src_argb4444[2] & 0x0f; + uint8 g1 = src_argb4444[2] >> 4; + uint8 r1 = src_argb4444[3] & 0x0f; + uint8 b2 = next_argb4444[0] & 0x0f; + uint8 g2 = next_argb4444[0] >> 4; + uint8 r2 = next_argb4444[1] & 0x0f; + uint8 b3 = next_argb4444[2] & 0x0f; + uint8 g3 = next_argb4444[2] >> 4; + uint8 r3 = next_argb4444[3] & 0x0f; + uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666. + uint8 g = (g0 + g1 + g2 + g3); + uint8 r = (r0 + r1 + r2 + r3); + b = (b << 2) | (b >> 4); // 666 -> 888. + g = (g << 2) | (g >> 4); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_argb4444 += 4; + next_argb4444 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 b0 = src_argb4444[0] & 0x0f; + uint8 g0 = src_argb4444[0] >> 4; + uint8 r0 = src_argb4444[1] & 0x0f; + uint8 b2 = next_argb4444[0] & 0x0f; + uint8 g2 = next_argb4444[0] >> 4; + uint8 r2 = next_argb4444[1] & 0x0f; + uint8 b = (b0 + b2); // 444 * 2 = 555. + uint8 g = (g0 + g2); + uint8 r = (r0 + r2); + b = (b << 3) | (b >> 2); // 555 -> 888. + g = (g << 3) | (g >> 2); + r = (r << 3) | (r >> 2); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGBToUV444Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 ab = src_argb[0]; + uint8 ag = src_argb[1]; + uint8 ar = src_argb[2]; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + src_argb += 4; + dst_u += 1; + dst_v += 1; + } +} + +void ARGBToUV422Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 ab = (src_argb[0] + src_argb[4]) >> 1; + uint8 ag = (src_argb[1] + src_argb[5]) >> 1; + uint8 ar = (src_argb[2] + src_argb[6]) >> 1; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + src_argb += 8; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 ab = src_argb[0]; + uint8 ag = src_argb[1]; + uint8 ar = src_argb[2]; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } +} + +void ARGBToUV411Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width - 3; x += 4) { + uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2; + uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2; + uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + src_argb += 16; + dst_u += 1; + dst_v += 1; + } + if ((width & 3) == 3) { + uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3; + uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3; + uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } else if ((width & 3) == 2) { + uint8 ab = (src_argb[0] + src_argb[4]) >> 1; + uint8 ag = (src_argb[1] + src_argb[5]) >> 1; + uint8 ar = (src_argb[2] + src_argb[6]) >> 1; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } else if ((width & 3) == 1) { + uint8 ab = src_argb[0]; + uint8 ag = src_argb[1]; + uint8 ar = src_argb[2]; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } +} + +void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = src_argb[3]; + dst_argb += 4; + src_argb += 4; + } +} + +// Convert a row of image to Sepia tone. +void ARGBSepiaRow_C(uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int sb = (b * 17 + g * 68 + r * 35) >> 7; + int sg = (b * 22 + g * 88 + r * 45) >> 7; + int sr = (b * 24 + g * 98 + r * 50) >> 7; + // b does not over flow. a is preserved from original. + dst_argb[0] = sb; + dst_argb[1] = clamp255(sg); + dst_argb[2] = clamp255(sr); + dst_argb += 4; + } +} + +// Apply color matrix to a row of image. Matrix is signed. +// TODO(fbarchard): Consider adding rounding (+32). +void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = src_argb[0]; + int g = src_argb[1]; + int r = src_argb[2]; + int a = src_argb[3]; + int sb = (b * matrix_argb[0] + g * matrix_argb[1] + + r * matrix_argb[2] + a * matrix_argb[3]) >> 6; + int sg = (b * matrix_argb[4] + g * matrix_argb[5] + + r * matrix_argb[6] + a * matrix_argb[7]) >> 6; + int sr = (b * matrix_argb[8] + g * matrix_argb[9] + + r * matrix_argb[10] + a * matrix_argb[11]) >> 6; + int sa = (b * matrix_argb[12] + g * matrix_argb[13] + + r * matrix_argb[14] + a * matrix_argb[15]) >> 6; + dst_argb[0] = Clamp(sb); + dst_argb[1] = Clamp(sg); + dst_argb[2] = Clamp(sr); + dst_argb[3] = Clamp(sa); + src_argb += 4; + dst_argb += 4; + } +} + +// Apply color table to a row of image. +void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int a = dst_argb[3]; + dst_argb[0] = table_argb[b * 4 + 0]; + dst_argb[1] = table_argb[g * 4 + 1]; + dst_argb[2] = table_argb[r * 4 + 2]; + dst_argb[3] = table_argb[a * 4 + 3]; + dst_argb += 4; + } +} + +// Apply color table to a row of image. +void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + dst_argb[0] = table_argb[b * 4 + 0]; + dst_argb[1] = table_argb[g * 4 + 1]; + dst_argb[2] = table_argb[r * 4 + 2]; + dst_argb += 4; + } +} + +void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; + dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset; + dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset; + dst_argb += 4; + } +} + +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v * f >> 24 + +void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + const uint32 b_scale = REPEAT8(value & 0xff); + const uint32 g_scale = REPEAT8((value >> 8) & 0xff); + const uint32 r_scale = REPEAT8((value >> 16) & 0xff); + const uint32 a_scale = REPEAT8(value >> 24); + + int i; + for (i = 0; i < width; ++i) { + const uint32 b = REPEAT8(src_argb[0]); + const uint32 g = REPEAT8(src_argb[1]); + const uint32 r = REPEAT8(src_argb[2]); + const uint32 a = REPEAT8(src_argb[3]); + dst_argb[0] = SHADE(b, b_scale); + dst_argb[1] = SHADE(g, g_scale); + dst_argb[2] = SHADE(r, r_scale); + dst_argb[3] = SHADE(a, a_scale); + src_argb += 4; + dst_argb += 4; + } +} +#undef REPEAT8 +#undef SHADE + +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v * f >> 16 + +void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + const uint32 b = REPEAT8(src_argb0[0]); + const uint32 g = REPEAT8(src_argb0[1]); + const uint32 r = REPEAT8(src_argb0[2]); + const uint32 a = REPEAT8(src_argb0[3]); + const uint32 b_scale = src_argb1[0]; + const uint32 g_scale = src_argb1[1]; + const uint32 r_scale = src_argb1[2]; + const uint32 a_scale = src_argb1[3]; + dst_argb[0] = SHADE(b, b_scale); + dst_argb[1] = SHADE(g, g_scale); + dst_argb[2] = SHADE(r, r_scale); + dst_argb[3] = SHADE(a, a_scale); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef REPEAT8 +#undef SHADE + +#define SHADE(f, v) clamp255(v + f) + +void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + const int b = src_argb0[0]; + const int g = src_argb0[1]; + const int r = src_argb0[2]; + const int a = src_argb0[3]; + const int b_add = src_argb1[0]; + const int g_add = src_argb1[1]; + const int r_add = src_argb1[2]; + const int a_add = src_argb1[3]; + dst_argb[0] = SHADE(b, b_add); + dst_argb[1] = SHADE(g, g_add); + dst_argb[2] = SHADE(r, r_add); + dst_argb[3] = SHADE(a, a_add); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef SHADE + +#define SHADE(f, v) clamp0(f - v) + +void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + const int b = src_argb0[0]; + const int g = src_argb0[1]; + const int r = src_argb0[2]; + const int a = src_argb0[3]; + const int b_sub = src_argb1[0]; + const int g_sub = src_argb1[1]; + const int r_sub = src_argb1[2]; + const int a_sub = src_argb1[3]; + dst_argb[0] = SHADE(b, b_sub); + dst_argb[1] = SHADE(g, g_sub); + dst_argb[2] = SHADE(r, r_sub); + dst_argb[3] = SHADE(a, a_sub); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef SHADE + +// Sobel functions which mimics SSSE3. +void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, + uint8* dst_sobelx, int width) { + int i; + for (i = 0; i < width; ++i) { + int a = src_y0[i]; + int b = src_y1[i]; + int c = src_y2[i]; + int a_sub = src_y0[i + 2]; + int b_sub = src_y1[i + 2]; + int c_sub = src_y2[i + 2]; + int a_diff = a - a_sub; + int b_diff = b - b_sub; + int c_diff = c - c_sub; + int sobel = Abs(a_diff + b_diff * 2 + c_diff); + dst_sobelx[i] = (uint8)(clamp255(sobel)); + } +} + +void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + int i; + for (i = 0; i < width; ++i) { + int a = src_y0[i + 0]; + int b = src_y0[i + 1]; + int c = src_y0[i + 2]; + int a_sub = src_y1[i + 0]; + int b_sub = src_y1[i + 1]; + int c_sub = src_y1[i + 2]; + int a_diff = a - a_sub; + int b_diff = b - b_sub; + int c_diff = c - c_sub; + int sobel = Abs(a_diff + b_diff * 2 + c_diff); + dst_sobely[i] = (uint8)(clamp255(sobel)); + } +} + +void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int s = clamp255(r + b); + dst_argb[0] = (uint8)(s); + dst_argb[1] = (uint8)(s); + dst_argb[2] = (uint8)(s); + dst_argb[3] = (uint8)(255u); + dst_argb += 4; + } +} + +void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int s = clamp255(r + b); + dst_y[i] = (uint8)(s); + } +} + +void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int g = clamp255(r + b); + dst_argb[0] = (uint8)(b); + dst_argb[1] = (uint8)(g); + dst_argb[2] = (uint8)(r); + dst_argb[3] = (uint8)(255u); + dst_argb += 4; + } +} + +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { + // Copy a Y to RGB. + int x; + for (x = 0; x < width; ++x) { + uint8 y = src_y[0]; + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = 255u; + dst_argb += 4; + ++src_y; + } +} + +// C reference code that mimics the YUV assembly. + +#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ + +#define UB 127 /* min(63,(int8)(2.018 * 64)) */ +#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ +#define UR 0 + +#define VB 0 +#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ +#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ + +// Bias +#define BB UB * 128 + VB * 128 +#define BG UG * 128 + VG * 128 +#define BR UR * 128 + VR * 128 + +static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, + uint8* b, uint8* g, uint8* r) { + int32 y1 = ((int32)(y) - 16) * YG; + *b = Clamp((int32)((u * UB + v * VB) - (BB) + y1) >> 6); + *g = Clamp((int32)((u * UG + v * VG) - (BG) + y1) >> 6); + *r = Clamp((int32)((u * UR + v * VR) - (BR) + y1) >> 6); +} + +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +// C mimic assembly. +// TODO(fbarchard): Remove subsampling from Neon. +void I444ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 u = (src_u[0] + src_u[1] + 1) >> 1; + uint8 v = (src_v[0] + src_v[1] + 1) >> 1; + YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + src_u += 2; + src_v += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + } +} +#else +void I444ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} +#endif +// Also used for 420 +void I422ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void I422ToRGB24Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 3, rgb_buf + 4, rgb_buf + 5); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + } +} + +void I422ToRAWRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 5, rgb_buf + 4, rgb_buf + 3); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + } +} + +void I422ToARGB4444Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1); + b0 = b0 >> 4; + g0 = g0 >> 4; + r0 = r0 >> 4; + b1 = b1 >> 4; + g1 = g1 >> 4; + r1 = r1 >> 4; + *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | + (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000; + src_y += 2; + src_u += 1; + src_v += 1; + dst_argb4444 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + b0 = b0 >> 4; + g0 = g0 >> 4; + r0 = r0 >> 4; + *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | + 0xf000; + } +} + +void I422ToARGB1555Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 3; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 3; + r1 = r1 >> 3; + *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | + (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000; + src_y += 2; + src_u += 1; + src_v += 1; + dst_argb1555 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 3; + r0 = r0 >> 3; + *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | + 0x8000; + } +} + +void I422ToRGB565Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27); + src_y += 2; + src_u += 1; + src_v += 1; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void I411ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 3; x += 4) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + YuvPixel(src_y[2], src_u[0], src_v[0], + rgb_buf + 8, rgb_buf + 9, rgb_buf + 10); + rgb_buf[11] = 255; + YuvPixel(src_y[3], src_u[0], src_v[0], + rgb_buf + 12, rgb_buf + 13, rgb_buf + 14); + rgb_buf[15] = 255; + src_y += 4; + src_u += 1; + src_v += 1; + rgb_buf += 16; // Advance 4 pixels. + } + if (width & 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void NV12ToARGBRow_C(const uint8* src_y, + const uint8* usrc_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], usrc_v[0], usrc_v[1], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], usrc_v[0], usrc_v[1], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + usrc_v += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], usrc_v[0], usrc_v[1], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void NV21ToARGBRow_C(const uint8* src_y, + const uint8* src_vu, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + + YuvPixel(src_y[1], src_vu[1], src_vu[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + + src_y += 2; + src_vu += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void NV12ToRGB565Row_C(const uint8* src_y, + const uint8* usrc_v, + uint8* dst_rgb565, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0); + YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27); + src_y += 2; + usrc_v += 2; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void NV21ToRGB565Row_C(const uint8* src_y, + const uint8* vsrc_u, + uint8* dst_rgb565, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0); + YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27); + src_y += 2; + vsrc_u += 2; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void YUY2ToARGBRow_C(const uint8* src_yuy2, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_yuy2 += 4; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void UYVYToARGBRow_C(const uint8* src_uyvy, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_uyvy += 4; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void I422ToBGRARow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 3, rgb_buf + 2, rgb_buf + 1); + rgb_buf[0] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 7, rgb_buf + 6, rgb_buf + 5); + rgb_buf[4] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 3, rgb_buf + 2, rgb_buf + 1); + rgb_buf[0] = 255; + } +} + +void I422ToABGRRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 6, rgb_buf + 5, rgb_buf + 4); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + rgb_buf[3] = 255; + } +} + +void I422ToRGBARow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 1, rgb_buf + 2, rgb_buf + 3); + rgb_buf[0] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 5, rgb_buf + 6, rgb_buf + 7); + rgb_buf[4] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 1, rgb_buf + 2, rgb_buf + 3); + rgb_buf[0] = 255; + } +} + +void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], 128, 128, + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], 128, 128, + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], 128, 128, + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void MirrorRow_C(const uint8* src, uint8* dst, int width) { + int x; + src += width - 1; + for (x = 0; x < width - 1; x += 2) { + dst[x] = src[0]; + dst[x + 1] = src[-1]; + src -= 2; + } + if (width & 1) { + dst[width - 1] = src[0]; + } +} + +void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { + int x; + src_uv += (width - 1) << 1; + for (x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[-2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[-2 + 1]; + src_uv -= 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; + } +} + +void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) { + int x; + const uint32* src32 = (const uint32*)(src); + uint32* dst32 = (uint32*)(dst); + src32 += width - 1; + for (x = 0; x < width - 1; x += 2) { + dst32[x] = src32[0]; + dst32[x + 1] = src32[-1]; + src32 -= 2; + } + if (width & 1) { + dst32[width - 1] = src32[0]; + } +} + +void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[3]; + src_uv += 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; + } +} + +void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = src_u[x]; + dst_uv[1] = src_v[x]; + dst_uv[2] = src_u[x + 1]; + dst_uv[3] = src_v[x + 1]; + dst_uv += 4; + } + if (width & 1) { + dst_uv[0] = src_u[width - 1]; + dst_uv[1] = src_v[width - 1]; + } +} + +void CopyRow_C(const uint8* src, uint8* dst, int count) { + memcpy(dst, src, count); +} + +void CopyRow_16_C(const uint16* src, uint16* dst, int count) { + memcpy(dst, src, count * 2); +} + +void SetRow_C(uint8* dst, uint32 v8, int count) { +#ifdef _MSC_VER + // VC will generate rep stosb. + int x; + for (x = 0; x < count; ++x) { + dst[x] = v8; + } +#else + memset(dst, v8, count); +#endif +} + +void ARGBSetRows_C(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + int y; + for (y = 0; y < height; ++y) { + uint32* d = (uint32*)(dst); + int x; + for (x = 0; x < width; ++x) { + d[x] = v32; + } + dst += dst_stride; + } +} + +// Filter 2 rows of YUY2 UV's (422) into U and V (420). +void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values, filtering 2 rows of YUY2. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; + dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of YUY2 UV's (422) into U and V (422). +void YUY2ToUV422Row_C(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = src_yuy2[1]; + dst_v[0] = src_yuy2[3]; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of YUY2 Y's (422) into Y (420/422). +void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { + // Output a row of Y values. + int x; + for (x = 0; x < width - 1; x += 2) { + dst_y[x] = src_yuy2[0]; + dst_y[x + 1] = src_yuy2[2]; + src_yuy2 += 4; + } + if (width & 1) { + dst_y[width - 1] = src_yuy2[0]; + } +} + +// Filter 2 rows of UYVY UV's (422) into U and V (420). +void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; + dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of UYVY UV's (422) into U and V (422). +void UYVYToUV422Row_C(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = src_uyvy[0]; + dst_v[0] = src_uyvy[2]; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of UYVY Y's (422) into Y (420/422). +void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) { + // Output a row of Y values. + int x; + for (x = 0; x < width - 1; x += 2) { + dst_y[x] = src_uyvy[1]; + dst_y[x + 1] = src_uyvy[3]; + src_uyvy += 4; + } + if (width & 1) { + dst_y[width - 1] = src_uyvy[1]; + } +} + +#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f + +// Blend src_argb0 over src_argb1 and store to dst_argb. +// dst_argb may be src_argb0 or src_argb1. +// This code mimics the SSSE3 version for better testability. +void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint32 fb = src_argb0[0]; + uint32 fg = src_argb0[1]; + uint32 fr = src_argb0[2]; + uint32 a = src_argb0[3]; + uint32 bb = src_argb1[0]; + uint32 bg = src_argb1[1]; + uint32 br = src_argb1[2]; + dst_argb[0] = BLEND(fb, bb, a); + dst_argb[1] = BLEND(fg, bg, a); + dst_argb[2] = BLEND(fr, br, a); + dst_argb[3] = 255u; + + fb = src_argb0[4 + 0]; + fg = src_argb0[4 + 1]; + fr = src_argb0[4 + 2]; + a = src_argb0[4 + 3]; + bb = src_argb1[4 + 0]; + bg = src_argb1[4 + 1]; + br = src_argb1[4 + 2]; + dst_argb[4 + 0] = BLEND(fb, bb, a); + dst_argb[4 + 1] = BLEND(fg, bg, a); + dst_argb[4 + 2] = BLEND(fr, br, a); + dst_argb[4 + 3] = 255u; + src_argb0 += 8; + src_argb1 += 8; + dst_argb += 8; + } + + if (width & 1) { + uint32 fb = src_argb0[0]; + uint32 fg = src_argb0[1]; + uint32 fr = src_argb0[2]; + uint32 a = src_argb0[3]; + uint32 bb = src_argb1[0]; + uint32 bg = src_argb1[1]; + uint32 br = src_argb1[2]; + dst_argb[0] = BLEND(fb, bb, a); + dst_argb[1] = BLEND(fg, bg, a); + dst_argb[2] = BLEND(fr, br, a); + dst_argb[3] = 255u; + } +} +#undef BLEND +#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 + +// Multiply source RGB by alpha and store to destination. +// This code mimics the SSSE3 version for better testability. +void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + uint32 b = src_argb[0]; + uint32 g = src_argb[1]; + uint32 r = src_argb[2]; + uint32 a = src_argb[3]; + dst_argb[0] = ATTENUATE(b, a); + dst_argb[1] = ATTENUATE(g, a); + dst_argb[2] = ATTENUATE(r, a); + dst_argb[3] = a; + b = src_argb[4]; + g = src_argb[5]; + r = src_argb[6]; + a = src_argb[7]; + dst_argb[4] = ATTENUATE(b, a); + dst_argb[5] = ATTENUATE(g, a); + dst_argb[6] = ATTENUATE(r, a); + dst_argb[7] = a; + src_argb += 8; + dst_argb += 8; + } + + if (width & 1) { + const uint32 b = src_argb[0]; + const uint32 g = src_argb[1]; + const uint32 r = src_argb[2]; + const uint32 a = src_argb[3]; + dst_argb[0] = ATTENUATE(b, a); + dst_argb[1] = ATTENUATE(g, a); + dst_argb[2] = ATTENUATE(r, a); + dst_argb[3] = a; + } +} +#undef ATTENUATE + +// Divide source RGB by alpha and store to destination. +// b = (b * 255 + (a / 2)) / a; +// g = (g * 255 + (a / 2)) / a; +// r = (r * 255 + (a / 2)) / a; +// Reciprocal method is off by 1 on some values. ie 125 +// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. +#define T(a) 0x01000000 + (0x10000 / a) +const uint32 fixed_invtbl8[256] = { + 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), + T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), + T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), + T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), + T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), + T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), + T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f), + T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47), + T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f), + T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57), + T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), + T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), + T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77), + T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f), + T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87), + T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f), + T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), + T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), + T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf), + T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7), + T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf), + T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7), + T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), + T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), + T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), + T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), + T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), + T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 }; +#undef T + +void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + uint32 b = src_argb[0]; + uint32 g = src_argb[1]; + uint32 r = src_argb[2]; + const uint32 a = src_argb[3]; + const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point + b = (b * ia) >> 8; + g = (g * ia) >> 8; + r = (r * ia) >> 8; + // Clamping should not be necessary but is free in assembly. + dst_argb[0] = clamp255(b); + dst_argb[1] = clamp255(g); + dst_argb[2] = clamp255(r); + dst_argb[3] = a; + src_argb += 4; + dst_argb += 4; + } +} + +void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) { + int32 row_sum[4] = {0, 0, 0, 0}; + int x; + for (x = 0; x < width; ++x) { + row_sum[0] += row[x * 4 + 0]; + row_sum[1] += row[x * 4 + 1]; + row_sum[2] += row[x * 4 + 2]; + row_sum[3] += row[x * 4 + 3]; + cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; + cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; + cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; + cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; + } +} + +void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl, + int w, int area, uint8* dst, int count) { + float ooa = 1.0f / area; + int i; + for (i = 0; i < count; ++i) { + dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); + dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); + dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); + dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); + dst += 4; + tl += 4; + bl += 4; + } +} + +// Copy pixels from rotated source to destination row with a slope. +LIBYUV_API +void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width) { + int i; + // Render a row of pixels from source into a buffer. + float uv[2]; + uv[0] = uv_dudv[0]; + uv[1] = uv_dudv[1]; + for (i = 0; i < width; ++i) { + int x = (int)(uv[0]); + int y = (int)(uv[1]); + *(uint32*)(dst_argb) = + *(const uint32*)(src_argb + y * src_argb_stride + + x * 4); + dst_argb += 4; + uv[0] += uv_dudv[2]; + uv[1] += uv_dudv[3]; + } +} + +// Blend 2 rows into 1 for conversions such as I422ToI420. +void HalfRow_C(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + int x; + for (x = 0; x < pix; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} + +void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, + uint16* dst_uv, int pix) { + int x; + for (x = 0; x < pix; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} + +// C version 2x2 -> 2x1. +void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, + int width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + int x; + if (source_y_fraction == 0) { + memcpy(dst_ptr, src_ptr, width); + return; + } + if (source_y_fraction == 128) { + HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width); + return; + } + for (x = 0; x < width - 1; x += 2) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + src_ptr += 2; + src_ptr1 += 2; + dst_ptr += 2; + } + if (width & 1) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + } +} + +void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride, + int width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16* src_ptr1 = src_ptr + src_stride; + int x; + if (source_y_fraction == 0) { + memcpy(dst_ptr, src_ptr, width * 2); + return; + } + if (source_y_fraction == 128) { + HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width); + return; + } + for (x = 0; x < width - 1; x += 2) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + src_ptr += 2; + src_ptr1 += 2; + dst_ptr += 2; + } + if (width & 1) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + } +} + +// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG +void ARGBToBayerRow_C(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix) { + int index0 = selector & 0xff; + int index1 = (selector >> 8) & 0xff; + // Copy a row of Bayer. + int x; + for (x = 0; x < pix - 1; x += 2) { + dst_bayer[0] = src_argb[index0]; + dst_bayer[1] = src_argb[index1]; + src_argb += 8; + dst_bayer += 2; + } + if (pix & 1) { + dst_bayer[0] = src_argb[index0]; + } +} + +// Select G channel from ARGB. e.g. GGGGGGGG +void ARGBToBayerGGRow_C(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix) { + // Copy a row of G. + int x; + for (x = 0; x < pix - 1; x += 2) { + dst_bayer[0] = src_argb[1]; + dst_bayer[1] = src_argb[5]; + src_argb += 8; + dst_bayer += 2; + } + if (pix & 1) { + dst_bayer[0] = src_argb[1]; + } +} + +// Use first 4 shuffler values to reorder ARGB channels. +void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + int index0 = shuffler[0]; + int index1 = shuffler[1]; + int index2 = shuffler[2]; + int index3 = shuffler[3]; + // Shuffle a row of ARGB. + int x; + for (x = 0; x < pix; ++x) { + // To support in-place conversion. + uint8 b = src_argb[index0]; + uint8 g = src_argb[index1]; + uint8 r = src_argb[index2]; + uint8 a = src_argb[index3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + src_argb += 4; + dst_argb += 4; + } +} + +void I422ToYUY2Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = src_y[1]; + dst_frame[3] = src_v[0]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = src_y[0]; // duplicate last y + dst_frame[3] = src_v[0]; + } +} + +void I422ToUYVYRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = src_y[1]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = src_y[0]; // duplicate last y + } +} + +#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3) +// row_win.cc has asm version, but GCC uses 2 step wrapper. +#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__)) +void I422ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + // Allocate a row of ARGB. + align_buffer_64(row, width * 4); + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); + ARGBToRGB565Row_SSE2(row, rgb_buf, width); + free_aligned_buffer_64(row); +} +#endif // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__)) + +#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) +void I422ToARGB1555Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + // Allocate a row of ARGB. + align_buffer_64(row, width * 4); + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); + ARGBToARGB1555Row_SSE2(row, rgb_buf, width); + free_aligned_buffer_64(row); +} + +void I422ToARGB4444Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + // Allocate a row of ARGB. + align_buffer_64(row, width * 4); + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); + ARGBToARGB4444Row_SSE2(row, rgb_buf, width); + free_aligned_buffer_64(row); +} + +void NV12ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + // Allocate a row of ARGB. + align_buffer_64(row, width * 4); + NV12ToARGBRow_SSSE3(src_y, src_uv, row, width); + ARGBToRGB565Row_SSE2(row, dst_rgb565, width); + free_aligned_buffer_64(row); +} + +void NV21ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_rgb565, + int width) { + // Allocate a row of ARGB. + align_buffer_64(row, width * 4); + NV21ToARGBRow_SSSE3(src_y, src_vu, row, width); + ARGBToRGB565Row_SSE2(row, dst_rgb565, width); + free_aligned_buffer_64(row); +} + +void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width); + YUY2ToYRow_SSE2(src_yuy2, row_y, width); + I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width); + free_aligned_buffer_64(row_y); +} + +void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width); + YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width); + I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); + free_aligned_buffer_64(row_y); +} + +void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width); + UYVYToYRow_SSE2(src_uyvy, row_y, width); + I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width); + free_aligned_buffer_64(row_y); +} + +void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width); + UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width); + I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); + free_aligned_buffer_64(row_y); +} + +#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) +#endif // !defined(LIBYUV_DISABLE_X86) + +void ARGBPolynomialRow_C(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + int i; + for (i = 0; i < width; ++i) { + float b = (float)(src_argb[0]); + float g = (float)(src_argb[1]); + float r = (float)(src_argb[2]); + float a = (float)(src_argb[3]); + float b2 = b * b; + float g2 = g * g; + float r2 = r * r; + float a2 = a * a; + float db = poly[0] + poly[4] * b; + float dg = poly[1] + poly[5] * g; + float dr = poly[2] + poly[6] * r; + float da = poly[3] + poly[7] * a; + float b3 = b2 * b; + float g3 = g2 * g; + float r3 = r2 * r; + float a3 = a2 * a; + db += poly[8] * b2; + dg += poly[9] * g2; + dr += poly[10] * r2; + da += poly[11] * a2; + db += poly[12] * b3; + dg += poly[13] * g3; + dr += poly[14] * r3; + da += poly[15] * a3; + + dst_argb[0] = Clamp((int32)(db)); + dst_argb[1] = Clamp((int32)(dg)); + dst_argb[2] = Clamp((int32)(dr)); + dst_argb[3] = Clamp((int32)(da)); + src_argb += 4; + dst_argb += 4; + } +} + +void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, + const uint8* luma, uint32 lumacoeff) { + uint32 bc = lumacoeff & 0xff; + uint32 gc = (lumacoeff >> 8) & 0xff; + uint32 rc = (lumacoeff >> 16) & 0xff; + + int i; + for (i = 0; i < width - 1; i += 2) { + // Luminance in rows, color values in columns. + const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + + src_argb[2] * rc) & 0x7F00u) + luma; + const uint8* luma1; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + luma1 = ((src_argb[4] * bc + src_argb[5] * gc + + src_argb[6] * rc) & 0x7F00u) + luma; + dst_argb[4] = luma1[src_argb[4]]; + dst_argb[5] = luma1[src_argb[5]]; + dst_argb[6] = luma1[src_argb[6]]; + dst_argb[7] = src_argb[7]; + src_argb += 8; + dst_argb += 8; + } + if (width & 1) { + // Luminance in rows, color values in columns. + const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + + src_argb[2] * rc) & 0x7F00u) + luma; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + } +} + +void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst[3] = src[3]; + dst[7] = src[7]; + dst += 8; + src += 8; + } + if (width & 1) { + dst[3] = src[3]; + } +} + +void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst[3] = src[0]; + dst[7] = src[1]; + dst += 8; + src += 2; + } + if (width & 1) { + dst[3] = src[0]; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/row_mips.cc b/third_party/libyuv/source/row_mips.cc new file mode 100644 index 000000000..a804670d3 --- /dev/null +++ b/third_party/libyuv/source/row_mips.cc @@ -0,0 +1,991 @@ +/* + * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) + +#ifdef HAS_COPYROW_MIPS +void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { + __asm__ __volatile__ ( + ".set noreorder \n" + ".set noat \n" + "slti $at, %[count], 8 \n" + "bne $at ,$zero, $last8 \n" + "xor $t8, %[src], %[dst] \n" + "andi $t8, $t8, 0x3 \n" + + "bne $t8, $zero, unaligned \n" + "negu $a3, %[dst] \n" + // make dst/src aligned + "andi $a3, $a3, 0x3 \n" + "beq $a3, $zero, $chk16w \n" + // word-aligned now count is the remining bytes count + "subu %[count], %[count], $a3 \n" + + "lwr $t8, 0(%[src]) \n" + "addu %[src], %[src], $a3 \n" + "swr $t8, 0(%[dst]) \n" + "addu %[dst], %[dst], $a3 \n" + + // Now the dst/src are mutually word-aligned with word-aligned addresses + "$chk16w: \n" + "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? + // t8 is the byte count after 64-byte chunks + "beq %[count], $t8, chk8w \n" + // There will be at most 1 32-byte chunk after it + "subu $a3, %[count], $t8 \n" // the reminder + // Here a3 counts bytes in 16w chunks + "addu $a3, %[dst], $a3 \n" + // Now a3 is the final dst after 64-byte chunks + "addu $t0, %[dst], %[count] \n" + // t0 is the "past the end" address + + // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past + // the "t0-32" address + // This means: for x=128 the last "safe" a1 address is "t0-160" + // Alternatively, for x=64 the last "safe" a1 address is "t0-96" + // we will use "pref 30,128(a1)", so "t0-160" is the limit + "subu $t9, $t0, 160 \n" + // t9 is the "last safe pref 30,128(a1)" address + "pref 0, 0(%[src]) \n" // first line of src + "pref 0, 32(%[src]) \n" // second line of src + "pref 0, 64(%[src]) \n" + "pref 30, 32(%[dst]) \n" + // In case the a1 > t9 don't use "pref 30" at all + "sgtu $v1, %[dst], $t9 \n" + "bgtz $v1, $loop16w \n" + "nop \n" + // otherwise, start with using pref30 + "pref 30, 64(%[dst]) \n" + "$loop16w: \n" + "pref 0, 96(%[src]) \n" + "lw $t0, 0(%[src]) \n" + "bgtz $v1, $skip_pref30_96 \n" // skip + "lw $t1, 4(%[src]) \n" + "pref 30, 96(%[dst]) \n" // continue + "$skip_pref30_96: \n" + "lw $t2, 8(%[src]) \n" + "lw $t3, 12(%[src]) \n" + "lw $t4, 16(%[src]) \n" + "lw $t5, 20(%[src]) \n" + "lw $t6, 24(%[src]) \n" + "lw $t7, 28(%[src]) \n" + "pref 0, 128(%[src]) \n" + // bring the next lines of src, addr 128 + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "lw $t0, 32(%[src]) \n" + "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1) + "lw $t1, 36(%[src]) \n" + "pref 30, 128(%[dst]) \n" // set dest, addr 128 + "$skip_pref30_128: \n" + "lw $t2, 40(%[src]) \n" + "lw $t3, 44(%[src]) \n" + "lw $t4, 48(%[src]) \n" + "lw $t5, 52(%[src]) \n" + "lw $t6, 56(%[src]) \n" + "lw $t7, 60(%[src]) \n" + "pref 0, 160(%[src]) \n" + // bring the next lines of src, addr 160 + "sw $t0, 32(%[dst]) \n" + "sw $t1, 36(%[dst]) \n" + "sw $t2, 40(%[dst]) \n" + "sw $t3, 44(%[dst]) \n" + "sw $t4, 48(%[dst]) \n" + "sw $t5, 52(%[dst]) \n" + "sw $t6, 56(%[dst]) \n" + "sw $t7, 60(%[dst]) \n" + + "addiu %[dst], %[dst], 64 \n" // adding 64 to dest + "sgtu $v1, %[dst], $t9 \n" + "bne %[dst], $a3, $loop16w \n" + " addiu %[src], %[src], 64 \n" // adding 64 to src + "move %[count], $t8 \n" + + // Here we have src and dest word-aligned but less than 64-bytes to go + + "chk8w: \n" + "pref 0, 0x0(%[src]) \n" + "andi $t8, %[count], 0x1f \n" // 32-byte chunk? + // the t8 is the reminder count past 32-bytes + "beq %[count], $t8, chk1w \n" + // count=t8,no 32-byte chunk + " nop \n" + + "lw $t0, 0(%[src]) \n" + "lw $t1, 4(%[src]) \n" + "lw $t2, 8(%[src]) \n" + "lw $t3, 12(%[src]) \n" + "lw $t4, 16(%[src]) \n" + "lw $t5, 20(%[src]) \n" + "lw $t6, 24(%[src]) \n" + "lw $t7, 28(%[src]) \n" + "addiu %[src], %[src], 32 \n" + + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "addiu %[dst], %[dst], 32 \n" + + "chk1w: \n" + "andi %[count], $t8, 0x3 \n" + // now count is the reminder past 1w chunks + "beq %[count], $t8, $last8 \n" + " subu $a3, $t8, %[count] \n" + // a3 is count of bytes in 1w chunks + "addu $a3, %[dst], $a3 \n" + // now a3 is the dst address past the 1w chunks + // copying in words (4-byte chunks) + "$wordCopy_loop: \n" + "lw $t3, 0(%[src]) \n" + // the first t3 may be equal t0 ... optimize? + "addiu %[src], %[src],4 \n" + "addiu %[dst], %[dst],4 \n" + "bne %[dst], $a3,$wordCopy_loop \n" + " sw $t3, -4(%[dst]) \n" + + // For the last (<8) bytes + "$last8: \n" + "blez %[count], leave \n" + " addu $a3, %[dst], %[count] \n" // a3 -last dst address + "$last8loop: \n" + "lb $v1, 0(%[src]) \n" + "addiu %[src], %[src], 1 \n" + "addiu %[dst], %[dst], 1 \n" + "bne %[dst], $a3, $last8loop \n" + " sb $v1, -1(%[dst]) \n" + + "leave: \n" + " j $ra \n" + " nop \n" + + // + // UNALIGNED case + // + + "unaligned: \n" + // got here with a3="negu a1" + "andi $a3, $a3, 0x3 \n" // a1 is word aligned? + "beqz $a3, $ua_chk16w \n" + " subu %[count], %[count], $a3 \n" + // bytes left after initial a3 bytes + "lwr $v1, 0(%[src]) \n" + "lwl $v1, 3(%[src]) \n" + "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3 + "swr $v1, 0(%[dst]) \n" + "addu %[dst], %[dst], $a3 \n" + // below the dst will be word aligned (NOTE1) + "$ua_chk16w: \n" + "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? + // t8 is the byte count after 64-byte chunks + "beq %[count], $t8, ua_chk8w \n" + // if a2==t8, no 64-byte chunks + // There will be at most 1 32-byte chunk after it + "subu $a3, %[count], $t8 \n" // the reminder + // Here a3 counts bytes in 16w chunks + "addu $a3, %[dst], $a3 \n" + // Now a3 is the final dst after 64-byte chunks + "addu $t0, %[dst], %[count] \n" // t0 "past the end" + "subu $t9, $t0, 160 \n" + // t9 is the "last safe pref 30,128(a1)" address + "pref 0, 0(%[src]) \n" // first line of src + "pref 0, 32(%[src]) \n" // second line addr 32 + "pref 0, 64(%[src]) \n" + "pref 30, 32(%[dst]) \n" + // safe, as we have at least 64 bytes ahead + // In case the a1 > t9 don't use "pref 30" at all + "sgtu $v1, %[dst], $t9 \n" + "bgtz $v1, $ua_loop16w \n" + // skip "pref 30,64(a1)" for too short arrays + " nop \n" + // otherwise, start with using pref30 + "pref 30, 64(%[dst]) \n" + "$ua_loop16w: \n" + "pref 0, 96(%[src]) \n" + "lwr $t0, 0(%[src]) \n" + "lwl $t0, 3(%[src]) \n" + "lwr $t1, 4(%[src]) \n" + "bgtz $v1, $ua_skip_pref30_96 \n" + " lwl $t1, 7(%[src]) \n" + "pref 30, 96(%[dst]) \n" + // continue setting up the dest, addr 96 + "$ua_skip_pref30_96: \n" + "lwr $t2, 8(%[src]) \n" + "lwl $t2, 11(%[src]) \n" + "lwr $t3, 12(%[src]) \n" + "lwl $t3, 15(%[src]) \n" + "lwr $t4, 16(%[src]) \n" + "lwl $t4, 19(%[src]) \n" + "lwr $t5, 20(%[src]) \n" + "lwl $t5, 23(%[src]) \n" + "lwr $t6, 24(%[src]) \n" + "lwl $t6, 27(%[src]) \n" + "lwr $t7, 28(%[src]) \n" + "lwl $t7, 31(%[src]) \n" + "pref 0, 128(%[src]) \n" + // bring the next lines of src, addr 128 + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "lwr $t0, 32(%[src]) \n" + "lwl $t0, 35(%[src]) \n" + "lwr $t1, 36(%[src]) \n" + "bgtz $v1, ua_skip_pref30_128 \n" + " lwl $t1, 39(%[src]) \n" + "pref 30, 128(%[dst]) \n" + // continue setting up the dest, addr 128 + "ua_skip_pref30_128: \n" + + "lwr $t2, 40(%[src]) \n" + "lwl $t2, 43(%[src]) \n" + "lwr $t3, 44(%[src]) \n" + "lwl $t3, 47(%[src]) \n" + "lwr $t4, 48(%[src]) \n" + "lwl $t4, 51(%[src]) \n" + "lwr $t5, 52(%[src]) \n" + "lwl $t5, 55(%[src]) \n" + "lwr $t6, 56(%[src]) \n" + "lwl $t6, 59(%[src]) \n" + "lwr $t7, 60(%[src]) \n" + "lwl $t7, 63(%[src]) \n" + "pref 0, 160(%[src]) \n" + // bring the next lines of src, addr 160 + "sw $t0, 32(%[dst]) \n" + "sw $t1, 36(%[dst]) \n" + "sw $t2, 40(%[dst]) \n" + "sw $t3, 44(%[dst]) \n" + "sw $t4, 48(%[dst]) \n" + "sw $t5, 52(%[dst]) \n" + "sw $t6, 56(%[dst]) \n" + "sw $t7, 60(%[dst]) \n" + + "addiu %[dst],%[dst],64 \n" // adding 64 to dest + "sgtu $v1,%[dst],$t9 \n" + "bne %[dst],$a3,$ua_loop16w \n" + " addiu %[src],%[src],64 \n" // adding 64 to src + "move %[count],$t8 \n" + + // Here we have src and dest word-aligned but less than 64-bytes to go + + "ua_chk8w: \n" + "pref 0, 0x0(%[src]) \n" + "andi $t8, %[count], 0x1f \n" // 32-byte chunk? + // the t8 is the reminder count + "beq %[count], $t8, $ua_chk1w \n" + // when count==t8, no 32-byte chunk + + "lwr $t0, 0(%[src]) \n" + "lwl $t0, 3(%[src]) \n" + "lwr $t1, 4(%[src]) \n" + "lwl $t1, 7(%[src]) \n" + "lwr $t2, 8(%[src]) \n" + "lwl $t2, 11(%[src]) \n" + "lwr $t3, 12(%[src]) \n" + "lwl $t3, 15(%[src]) \n" + "lwr $t4, 16(%[src]) \n" + "lwl $t4, 19(%[src]) \n" + "lwr $t5, 20(%[src]) \n" + "lwl $t5, 23(%[src]) \n" + "lwr $t6, 24(%[src]) \n" + "lwl $t6, 27(%[src]) \n" + "lwr $t7, 28(%[src]) \n" + "lwl $t7, 31(%[src]) \n" + "addiu %[src], %[src], 32 \n" + + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "addiu %[dst], %[dst], 32 \n" + + "$ua_chk1w: \n" + "andi %[count], $t8, 0x3 \n" + // now count is the reminder past 1w chunks + "beq %[count], $t8, ua_smallCopy \n" + "subu $a3, $t8, %[count] \n" + // a3 is count of bytes in 1w chunks + "addu $a3, %[dst], $a3 \n" + // now a3 is the dst address past the 1w chunks + + // copying in words (4-byte chunks) + "$ua_wordCopy_loop: \n" + "lwr $v1, 0(%[src]) \n" + "lwl $v1, 3(%[src]) \n" + "addiu %[src], %[src], 4 \n" + "addiu %[dst], %[dst], 4 \n" + // note: dst=a1 is word aligned here, see NOTE1 + "bne %[dst], $a3, $ua_wordCopy_loop \n" + " sw $v1,-4(%[dst]) \n" + + // Now less than 4 bytes (value in count) left to copy + "ua_smallCopy: \n" + "beqz %[count], leave \n" + " addu $a3, %[dst], %[count] \n" // a3 = last dst address + "$ua_smallCopy_loop: \n" + "lb $v1, 0(%[src]) \n" + "addiu %[src], %[src], 1 \n" + "addiu %[dst], %[dst], 1 \n" + "bne %[dst],$a3,$ua_smallCopy_loop \n" + " sb $v1, -1(%[dst]) \n" + + "j $ra \n" + " nop \n" + ".set at \n" + ".set reorder \n" + : [dst] "+r" (dst), [src] "+r" (src) + : [count] "r" (count) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", + "t8", "t9", "a3", "v1", "at" + ); +} +#endif // HAS_COPYROW_MIPS + +// MIPS DSPR2 functions +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \ + (__mips_dsp_rev >= 2) +void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "srl $t4, %[width], 4 \n" // multiplies of 16 + "blez $t4, 2f \n" + " andi %[width], %[width], 0xf \n" // residual + + ".p2align 2 \n" + "1: \n" + "addiu $t4, $t4, -1 \n" + "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 + "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2 + "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4 + "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6 + "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8 + "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10 + "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12 + "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14 + "addiu %[src_uv], %[src_uv], 32 \n" + "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0 + "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0 + "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4 + "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4 + "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8 + "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8 + "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12 + "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12 + "sw $t9, 0(%[dst_v]) \n" + "sw $t0, 0(%[dst_u]) \n" + "sw $t1, 4(%[dst_v]) \n" + "sw $t2, 4(%[dst_u]) \n" + "sw $t3, 8(%[dst_v]) \n" + "sw $t5, 8(%[dst_u]) \n" + "sw $t6, 12(%[dst_v]) \n" + "sw $t7, 12(%[dst_u]) \n" + "addiu %[dst_v], %[dst_v], 16 \n" + "bgtz $t4, 1b \n" + " addiu %[dst_u], %[dst_u], 16 \n" + + "beqz %[width], 3f \n" + " nop \n" + + "2: \n" + "lbu $t0, 0(%[src_uv]) \n" + "lbu $t1, 1(%[src_uv]) \n" + "addiu %[src_uv], %[src_uv], 2 \n" + "addiu %[width], %[width], -1 \n" + "sb $t0, 0(%[dst_u]) \n" + "sb $t1, 0(%[dst_v]) \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "bgtz %[width], 2b \n" + " addiu %[dst_v], %[dst_v], 1 \n" + + "3: \n" + ".set pop \n" + : [src_uv] "+r" (src_uv), + [width] "+r" (width), + [dst_u] "+r" (dst_u), + [dst_v] "+r" (dst_v) + : + : "t0", "t1", "t2", "t3", + "t4", "t5", "t6", "t7", "t8", "t9" + ); +} + +void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, + uint8* dst_v, int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "srl $t4, %[width], 4 \n" // multiplies of 16 + "blez $t4, 2f \n" + " andi %[width], %[width], 0xf \n" // residual + + ".p2align 2 \n" + "1: \n" + "addiu $t4, $t4, -1 \n" + "lwr $t0, 0(%[src_uv]) \n" + "lwl $t0, 3(%[src_uv]) \n" // V1 | U1 | V0 | U0 + "lwr $t1, 4(%[src_uv]) \n" + "lwl $t1, 7(%[src_uv]) \n" // V3 | U3 | V2 | U2 + "lwr $t2, 8(%[src_uv]) \n" + "lwl $t2, 11(%[src_uv]) \n" // V5 | U5 | V4 | U4 + "lwr $t3, 12(%[src_uv]) \n" + "lwl $t3, 15(%[src_uv]) \n" // V7 | U7 | V6 | U6 + "lwr $t5, 16(%[src_uv]) \n" + "lwl $t5, 19(%[src_uv]) \n" // V9 | U9 | V8 | U8 + "lwr $t6, 20(%[src_uv]) \n" + "lwl $t6, 23(%[src_uv]) \n" // V11 | U11 | V10 | U10 + "lwr $t7, 24(%[src_uv]) \n" + "lwl $t7, 27(%[src_uv]) \n" // V13 | U13 | V12 | U12 + "lwr $t8, 28(%[src_uv]) \n" + "lwl $t8, 31(%[src_uv]) \n" // V15 | U15 | V14 | U14 + "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0 + "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0 + "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4 + "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4 + "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8 + "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8 + "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12 + "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12 + "addiu %[src_uv], %[src_uv], 32 \n" + "swr $t9, 0(%[dst_v]) \n" + "swl $t9, 3(%[dst_v]) \n" + "swr $t0, 0(%[dst_u]) \n" + "swl $t0, 3(%[dst_u]) \n" + "swr $t1, 4(%[dst_v]) \n" + "swl $t1, 7(%[dst_v]) \n" + "swr $t2, 4(%[dst_u]) \n" + "swl $t2, 7(%[dst_u]) \n" + "swr $t3, 8(%[dst_v]) \n" + "swl $t3, 11(%[dst_v]) \n" + "swr $t5, 8(%[dst_u]) \n" + "swl $t5, 11(%[dst_u]) \n" + "swr $t6, 12(%[dst_v]) \n" + "swl $t6, 15(%[dst_v]) \n" + "swr $t7, 12(%[dst_u]) \n" + "swl $t7, 15(%[dst_u]) \n" + "addiu %[dst_u], %[dst_u], 16 \n" + "bgtz $t4, 1b \n" + " addiu %[dst_v], %[dst_v], 16 \n" + + "beqz %[width], 3f \n" + " nop \n" + + "2: \n" + "lbu $t0, 0(%[src_uv]) \n" + "lbu $t1, 1(%[src_uv]) \n" + "addiu %[src_uv], %[src_uv], 2 \n" + "addiu %[width], %[width], -1 \n" + "sb $t0, 0(%[dst_u]) \n" + "sb $t1, 0(%[dst_v]) \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "bgtz %[width], 2b \n" + " addiu %[dst_v], %[dst_v], 1 \n" + + "3: \n" + ".set pop \n" + : [src_uv] "+r" (src_uv), + [width] "+r" (width), + [dst_u] "+r" (dst_u), + [dst_v] "+r" (dst_v) + : + : "t0", "t1", "t2", "t3", + "t4", "t5", "t6", "t7", "t8", "t9" + ); +} + +void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t4, %[width], 4 \n" // multiplies of 16 + "andi $t5, %[width], 0xf \n" + "blez $t4, 2f \n" + " addu %[src], %[src], %[width] \n" // src += width + + ".p2align 2 \n" + "1: \n" + "lw $t0, -16(%[src]) \n" // |3|2|1|0| + "lw $t1, -12(%[src]) \n" // |7|6|5|4| + "lw $t2, -8(%[src]) \n" // |11|10|9|8| + "lw $t3, -4(%[src]) \n" // |15|14|13|12| + "wsbh $t0, $t0 \n" // |2|3|0|1| + "wsbh $t1, $t1 \n" // |6|7|4|5| + "wsbh $t2, $t2 \n" // |10|11|8|9| + "wsbh $t3, $t3 \n" // |14|15|12|13| + "rotr $t0, $t0, 16 \n" // |0|1|2|3| + "rotr $t1, $t1, 16 \n" // |4|5|6|7| + "rotr $t2, $t2, 16 \n" // |8|9|10|11| + "rotr $t3, $t3, 16 \n" // |12|13|14|15| + "addiu %[src], %[src], -16 \n" + "addiu $t4, $t4, -1 \n" + "sw $t3, 0(%[dst]) \n" // |15|14|13|12| + "sw $t2, 4(%[dst]) \n" // |11|10|9|8| + "sw $t1, 8(%[dst]) \n" // |7|6|5|4| + "sw $t0, 12(%[dst]) \n" // |3|2|1|0| + "bgtz $t4, 1b \n" + " addiu %[dst], %[dst], 16 \n" + "beqz $t5, 3f \n" + " nop \n" + + "2: \n" + "lbu $t0, -1(%[src]) \n" + "addiu $t5, $t5, -1 \n" + "addiu %[src], %[src], -1 \n" + "sb $t0, 0(%[dst]) \n" + "bgez $t5, 2b \n" + " addiu %[dst], %[dst], 1 \n" + + "3: \n" + ".set pop \n" + : [src] "+r" (src), [dst] "+r" (dst) + : [width] "r" (width) + : "t0", "t1", "t2", "t3", "t4", "t5" + ); +} + +void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + int x = 0; + int y = 0; + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "addu $t4, %[width], %[width] \n" + "srl %[x], %[width], 4 \n" + "andi %[y], %[width], 0xf \n" + "blez %[x], 2f \n" + " addu %[src_uv], %[src_uv], $t4 \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| + "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| + "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8| + "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12| + "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16| + "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20| + "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24| + "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28| + + "rotr $t0, $t0, 16 \n" // |1|0|3|2| + "rotr $t1, $t1, 16 \n" // |5|4|7|6| + "rotr $t2, $t2, 16 \n" // |9|8|11|10| + "rotr $t3, $t3, 16 \n" // |13|12|15|14| + "rotr $t4, $t4, 16 \n" // |17|16|19|18| + "rotr $t6, $t6, 16 \n" // |21|20|23|22| + "rotr $t7, $t7, 16 \n" // |25|24|27|26| + "rotr $t8, $t8, 16 \n" // |29|28|31|30| + "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6| + "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7| + "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14| + "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15| + "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22| + "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23| + "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30| + "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31| + "addiu %[src_uv], %[src_uv], -32 \n" + "addiu %[x], %[x], -1 \n" + "swr $t4, 0(%[dst_u]) \n" + "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24| + "swr $t6, 0(%[dst_v]) \n" + "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25| + "swr $t2, 4(%[dst_u]) \n" + "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16| + "swr $t3, 4(%[dst_v]) \n" + "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17| + "swr $t0, 8(%[dst_u]) \n" + "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8| + "swr $t1, 8(%[dst_v]) \n" + "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9| + "swr $t9, 12(%[dst_u]) \n" + "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0| + "swr $t5, 12(%[dst_v]) \n" + "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1| + "addiu %[dst_v], %[dst_v], 16 \n" + "bgtz %[x], 1b \n" + " addiu %[dst_u], %[dst_u], 16 \n" + "beqz %[y], 3f \n" + " nop \n" + "b 2f \n" + " nop \n" + + "2: \n" + "lbu $t0, -2(%[src_uv]) \n" + "lbu $t1, -1(%[src_uv]) \n" + "addiu %[src_uv], %[src_uv], -2 \n" + "addiu %[y], %[y], -1 \n" + "sb $t0, 0(%[dst_u]) \n" + "sb $t1, 0(%[dst_v]) \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "bgtz %[y], 2b \n" + " addiu %[dst_v], %[dst_v], 1 \n" + + "3: \n" + ".set pop \n" + : [src_uv] "+r" (src_uv), + [dst_u] "+r" (dst_u), + [dst_v] "+r" (dst_v), + [x] "=&r" (x), + [y] "+r" (y) + : [width] "r" (width) + : "t0", "t1", "t2", "t3", "t4", + "t5", "t7", "t8", "t9" + ); +} + +// Convert (4 Y and 2 VU) I422 and arrange RGB values into +// t5 = | 0 | B0 | 0 | b0 | +// t4 = | 0 | B1 | 0 | b1 | +// t9 = | 0 | G0 | 0 | g0 | +// t8 = | 0 | G1 | 0 | g1 | +// t2 = | 0 | R0 | 0 | r0 | +// t1 = | 0 | R1 | 0 | r1 | +#define I422ToTransientMipsRGB \ + "lw $t0, 0(%[y_buf]) \n" \ + "lhu $t1, 0(%[u_buf]) \n" \ + "lhu $t2, 0(%[v_buf]) \n" \ + "preceu.ph.qbr $t1, $t1 \n" \ + "preceu.ph.qbr $t2, $t2 \n" \ + "preceu.ph.qbra $t3, $t0 \n" \ + "preceu.ph.qbla $t0, $t0 \n" \ + "subu.ph $t1, $t1, $s5 \n" \ + "subu.ph $t2, $t2, $s5 \n" \ + "subu.ph $t3, $t3, $s4 \n" \ + "subu.ph $t0, $t0, $s4 \n" \ + "mul.ph $t3, $t3, $s0 \n" \ + "mul.ph $t0, $t0, $s0 \n" \ + "shll.ph $t4, $t1, 0x7 \n" \ + "subu.ph $t4, $t4, $t1 \n" \ + "mul.ph $t6, $t1, $s1 \n" \ + "mul.ph $t1, $t2, $s2 \n" \ + "addq_s.ph $t5, $t4, $t3 \n" \ + "addq_s.ph $t4, $t4, $t0 \n" \ + "shra.ph $t5, $t5, 6 \n" \ + "shra.ph $t4, $t4, 6 \n" \ + "addiu %[u_buf], 2 \n" \ + "addiu %[v_buf], 2 \n" \ + "addu.ph $t6, $t6, $t1 \n" \ + "mul.ph $t1, $t2, $s3 \n" \ + "addu.ph $t9, $t6, $t3 \n" \ + "addu.ph $t8, $t6, $t0 \n" \ + "shra.ph $t9, $t9, 6 \n" \ + "shra.ph $t8, $t8, 6 \n" \ + "addu.ph $t2, $t1, $t3 \n" \ + "addu.ph $t1, $t1, $t0 \n" \ + "shra.ph $t2, $t2, 6 \n" \ + "shra.ph $t1, $t1, 6 \n" \ + "subu.ph $t5, $t5, $s5 \n" \ + "subu.ph $t4, $t4, $s5 \n" \ + "subu.ph $t9, $t9, $s5 \n" \ + "subu.ph $t8, $t8, $s5 \n" \ + "subu.ph $t2, $t2, $s5 \n" \ + "subu.ph $t1, $t1, $s5 \n" \ + "shll_s.ph $t5, $t5, 8 \n" \ + "shll_s.ph $t4, $t4, 8 \n" \ + "shll_s.ph $t9, $t9, 8 \n" \ + "shll_s.ph $t8, $t8, 8 \n" \ + "shll_s.ph $t2, $t2, 8 \n" \ + "shll_s.ph $t1, $t1, 8 \n" \ + "shra.ph $t5, $t5, 8 \n" \ + "shra.ph $t4, $t4, 8 \n" \ + "shra.ph $t9, $t9, 8 \n" \ + "shra.ph $t8, $t8, 8 \n" \ + "shra.ph $t2, $t2, 8 \n" \ + "shra.ph $t1, $t1, 8 \n" \ + "addu.ph $t5, $t5, $s5 \n" \ + "addu.ph $t4, $t4, $s5 \n" \ + "addu.ph $t9, $t9, $s5 \n" \ + "addu.ph $t8, $t8, $s5 \n" \ + "addu.ph $t2, $t2, $s5 \n" \ + "addu.ph $t1, $t1, $s5 \n" + +void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " repl.ph $s0, 74 \n" // |YG|YG| = |74|74| + "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| + "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| + "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| + "repl.ph $s4, 16 \n" // |0|16|0|16| + "repl.ph $s5, 128 \n" // |128|128| // clipping + "lui $s6, 0xff00 \n" + "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff| + + ".p2align 2 \n" + "1: \n" + I422ToTransientMipsRGB +// Arranging into argb format + "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1| + "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0| + "addiu %[width], -4 \n" + "precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0| + "precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0| + "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0| + + "addiu %[y_buf], 4 \n" + "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0| + "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0| + "or $t1, $t1, $s6 \n" // |ff|R1|ff|R0| + "or $t2, $t2, $s6 \n" // |ff|r1|ff|r0| + "precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1| + "precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1| + "sll $t9, $t9, 16 \n" + "sll $t8, $t8, 16 \n" + "packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0| + "packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0| +// Store results. + "sw $t2, 0(%[rgb_buf]) \n" + "sw $t0, 4(%[rgb_buf]) \n" + "sw $t1, 8(%[rgb_buf]) \n" + "sw $t3, 12(%[rgb_buf]) \n" + "bnez %[width], 1b \n" + " addiu %[rgb_buf], 16 \n" + "2: \n" + ".set pop \n" + :[y_buf] "+r" (y_buf), + [u_buf] "+r" (u_buf), + [v_buf] "+r" (v_buf), + [width] "+r" (width), + [rgb_buf] "+r" (rgb_buf) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " repl.ph $s0, 74 \n" // |YG|YG| = |74|74| + "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| + "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| + "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| + "repl.ph $s4, 16 \n" // |0|16|0|16| + "repl.ph $s5, 128 \n" // |128|128| + "lui $s6, 0xff00 \n" + "ori $s6, 0xff00 \n" // |ff|00|ff|00| + + ".p2align 2 \n" + "1: \n" + I422ToTransientMipsRGB +// Arranging into abgr format + "precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1| + "precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0| + "precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0| + "precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0| + + "precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0| + "addiu %[width], -4 \n" + "addiu %[y_buf], 4 \n" + "preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0| + "preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0| + "or $t1, $t1, $s6 \n" // |ff|B1|ff|B0| + "or $t2, $t2, $s6 \n" // |ff|b1|ff|b0| + "precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1| + "precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1| + "sll $t9, $t9, 16 \n" + "sll $t8, $t8, 16 \n" + "packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0| + "packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0| +// Store results. + "sw $t2, 0(%[rgb_buf]) \n" + "sw $t0, 4(%[rgb_buf]) \n" + "sw $t1, 8(%[rgb_buf]) \n" + "sw $t3, 12(%[rgb_buf]) \n" + "bnez %[width], 1b \n" + " addiu %[rgb_buf], 16 \n" + "2: \n" + ".set pop \n" + :[y_buf] "+r" (y_buf), + [u_buf] "+r" (u_buf), + [v_buf] "+r" (v_buf), + [width] "+r" (width), + [rgb_buf] "+r" (rgb_buf) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 | + "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| + "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| + "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| + "repl.ph $s4, 16 \n" // |0|16|0|16| + "repl.ph $s5, 128 \n" // |128|128| + "lui $s6, 0xff \n" + "ori $s6, 0xff \n" // |00|ff|00|ff| + + ".p2align 2 \n" + "1: \n" + I422ToTransientMipsRGB + // Arranging into bgra format + "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1| + "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0| + "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0| + "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0| + + "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0| + "addiu %[width], -4 \n" + "addiu %[y_buf], 4 \n" + "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0| + "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0| + "sll $t1, $t1, 8 \n" // |R1|0 |R0|0 | + "sll $t2, $t2, 8 \n" // |r1|0 |r0|0 | + "or $t1, $t1, $s6 \n" // |R1|ff|R0|ff| + "or $t2, $t2, $s6 \n" // |r1|ff|r0|ff| + "precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff| + "precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff| + "sll $t1, $t1, 16 \n" + "sll $t2, $t2, 16 \n" + "packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff| + "packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff| +// Store results. + "sw $t2, 0(%[rgb_buf]) \n" + "sw $t0, 4(%[rgb_buf]) \n" + "sw $t1, 8(%[rgb_buf]) \n" + "sw $t3, 12(%[rgb_buf]) \n" + "bnez %[width], 1b \n" + " addiu %[rgb_buf], 16 \n" + "2: \n" + ".set pop \n" + :[y_buf] "+r" (y_buf), + [u_buf] "+r" (u_buf), + [v_buf] "+r" (v_buf), + [width] "+r" (width), + [rgb_buf] "+r" (rgb_buf) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +// Bilinear filter 8x2 -> 8x1 +void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + int y0_fraction = 256 - source_y_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "replv.ph $t0, %[y0_fraction] \n" + "replv.ph $t1, %[source_y_fraction] \n" + + ".p2align 2 \n" + "1: \n" + "lw $t2, 0(%[src_ptr]) \n" + "lw $t3, 0(%[src_ptr1]) \n" + "lw $t4, 4(%[src_ptr]) \n" + "lw $t5, 4(%[src_ptr1]) \n" + "muleu_s.ph.qbl $t6, $t2, $t0 \n" + "muleu_s.ph.qbr $t7, $t2, $t0 \n" + "muleu_s.ph.qbl $t8, $t3, $t1 \n" + "muleu_s.ph.qbr $t9, $t3, $t1 \n" + "muleu_s.ph.qbl $t2, $t4, $t0 \n" + "muleu_s.ph.qbr $t3, $t4, $t0 \n" + "muleu_s.ph.qbl $t4, $t5, $t1 \n" + "muleu_s.ph.qbr $t5, $t5, $t1 \n" + "addq.ph $t6, $t6, $t8 \n" + "addq.ph $t7, $t7, $t9 \n" + "addq.ph $t2, $t2, $t4 \n" + "addq.ph $t3, $t3, $t5 \n" + "shra.ph $t6, $t6, 8 \n" + "shra.ph $t7, $t7, 8 \n" + "shra.ph $t2, $t2, 8 \n" + "shra.ph $t3, $t3, 8 \n" + "precr.qb.ph $t6, $t6, $t7 \n" + "precr.qb.ph $t2, $t2, $t3 \n" + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[src_ptr1], %[src_ptr1], 8 \n" + "addiu %[dst_width], %[dst_width], -8 \n" + "sw $t6, 0(%[dst_ptr]) \n" + "sw $t2, 4(%[dst_ptr]) \n" + "bgtz %[dst_width], 1b \n" + " addiu %[dst_ptr], %[dst_ptr], 8 \n" + + ".set pop \n" + : [dst_ptr] "+r" (dst_ptr), + [src_ptr1] "+r" (src_ptr1), + [src_ptr] "+r" (src_ptr), + [dst_width] "+r" (dst_width) + : [source_y_fraction] "r" (source_y_fraction), + [y0_fraction] "r" (y0_fraction), + [src_stride] "r" (src_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} +#endif // __mips_dsp_rev >= 2 + +#endif // defined(__mips__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/row_neon.cc b/third_party/libyuv/source/row_neon.cc new file mode 100644 index 000000000..c5ae2c583 --- /dev/null +++ b/third_party/libyuv/source/row_neon.cc @@ -0,0 +1,2844 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) + +// Read 8 Y, 4 U and 4 V from 422 +#define READYUV422 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.32 {d2[0]}, [%1]! \n" \ + "vld1.32 {d2[1]}, [%2]! \n" + +// Read 8 Y, 2 U and 2 V from 422 +#define READYUV411 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.16 {d2[0]}, [%1]! \n" \ + "vld1.16 {d2[1]}, [%2]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vzip.u8 d2, d3 \n" + +// Read 8 Y, 8 U and 8 V from 444 +#define READYUV444 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vld1.8 {d3}, [%2]! \n" \ + "vpaddl.u8 q1, q1 \n" \ + "vrshrn.u16 d2, q1, #1 \n" + +// Read 8 Y, and set 4 U and 4 V to 128 +#define READYUV400 \ + "vld1.8 {d0}, [%0]! \n" \ + "vmov.u8 d2, #128 \n" + +// Read 8 Y and 4 UV from NV12 +#define READNV12 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 Y and 4 VU from NV21 +#define READNV21 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ + "vuzp.u8 d3, d2 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 YUY2 +#define READYUY2 \ + "vld2.8 {d0, d2}, [%0]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 UYVY +#define READUYVY \ + "vld2.8 {d2, d3}, [%0]! \n" \ + "vmov.u8 d0, d3 \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +#define YUV422TORGB \ + "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\ + "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\ + "vmull.s8 q9, d2, d25 \n"/* u/v G component */\ + "vmov.u8 d1, #0 \n"/* split odd/even y apart */\ + "vtrn.u8 d0, d1 \n" \ + "vsub.s16 q0, q0, q15 \n"/* offset y */\ + "vmul.s16 q0, q0, q14 \n" \ + "vadd.s16 d18, d19 \n" \ + "vqadd.s16 d20, d0, d16 \n" /* B */ \ + "vqadd.s16 d21, d1, d16 \n" \ + "vqadd.s16 d22, d0, d17 \n" /* R */ \ + "vqadd.s16 d23, d1, d17 \n" \ + "vqadd.s16 d16, d0, d18 \n" /* G */ \ + "vqadd.s16 d17, d1, d18 \n" \ + "vqshrun.s16 d0, q10, #6 \n" /* B */ \ + "vqshrun.s16 d1, q11, #6 \n" /* G */ \ + "vqshrun.s16 d2, q8, #6 \n" /* R */ \ + "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\ + "vmovl.u8 q11, d1 \n" \ + "vmovl.u8 q8, d2 \n" \ + "vtrn.u8 d20, d21 \n" \ + "vtrn.u8 d22, d23 \n" \ + "vtrn.u8 d16, d17 \n" \ + "vmov.u8 d21, d16 \n" + +static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, + 0, 0, 0, 0, 0, 0, 0, 0 }; +static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, + 0, 0, 0, 0, 0, 0, 0, 0 }; + +void I444ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV444 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I411ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV411 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToBGRARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width) { + asm volatile ( + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vmov.u8 d19, #255 \n" + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_bgra), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToABGRRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width) { + asm volatile ( + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_abgr), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToRGBARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width) { + asm volatile ( + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d19, #255 \n" + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgba), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToRGB24Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width) { + asm volatile ( + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToRAWRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width) { + asm volatile ( + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_raw), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#define ARGBTORGB565 \ + "vshr.u8 d20, d20, #3 \n" /* B */ \ + "vshr.u8 d21, d21, #2 \n" /* G */ \ + "vshr.u8 d22, d22, #3 \n" /* R */ \ + "vmovl.u8 q8, d20 \n" /* B */ \ + "vmovl.u8 q9, d21 \n" /* G */ \ + "vmovl.u8 q10, d22 \n" /* R */ \ + "vshl.u16 q9, q9, #5 \n" /* G */ \ + "vshl.u16 q10, q10, #11 \n" /* R */ \ + "vorr q0, q8, q9 \n" /* BG */ \ + "vorr q0, q0, q10 \n" /* BGR */ + +void I422ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + asm volatile ( + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + ARGBTORGB565 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#define ARGBTOARGB1555 \ + "vshr.u8 q10, q10, #3 \n" /* B */ \ + "vshr.u8 d22, d22, #3 \n" /* R */ \ + "vshr.u8 d23, d23, #7 \n" /* A */ \ + "vmovl.u8 q8, d20 \n" /* B */ \ + "vmovl.u8 q9, d21 \n" /* G */ \ + "vmovl.u8 q10, d22 \n" /* R */ \ + "vmovl.u8 q11, d23 \n" /* A */ \ + "vshl.u16 q9, q9, #5 \n" /* G */ \ + "vshl.u16 q10, q10, #10 \n" /* R */ \ + "vshl.u16 q11, q11, #15 \n" /* A */ \ + "vorr q0, q8, q9 \n" /* BG */ \ + "vorr q1, q10, q11 \n" /* RA */ \ + "vorr q0, q0, q1 \n" /* BGRA */ + +void I422ToARGB1555Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + asm volatile ( + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + ARGBTOARGB1555 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#define ARGBTOARGB4444 \ + "vshr.u8 d20, d20, #4 \n" /* B */ \ + "vbic.32 d21, d21, d4 \n" /* G */ \ + "vshr.u8 d22, d22, #4 \n" /* R */ \ + "vbic.32 d23, d23, d4 \n" /* A */ \ + "vorr d0, d20, d21 \n" /* BG */ \ + "vorr d1, d22, d23 \n" /* RA */ \ + "vzip.u8 d0, d1 \n" /* BGRA */ + +void I422ToARGB4444Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + asm volatile ( + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + ARGBTOARGB4444 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb4444), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void YToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + "vld1.8 {d24}, [%3] \n" + "vld1.8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV400 + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I400ToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + ".p2align 2 \n" + "vmov.u8 d23, #255 \n" + "1: \n" + "vld1.8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d20", "d21", "d22", "d23" + ); +} + +void NV12ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + "vld1.8 {d24}, [%4] \n" + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV12 + YUV422TORGB + "subs %3, %3, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void NV21ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + "vld1.8 {d24}, [%4] \n" + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV21 + YUV422TORGB + "subs %3, %3, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void NV12ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + "vld1.8 {d24}, [%4] \n" + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV12 + YUV422TORGB + "subs %3, %3, #8 \n" + ARGBTORGB565 + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void NV21ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + "vld1.8 {d24}, [%4] \n" + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV21 + YUV422TORGB + "subs %3, %3, #8 \n" + ARGBTORGB565 + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void YUY2ToARGBRow_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + asm volatile ( + "vld1.8 {d24}, [%3] \n" + "vld1.8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUY2 + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void UYVYToARGBRow_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + asm volatile ( + "vld1.8 {d24}, [%3] \n" + "vld1.8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READUYVY + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. +void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store U + "vst1.8 {q1}, [%2]! \n" // store V + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Reads 16 U's and V's and writes out 16 pairs of UV. +void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load U + "vld1.8 {q1}, [%1]! \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "bgt 1b \n" + : + "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. +void CopyRow_NEON(const uint8* src, uint8* dst, int count) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// SetRow8 writes 'count' bytes using a 32 bit value repeated. +void SetRow_NEON(uint8* dst, uint32 v32, int count) { + asm volatile ( + "vdup.u32 q0, %2 \n" // duplicate 4 ints + "1: \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v32) // %2 + : "cc", "memory", "q0" + ); +} + +// TODO(fbarchard): Make fully assembler +// SetRow32 writes 'count' words using a 32 bit value repeated. +void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + for (int y = 0; y < height; ++y) { + SetRow_NEON(dst, v32, width << 2); + dst += dst_stride; + } +} + +void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2 \n" + "sub %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #16 \n" // 16 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d1}, [%1]! \n" // dst += 16 + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0" + ); +} + +void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + // Start at end of source row. + "mov r12, #-16 \n" + "add %0, %0, %3, lsl #1 \n" + "sub %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %3, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d0}, [%1]! \n" // dst += 8 + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "r12", "q0" + ); +} + +void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2, lsl #2 \n" + "sub %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #4 \n" // 4 pixels per loop. + "vrev64.32 q0, q0 \n" + "vst1.8 {d1}, [%1]! \n" // dst += 16 + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0" + ); +} + +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d4, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d4, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +#define RGB565TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ + "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ + "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +#define ARGB1555TOARGB \ + "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ + "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ + "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ + "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ + "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ + "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ + "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ + "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ + "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ + "vorr.u8 q1, q1, q3 \n" /* R,A */ \ + "vorr.u8 q0, q0, q2 \n" /* B,G */ \ + +// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. +#define RGB555TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ + "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ + "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +#define ARGB4444TOARGB \ + "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ + "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ + "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ + "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ + "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ + "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ + "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ + "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ + +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d1}, [%1]! \n" // store 8 U. + "vst1.8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} + +void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d0}, [%1]! \n" // store 8 U. + "vst1.8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} + +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // stride + src_yuy2 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "vst1.8 {d1}, [%2]! \n" // store 8 U. + "vst1.8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + ); +} + +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // stride + src_uyvy + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "vst1.8 {d0}, [%2]! \n" // store 8 U. + "vst1.8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(stride_uyvy), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + ); +} + +void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %0 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. + "subs %3, %3, #16 \n" // 16 processed per loop + "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. + "vrhadd.u8 q0, q1 \n" // average row 1 and 2 + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(src_uv_stride), // %1 + "+r"(dst_uv), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG +void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + asm volatile ( + "vmov.u32 d6[0], %3 \n" // selector + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels + "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels + "vtrn.u32 d4, d5 \n" // combine 8 pixels + "vst1.8 {d4}, [%1]! \n" // store 8. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : "r"(selector) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Select G channels from ARGB. e.g. GGGGGGGG +void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 /*selector*/, int pix) { + asm volatile ( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {d1}, [%1]! \n" // store 8 G's. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "vld1.8 {q2}, [%3] \n" // shuffler + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels + "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + "vst1.8 {q1}, [%1]! \n" // store 4. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +void I422ToYUY2Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + "vld1.8 {d1}, [%1]! \n" // load 8 Us + "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3" + ); +} + +void I422ToUYVYRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + "vld1.8 {d0}, [%1]! \n" // load 8 Us + "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3" + ); +} + +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} + +void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} + +void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, + int pix) { + asm volatile ( + "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. + ".p2align 2 \n" + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} + +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + ); +} + +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + ); +} + +// 8x1 pixels. +void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient + "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient + "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient + "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient + "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlsl.u8 q2, d1, d25 \n" // G + "vmlsl.u8 q2, d2, d26 \n" // R + "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned + + "vmull.u8 q3, d2, d24 \n" // R + "vmlsl.u8 q3, d1, d28 \n" // G + "vmlsl.u8 q3, d0, d27 \n" // B + "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned + + "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V + + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" + ); +} + +// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + + "subs %3, %3, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q0, q10 \n" // B + "vmls.s16 q8, q1, q11 \n" // G + "vmls.s16 q8, q2, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + + "vmul.s16 q9, q2, q10 \n" // R + "vmls.s16 q9, q1, q14 \n" // G + "vmls.s16 q9, q0, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. +void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. + "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts. + + "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts. + "vpadd.u16 d1, d8, d9 \n" // B + "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts. + "vpadd.u16 d3, d10, d11 \n" // G + "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts. + "vpadd.u16 d5, d12, d13 \n" // R + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %3, %3, #32 \n" // 32 processed per loop. + "vmul.s16 q8, q0, q10 \n" // B + "vmls.s16 q8, q1, q11 \n" // G + "vmls.s16 q8, q2, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q2, q10 \n" // R + "vmls.s16 q9, q1, q14 \n" // G + "vmls.s16 q9, q0, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#define RGBTOUV(QB, QG, QR) \ + "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ + "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ + "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ + "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ + "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ + "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ + "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ + "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ + "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ + "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ + +// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// TODO(fbarchard): Subsample match C code. +void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_bgra + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. + "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. + "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q1, q1, #1 \n" // 2x average + "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q3, q3, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q3, q2, q1) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(src_stride_bgra), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_abgr + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_stride_abgr), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgba + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. + "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. + "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(src_stride_rgba), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgb24 + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_stride_rgb24), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_raw + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_stride_raw), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_stride_rgb565), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_stride_argb1555), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_stride_argb4444), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} + +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} + +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} + +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // R + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // R + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // B + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + asm volatile ( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + "vld1.8 {q1}, [%1]! \n" + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" + ); +} + +// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr +void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "subs %3, #8 \n" + "blt 89f \n" + // Blend 8 pixels. + "8: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. + "bge 8b \n" + + "89: \n" + "adds %3, #8-1 \n" + "blt 99f \n" + + // Blend 1 pixels. + "1: \n" + "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. + "bge 1b \n" + + "99: \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" + ); +} + +// Attenuate 8 pixels at a time. +void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + // Attenuate 8 pixels. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d0, d3 \n" // b * a + "vmull.u8 q11, d1, d3 \n" // g * a + "vmull.u8 q12, d2, d3 \n" // r * a + "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 + "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 + "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q10", "q11", "q12" + ); +} + +// Quantize 8 ARGB pixels (32 bytes). +// dst = (dst * scale >> 16) * interval_size + interval_offset; +void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + asm volatile ( + "vdup.u16 q8, %2 \n" + "vshr.u16 q8, q8, #1 \n" // scale >>= 1 + "vdup.u16 q9, %3 \n" // interval multiply. + "vdup.u16 q10, %4 \n" // interval add + + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmovl.u8 q0, d0 \n" // b (0 .. 255) + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q2, d4 \n" + "vqdmulh.s16 q0, q0, q8 \n" // b * scale + "vqdmulh.s16 q1, q1, q8 \n" // g + "vqdmulh.s16 q2, q2, q8 \n" // r + "vmul.u16 q0, q0, q9 \n" // b * interval_size + "vmul.u16 q1, q1, q9 \n" // g + "vmul.u16 q2, q2, q9 \n" // r + "vadd.u16 q0, q0, q10 \n" // b + interval_offset + "vadd.u16 q1, q1, q10 \n" // g + "vadd.u16 q2, q2, q10 \n" // r + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d2, q1 \n" + "vqmovn.u16 d4, q2 \n" + "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" + ); +} + +// Shade 8 pixels at a time by specified value. +// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. +// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. +void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + asm volatile ( + "vdup.u32 q0, %3 \n" // duplicate scale value. + "vzip.u8 d0, d1 \n" // d0 aarrggbb. + "vshr.u16 q0, q0, #1 \n" // scale / 2. + + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q10, d20 \n" // b (0 .. 255) + "vmovl.u8 q11, d22 \n" + "vmovl.u8 q12, d24 \n" + "vmovl.u8 q13, d26 \n" + "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 + "vqrdmulh.s16 q11, q11, d0[1] \n" // g + "vqrdmulh.s16 q12, q12, d0[2] \n" // r + "vqrdmulh.s16 q13, q13, d0[3] \n" // a + "vqmovn.u16 d20, q10 \n" + "vqmovn.u16 d22, q11 \n" + "vqmovn.u16 d24, q12 \n" + "vqmovn.u16 d26, q13 \n" + "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "q0", "q10", "q11", "q12", "q13" + ); +} + +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +// Similar to ARGBToYJ but stores ARGB. +// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; +void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B + "vmov d1, d0 \n" // G + "vmov d2, d0 \n" // R + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + ); +} + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d20, #17 \n" // BB coefficient + "vmov.u8 d21, #68 \n" // BG coefficient + "vmov.u8 d22, #35 \n" // BR coefficient + "vmov.u8 d24, #22 \n" // GB coefficient + "vmov.u8 d25, #88 \n" // GG coefficient + "vmov.u8 d26, #45 \n" // GR coefficient + "vmov.u8 d28, #24 \n" // BB coefficient + "vmov.u8 d29, #98 \n" // BG coefficient + "vmov.u8 d30, #50 \n" // BR coefficient + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d20 \n" // B to Sepia B + "vmlal.u8 q2, d1, d21 \n" // G + "vmlal.u8 q2, d2, d22 \n" // R + "vmull.u8 q3, d0, d24 \n" // B to Sepia G + "vmlal.u8 q3, d1, d25 \n" // G + "vmlal.u8 q3, d2, d26 \n" // R + "vmull.u8 q8, d0, d28 \n" // B to Sepia R + "vmlal.u8 q8, d1, d29 \n" // G + "vmlal.u8 q8, d2, d30 \n" // R + "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G + "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "q0", "q1", "q2", "q3", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// TODO(fbarchard): Was same as Sepia except matrix is provided. This function +// needs to saturate. Consider doing a non-saturating version. +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + asm volatile ( + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. + "vmovl.s8 q0, d4 \n" // B,G coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. + + ".p2align 2 \n" + "1: \n" + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit + "vmovl.u8 q9, d18 \n" // g + "vmovl.u8 q10, d20 \n" // r + "vmovl.u8 q15, d22 \n" // a + "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B + "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G + "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A + "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B + "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G + "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B + "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G + "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B + "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G + "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B + "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G + "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R + "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. +#ifdef HAS_ARGBMULTIPLYROW_NEON +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q0, d0, d1 \n" // multiply B + "vmull.u8 q1, d2, d3 \n" // multiply G + "vmull.u8 q2, d4, d5 \n" // multiply R + "vmull.u8 q3, d6, d7 \n" // multiply A + "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B + "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G + "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R + "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} +#endif // HAS_ARGBMULTIPLYROW_NEON + +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 q0, q0, q2 \n" // add B, G + "vqadd.u8 q1, q1, q3 \n" // add R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} + +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqsub.u8 q0, q0, q2 \n" // subtract B, G + "vqsub.u8 q1, q1, q3 \n" // subtract R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} + +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d1 \n" // add + "vmov.u8 d1, d0 \n" + "vmov.u8 d2, d0 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} + +// Adds Sobel X and Sobel Y and stores Sobel into plane. +void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + asm volatile ( + // 16 pixel loop. + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "vqadd.u8 q0, q0, q1 \n" // add + "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} + +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d1, d0, d2 \n" // add + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} + +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld1.8 {d0}, [%0],%5 \n" // top + "vld1.8 {d1}, [%0],%6 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%1],%5 \n" // center * 2 + "vld1.8 {d3}, [%1],%6 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%2],%5 \n" // bottom + "vld1.8 {d3}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2), // %5 + "r"(6) // %6 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld1.8 {d0}, [%0],%4 \n" // left + "vld1.8 {d1}, [%1],%4 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%0],%4 \n" // center * 2 + "vld1.8 {d3}, [%1],%4 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%0],%5 \n" // right + "vld1.8 {d3}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1), // %4 + "r"(6) // %5 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} +#endif // __ARM_NEON__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/row_posix.cc b/third_party/libyuv/source/row_posix.cc new file mode 100644 index 000000000..e47708802 --- /dev/null +++ b/third_party/libyuv/source/row_posix.cc @@ -0,0 +1,6443 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) + +// Constants for ARGB +static vec8 kARGBToY = { + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 +}; + +// JPeg full range. +static vec8 kARGBToYJ = { + 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 +}; +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) + +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) + +static vec8 kARGBToU = { + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 +}; + +static vec8 kARGBToUJ = { + 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 +}; + +static vec8 kARGBToV = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, +}; + +static vec8 kARGBToVJ = { + -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 +}; + +// Constants for BGRA +static vec8 kBGRAToY = { + 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 +}; + +static vec8 kBGRAToU = { + 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 +}; + +static vec8 kBGRAToV = { + 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 +}; + +// Constants for ABGR +static vec8 kABGRToY = { + 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 +}; + +static vec8 kABGRToU = { + -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 +}; + +static vec8 kABGRToV = { + 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 +}; + +// Constants for RGBA. +static vec8 kRGBAToY = { + 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 +}; + +static vec8 kRGBAToU = { + 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 +}; + +static vec8 kRGBAToV = { + 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 +}; + +static uvec8 kAddY16 = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u +}; + +static vec16 kAddYJ64 = { + 64, 64, 64, 64, 64, 64, 64, 64 +}; + +static uvec8 kAddUV128 = { + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + +static uvec16 kAddUVJ128 = { + 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u +}; +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) + +#ifdef HAS_RGB24TOARGBROW_SSSE3 + +// Shuffle table for converting RGB24 to ARGB. +static uvec8 kShuffleMaskRGB24ToARGB = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u +}; + +// Shuffle table for converting RAW to ARGB. +static uvec8 kShuffleMaskRAWToARGB = { + 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u +}; + +// Shuffle table for converting ARGB to RGB24. +static uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGB to RAW. +static uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 +static uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u +}; + +// Shuffle table for converting ARGB to RAW. +static uvec8 kShuffleMaskARGBToRAW_0 = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u +}; +#endif // HAS_RGB24TOARGBROW_SSSE3 + +#if defined(TESTING) && defined(__x86_64__) +void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + asm volatile ( + ".p2align 5 \n" + "mov %%eax,%%eax \n" + "mov %%ebx,%%ebx \n" + "mov %%ecx,%%ecx \n" + "mov %%edx,%%edx \n" + "mov %%esi,%%esi \n" + "mov %%edi,%%edi \n" + "mov %%ebp,%%ebp \n" + "mov %%esp,%%esp \n" + ".p2align 5 \n" + "mov %%r8d,%%r8d \n" + "mov %%r9d,%%r9d \n" + "mov %%r10d,%%r10d \n" + "mov %%r11d,%%r11d \n" + "mov %%r12d,%%r12d \n" + "mov %%r13d,%%r13d \n" + "mov %%r14d,%%r14d \n" + "mov %%r15d,%%r15d \n" + ".p2align 5 \n" + "lea (%%rax),%%eax \n" + "lea (%%rbx),%%ebx \n" + "lea (%%rcx),%%ecx \n" + "lea (%%rdx),%%edx \n" + "lea (%%rsi),%%esi \n" + "lea (%%rdi),%%edi \n" + "lea (%%rbp),%%ebp \n" + "lea (%%rsp),%%esp \n" + ".p2align 5 \n" + "lea (%%r8),%%r8d \n" + "lea (%%r9),%%r9d \n" + "lea (%%r10),%%r10d \n" + "lea (%%r11),%%r11d \n" + "lea (%%r12),%%r12d \n" + "lea (%%r13),%%r13d \n" + "lea (%%r14),%%r14d \n" + "lea (%%r15),%%r15d \n" + + ".p2align 5 \n" + "lea 0x10(%%rax),%%eax \n" + "lea 0x10(%%rbx),%%ebx \n" + "lea 0x10(%%rcx),%%ecx \n" + "lea 0x10(%%rdx),%%edx \n" + "lea 0x10(%%rsi),%%esi \n" + "lea 0x10(%%rdi),%%edi \n" + "lea 0x10(%%rbp),%%ebp \n" + "lea 0x10(%%rsp),%%esp \n" + ".p2align 5 \n" + "lea 0x10(%%r8),%%r8d \n" + "lea 0x10(%%r9),%%r9d \n" + "lea 0x10(%%r10),%%r10d \n" + "lea 0x10(%%r11),%%r11d \n" + "lea 0x10(%%r12),%%r12d \n" + "lea 0x10(%%r13),%%r13d \n" + "lea 0x10(%%r14),%%r14d \n" + "lea 0x10(%%r15),%%r15d \n" + + ".p2align 5 \n" + "add 0x10,%%eax \n" + "add 0x10,%%ebx \n" + "add 0x10,%%ecx \n" + "add 0x10,%%edx \n" + "add 0x10,%%esi \n" + "add 0x10,%%edi \n" + "add 0x10,%%ebp \n" + "add 0x10,%%esp \n" + ".p2align 5 \n" + "add 0x10,%%r8d \n" + "add 0x10,%%r9d \n" + "add 0x10,%%r10d \n" + "add 0x10,%%r11d \n" + "add 0x10,%%r12d \n" + "add 0x10,%%r13d \n" + "add 0x10,%%r14d \n" + "add 0x10,%%r15d \n" + + ".p2align 2 \n" + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // TESTING + +#ifdef HAS_I400TOARGBROW_SSE2 +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, + int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_I400TOARGBROW_SSE2 + +#ifdef HAS_RGB24TOARGBROW_SSSE3 +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" + "lea " MEMLEA(0x30,0) ",%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "por %%xmm5,%%xmm3 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskRGB24ToARGB) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" + "lea " MEMLEA(0x30,0) ",%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "por %%xmm5,%%xmm3 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskRAWToARGB) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x20802080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xa,%%xmm4 \n" + "psrlw $0x5,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "pand %%xmm4,%%xmm0 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) + MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + "lea " MEMLEA(0x10,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "eax" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x42004200,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "movdqa %%xmm3,%%xmm4 \n" + "psrlw $0x6,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psllw $0x1,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) + MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + "lea " MEMLEA(0x10,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "eax" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "mov $0xf0f0f0f,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x4,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "pand %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "psllw $0x4,%%xmm1 \n" + "psrlw $0x4,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2) + MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2) + "lea " MEMLEA(0x10,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "eax" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "movdqa %3,%%xmm6 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x30,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskARGBToRGB24) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "movdqa %3,%%xmm6 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x30,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskARGBToRAW) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1b,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x5,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pslld $0xa,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "pslld $0xf,%%xmm7 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "psrad $0x10,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x6,%%xmm2 \n" + "psrld $0x9,%%xmm3 \n" + "pand %%xmm7,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm6,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMACCESS2(0x8,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xc,%%xmm4 \n" + "movdqa %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm3 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm3,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "psrlq $0x4,%%xmm0 \n" + "psrlq $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} +#endif // HAS_RGB24TOARGBROW_SSSE3 + +#ifdef HAS_ARGBTOYROW_SSSE3 +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBTOYROW_SSSE3 + +#ifdef HAS_ARGBTOYJROW_SSSE3 +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBTOYJROW_SSSE3 + +#ifdef HAS_ARGBTOUVROW_SSSE3 +// TODO(fbarchard): pass xmm constants to single block of assembly. +// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes +// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, +// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around +// and considered unsafe. +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToUJ), // %0 + "m"(kARGBToVJ), // %1 + "m"(kAddUVJ128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToUJ), // %0 + "m"(kARGBToVJ), // %1 + "m"(kAddUVJ128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)) + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6" +#endif + ); +} + +void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, + uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6" +#endif + ); +} + +void ARGBToUV422Row_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kBGRAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kBGRAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kBGRAToU), // %0 + "m"(kBGRAToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_bgra0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_bgra)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kBGRAToU), // %0 + "m"(kBGRAToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_bgra0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_bgra)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kRGBAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kRGBAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kABGRToU), // %0 + "m"(kABGRToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kABGRToU), // %0 + "m"(kABGRToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kRGBAToU), // %0 + "m"(kRGBAToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_rgba0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_rgba)) + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kRGBAToU), // %0 + "m"(kRGBAToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_rgba0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_rgba)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBTOUVROW_SSSE3 + +#ifdef HAS_I422TOARGBROW_SSSE3 +#define UB 127 /* min(63,(int8)(2.018 * 64)) */ +#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ +#define UR 0 + +#define VB 0 +#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ +#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ + +// Bias +#define BB UB * 128 + VB * 128 +#define BG UG * 128 + VG * 128 +#define BR UR * 128 + VR * 128 + +#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ + +struct { + vec8 kUVToB; // 0 + vec8 kUVToG; // 16 + vec8 kUVToR; // 32 + vec16 kUVBiasB; // 48 + vec16 kUVBiasG; // 64 + vec16 kUVBiasR; // 80 + vec16 kYSub16; // 96 + vec16 kYToRgb; // 112 + vec8 kVUToB; // 128 + vec8 kVUToG; // 144 + vec8 kVUToR; // 160 +} static SIMD_ALIGNED(kYuvConstants) = { + { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, + { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, + { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }, + { BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR }, + { 16, 16, 16, 16, 16, 16, 16, 16 }, + { YG, YG, YG, YG, YG, YG, YG, YG }, + { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB }, + { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, + { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR } +}; + + +// Read 8 UV from 411 +#define READYUV444 \ + "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + BUNDLEALIGN \ + MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" + +// Read 4 UV from 422, upsample to 8 UV +#define READYUV422 \ + "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + BUNDLEALIGN \ + MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" + +// Read 2 UV from 411, upsample to 8 UV +#define READYUV411 \ + "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + BUNDLEALIGN \ + MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpckldq %%xmm0,%%xmm0 \n" + +// Read 4 UV from NV12, upsample to 8 UV +#define READNV12 \ + "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ + "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" + +// Convert 8 pixels: 8 UV and 8 Y +#define YUVTORGB \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \ + "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \ + "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \ + "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ + "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ + "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ + "punpcklbw %%xmm4,%%xmm3 \n" \ + "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ + "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ + "paddsw %%xmm3,%%xmm0 \n" \ + "paddsw %%xmm3,%%xmm1 \n" \ + "paddsw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" + +// Convert 8 pixels: 8 VU and 8 Y +#define YVUTORGB \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \ + "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \ + "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \ + "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ + "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ + "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ + "punpcklbw %%xmm4,%%xmm3 \n" \ + "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ + "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ + "paddsw %%xmm3,%%xmm0 \n" \ + "paddsw %%xmm3,%%xmm1 \n" \ + "paddsw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" + +void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV444 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgb24, + int width) { +// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. +#if defined(__i386__) + asm volatile ( + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" + :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)); +#endif + + asm volatile ( +#if !defined(__i386__) + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" +#endif + "sub %[u_buf],%[v_buf] \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" + "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) +#if !defined(__i386__) + , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) +#endif + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_raw, + int width) { +// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. +#if defined(__i386__) + asm volatile ( + "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" + :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), + [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)); +#endif + + asm volatile ( +#if !defined(__i386__) + "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" +#endif + "sub %[u_buf],%[v_buf] \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "movq %%xmm0," MEMACCESS([dst_raw]) " \n" + "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" + "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_raw]"+r"(dst_raw), // %[dst_raw] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) +#if !defined(__i386__) + , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), + [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) +#endif + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV411 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READNV12 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" + // Does not use r14. +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READNV12 + YVUTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" + // Does not use r14. +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV444 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV411 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READNV12 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" + // Does not use r14. +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READNV12 + YVUTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" + // Does not use r14. +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_bgra, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n" + "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" + "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_abgr, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" + "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgba, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n" + "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" + "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_bgra, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n" + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" + "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_abgr, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" + "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgba, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n" + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" + "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +#endif // HAS_I422TOARGBROW_SSSE3 + +#ifdef HAS_YTOARGBROW_SSE2 +void YToARGBRow_SSE2(const uint8* y_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "mov $0x00100010,%%eax \n" + "movd %%eax,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "mov $0x004a004a,%%eax \n" + "movd %%eax,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "psubusw %%xmm3,%%xmm0 \n" + "pmullw %%xmm2,%%xmm0 \n" + "psrlw $6, %%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + + // Step 2: Weave into ARGB + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "por %%xmm4,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} +#endif // HAS_YTOARGBROW_SSE2 + +#ifdef HAS_MIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static uvec8 kShuffleMirror = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "movdqa %3,%%xmm5 \n" + "lea " MEMLEA(-0x10,0) ",%0 \n" + LABELALIGN + "1: \n" + MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0 + "pshufb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif + ); +} +#endif // HAS_MIRRORROW_SSSE3 + +#ifdef HAS_MIRRORROW_SSE2 +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "lea " MEMLEA(-0x10,0) ",%0 \n" + LABELALIGN + "1: \n" + MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0 + "movdqa %%xmm0,%%xmm1 \n" + "psllw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "pshuflw $0x1b,%%xmm0,%%xmm0 \n" + "pshufhw $0x1b,%%xmm0,%%xmm0 \n" + "pshufd $0x4e,%%xmm0,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1)",%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_MIRRORROW_SSE2 + +#ifdef HAS_MIRRORROW_UV_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +static uvec8 kShuffleMirrorUV = { + 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u +}; +void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, + int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "movdqa %4,%%xmm1 \n" + "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(-0x10,0) ",%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "sub $8,%3 \n" + "movlpd %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorUV) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_MIRRORROW_UV_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static uvec8 kARGBShuffleMirror = { + 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u +}; + +void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" + "movdqa %3,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "lea " MEMLEA(-0x10,0) ",%0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kARGBShuffleMirror) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif + ); +} +#endif // HAS_ARGBMIRRORROW_SSSE3 + +#ifdef HAS_SPLITUVROW_SSE2 +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_SPLITUVROW_SSE2 + +#ifdef HAS_MERGEUVROW_SSE2 +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + asm volatile ( + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} + +void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, + uint8* dst_uv, int width) { + asm volatile ( + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} +#endif // HAS_MERGEUVROW_SSE2 + +#ifdef HAS_COPYROW_SSE2 +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_COPYROW_SSE2 + +#ifdef HAS_COPYROW_X86 +void CopyRow_X86(const uint8* src, uint8* dst, int width) { + size_t width_tmp = (size_t)(width); + asm volatile ( + "shr $0x2,%2 \n" + "rep movsl " MEMMOVESTRING(0,1) " \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc" + ); +} +#endif // HAS_COPYROW_X86 + +#ifdef HAS_COPYROW_ERMS +// Unaligned Multiple of 1. +void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { + size_t width_tmp = (size_t)(width); + asm volatile ( + "rep movsb " MEMMOVESTRING(0,1) " \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc" + ); +} +#endif // HAS_COPYROW_ERMS + +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +// width in pixels +void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa " MEMACCESS(1) ",%%xmm4 \n" + "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqa %%xmm2," MEMACCESS(1) " \n" + "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBCOPYALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +// width in pixels +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1," MEMACCESS(1) " \n" + "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} +#endif // HAS_ARGBCOPYALPHAROW_AVX2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +// width in pixels +void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpckhwd %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "movdqa " MEMACCESS(1) ",%%xmm4 \n" + "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqa %%xmm2," MEMACCESS(1) " \n" + "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +// width in pixels +void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + LABELALIGN + "1: \n" + "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" + "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "vpslld $0x18,%%ymm1,%%ymm1 \n" + "vpslld $0x18,%%ymm2,%%ymm2 \n" + "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1," MEMACCESS(1) " \n" + "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 + +#ifdef HAS_SETROW_X86 +void SetRow_X86(uint8* dst, uint32 v32, int width) { + size_t width_tmp = (size_t)(width); + asm volatile ( + "shr $0x2,%1 \n" + "rep stosl " MEMSTORESTRING(eax,0) " \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); +} + +void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + for (int y = 0; y < height; ++y) { + size_t width_tmp = (size_t)(width); + uint32* d = (uint32*)(dst); + asm volatile ( + "rep stosl " MEMSTORESTRING(eax,0) " \n" + : "+D"(d), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); + dst += dst_stride; + } +} +#endif // HAS_SETROW_X86 + +#ifdef HAS_YUY2TOYROW_SSE2 +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 + MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 + MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_YUY2TOYROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSE2 +// Blend 8 pixels at a time. +void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x1,%3 \n" + "je 91f \n" + "jl 99f \n" + + // 1 pixel loop until destination pointer is aligned. + "10: \n" + "test $0xf,%2 \n" + "je 19f \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" + "jge 10b \n" + + "19: \n" + "add $1-4,%3 \n" + "jl 49f \n" + + // 4 pixel loop. + LABELALIGN + "41: \n" + "movdqu " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jge 41b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 99f \n" + + // 1 pixel loop. + "91: \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" + "jge 91b \n" + "99: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBBLENDROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for isolating alpha. +static uvec8 kShuffleAlpha = { + 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 +}; + +// Blend 8 pixels at a time +// Shuffle table for reversing the bytes. + +// Same as SSE2, but replaces +// psrlw xmm3, 8 // alpha +// pshufhw xmm3, xmm3,0F5h // 8 alpha words +// pshuflw xmm3, xmm3,0F5h +// with.. +// pshufb xmm3, kShuffleAlpha // alpha + +void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x1,%3 \n" + "je 91f \n" + "jl 99f \n" + + // 1 pixel loop until destination pointer is aligned. + "10: \n" + "test $0xf,%2 \n" + "je 19f \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" + "jge 10b \n" + + "19: \n" + "add $1-4,%3 \n" + "jl 49f \n" + "test $0xf,%0 \n" + "jne 41f \n" + "test $0xf,%1 \n" + "jne 41f \n" + + // 4 pixel loop. + LABELALIGN + "40: \n" + "movdqa " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqa " MEMACCESS(1) ",%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqa " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jge 40b \n" + "jmp 49f \n" + + // 4 pixel unaligned loop. + LABELALIGN + "41: \n" + "movdqu " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jge 41b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 99f \n" + + // 1 pixel loop. + "91: \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" + "jge 91b \n" + "99: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "m"(kShuffleAlpha) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_SSE2 +// Attenuate 4 pixels at a time. +// aligned to 16 bytes +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x8,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pshufhw $0xff,%%xmm0,%%xmm2 \n" + "pshuflw $0xff,%%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pshufhw $0xff,%%xmm1,%%xmm2 \n" + "pshuflw $0xff,%%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm4,%%xmm2 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBATTENUATEROW_SSE2 + +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +// Shuffle table duplicating alpha +static uvec8 kShuffleAlpha0 = { + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, +}; +static uvec8 kShuffleAlpha1 = { + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, +}; +// Attenuate 4 pixels at a time. +// aligned to 16 bytes +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha0), // %3 + "m"(kShuffleAlpha1) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBATTENUATEROW_SSSE3 + +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +// Unattenuate 4 pixels at a time. +// aligned to 16 bytes +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, + int width) { + uintptr_t alpha = 0; + asm volatile ( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movzb " MEMACCESS2(0x03,0) ",%3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x07,0) ",%3 \n" + MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "movzb " MEMACCESS2(0x0b,0) ",%3 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x0f,0) ",%3 \n" + MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "+r"(alpha) // %3 + : "r"(fixed_invtbl8) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBUNATTENUATEROW_SSE2 + +#ifdef HAS_ARGBGRAYROW_SSSE3 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrld $0x18,%%xmm2 \n" + "psrld $0x18,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBGRAYROW_SSSE3 + +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone +static vec8 kARGBToSepiaB = { + 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 +}; + +static vec8 kARGBToSepiaG = { + 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 +}; + +static vec8 kARGBToSepiaR = { + 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 +}; + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { + asm volatile ( + "movdqa %2,%%xmm2 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm5 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm5 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "sub $0x8,%1 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "m"(kARGBToSepiaB), // %2 + "m"(kARGBToSepiaG), // %3 + "m"(kARGBToSepiaR) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} +#endif // HAS_ARGBSEPIAROW_SSSE3 + +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + asm volatile ( + "movdqu " MEMACCESS(3) ",%%xmm5 \n" + "pshufd $0x00,%%xmm5,%%xmm2 \n" + "pshufd $0x55,%%xmm5,%%xmm3 \n" + "pshufd $0xaa,%%xmm5,%%xmm4 \n" + "pshufd $0xff,%%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm7 \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddsw %%xmm7,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm6 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm1 \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm6 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "punpcklwd %%xmm1,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm6 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 + +#ifdef HAS_ARGBQUANTIZEROW_SSE2 +// Quantize 4 ARGB pixels (16 bytes). +// aligned to 16 bytes +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + asm volatile ( + "movd %2,%%xmm2 \n" + "movd %3,%%xmm3 \n" + "movd %4,%%xmm4 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshufd $0x44,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "pshufd $0x44,%%xmm3,%%xmm3 \n" + "pshuflw $0x40,%%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "pslld $0x18,%%xmm6 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "pmullw %%xmm3,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm7 \n" + "pmullw %%xmm3,%%xmm1 \n" + "pand %%xmm6,%%xmm7 \n" + "paddw %%xmm4,%%xmm0 \n" + "paddw %%xmm4,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "sub $0x4,%1 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBQUANTIZEROW_SSE2 + +#ifdef HAS_ARGBSHADEROW_SSE2 +// Shade 4 pixels at a time by specified value. +// Aligned to 16 bytes. +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm2 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} +#endif // HAS_ARGBSHADEROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_ARGBMULTIPLYROW_SSE2 + +#ifdef HAS_ARGBADDROW_SSE2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_ARGBADDROW_SSE2 + +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +// Subtract 2 rows of ARGB pixels, 4 pixels at a time. +void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "psubusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_ARGBSUBTRACTROW_SSE2 + +#ifdef HAS_SOBELXROW_SSE2 +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + asm volatile ( + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + BUNDLEALIGN + MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 + MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 + MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "sub $0x8,%4 \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) + "lea " MEMLEA(0x8,0) ",%0 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_SOBELXROW_SSE2 + +#ifdef HAS_SOBELYROW_SSE2 +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + asm volatile ( + "sub %0,%1 \n" + "sub %0,%2 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + BUNDLEALIGN + "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" + MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + BUNDLEALIGN + "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" + MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "sub $0x8,%3 \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) + "lea " MEMLEA(0x8,0) ",%0 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_SOBELYROW_SSE2 + +#ifdef HAS_SOBELROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm2 \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm1 \n" + "punpckhwd %%xmm2,%%xmm2 \n" + "por %%xmm5,%%xmm1 \n" + "por %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklwd %%xmm0,%%xmm3 \n" + "punpckhwd %%xmm0,%%xmm0 \n" + "por %%xmm5,%%xmm3 \n" + "por %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movdqa %%xmm1," MEMACCESS(2) " \n" + "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" + "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n" + "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n" + "lea " MEMLEA(0x40,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_SOBELROW_SSE2 + +#ifdef HAS_SOBELTOPLANEROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x10,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_SOBELTOPLANEROW_SSE2 + +#ifdef HAS_SOBELXYROW_SSE2 +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "paddusb %%xmm1,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "punpckhbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm2,%%xmm4 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm4,%%xmm6 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "movdqa %%xmm1,%%xmm7 \n" + "punpcklwd %%xmm0,%%xmm7 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "sub $0x10,%3 \n" + "movdqa %%xmm6," MEMACCESS(2) " \n" + "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n" + "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n" + "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n" + "lea " MEMLEA(0x40,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_SOBELXYROW_SSE2 + +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value, inclusive of the value. +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) { + asm volatile ( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "test $0xf,%1 \n" + "jne 49f \n" + + // 4 pixel loop \n" + LABELALIGN + "40: \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm3 \n" + "punpckhbw %%xmm1,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "punpcklwd %%xmm1,%%xmm4 \n" + "punpckhwd %%xmm1,%%xmm5 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqa " MEMACCESS(2) ",%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n" + "paddd %%xmm0,%%xmm3 \n" + "paddd %%xmm4,%%xmm0 \n" + "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n" + "paddd %%xmm0,%%xmm4 \n" + "paddd %%xmm5,%%xmm0 \n" + "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n" + "lea " MEMLEA(0x40,2) ",%2 \n" + "paddd %%xmm0,%%xmm5 \n" + "movdqa %%xmm2," MEMACCESS(1) " \n" + "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n" + "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop \n" + LABELALIGN + "10: \n" + "movd " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu " MEMACCESS(2) ",%%xmm2 \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "paddd %%xmm0,%%xmm2 \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + + "19: \n" + : "+r"(row), // %0 + "+r"(cumsum), // %1 + "+r"(previous_cumsum), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, + int count) { + asm volatile ( + "movd %5,%%xmm5 \n" + "cvtdq2ps %%xmm5,%%xmm5 \n" + "rcpss %%xmm5,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "cmpl $0x80,%5 \n" + "ja 40f \n" + + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrld $0x10,%%xmm6 \n" + "cvtdq2ps %%xmm6,%%xmm6 \n" + "addps %%xmm6,%%xmm5 \n" + "mulps %%xmm4,%%xmm5 \n" + "cvtps2dq %%xmm5,%%xmm5 \n" + "packssdw %%xmm5,%%xmm5 \n" + + // 4 pixel small loop \n" + LABELALIGN + "4: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + BUNDLEALIGN + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 + MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 + MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 + MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 + "lea " MEMLEA(0x40,0) ",%0 \n" + "psubd " MEMACCESS(1) ",%%xmm0 \n" + "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" + "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" + "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" + BUNDLEALIGN + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 + MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 + MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 + MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 + "lea " MEMLEA(0x40,1) ",%1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jge 4b \n" + "jmp 49f \n" + + // 4 pixel loop \n" + LABELALIGN + "40: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + BUNDLEALIGN + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 + MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 + MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 + MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 + "lea " MEMLEA(0x40,0) ",%0 \n" + "psubd " MEMACCESS(1) ",%%xmm0 \n" + "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" + "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" + "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" + BUNDLEALIGN + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 + MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 + MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 + MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 + "lea " MEMLEA(0x40,1) ",%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm1,%%xmm1 \n" + "mulps %%xmm4,%%xmm0 \n" + "mulps %%xmm4,%%xmm1 \n" + "cvtdq2ps %%xmm2,%%xmm2 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "cvtps2dq %%xmm1,%%xmm1 \n" + "cvtps2dq %%xmm2,%%xmm2 \n" + "cvtps2dq %%xmm3,%%xmm3 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop \n" + LABELALIGN + "10: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 + "lea " MEMLEA(0x10,0) ",%0 \n" + "psubd " MEMACCESS(1) ",%%xmm0 \n" + BUNDLEALIGN + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 + "lea " MEMLEA(0x10,1) ",%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "mulps %%xmm4,%%xmm0 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + "19: \n" + : "+r"(topleft), // %0 + "+r"(botleft), // %1 + "+r"(dst), // %2 + "+rm"(count) // %3 + : "r"((intptr_t)(width)), // %4 + "rm"(area) // %5 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} +#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 + +#ifdef HAS_ARGBAFFINEROW_SSE2 +// Copy ARGB pixels from source image with slope to a row of destination. +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* src_dudv, int width) { + intptr_t src_argb_stride_temp = src_argb_stride; + intptr_t temp = 0; + asm volatile ( + "movq " MEMACCESS(3) ",%%xmm2 \n" + "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" + "shl $0x10,%1 \n" + "add $0x4,%1 \n" + "movd %1,%%xmm5 \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + "pshufd $0x44,%%xmm7,%%xmm7 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "movdqa %%xmm2,%%xmm0 \n" + "addps %%xmm7,%%xmm0 \n" + "movlhps %%xmm0,%%xmm2 \n" + "movdqa %%xmm7,%%xmm4 \n" + "addps %%xmm4,%%xmm4 \n" + "movdqa %%xmm2,%%xmm3 \n" + "addps %%xmm4,%%xmm3 \n" + "addps %%xmm4,%%xmm4 \n" + + // 4 pixel loop \n" + LABELALIGN + "40: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 + "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 + "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts + "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 + MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 + "punpckldq %%xmm6,%%xmm1 \n" + "addps %%xmm4,%%xmm2 \n" + "movq %%xmm1," MEMACCESS(2) " \n" + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 + MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 + "punpckldq %%xmm6,%%xmm0 \n" + "addps %%xmm4,%%xmm3 \n" + "sub $0x4,%4 \n" + "movq %%xmm0," MEMACCESS2(0x08,2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%4 \n" + "jl 19f \n" + + // 1 pixel loop \n" + LABELALIGN + "10: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "pmaddwd %%xmm5,%%xmm0 \n" + "addps %%xmm7,%%xmm2 \n" + "movd %%xmm0,%k1 \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 + "sub $0x1,%4 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x04,2) ",%2 \n" + "jge 10b \n" + "19: \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_stride_temp), // %1 + "+r"(dst_argb), // %2 + "+r"(src_dudv), // %3 + "+rm"(width), // %4 + "+r"(temp) // %5 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBAFFINEROW_SSE2 + +#ifdef HAS_INTERPOLATEROW_SSSE3 +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm2) + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 25 / 75. + LABELALIGN + "25: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm1) + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm1) + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + LABELALIGN + "75: \n" + "movdqa " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm0) + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "sub $0x10,%2 \n" + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm5" +#endif + ); +} +#endif // HAS_INTERPOLATEROW_SSSE3 + +#ifdef HAS_INTERPOLATEROW_SSE2 +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2 + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "punpckhbw %%xmm4,%%xmm1 \n" + "psubw %%xmm0,%%xmm2 \n" + "psubw %%xmm1,%%xmm3 \n" + "paddw %%xmm2,%%xmm2 \n" + "paddw %%xmm3,%%xmm3 \n" + "pmulhw %%xmm5,%%xmm2 \n" + "pmulhw %%xmm5,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 25 / 75. + LABELALIGN + "25: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + LABELALIGN + "75: \n" + "movdqa " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0 + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "sub $0x10,%2 \n" + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_INTERPOLATEROW_SSE2 + +#ifdef HAS_INTERPOLATEROW_SSSE3 +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm2) + "movdqu %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 25 / 75. + LABELALIGN + "25: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + LABELALIGN + "75: \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm0) + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + "sub $0x10,%2 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm5" +#endif + ); +} +#endif // HAS_INTERPOLATEROW_SSSE3 + +#ifdef HAS_INTERPOLATEROW_SSE2 +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "punpckhbw %%xmm4,%%xmm1 \n" + "psubw %%xmm0,%%xmm2 \n" + "psubw %%xmm1,%%xmm3 \n" + "paddw %%xmm2,%%xmm2 \n" + "paddw %%xmm3,%%xmm3 \n" + "pmulhw %%xmm5,%%xmm2 \n" + "pmulhw %%xmm5,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 25 / 75. + LABELALIGN + "25: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + LABELALIGN + "75: \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + "sub $0x10,%2 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_INTERPOLATEROW_SSE2 + +#ifdef HAS_HALFROW_SSE2 +void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + asm volatile ( + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0 + "sub $0x10,%2 \n" + MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1) + "lea " MEMLEA(0x10,0) ",%0 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(pix) // %2 + : "r"((intptr_t)(src_uv_stride)) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0" +#endif + ); +} +#endif // HAS_HALFROW_SSE2 + +#ifdef HAS_ARGBTOBAYERROW_SSSE3 +void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + asm volatile ( + // NaCL caveat - assumes movd is from GPR + "movd %3,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "sub $0x8,%2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : "g"(selector) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBTOBAYERROW_SSSE3 + +#ifdef HAS_ARGBTOBAYERGGROW_SSE2 +void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrld $0x8,%%xmm0 \n" + "psrld $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x8,%2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBTOBAYERGGROW_SSE2 + +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "movdqa " MEMACCESS(3) ",%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "movdqa " MEMACCESS(3) ",%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBSHUFFLEROW_SSSE3 + +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "sub $0x10,%2 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBSHUFFLEROW_AVX2 + +#ifdef HAS_ARGBSHUFFLEROW_SSE2 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + uintptr_t pixel_temp = 0u; + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" + "mov " MEMACCESS(4) ",%k2 \n" + "cmp $0x3000102,%k2 \n" + "je 3012f \n" + "cmp $0x10203,%k2 \n" + "je 123f \n" + "cmp $0x30201,%k2 \n" + "je 321f \n" + "cmp $0x2010003,%k2 \n" + "je 2103f \n" + + LABELALIGN + "1: \n" + "movzb " MEMACCESS(4) ",%2 \n" + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS(1) " \n" + "movzb " MEMACCESS2(0x1,4) ",%2 \n" + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS2(0x1,1) " \n" + BUNDLEALIGN + "movzb " MEMACCESS2(0x2,4) ",%2 \n" + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS2(0x2,1) " \n" + "movzb " MEMACCESS2(0x3,4) ",%2 \n" + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS2(0x3,1) " \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "sub $0x1,%3 \n" + "jg 1b \n" + "jmp 99f \n" + + LABELALIGN + "123: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x1b,%%xmm0,%%xmm0 \n" + "pshuflw $0x1b,%%xmm0,%%xmm0 \n" + "pshufhw $0x1b,%%xmm1,%%xmm1 \n" + "pshuflw $0x1b,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 123b \n" + "jmp 99f \n" + + LABELALIGN + "321: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x39,%%xmm0,%%xmm0 \n" + "pshuflw $0x39,%%xmm0,%%xmm0 \n" + "pshufhw $0x39,%%xmm1,%%xmm1 \n" + "pshuflw $0x39,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 321b \n" + "jmp 99f \n" + + LABELALIGN + "2103: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x93,%%xmm0,%%xmm0 \n" + "pshuflw $0x93,%%xmm0,%%xmm0 \n" + "pshufhw $0x93,%%xmm1,%%xmm1 \n" + "pshuflw $0x93,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 2103b \n" + "jmp 99f \n" + + LABELALIGN + "3012: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0xc6,%%xmm0,%%xmm0 \n" + "pshuflw $0xc6,%%xmm0,%%xmm0 \n" + "pshufhw $0xc6,%%xmm1,%%xmm1 \n" + "pshuflw $0xc6,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 3012b \n" + + "99: \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+d"(pixel_temp), // %2 + "+r"(pix) // %3 + : "r"(shuffler) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBSHUFFLEROW_SSE2 + +#ifdef HAS_I422TOYUY2ROW_SSE2 +void I422ToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(1) ",%%xmm2 \n" + MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 + "lea " MEMLEA(0x8,1) ",%1 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(3) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" + "lea " MEMLEA(0x20,3) ",%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_frame), // %3 + "+rm"(width) // %4 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} +#endif // HAS_I422TOYUY2ROW_SSE2 + +#ifdef HAS_I422TOUYVYROW_SSE2 +void I422ToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(1) ",%%xmm2 \n" + MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 + "lea " MEMLEA(0x8,1) ",%1 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1," MEMACCESS(3) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" + "lea " MEMLEA(0x20,3) ",%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_frame), // %3 + "+rm"(width) // %4 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} +#endif // HAS_I422TOUYVYROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + asm volatile ( + "pxor %%xmm3,%%xmm3 \n" + + // 2 pixel loop. + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" + "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" + "addps " MEMACCESS(3) ",%%xmm0 \n" + "addps " MEMACCESS(3) ",%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" + "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" + "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" + "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "sub $0x2,%2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + asm volatile ( + "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" + "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" + "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" + "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" + + // 2 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels + "lea " MEMLEA(0x8,0) ",%0 \n" + "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats + "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X + "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X + "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X + "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X + "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X + "vcvttps2dq %%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" + "sub $0x2,%2 \n" + "vmovq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc" +#if defined(__SSE2__) +// TODO(fbarchard): declare ymm usage when applicable. + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, + int width) { + uintptr_t pixel_temp = 0u; + asm volatile ( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb " MEMACCESS(0) ",%1 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x4,0) " \n" + "movzb " MEMACCESS2(-0x3,0) ",%1 \n" + MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x3,0) " \n" + "movzb " MEMACCESS2(-0x2,0) ",%1 \n" + MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x2,0) " \n" + "movzb " MEMACCESS2(-0x1,0) ",%1 \n" + MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x1,0) " \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); +} +#endif // HAS_ARGBCOLORTABLEROW_X86 + +#ifdef HAS_RGBCOLORTABLEROW_X86 +// Tranform RGB pixels with color table. +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { + uintptr_t pixel_temp = 0u; + asm volatile ( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb " MEMACCESS(0) ",%1 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x4,0) " \n" + "movzb " MEMACCESS2(-0x3,0) ",%1 \n" + MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x3,0) " \n" + "movzb " MEMACCESS2(-0x2,0) ",%1 \n" + MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x2,0) " \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); +} +#endif // HAS_RGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, + const uint8* luma, uint32 lumacoeff) { + uintptr_t pixel_temp = 0u; + uintptr_t table_temp = 0u; + asm volatile ( + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0x8,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(2) ",%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "phaddw %%xmm0,%%xmm0 \n" + "pand %%xmm4,%%xmm0 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb " MEMACCESS(2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS(3) " \n" + "movzb " MEMACCESS2(0x1,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x1,3) " \n" + "movzb " MEMACCESS2(0x2,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x2,3) " \n" + "movzb " MEMACCESS2(0x3,2) ",%0 \n" + "mov %b0," MEMACCESS2(0x3,3) " \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb " MEMACCESS2(0x4,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x4,3) " \n" + BUNDLEALIGN + "movzb " MEMACCESS2(0x5,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x5,3) " \n" + "movzb " MEMACCESS2(0x6,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x6,3) " \n" + "movzb " MEMACCESS2(0x7,2) ",%0 \n" + "mov %b0," MEMACCESS2(0x7,3) " \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb " MEMACCESS2(0x8,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x8,3) " \n" + "movzb " MEMACCESS2(0x9,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x9,3) " \n" + "movzb " MEMACCESS2(0xa,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xa,3) " \n" + "movzb " MEMACCESS2(0xb,2) ",%0 \n" + "mov %b0," MEMACCESS2(0xb,3) " \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + + "movzb " MEMACCESS2(0xc,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xc,3) " \n" + "movzb " MEMACCESS2(0xd,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xd,3) " \n" + "movzb " MEMACCESS2(0xe,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xe,3) " \n" + "movzb " MEMACCESS2(0xf,2) ",%0 \n" + "mov %b0," MEMACCESS2(0xf,3) " \n" + "sub $0x4,%4 \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "lea " MEMLEA(0x10,3) ",%3 \n" + "jg 1b \n" + : "+d"(pixel_temp), // %0 + "+a"(table_temp), // %1 + "+r"(src_argb), // %2 + "+r"(dst_argb), // %3 + "+rm"(width) // %4 + : "r"(luma), // %5 + "rm"(lumacoeff) // %6 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc new file mode 100644 index 000000000..2cfacad1b --- /dev/null +++ b/third_party/libyuv/source/row_win.cc @@ -0,0 +1,7284 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Visual C x86. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +#ifdef HAS_ARGBTOYROW_SSSE3 + +// Constants for ARGB. +static const vec8 kARGBToY = { + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 +}; + +// JPeg full range. +static const vec8 kARGBToYJ = { + 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 +}; + +static const vec8 kARGBToU = { + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 +}; + +static const vec8 kARGBToUJ = { + 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 +}; + +static const vec8 kARGBToV = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, +}; + +static const vec8 kARGBToVJ = { + -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 +}; + +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = { + 0, 4, 1, 5, 2, 6, 3, 7 +}; + +// vpshufb for vphaddw + vpackuswb packed to shorts. +static const lvec8 kShufARGBToUV_AVX = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, +}; + +// Constants for BGRA. +static const vec8 kBGRAToY = { + 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 +}; + +static const vec8 kBGRAToU = { + 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 +}; + +static const vec8 kBGRAToV = { + 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 +}; + +// Constants for ABGR. +static const vec8 kABGRToY = { + 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 +}; + +static const vec8 kABGRToU = { + -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 +}; + +static const vec8 kABGRToV = { + 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 +}; + +// Constants for RGBA. +static const vec8 kRGBAToY = { + 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 +}; + +static const vec8 kRGBAToU = { + 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 +}; + +static const vec8 kRGBAToV = { + 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 +}; + +static const uvec8 kAddY16 = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u +}; + +static const vec16 kAddYJ64 = { + 64, 64, 64, 64, 64, 64, 64, 64 +}; + +static const uvec8 kAddUV128 = { + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + +static const uvec16 kAddUVJ128 = { + 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u +}; + +// Shuffle table for converting RGB24 to ARGB. +static const uvec8 kShuffleMaskRGB24ToARGB = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u +}; + +// Shuffle table for converting RAW to ARGB. +static const uvec8 kShuffleMaskRAWToARGB = { + 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u +}; + +// Shuffle table for converting ARGB to RGB24. +static const uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGB to RAW. +static const uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 +static const uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u +}; + +// Shuffle table for converting ARGB to RAW. +static const uvec8 kShuffleMaskARGBToRAW_0 = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u +}; + +// Duplicates gray value 3 times and fills in alpha opaque. +__declspec(naked) __declspec(align(16)) +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + + align 4 + convertloop: + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 + punpckhwd xmm1, xmm1 + por xmm0, xmm5 + por xmm1, xmm5 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, + int pix) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + + align 4 + convertloop: + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 + punpckhwd xmm1, xmm1 + por xmm0, xmm5 + por xmm1, xmm5 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { + __asm { + mov eax, [esp + 4] // src_rgb24 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, kShuffleMaskRGB24ToARGB + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqa [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqa [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqa [edx + 16], xmm1 + por xmm3, xmm5 + sub ecx, 16 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, + int pix) { + __asm { + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, kShuffleMaskRAWToARGB + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqa [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqa [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqa [edx + 16], xmm1 + por xmm3, xmm5 + sub ecx, 16 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + jg convertloop + ret + } +} + +// pmul method to replicate bits. +// Math to replicate bits: +// (v << 8) | (v << 3) +// v * 256 + v * 8 +// v * (256 + 8) +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 +// 20 instructions. +__declspec(naked) __declspec(align(16)) +void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green + psllw xmm4, 10 + psrlw xmm4, 5 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + align 4 + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of bgr565 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + pand xmm1, xmm3 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pmulhuw xmm1, xmm5 // * (256 + 8) + pmulhuw xmm2, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + pand xmm0, xmm4 // G in middle 6 bits + pmulhuw xmm0, xmm6 // << 5 * (256 + 4) + por xmm0, xmm7 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +// 24 instructions +__declspec(naked) __declspec(align(16)) +void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green + psrlw xmm4, 6 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + align 4 + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of 1555 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psllw xmm1, 1 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pand xmm1, xmm3 + pmulhuw xmm2, xmm5 // * (256 + 8) + pmulhuw xmm1, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + movdqa xmm2, xmm0 + pand xmm0, xmm4 // G in middle 5 bits + psraw xmm2, 8 // A + pmulhuw xmm0, xmm6 // << 6 * (256 + 8) + pand xmm2, xmm7 + por xmm0, xmm2 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +// 18 instructions. +__declspec(naked) __declspec(align(16)) +void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f + movd xmm4, eax + pshufd xmm4, xmm4, 0 + movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles + pslld xmm5, 4 + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + align 4 + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 + movdqa xmm2, xmm0 + pand xmm0, xmm4 // mask low nibbles + pand xmm2, xmm5 // mask high nibbles + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + psllw xmm1, 4 + psrlw xmm3, 4 + por xmm0, xmm1 + por xmm2, xmm3 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB + movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + movdqa xmm6, kShuffleMaskARGBToRGB24 + + align 4 + convertloop: + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm6 + pshufb xmm2, xmm6 + pshufb xmm3, xmm6 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqu [edx], xmm0 // store 0 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 + lea edx, [edx + 48] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + movdqa xmm6, kShuffleMaskARGBToRAW + + align 4 + convertloop: + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm6 + pshufb xmm2, xmm6 + pshufb xmm3, xmm6 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqu [edx], xmm0 // store 0 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 + lea edx, [edx + 48] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + psrld xmm3, 27 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + psrld xmm4, 26 + pslld xmm4, 5 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pslld xmm5, 11 + + align 4 + convertloop: + movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +// TODO(fbarchard): Improve sign extension/packing. +__declspec(naked) __declspec(align(16)) +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + pcmpeqb xmm4, xmm4 // generate mask 0x0000001f + psrld xmm4, 27 + movdqa xmm5, xmm4 // generate mask 0x000003e0 + pslld xmm5, 5 + movdqa xmm6, xmm4 // generate mask 0x00007c00 + pslld xmm6, 10 + pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 + pslld xmm7, 15 + + align 4 + convertloop: + movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + movdqa xmm3, xmm0 // R + psrad xmm0, 16 // A + psrld xmm1, 3 // B + psrld xmm2, 6 // G + psrld xmm3, 9 // R + pand xmm0, xmm7 // A + pand xmm1, xmm4 // B + pand xmm2, xmm5 // G + pand xmm3, xmm6 // R + por xmm0, xmm1 // BA + por xmm2, xmm3 // GR + por xmm0, xmm2 // BGRA + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 + psllw xmm4, 12 + movdqa xmm3, xmm4 // generate mask 0x00f000f0 + psrlw xmm3, 8 + + align 4 + convertloop: + movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 + pand xmm0, xmm3 // low nibble + pand xmm1, xmm4 // high nibble + psrl xmm0, 4 + psrl xmm1, 8 + por xmm0, xmm1 + packuswb xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +__declspec(naked) __declspec(align(16)) +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kARGBToY + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +__declspec(naked) __declspec(align(16)) +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm4, kARGBToYJ + movdqa xmm5, kAddYJ64 + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + paddw xmm0, xmm5 // Add .5 for rounding. + paddw xmm2, xmm5 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +#ifdef HAS_ARGBTOYROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +__declspec(naked) __declspec(align(32)) +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + vbroadcastf128 ymm4, kARGBToY + vbroadcastf128 ymm5, kAddY16 + vmovdqa ymm6, kPermdARGBToY_AVX + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpmaddubsw ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + lea eax, [eax + 128] + vphaddw ymm0, ymm0, ymm1 // mutates. + vphaddw ymm2, ymm2, ymm3 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm2, ymm2, 7 + vpackuswb ymm0, ymm0, ymm2 // mutates. + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. + vpaddb ymm0, ymm0, ymm5 + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOYROW_AVX2 + +#ifdef HAS_ARGBTOYROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +__declspec(naked) __declspec(align(32)) +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + vbroadcastf128 ymm4, kARGBToYJ + vbroadcastf128 ymm5, kAddYJ64 + vmovdqa ymm6, kPermdARGBToY_AVX + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpmaddubsw ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + lea eax, [eax + 128] + vphaddw ymm0, ymm0, ymm1 // mutates. + vphaddw ymm2, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. + vpaddw ymm2, ymm2, ymm5 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm2, ymm2, 7 + vpackuswb ymm0, ymm0, ymm2 // mutates. + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBTOYJROW_AVX2 + +__declspec(naked) __declspec(align(16)) +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kARGBToY + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm4, kARGBToYJ + movdqa xmm5, kAddYJ64 + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + paddw xmm0, xmm5 + paddw xmm2, xmm5 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kBGRAToY + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kBGRAToY + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kABGRToY + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kABGRToY + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kRGBAToY + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kRGBAToY + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kARGBToUJ + movdqa xmm6, kARGBToVJ + movdqa xmm5, kAddUVJ128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + paddw xmm0, xmm5 // +.5 rounding -> unsigned + paddw xmm1, xmm5 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +#ifdef HAS_ARGBTOUVROW_AVX2 +__declspec(naked) __declspec(align(32)) +void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + vbroadcastf128 ymm5, kAddUV128 + vbroadcastf128 ymm6, kARGBToV + vbroadcastf128 ymm7, kARGBToU + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 32x2 argb pixels to 16x1 */ + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + vpavgb ymm2, ymm2, [eax + esi + 64] + vpavgb ymm3, ymm3, [eax + esi + 96] + lea eax, [eax + 128] + vshufps ymm4, ymm0, ymm1, 0x88 + vshufps ymm0, ymm0, ymm1, 0xdd + vpavgb ymm0, ymm0, ymm4 // mutated by vshufps + vshufps ymm4, ymm2, ymm3, 0x88 + vshufps ymm2, ymm2, ymm3, 0xdd + vpavgb ymm2, ymm2, ymm4 // mutated by vshufps + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V + vpmaddubsw ymm1, ymm0, ymm7 // U + vpmaddubsw ymm3, ymm2, ymm7 + vpmaddubsw ymm0, ymm0, ymm6 // V + vpmaddubsw ymm2, ymm2, ymm6 + vphaddw ymm1, ymm1, ymm3 // mutates + vphaddw ymm0, ymm0, ymm2 + vpsraw ymm1, ymm1, 8 + vpsraw ymm0, ymm0, 8 + vpacksswb ymm0, ymm1, ymm0 // mutates + vpermq ymm0, ymm0, 0xd8 // For vpacksswb + vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw + vpaddb ymm0, ymm0, ymm5 // -> unsigned + + // step 3 - store 16 U and 16 V values + sub ecx, 32 + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V + lea edx, [edx + 16] + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBTOUVROW_AVX2 + +__declspec(naked) __declspec(align(16)) +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kARGBToUJ + movdqa xmm6, kARGBToVJ + movdqa xmm5, kAddUVJ128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + paddw xmm0, xmm5 // +.5 rounding -> unsigned + paddw xmm1, xmm5 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUV444Row_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* convert to U and V */ + movdqa xmm0, [eax] // U + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm0 + + movdqa xmm0, [eax] // V + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm6 + pmaddubsw xmm1, xmm6 + pmaddubsw xmm2, xmm6 + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + lea eax, [eax + 64] + movdqa [edx + edi], xmm0 + lea edx, [edx + 16] + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* convert to U and V */ + movdqu xmm0, [eax] // U + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqu [edx], xmm0 + + movdqu xmm0, [eax] // V + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm6 + pmaddubsw xmm1, xmm6 + pmaddubsw xmm2, xmm6 + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + lea eax, [eax + 64] + movdqu [edx + edi], xmm0 + lea edx, [edx + 16] + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUV422Row_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kBGRAToU + movdqa xmm6, kBGRAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kBGRAToU + movdqa xmm6, kBGRAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kABGRToU + movdqa xmm6, kABGRToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kABGRToU + movdqa xmm6, kABGRToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kRGBAToU + movdqa xmm6, kRGBAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kRGBAToU + movdqa xmm6, kRGBAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBTOYROW_SSSE3 + +#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ + +#define UB 127 /* min(63,(int8)(2.018 * 64)) */ +#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ +#define UR 0 + +#define VB 0 +#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ +#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ + +// Bias +#define BB UB * 128 + VB * 128 +#define BG UG * 128 + VG * 128 +#define BR UR * 128 + VR * 128 + +#ifdef HAS_I422TOARGBROW_AVX2 + +static const lvec8 kUVToB_AVX = { + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB +}; +static const lvec8 kUVToR_AVX = { + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR +}; +static const lvec8 kUVToG_AVX = { + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG +}; +static const lvec16 kYToRgb_AVX = { + YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG +}; +static const lvec16 kYSub16_AVX = { + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 +}; +static const lvec16 kUVBiasB_AVX = { + BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB +}; +static const lvec16 kUVBiasG_AVX = { + BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG +}; +static const lvec16 kUVBiasR_AVX = { + BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR +}; + +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpxor ymm4, ymm4, ymm4 + + align 4 + convertloop: + vmovq xmm0, qword ptr [esi] // U + vmovq xmm1, qword ptr [esi + edi] // V + lea esi, [esi + 8] + vpunpcklbw ymm0, ymm0, ymm1 // UV + vpermq ymm0, ymm0, 0xd8 + vpunpcklwd ymm0, ymm0, ymm0 // UVUV + vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV + vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV + vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV + vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed + vpsubw ymm1, ymm1, kUVBiasG_AVX + vpsubw ymm0, ymm0, kUVBiasR_AVX + + // Step 2: Find Y contribution to 16 R,G,B values + vmovdqu xmm3, [eax] // NOLINT + lea eax, [eax + 16] + vpermq ymm3, ymm3, 0xd8 + vpunpcklbw ymm3, ymm3, ymm4 + vpsubsw ymm3, ymm3, kYSub16_AVX + vpmullw ymm3, ymm3, kYToRgb_AVX + vpaddsw ymm2, ymm2, ymm3 // B += Y + vpaddsw ymm1, ymm1, ymm3 // G += Y + vpaddsw ymm0, ymm0, ymm3 // R += Y + vpsraw ymm2, ymm2, 6 + vpsraw ymm1, ymm1, 6 + vpsraw ymm0, ymm0, 6 + vpackuswb ymm2, ymm2, ymm2 // B + vpackuswb ymm1, ymm1, ymm1 // G + vpackuswb ymm0, ymm0, ymm0 // R + + // Step 3: Weave into ARGB + vpunpcklbw ymm2, ymm2, ymm1 // BG + vpermq ymm2, ymm2, 0xd8 + vpunpcklbw ymm0, ymm0, ymm5 // RA + vpermq ymm0, ymm0, 0xd8 + vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels + vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + + pop edi + pop esi + ret + } +} +#endif // HAS_I422TOARGBROW_AVX2 + +#ifdef HAS_I422TOARGBROW_SSSE3 + +static const vec8 kUVToB = { + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB +}; + +static const vec8 kUVToR = { + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR +}; + +static const vec8 kUVToG = { + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG +}; + +static const vec8 kVUToB = { + VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, +}; + +static const vec8 kVUToR = { + VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, +}; + +static const vec8 kVUToG = { + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, +}; + +static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; +static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; +static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; +static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; +static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; + +// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. + +// Read 8 UV from 444. +#define READYUV444 __asm { \ + __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ + __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + } + +// Read 4 UV from 422, upsample to 8 UV. +#define READYUV422 __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 4] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + } + +// Read 2 UV from 411, upsample to 8 UV. +#define READYUV411 __asm { \ + __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ + __asm movd xmm0, ebx \ + __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm movd xmm1, ebx \ + __asm lea esi, [esi + 2] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ + } + +// Read 4 UV from NV12, upsample to 8 UV. +#define READNV12 __asm { \ + __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + } + +// Convert 8 pixels: 8 UV and 8 Y. +#define YUVTORGB __asm { \ + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ + __asm movdqa xmm1, xmm0 \ + __asm movdqa xmm2, xmm0 \ + __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ + __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ + __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ + __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ + __asm psubw xmm1, kUVBiasG \ + __asm psubw xmm2, kUVBiasR \ + /* Step 2: Find Y contribution to 8 R,G,B values */ \ + __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ + __asm lea eax, [eax + 8] \ + __asm punpcklbw xmm3, xmm4 \ + __asm psubsw xmm3, kYSub16 \ + __asm pmullw xmm3, kYToRgb \ + __asm paddsw xmm0, xmm3 /* B += Y */ \ + __asm paddsw xmm1, xmm3 /* G += Y */ \ + __asm paddsw xmm2, xmm3 /* R += Y */ \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ + } + +// Convert 8 pixels: 8 VU and 8 Y. +#define YVUTORGB __asm { \ + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ + __asm movdqa xmm1, xmm0 \ + __asm movdqa xmm2, xmm0 \ + __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ + __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ + __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ + __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ + __asm psubw xmm1, kUVBiasG \ + __asm psubw xmm2, kUVBiasR \ + /* Step 2: Find Y contribution to 8 R,G,B values */ \ + __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ + __asm lea eax, [eax + 8] \ + __asm punpcklbw xmm3, xmm4 \ + __asm psubsw xmm3, kYSub16 \ + __asm pmullw xmm3, kYToRgb \ + __asm paddsw xmm0, xmm3 /* B += Y */ \ + __asm paddsw xmm1, xmm3 /* G += Y */ \ + __asm paddsw xmm2, xmm3 /* R += Y */ \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ + } + +// 8 pixels, dest aligned 16. +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV444 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToRGB24Row_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgb24, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb24 + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + movdqa xmm5, kShuffleMaskARGBToRGB24_0 + movdqa xmm6, kShuffleMaskARGBToRGB24 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RRGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm2 // RR + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRR first 4 pixels + punpckhwd xmm1, xmm2 // BGRR next 4 pixels + pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. + pshufb xmm1, xmm6 // Pack into first 12 bytes. + palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 + movq qword ptr [edx], xmm0 // First 8 bytes + movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. + lea edx, [edx + 24] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToRAWRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_raw, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // raw + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + movdqa xmm5, kShuffleMaskARGBToRAW_0 + movdqa xmm6, kShuffleMaskARGBToRAW + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RRGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm2 // RR + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRR first 4 pixels + punpckhwd xmm1, xmm2 // BGRR next 4 pixels + pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. + pshufb xmm1, xmm6 // Pack into first 12 bytes. + palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 + movq qword ptr [edx], xmm0 // First 8 bytes + movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. + lea edx, [edx + 24] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest unaligned. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToRGB565Row_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb565_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb565 + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + pcmpeqb xmm5, xmm5 // generate mask 0x0000001f + psrld xmm5, 27 + pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 + psrld xmm6, 26 + pslld xmm6, 5 + pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 + pslld xmm7, 11 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RRGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm2 // RR + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRR first 4 pixels + punpckhwd xmm1, xmm2 // BGRR next 4 pixels + + // Step 3b: RRGB -> RGB565 + movdqa xmm3, xmm0 // B first 4 pixels of argb + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm3, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm3, xmm5 // B + pand xmm2, xmm6 // G + pand xmm0, xmm7 // R + por xmm3, xmm2 // BG + por xmm0, xmm3 // BGR + movdqa xmm3, xmm1 // B next 4 pixels of argb + movdqa xmm2, xmm1 // G + pslld xmm1, 8 // R + psrld xmm3, 3 // B + psrld xmm2, 5 // G + psrad xmm1, 16 // R + pand xmm3, xmm5 // B + pand xmm2, xmm6 // G + pand xmm1, xmm7 // R + por xmm3, xmm2 // BG + por xmm1, xmm3 // BGR + packssdw xmm0, xmm1 + sub ecx, 8 + movdqu [edx], xmm0 // store 8 pixels of RGB565 + lea edx, [edx + 16] + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +// Similar to I420 but duplicate UV once more. +__declspec(naked) __declspec(align(16)) +void I411ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ecx, [esp + 12 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV411 // modifies EBX + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + pop ebx + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void NV12ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READNV12 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void NV21ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // VU + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READNV12 + YVUTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +// 8 pixels, unaligned. +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV444 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, unaligned. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, unaligned. +// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +// Similar to I420 but duplicate UV once more. +__declspec(naked) __declspec(align(16)) +void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ecx, [esp + 12 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV411 // modifies EBX + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + pop ebx + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READNV12 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // VU + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READNV12 + YVUTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_bgra, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // bgra + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into BGRA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm0 // GB + punpcklbw xmm5, xmm2 // AR + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // BGRA first 4 pixels + punpckhwd xmm0, xmm1 // BGRA next 4 pixels + movdqa [edx], xmm5 + movdqa [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_bgra, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // bgra + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into BGRA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm0 // GB + punpcklbw xmm5, xmm2 // AR + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // BGRA first 4 pixels + punpckhwd xmm0, xmm1 // BGRA next 4 pixels + movdqu [edx], xmm5 + movdqu [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_abgr, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // abgr + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm2, xmm1 // RG + punpcklbw xmm0, xmm5 // BA + movdqa xmm1, xmm2 + punpcklwd xmm2, xmm0 // RGBA first 4 pixels + punpckhwd xmm1, xmm0 // RGBA next 4 pixels + movdqa [edx], xmm2 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_abgr, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // abgr + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm2, xmm1 // RG + punpcklbw xmm0, xmm5 // BA + movdqa xmm1, xmm2 + punpcklwd xmm2, xmm0 // RGBA first 4 pixels + punpckhwd xmm1, xmm0 // RGBA next 4 pixels + movdqu [edx], xmm2 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToRGBARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgba, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgba + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RGBA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm2 // GR + punpcklbw xmm5, xmm0 // AB + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // RGBA first 4 pixels + punpckhwd xmm0, xmm1 // RGBA next 4 pixels + movdqa [edx], xmm5 + movdqa [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgba, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgba + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RGBA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm2 // GR + punpcklbw xmm5, xmm0 // AB + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // RGBA first 4 pixels + punpckhwd xmm0, xmm1 // RGBA next 4 pixels + movdqu [edx], xmm5 + movdqu [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +#endif // HAS_I422TOARGBROW_SSSE3 + +#ifdef HAS_YTOARGBROW_SSE2 +__declspec(naked) __declspec(align(16)) +void YToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width) { + __asm { + pxor xmm5, xmm5 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + mov eax, 0x00100010 + movd xmm3, eax + pshufd xmm3, xmm3, 0 + mov eax, 0x004a004a // 74 + movd xmm2, eax + pshufd xmm2, xmm2,0 + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width + + align 4 + convertloop: + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm5 // 0.Y + psubusw xmm0, xmm3 + pmullw xmm0, xmm2 + psrlw xmm0, 6 + packuswb xmm0, xmm0 // G + + // Step 2: Weave into ARGB + punpcklbw xmm0, xmm0 // GG + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 // BGRA first 4 pixels + punpckhwd xmm1, xmm1 // BGRA next 4 pixels + por xmm0, xmm4 + por xmm1, xmm4 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_YTOARGBROW_SSE2 + +#ifdef HAS_MIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleMirror = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +__declspec(naked) __declspec(align(16)) +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + movdqa xmm5, kShuffleMirror + lea eax, [eax - 16] + + align 4 + convertloop: + movdqa xmm0, [eax + ecx] + pshufb xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} +#endif // HAS_MIRRORROW_SSSE3 + +#ifdef HAS_MIRRORROW_AVX2 +// Shuffle table for reversing the bytes. +static const ulvec8 kShuffleMirror_AVX2 = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +__declspec(naked) __declspec(align(16)) +void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + vmovdqa ymm5, kShuffleMirror_AVX2 + lea eax, [eax - 32] + + align 4 + convertloop: + vmovdqu ymm0, [eax + ecx] + vpshufb ymm0, ymm0, ymm5 + vpermq ymm0, ymm0, 0x4e // swap high and low halfs + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_MIRRORROW_AVX2 + +#ifdef HAS_MIRRORROW_SSE2 +// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 +// version can not. +__declspec(naked) __declspec(align(16)) +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + lea eax, [eax - 16] + + align 4 + convertloop: + movdqu xmm0, [eax + ecx] + movdqa xmm1, xmm0 // swap bytes + psllw xmm0, 8 + psrlw xmm1, 8 + por xmm0, xmm1 + pshuflw xmm0, xmm0, 0x1b // swap words + pshufhw xmm0, xmm0, 0x1b + pshufd xmm0, xmm0, 0x4e // swap qwords + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} +#endif // HAS_MIRRORROW_SSE2 + +#ifdef HAS_MIRRORROW_UV_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +static const uvec8 kShuffleMirrorUV = { + 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u +}; + +__declspec(naked) __declspec(align(16)) +void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + movdqa xmm1, kShuffleMirrorUV + lea eax, [eax + ecx * 2 - 16] + sub edi, edx + + align 4 + convertloop: + movdqa xmm0, [eax] + lea eax, [eax - 16] + pshufb xmm0, xmm1 + sub ecx, 8 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [edx + edi], xmm0 + lea edx, [edx + 8] + jg convertloop + + pop edi + ret + } +} +#endif // HAS_MIRRORROW_UV_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static const uvec8 kARGBShuffleMirror = { + 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u +}; + +__declspec(naked) __declspec(align(16)) +void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + lea eax, [eax - 16 + ecx * 4] // last 4 pixels. + movdqa xmm5, kARGBShuffleMirror + + align 4 + convertloop: + movdqa xmm0, [eax] + lea eax, [eax - 16] + pshufb xmm0, xmm5 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} +#endif // HAS_ARGBMIRRORROW_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_AVX2 +// Shuffle table for reversing the bytes. +static const ulvec32 kARGBShuffleMirror_AVX2 = { + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +__declspec(naked) __declspec(align(16)) +void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + lea eax, [eax - 32] + vmovdqa ymm5, kARGBShuffleMirror_AVX2 + + align 4 + convertloop: + vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order + sub ecx, 8 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBMIRRORROW_AVX2 + +#ifdef HAS_SPLITUVROW_SSE2 +__declspec(naked) __declspec(align(16)) +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pand xmm0, xmm5 // even bytes + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm2, 8 // odd bytes + psrlw xmm3, 8 + packuswb xmm2, xmm3 + movdqa [edx], xmm0 + movdqa [edx + edi], xmm2 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pand xmm0, xmm5 // even bytes + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm2, 8 // odd bytes + psrlw xmm3, 8 + packuswb xmm2, xmm3 + movdqu [edx], xmm0 + movdqu [edx + edi], xmm2 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_SPLITUVROW_SSE2 + +#ifdef HAS_SPLITUVROW_AVX2 +__declspec(naked) __declspec(align(16)) +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm2, ymm0, 8 // odd bytes + vpsrlw ymm3, ymm1, 8 + vpand ymm0, ymm0, ymm5 // even bytes + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpackuswb ymm2, ymm2, ymm3 + vpermq ymm0, ymm0, 0xd8 + vpermq ymm2, ymm2, 0xd8 + vmovdqu [edx], ymm0 + vmovdqu [edx + edi], ymm2 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_SPLITUVROW_AVX2 + +#ifdef HAS_MERGEUVROW_SSE2 +__declspec(naked) __declspec(align(16)) +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + align 4 + convertloop: + movdqa xmm0, [eax] // read 16 U's + movdqa xmm1, [eax + edx] // and 16 V's + lea eax, [eax + 16] + movdqa xmm2, xmm0 + punpcklbw xmm0, xmm1 // first 8 UV pairs + punpckhbw xmm2, xmm1 // next 8 UV pairs + movdqa [edi], xmm0 + movdqa [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, + uint8* dst_uv, int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + align 4 + convertloop: + movdqu xmm0, [eax] // read 16 U's + movdqu xmm1, [eax + edx] // and 16 V's + lea eax, [eax + 16] + movdqa xmm2, xmm0 + punpcklbw xmm0, xmm1 // first 8 UV pairs + punpckhbw xmm2, xmm1 // next 8 UV pairs + movdqu [edi], xmm0 + movdqu [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_MERGEUVROW_SSE2 + +#ifdef HAS_MERGEUVROW_AVX2 +__declspec(naked) __declspec(align(16)) +void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + align 4 + convertloop: + vmovdqu ymm0, [eax] // read 32 U's + vmovdqu ymm1, [eax + edx] // and 32 V's + lea eax, [eax + 32] + vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 + vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 + vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0 + vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0 + vmovdqu [edi], ymm1 + vmovdqu [edi + 32], ymm2 + lea edi, [edi + 64] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_MERGEUVROW_AVX2 + +#ifdef HAS_COPYROW_SSE2 +// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. +__declspec(naked) __declspec(align(16)) +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + ret + } +} +#endif // HAS_COPYROW_SSE2 + +// Unaligned Multiple of 1. +__declspec(naked) __declspec(align(16)) +void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, esi + mov edx, edi + mov esi, [esp + 4] // src + mov edi, [esp + 8] // dst + mov ecx, [esp + 12] // count + rep movsb + mov edi, edx + mov esi, eax + ret + } +} + +#ifdef HAS_COPYROW_X86 +__declspec(naked) __declspec(align(16)) +void CopyRow_X86(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, esi + mov edx, edi + mov esi, [esp + 4] // src + mov edi, [esp + 8] // dst + mov ecx, [esp + 12] // count + shr ecx, 2 + rep movsd + mov edi, edx + mov esi, eax + ret + } +} +#endif // HAS_COPYROW_X86 + +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pslld xmm0, 24 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + psrld xmm1, 8 + + align 4 + convertloop: + movdqa xmm2, [eax] + movdqa xmm3, [eax + 16] + lea eax, [eax + 32] + movdqa xmm4, [edx] + movdqa xmm5, [edx + 16] + pand xmm2, xmm0 + pand xmm3, xmm0 + pand xmm4, xmm1 + pand xmm5, xmm1 + por xmm2, xmm4 + por xmm3, xmm5 + movdqa [edx], xmm2 + movdqa [edx + 16], xmm3 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + vpcmpeqb ymm0, ymm0, ymm0 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + + align 4 + convertloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + 32] + lea eax, [eax + 64] + vpblendvb ymm1, ymm1, [edx], ymm0 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_AVX2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pslld xmm0, 24 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + psrld xmm1, 8 + + align 4 + convertloop: + movq xmm2, qword ptr [eax] // 8 Y's + lea eax, [eax + 8] + punpcklbw xmm2, xmm2 + punpckhwd xmm3, xmm2 + punpcklwd xmm2, xmm2 + movdqa xmm4, [edx] + movdqa xmm5, [edx + 16] + pand xmm2, xmm0 + pand xmm3, xmm0 + pand xmm4, xmm1 + pand xmm5, xmm1 + por xmm2, xmm4 + por xmm3, xmm5 + movdqa [edx], xmm2 + movdqa [edx + 16], xmm3 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + vpcmpeqb ymm0, ymm0, ymm0 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + + align 4 + convertloop: + vpmovzxbd ymm1, qword ptr [eax] + vpmovzxbd ymm2, qword ptr [eax + 8] + lea eax, [eax + 16] + vpslld ymm1, ymm1, 24 + vpslld ymm2, ymm2, 24 + vpblendvb ymm1, ymm1, [edx], ymm0 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 + +#ifdef HAS_SETROW_X86 +// SetRow8 writes 'count' bytes using a 32 bit value repeated. +__declspec(naked) __declspec(align(16)) +void SetRow_X86(uint8* dst, uint32 v32, int count) { + __asm { + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v32 + mov ecx, [esp + 12] // count + shr ecx, 2 + rep stosd + mov edi, edx + ret + } +} + +// SetRow32 writes 'count' words using a 32 bit value repeated. +__declspec(naked) __declspec(align(16)) +void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + __asm { + push esi + push edi + push ebp + mov edi, [esp + 12 + 4] // dst + mov eax, [esp + 12 + 8] // v32 + mov ebp, [esp + 12 + 12] // width + mov edx, [esp + 12 + 16] // dst_stride + mov esi, [esp + 12 + 20] // height + lea ecx, [ebp * 4] + sub edx, ecx // stride - width * 4 + + align 4 + convertloop: + mov ecx, ebp + rep stosd + add edi, edx + sub esi, 1 + jg convertloop + + pop ebp + pop edi + pop esi + ret + } +} +#endif // HAS_SETROW_X86 + +#ifdef HAS_YUY2TOYROW_AVX2 +__declspec(naked) __declspec(align(16)) +void YUY2ToYRow_AVX2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // even bytes are Y + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + vzeroupper + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToYRow_AVX2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // odd bytes are Y + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + ret + vzeroupper + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_YUY2TOYROW_AVX2 + +#ifdef HAS_YUY2TOYROW_SSE2 +__declspec(naked) __declspec(align(16)) +void YUY2ToYRow_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // even bytes are Y + pand xmm1, xmm5 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // even bytes are Y + pand xmm1, xmm5 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToYRow_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // odd bytes are Y + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // odd bytes are Y + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_YUY2TOYROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSE2 +// Blend 8 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 1 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + sub ecx, 1 + je convertloop1 // only 1 pixel? + jl convertloop1b + + // 1 pixel loop until destination pointer is aligned. + alignloop1: + test edx, 15 // aligned? + je alignloop1b + movd xmm3, [eax] + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3, 0F5h // 8 alpha words + pshuflw xmm3, xmm3, 0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge alignloop1 + + alignloop1b: + add ecx, 1 - 4 + jl convertloop4b + + // 4 pixel loop. + convertloop4: + movdqu xmm3, [eax] // src argb + lea eax, [eax + 16] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqu xmm2, [esi] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3, 0F5h // 8 alpha words + pshuflw xmm3, xmm3, 0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqu xmm1, [esi] // _a_g + lea esi, [esi + 16] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jge convertloop4 + + convertloop4b: + add ecx, 4 - 1 + jl convertloop1b + + // 1 pixel loop. + convertloop1: + movd xmm3, [eax] // src argb + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3, 0F5h // 8 alpha words + pshuflw xmm3, xmm3, 0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge convertloop1 + + convertloop1b: + pop esi + ret + } +} +#endif // HAS_ARGBBLENDROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for isolating alpha. +static const uvec8 kShuffleAlpha = { + 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 +}; +// Same as SSE2, but replaces: +// psrlw xmm3, 8 // alpha +// pshufhw xmm3, xmm3, 0F5h // 8 alpha words +// pshuflw xmm3, xmm3, 0F5h +// with.. +// pshufb xmm3, kShuffleAlpha // alpha +// Blend 8 pixels at a time. + +__declspec(naked) __declspec(align(16)) +void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 0x0001 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + sub ecx, 1 + je convertloop1 // only 1 pixel? + jl convertloop1b + + // 1 pixel loop until destination pointer is aligned. + alignloop1: + test edx, 15 // aligned? + je alignloop1b + movd xmm3, [eax] + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge alignloop1 + + alignloop1b: + add ecx, 1 - 4 + jl convertloop4b + + test eax, 15 // unaligned? + jne convertuloop4 + test esi, 15 // unaligned? + jne convertuloop4 + + // 4 pixel loop. + convertloop4: + movdqa xmm3, [eax] // src argb + lea eax, [eax + 16] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqa xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqa xmm1, [esi] // _a_g + lea esi, [esi + 16] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jge convertloop4 + jmp convertloop4b + + // 4 pixel unaligned loop. + convertuloop4: + movdqu xmm3, [eax] // src argb + lea eax, [eax + 16] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqu xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqu xmm1, [esi] // _a_g + lea esi, [esi + 16] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jge convertuloop4 + + convertloop4b: + add ecx, 4 - 1 + jl convertloop1b + + // 1 pixel loop. + convertloop1: + movd xmm3, [eax] // src argb + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge convertloop1 + + convertloop1b: + pop esi + ret + } +} +#endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_SSE2 +// Attenuate 4 pixels at a time. +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff + psrld xmm5, 8 + + align 4 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + punpcklbw xmm0, xmm0 // first 2 + pshufhw xmm2, xmm0, 0FFh // 8 alpha words + pshuflw xmm2, xmm2, 0FFh + pmulhuw xmm0, xmm2 // rgb * a + movdqa xmm1, [eax] // read 4 pixels + punpckhbw xmm1, xmm1 // next 2 pixels + pshufhw xmm2, xmm1, 0FFh // 8 alpha words + pshuflw xmm2, xmm2, 0FFh + pmulhuw xmm1, xmm2 // rgb * a + movdqa xmm2, [eax] // alphas + lea eax, [eax + 16] + psrlw xmm0, 8 + pand xmm2, xmm4 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + pand xmm0, xmm5 // keep original alphas + por xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + + ret + } +} +#endif // HAS_ARGBATTENUATEROW_SSE2 + +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha0 = { + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, +}; +static const uvec8 kShuffleAlpha1 = { + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, +}; +__declspec(naked) __declspec(align(16)) +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm3, xmm3 // generate mask 0xff000000 + pslld xmm3, 24 + movdqa xmm4, kShuffleAlpha0 + movdqa xmm5, kShuffleAlpha1 + + align 4 + convertloop: + movdqu xmm0, [eax] // read 4 pixels + pshufb xmm0, xmm4 // isolate first 2 alphas + movdqu xmm1, [eax] // read 4 pixels + punpcklbw xmm1, xmm1 // first 2 pixel rgbs + pmulhuw xmm0, xmm1 // rgb * a + movdqu xmm1, [eax] // read 4 pixels + pshufb xmm1, xmm5 // isolate next 2 alphas + movdqu xmm2, [eax] // read 4 pixels + punpckhbw xmm2, xmm2 // next 2 pixel rgbs + pmulhuw xmm1, xmm2 // rgb * a + movdqu xmm2, [eax] // mask original alpha + lea eax, [eax + 16] + pand xmm2, xmm3 + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + por xmm0, xmm2 // copy original alpha + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + + ret + } +} +#endif // HAS_ARGBATTENUATEROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const ulvec8 kShuffleAlpha_AVX2 = { + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, + 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, + 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, +}; +__declspec(naked) __declspec(align(16)) +void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vmovdqa ymm4, kShuffleAlpha_AVX2 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpslld ymm5, ymm5, 24 + + align 4 + convertloop: + vmovdqu ymm6, [eax] // read 8 pixels. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpshufb ymm2, ymm0, ymm4 // low 4 alphas + vpshufb ymm3, ymm1, ymm4 // high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * a + vpmulhuw ymm1, ymm1, ymm3 // rgb * a + vpand ymm6, ymm6, ymm5 // isolate alpha + vpsrlw ymm0, ymm0, 8 + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // unmutated. + vpor ymm0, ymm0, ymm6 // copy original alpha + sub ecx, 8 + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBATTENUATEROW_AVX2 + +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +// Unattenuate 4 pixels at a time. +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb0 + mov edx, [esp + 8 + 8] // dst_argb + mov ecx, [esp + 8 + 12] // width + + align 4 + convertloop: + movdqu xmm0, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 3] // first alpha + movzx edi, byte ptr [eax + 7] // second alpha + punpcklbw xmm0, xmm0 // first 2 + movd xmm2, dword ptr fixed_invtbl8[esi * 4] + movd xmm3, dword ptr fixed_invtbl8[edi * 4] + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm0, xmm2 // rgb * a + + movdqu xmm1, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 11] // third alpha + movzx edi, byte ptr [eax + 15] // forth alpha + punpckhbw xmm1, xmm1 // next 2 + movd xmm2, dword ptr fixed_invtbl8[esi * 4] + movd xmm3, dword ptr fixed_invtbl8[edi * 4] + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm1, xmm2 // rgb * a + lea eax, [eax + 16] + + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBUNATTENUATEROW_SSE2 + +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const ulvec8 kUnattenShuffleAlpha_AVX2 = { + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, +}; +// TODO(fbarchard): Enable USE_GATHER for future hardware if faster. +// USE_GATHER is not on by default, due to being a slow instruction. +#ifdef USE_GATHER +__declspec(naked) __declspec(align(16)) +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vmovdqa ymm4, kUnattenShuffleAlpha_AVX2 + + align 4 + convertloop: + vmovdqu ymm6, [eax] // read 8 pixels. + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. + vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a + vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a + vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. + vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a + vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia + vpackuswb ymm0, ymm0, ymm1 // unmutated. + sub ecx, 8 + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + jg convertloop + + vzeroupper + ret + } +} +#else // USE_GATHER +__declspec(naked) __declspec(align(16)) +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, + int width) { + __asm { + + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vmovdqa ymm5, kUnattenShuffleAlpha_AVX2 + + push esi + push edi + + align 4 + convertloop: + // replace VPGATHER + movzx esi, byte ptr [eax + 3] // alpha0 + movzx edi, byte ptr [eax + 7] // alpha1 + vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] + vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] + movzx esi, byte ptr [eax + 11] // alpha2 + movzx edi, byte ptr [eax + 15] // alpha3 + vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] + vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] + vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] + movzx esi, byte ptr [eax + 19] // alpha4 + movzx edi, byte ptr [eax + 23] // alpha5 + vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] + vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] + vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] + movzx esi, byte ptr [eax + 27] // alpha6 + movzx edi, byte ptr [eax + 31] // alpha7 + vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] + vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] + vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] + vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] + vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] + vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] + vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] + // end of VPGATHER + + vmovdqu ymm6, [eax] // read 8 pixels. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a + vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. + vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a + vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia + vpackuswb ymm0, ymm0, ymm1 // unmutated. + sub ecx, 8 + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // USE_GATHER +#endif // HAS_ARGBATTENUATEROW_AVX2 + +#ifdef HAS_ARGBGRAYROW_SSSE3 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. +__declspec(naked) __declspec(align(16)) +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, kARGBToYJ + movdqa xmm5, kAddYJ64 + + align 4 + convertloop: + movdqa xmm0, [eax] // G + movdqa xmm1, [eax + 16] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm0, xmm1 + paddw xmm0, xmm5 // Add .5 for rounding. + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 G bytes + movdqa xmm2, [eax] // A + movdqa xmm3, [eax + 16] + lea eax, [eax + 32] + psrld xmm2, 24 + psrld xmm3, 24 + packuswb xmm2, xmm3 + packuswb xmm2, xmm2 // 8 A bytes + movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA + punpcklbw xmm0, xmm0 // 8 GG words + punpcklbw xmm3, xmm2 // 8 GA words + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm3 // GGGA first 4 + punpckhwd xmm1, xmm3 // GGGA next 4 + sub ecx, 8 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg convertloop + ret + } +} +#endif // HAS_ARGBGRAYROW_SSSE3 + +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone. +static const vec8 kARGBToSepiaB = { + 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 +}; + +static const vec8 kARGBToSepiaG = { + 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 +}; + +static const vec8 kARGBToSepiaR = { + 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 +}; + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +__declspec(naked) __declspec(align(16)) +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + mov ecx, [esp + 8] /* width */ + movdqa xmm2, kARGBToSepiaB + movdqa xmm3, kARGBToSepiaG + movdqa xmm4, kARGBToSepiaR + + align 4 + convertloop: + movdqa xmm0, [eax] // B + movdqa xmm6, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm6, xmm2 + phaddw xmm0, xmm6 + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 B values + movdqa xmm5, [eax] // G + movdqa xmm1, [eax + 16] + pmaddubsw xmm5, xmm3 + pmaddubsw xmm1, xmm3 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values + movdqa xmm5, [eax] // R + movdqa xmm1, [eax + 16] + pmaddubsw xmm5, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 R values + movdqa xmm6, [eax] // A + movdqa xmm1, [eax + 16] + psrld xmm6, 24 + psrld xmm1, 24 + packuswb xmm6, xmm1 + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm5, xmm6 // 8 RA values + movdqa xmm1, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm5 // BGRA first 4 + punpckhwd xmm1, xmm5 // BGRA next 4 + sub ecx, 8 + movdqa [eax], xmm0 + movdqa [eax + 16], xmm1 + lea eax, [eax + 32] + jg convertloop + ret + } +} +#endif // HAS_ARGBSEPIAROW_SSSE3 + +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R +// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. +__declspec(naked) __declspec(align(16)) +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* matrix_argb */ + movdqu xmm5, [ecx] + pshufd xmm2, xmm5, 0x00 + pshufd xmm3, xmm5, 0x55 + pshufd xmm4, xmm5, 0xaa + pshufd xmm5, xmm5, 0xff + mov ecx, [esp + 16] /* width */ + + align 4 + convertloop: + movdqa xmm0, [eax] // B + movdqa xmm7, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm7, xmm2 + movdqa xmm6, [eax] // G + movdqa xmm1, [eax + 16] + pmaddubsw xmm6, xmm3 + pmaddubsw xmm1, xmm3 + phaddsw xmm0, xmm7 // B + phaddsw xmm6, xmm1 // G + psraw xmm0, 6 // B + psraw xmm6, 6 // G + packuswb xmm0, xmm0 // 8 B values + packuswb xmm6, xmm6 // 8 G values + punpcklbw xmm0, xmm6 // 8 BG values + movdqa xmm1, [eax] // R + movdqa xmm7, [eax + 16] + pmaddubsw xmm1, xmm4 + pmaddubsw xmm7, xmm4 + phaddsw xmm1, xmm7 // R + movdqa xmm6, [eax] // A + movdqa xmm7, [eax + 16] + pmaddubsw xmm6, xmm5 + pmaddubsw xmm7, xmm5 + phaddsw xmm6, xmm7 // A + psraw xmm1, 6 // R + psraw xmm6, 6 // A + packuswb xmm1, xmm1 // 8 R values + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm1, xmm6 // 8 RA values + movdqa xmm6, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm1 // BGRA first 4 + punpckhwd xmm6, xmm1 // BGRA next 4 + sub ecx, 8 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm6 + lea eax, [eax + 32] + lea edx, [edx + 32] + jg convertloop + ret + } +} +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 + +#ifdef HAS_ARGBQUANTIZEROW_SSE2 +// Quantize 4 ARGB pixels (16 bytes). +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + movd xmm2, [esp + 8] /* scale */ + movd xmm3, [esp + 12] /* interval_size */ + movd xmm4, [esp + 16] /* interval_offset */ + mov ecx, [esp + 20] /* width */ + pshuflw xmm2, xmm2, 040h + pshufd xmm2, xmm2, 044h + pshuflw xmm3, xmm3, 040h + pshufd xmm3, xmm3, 044h + pshuflw xmm4, xmm4, 040h + pshufd xmm4, xmm4, 044h + pxor xmm5, xmm5 // constant 0 + pcmpeqb xmm6, xmm6 // generate mask 0xff000000 + pslld xmm6, 24 + + align 4 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + punpcklbw xmm0, xmm5 // first 2 pixels + pmulhuw xmm0, xmm2 // pixel * scale >> 16 + movdqa xmm1, [eax] // read 4 pixels + punpckhbw xmm1, xmm5 // next 2 pixels + pmulhuw xmm1, xmm2 + pmullw xmm0, xmm3 // * interval_size + movdqa xmm7, [eax] // read 4 pixels + pmullw xmm1, xmm3 + pand xmm7, xmm6 // mask alpha + paddw xmm0, xmm4 // + interval_size / 2 + paddw xmm1, xmm4 + packuswb xmm0, xmm1 + por xmm0, xmm7 + sub ecx, 4 + movdqa [eax], xmm0 + lea eax, [eax + 16] + jg convertloop + ret + } +} +#endif // HAS_ARGBQUANTIZEROW_SSE2 + +#ifdef HAS_ARGBSHADEROW_SSE2 +// Shade 4 pixels at a time by specified value. +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + movd xmm2, [esp + 16] // value + punpcklbw xmm2, xmm2 + punpcklqdq xmm2, xmm2 + + align 4 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + pmulhuw xmm0, xmm2 // argb * value + pmulhuw xmm1, xmm2 // argb * value + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + + ret + } +} +#endif // HAS_ARGBSHADEROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pxor xmm5, xmm5 // constant 0 + + align 4 + convertloop: + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm2, [esi] // read 4 pixels from src_argb1 + movdqu xmm1, xmm0 + movdqu xmm3, xmm2 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + punpcklbw xmm2, xmm5 // first 2 + punpckhbw xmm3, xmm5 // next 2 + pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + lea eax, [eax + 16] + lea esi, [esi + 16] + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBMULTIPLYROW_SSE2 + +#ifdef HAS_ARGBADDROW_SSE2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +// TODO(fbarchard): Port this to posix, neon and other math functions. +__declspec(naked) __declspec(align(16)) +void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + sub ecx, 4 + jl convertloop49 + + align 4 + convertloop4: + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] + paddusb xmm0, xmm1 // src_argb0 + src_argb1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jge convertloop4 + + convertloop49: + add ecx, 4 - 1 + jl convertloop19 + + convertloop1: + movd xmm0, [eax] // read 1 pixels from src_argb0 + lea eax, [eax + 4] + movd xmm1, [esi] // read 1 pixels from src_argb1 + lea esi, [esi + 4] + paddusb xmm0, xmm1 // src_argb0 + src_argb1 + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge convertloop1 + + convertloop19: + pop esi + ret + } +} +#endif // HAS_ARGBADDROW_SSE2 + +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +// Subtract 2 rows of ARGB pixels together, 4 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + align 4 + convertloop: + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] + psubusb xmm0, xmm1 // src_argb0 - src_argb1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBSUBTRACTROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + vpxor ymm5, ymm5, ymm5 // constant 0 + + align 4 + convertloop: + vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 + lea eax, [eax + 32] + vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 + lea esi, [esi + 32] + vpunpcklbw ymm0, ymm1, ymm1 // low 4 + vpunpckhbw ymm1, ymm1, ymm1 // high 4 + vpunpcklbw ymm2, ymm3, ymm5 // low 4 + vpunpckhbw ymm3, ymm3, ymm5 // high 4 + vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 + vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 + vpackuswb ymm0, ymm0, ymm1 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBMULTIPLYROW_AVX2 + +#ifdef HAS_ARGBADDROW_AVX2 +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + align 4 + convertloop: + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + lea eax, [eax + 32] + vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBADDROW_AVX2 + +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +// Subtract 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + align 4 + convertloop: + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + lea eax, [eax + 32] + vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBSUBTRACTROW_AVX2 + +#ifdef HAS_SOBELXROW_SSE2 +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +__declspec(naked) __declspec(align(16)) +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y0 + mov esi, [esp + 8 + 8] // src_y1 + mov edi, [esp + 8 + 12] // src_y2 + mov edx, [esp + 8 + 16] // dst_sobelx + mov ecx, [esp + 8 + 20] // width + sub esi, eax + sub edi, eax + sub edx, eax + pxor xmm5, xmm5 // constant 0 + + align 4 + convertloop: + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + psubw xmm0, xmm1 + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + psubw xmm1, xmm2 + movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] + movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + psubw xmm2, xmm3 + paddw xmm0, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm1 + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + psubw xmm1, xmm0 + pmaxsw xmm0, xmm1 + packuswb xmm0, xmm0 + sub ecx, 8 + movq qword ptr [eax + edx], xmm0 + lea eax, [eax + 8] + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_SOBELXROW_SSE2 + +#ifdef HAS_SOBELYROW_SSE2 +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +__declspec(naked) __declspec(align(16)) +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_y0 + mov esi, [esp + 4 + 8] // src_y1 + mov edx, [esp + 4 + 12] // dst_sobely + mov ecx, [esp + 4 + 16] // width + sub esi, eax + sub edx, eax + pxor xmm5, xmm5 // constant 0 + + align 4 + convertloop: + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + psubw xmm0, xmm1 + movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] + movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + psubw xmm1, xmm2 + movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + psubw xmm2, xmm3 + paddw xmm0, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm1 + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + psubw xmm1, xmm0 + pmaxsw xmm0, xmm1 + packuswb xmm0, xmm0 + sub ecx, 8 + movq qword ptr [eax + edx], xmm0 + lea eax, [eax + 8] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELYROW_SSE2 + +#ifdef HAS_SOBELROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +__declspec(naked) __declspec(align(16)) +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + pcmpeqb xmm5, xmm5 // alpha 255 + pslld xmm5, 24 // 0xff000000 + + align 4 + convertloop: + movdqa xmm0, [eax] // read 16 pixels src_sobelx + movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + paddusb xmm0, xmm1 // sobel = sobelx + sobely + movdqa xmm2, xmm0 // GG + punpcklbw xmm2, xmm0 // First 8 + punpckhbw xmm0, xmm0 // Next 8 + movdqa xmm1, xmm2 // GGGG + punpcklwd xmm1, xmm2 // First 4 + punpckhwd xmm2, xmm2 // Next 4 + por xmm1, xmm5 // GGGA + por xmm2, xmm5 + movdqa xmm3, xmm0 // GGGG + punpcklwd xmm3, xmm0 // Next 4 + punpckhwd xmm0, xmm0 // Last 4 + por xmm3, xmm5 // GGGA + por xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm1 + movdqa [edx + 16], xmm2 + movdqa [edx + 32], xmm3 + movdqa [edx + 48], xmm0 + lea edx, [edx + 64] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELROW_SSE2 + +#ifdef HAS_SOBELTOPLANEROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. +__declspec(naked) __declspec(align(16)) +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + + align 4 + convertloop: + movdqa xmm0, [eax] // read 16 pixels src_sobelx + movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + paddusb xmm0, xmm1 // sobel = sobelx + sobely + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELTOPLANEROW_SSE2 + +#ifdef HAS_SOBELXYROW_SSE2 +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +__declspec(naked) __declspec(align(16)) +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + pcmpeqb xmm5, xmm5 // alpha 255 + + align 4 + convertloop: + movdqa xmm0, [eax] // read 16 pixels src_sobelx + movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + movdqa xmm2, xmm0 + paddusb xmm2, xmm1 // sobel = sobelx + sobely + movdqa xmm3, xmm0 // XA + punpcklbw xmm3, xmm5 + punpckhbw xmm0, xmm5 + movdqa xmm4, xmm1 // YS + punpcklbw xmm4, xmm2 + punpckhbw xmm1, xmm2 + movdqa xmm6, xmm4 // YSXA + punpcklwd xmm6, xmm3 // First 4 + punpckhwd xmm4, xmm3 // Next 4 + movdqa xmm7, xmm1 // YSXA + punpcklwd xmm7, xmm0 // Next 4 + punpckhwd xmm1, xmm0 // Last 4 + sub ecx, 16 + movdqa [edx], xmm6 + movdqa [edx + 16], xmm4 + movdqa [edx + 32], xmm7 + movdqa [edx + 48], xmm1 + lea edx, [edx + 64] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELXYROW_SSE2 + +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +// Consider float CumulativeSum. +// Consider calling CumulativeSum one row at time as needed. +// Consider circular CumulativeSum buffer of radius * 2 + 1 height. +// Convert cumulative sum for an area to an average for 1 pixel. +// topleft is pointer to top left of CumulativeSum buffer for area. +// botleft is pointer to bottom left of CumulativeSum buffer. +// width is offset from left to right of area in CumulativeSum buffer measured +// in number of ints. +// area is the number of pixels in the area being averaged. +// dst points to pixel to store result to. +// count is number of averaged pixels to produce. +// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte +// aligned. +void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, + int count) { + __asm { + mov eax, topleft // eax topleft + mov esi, botleft // esi botleft + mov edx, width + movd xmm5, area + mov edi, dst + mov ecx, count + cvtdq2ps xmm5, xmm5 + rcpss xmm4, xmm5 // 1.0f / area + pshufd xmm4, xmm4, 0 + sub ecx, 4 + jl l4b + + cmp area, 128 // 128 pixels will not overflow 15 bits. + ja l4 + + pshufd xmm5, xmm5, 0 // area + pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 + psrld xmm6, 16 + cvtdq2ps xmm6, xmm6 + addps xmm5, xmm6 // (65536.0 + area - 1) + mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area + cvtps2dq xmm5, xmm5 // 0.16 fixed point + packssdw xmm5, xmm5 // 16 bit shorts + + // 4 pixel loop small blocks. + align 4 + s4: + // top left + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + + // - top right + psubd xmm0, [eax + edx * 4] + psubd xmm1, [eax + edx * 4 + 16] + psubd xmm2, [eax + edx * 4 + 32] + psubd xmm3, [eax + edx * 4 + 48] + lea eax, [eax + 64] + + // - bottom left + psubd xmm0, [esi] + psubd xmm1, [esi + 16] + psubd xmm2, [esi + 32] + psubd xmm3, [esi + 48] + + // + bottom right + paddd xmm0, [esi + edx * 4] + paddd xmm1, [esi + edx * 4 + 16] + paddd xmm2, [esi + edx * 4 + 32] + paddd xmm3, [esi + edx * 4 + 48] + lea esi, [esi + 64] + + packssdw xmm0, xmm1 // pack 4 pixels into 2 registers + packssdw xmm2, xmm3 + + pmulhuw xmm0, xmm5 + pmulhuw xmm2, xmm5 + + packuswb xmm0, xmm2 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 + jge s4 + + jmp l4b + + // 4 pixel loop + align 4 + l4: + // top left + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + + // - top right + psubd xmm0, [eax + edx * 4] + psubd xmm1, [eax + edx * 4 + 16] + psubd xmm2, [eax + edx * 4 + 32] + psubd xmm3, [eax + edx * 4 + 48] + lea eax, [eax + 64] + + // - bottom left + psubd xmm0, [esi] + psubd xmm1, [esi + 16] + psubd xmm2, [esi + 32] + psubd xmm3, [esi + 48] + + // + bottom right + paddd xmm0, [esi + edx * 4] + paddd xmm1, [esi + edx * 4 + 16] + paddd xmm2, [esi + edx * 4 + 32] + paddd xmm3, [esi + edx * 4 + 48] + lea esi, [esi + 64] + + cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area + cvtdq2ps xmm1, xmm1 + mulps xmm0, xmm4 + mulps xmm1, xmm4 + cvtdq2ps xmm2, xmm2 + cvtdq2ps xmm3, xmm3 + mulps xmm2, xmm4 + mulps xmm3, xmm4 + cvtps2dq xmm0, xmm0 + cvtps2dq xmm1, xmm1 + cvtps2dq xmm2, xmm2 + cvtps2dq xmm3, xmm3 + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + packuswb xmm0, xmm2 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + align 4 + l1: + movdqa xmm0, [eax] + psubd xmm0, [eax + edx * 4] + lea eax, [eax + 16] + psubd xmm0, [esi] + paddd xmm0, [esi + edx * 4] + lea esi, [esi + 16] + cvtdq2ps xmm0, xmm0 + mulps xmm0, xmm4 + cvtps2dq xmm0, xmm0 + packssdw xmm0, xmm0 + packuswb xmm0, xmm0 + movd dword ptr [edi], xmm0 + lea edi, [edi + 4] + sub ecx, 1 + jge l1 + l1b: + } +} +#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 + +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value. +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) { + __asm { + mov eax, row + mov edx, cumsum + mov esi, previous_cumsum + mov ecx, width + pxor xmm0, xmm0 + pxor xmm1, xmm1 + + sub ecx, 4 + jl l4b + test edx, 15 + jne l4b + + // 4 pixel loop + align 4 + l4: + movdqu xmm2, [eax] // 4 argb pixels 16 bytes. + lea eax, [eax + 16] + movdqa xmm4, xmm2 + + punpcklbw xmm2, xmm1 + movdqa xmm3, xmm2 + punpcklwd xmm2, xmm1 + punpckhwd xmm3, xmm1 + + punpckhbw xmm4, xmm1 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm1 + punpckhwd xmm5, xmm1 + + paddd xmm0, xmm2 + movdqa xmm2, [esi] // previous row above. + paddd xmm2, xmm0 + + paddd xmm0, xmm3 + movdqa xmm3, [esi + 16] + paddd xmm3, xmm0 + + paddd xmm0, xmm4 + movdqa xmm4, [esi + 32] + paddd xmm4, xmm0 + + paddd xmm0, xmm5 + movdqa xmm5, [esi + 48] + lea esi, [esi + 64] + paddd xmm5, xmm0 + + movdqa [edx], xmm2 + movdqa [edx + 16], xmm3 + movdqa [edx + 32], xmm4 + movdqa [edx + 48], xmm5 + + lea edx, [edx + 64] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + align 4 + l1: + movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. + lea eax, [eax + 4] + punpcklbw xmm2, xmm1 + punpcklwd xmm2, xmm1 + paddd xmm0, xmm2 + movdqu xmm2, [esi] + lea esi, [esi + 16] + paddd xmm2, xmm0 + movdqu [edx], xmm2 + lea edx, [edx + 16] + sub ecx, 1 + jge l1 + + l1b: + } +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + +#ifdef HAS_ARGBAFFINEROW_SSE2 +// Copy ARGB pixels from source image with slope to a row of destination. +__declspec(naked) __declspec(align(16)) +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width) { + __asm { + push esi + push edi + mov eax, [esp + 12] // src_argb + mov esi, [esp + 16] // stride + mov edx, [esp + 20] // dst_argb + mov ecx, [esp + 24] // pointer to uv_dudv + movq xmm2, qword ptr [ecx] // uv + movq xmm7, qword ptr [ecx + 8] // dudv + mov ecx, [esp + 28] // width + shl esi, 16 // 4, stride + add esi, 4 + movd xmm5, esi + sub ecx, 4 + jl l4b + + // setup for 4 pixel loop + pshufd xmm7, xmm7, 0x44 // dup dudv + pshufd xmm5, xmm5, 0 // dup 4, stride + movdqa xmm0, xmm2 // x0, y0, x1, y1 + addps xmm0, xmm7 + movlhps xmm2, xmm0 + movdqa xmm4, xmm7 + addps xmm4, xmm4 // dudv *= 2 + movdqa xmm3, xmm2 // x2, y2, x3, y3 + addps xmm3, xmm4 + addps xmm4, xmm4 // dudv *= 4 + + // 4 pixel loop + align 4 + l4: + cvttps2dq xmm0, xmm2 // x, y float to int first 2 + cvttps2dq xmm1, xmm3 // x, y float to int next 2 + packssdw xmm0, xmm1 // x, y as 8 shorts + pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd edi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd xmm1, [eax + esi] // read pixel 0 + movd xmm6, [eax + edi] // read pixel 1 + punpckldq xmm1, xmm6 // combine pixel 0 and 1 + addps xmm2, xmm4 // x, y += dx, dy first 2 + movq qword ptr [edx], xmm1 + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd edi, xmm0 + movd xmm6, [eax + esi] // read pixel 2 + movd xmm0, [eax + edi] // read pixel 3 + punpckldq xmm6, xmm0 // combine pixel 2 and 3 + addps xmm3, xmm4 // x, y += dx, dy next 2 + sub ecx, 4 + movq qword ptr 8[edx], xmm6 + lea edx, [edx + 16] + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + align 4 + l1: + cvttps2dq xmm0, xmm2 // x, y float to int + packssdw xmm0, xmm0 // x, y as shorts + pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride + addps xmm2, xmm7 // x, y += dx, dy + movd esi, xmm0 + movd xmm0, [eax + esi] // copy a pixel + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge l1 + l1b: + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBAFFINEROW_SSE2 + +#ifdef HAS_INTERPOLATEROW_AVX2 +// Bilinear filter 16x2 -> 16x1 +__declspec(naked) __declspec(align(16)) +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + shr eax, 1 + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 128. Blend 100 / 0. + sub edi, esi + cmp eax, 32 + je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. + cmp eax, 64 + je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. + cmp eax, 96 + je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. + + vmovd xmm0, eax // high fraction 0..127 + neg eax + add eax, 128 + vmovd xmm5, eax // low fraction 128..1 + vpunpcklbw xmm5, xmm5, xmm0 + vpunpcklwd xmm5, xmm5, xmm5 + vpxor ymm0, ymm0, ymm0 + vpermd ymm5, ymm0, ymm5 + + align 4 + xloop: + vmovdqu ymm0, [esi] + vmovdqu ymm2, [esi + edx] + vpunpckhbw ymm1, ymm0, ymm2 // mutates + vpunpcklbw ymm0, ymm0, ymm2 // mutates + vpmaddubsw ymm0, ymm0, ymm5 + vpmaddubsw ymm1, ymm1, ymm5 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm1, ymm1, 7 + vpackuswb ymm0, ymm0, ymm1 // unmutates + sub ecx, 32 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + jg xloop + jmp xloop99 + + // Blend 25 / 75. + align 4 + xloop25: + vmovdqu ymm0, [esi] + vpavgb ymm0, ymm0, [esi + edx] + vpavgb ymm0, ymm0, [esi + edx] + sub ecx, 32 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 4 + xloop50: + vmovdqu ymm0, [esi] + vpavgb ymm0, ymm0, [esi + edx] + sub ecx, 32 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 4 + xloop75: + vmovdqu ymm0, [esi + edx] + vpavgb ymm0, ymm0, [esi] + vpavgb ymm0, ymm0, [esi] + sub ecx, 32 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 4 + xloop100: + rep movsb + + xloop99: + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_INTERPOLATEROW_AVX2 + +#ifdef HAS_INTERPOLATEROW_SSSE3 +// Bilinear filter 16x2 -> 16x1 +__declspec(naked) __declspec(align(16)) +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + shr eax, 1 + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 128. Blend 100 / 0. + cmp eax, 32 + je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. + cmp eax, 64 + je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. + cmp eax, 96 + je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. + + movd xmm0, eax // high fraction 0..127 + neg eax + add eax, 128 + movd xmm5, eax // low fraction 128..1 + punpcklbw xmm5, xmm0 + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + + align 4 + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm5 + pmaddubsw xmm1, xmm5 + psrlw xmm0, 7 + psrlw xmm1, 7 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + jmp xloop99 + + // Blend 25 / 75. + align 4 + xloop25: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 4 + xloop50: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 4 + xloop75: + movdqa xmm1, [esi] + movdqa xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 4 + xloop100: + movdqa xmm0, [esi] + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop100 + + xloop99: + pop edi + pop esi + ret + } +} +#endif // HAS_INTERPOLATEROW_SSSE3 + +#ifdef HAS_INTERPOLATEROW_SSE2 +// Bilinear filter 16x2 -> 16x1 +__declspec(naked) __declspec(align(16)) +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 256. Blend 100 / 0. + cmp eax, 64 + je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. + cmp eax, 128 + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + cmp eax, 192 + je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. + + movd xmm5, eax // xmm5 = y fraction + punpcklbw xmm5, xmm5 + psrlw xmm5, 1 + punpcklwd xmm5, xmm5 + punpckldq xmm5, xmm5 + punpcklqdq xmm5, xmm5 + pxor xmm4, xmm4 + + align 4 + xloop: + movdqa xmm0, [esi] // row0 + movdqa xmm2, [esi + edx] // row1 + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + psubw xmm2, xmm0 // row1 - row0 + psubw xmm3, xmm1 + paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 + paddw xmm3, xmm3 + pmulhw xmm2, xmm5 // scale diff + pmulhw xmm3, xmm5 + paddw xmm0, xmm2 // sum rows + paddw xmm1, xmm3 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + jmp xloop99 + + // Blend 25 / 75. + align 4 + xloop25: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 4 + xloop50: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 4 + xloop75: + movdqa xmm1, [esi] + movdqa xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 4 + xloop100: + movdqa xmm0, [esi] + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop100 + + xloop99: + pop edi + pop esi + ret + } +} +#endif // HAS_INTERPOLATEROW_SSE2 + +// Bilinear filter 16x2 -> 16x1 +__declspec(naked) __declspec(align(16)) +void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + shr eax, 1 + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 128. Blend 100 / 0. + cmp eax, 32 + je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. + cmp eax, 64 + je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. + cmp eax, 96 + je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. + + movd xmm0, eax // high fraction 0..127 + neg eax + add eax, 128 + movd xmm5, eax // low fraction 128..1 + punpcklbw xmm5, xmm0 + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + + align 4 + xloop: + movdqu xmm0, [esi] + movdqu xmm2, [esi + edx] + movdqu xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm5 + pmaddubsw xmm1, xmm5 + psrlw xmm0, 7 + psrlw xmm1, 7 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + jmp xloop99 + + // Blend 25 / 75. + align 4 + xloop25: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 4 + xloop50: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 4 + xloop75: + movdqu xmm1, [esi] + movdqu xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 4 + xloop100: + movdqu xmm0, [esi] + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop100 + + xloop99: + pop edi + pop esi + ret + } +} + +#ifdef HAS_INTERPOLATEROW_SSE2 +// Bilinear filter 16x2 -> 16x1 +__declspec(naked) __declspec(align(16)) +void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 256. Blend 100 / 0. + cmp eax, 64 + je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. + cmp eax, 128 + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + cmp eax, 192 + je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. + + movd xmm5, eax // xmm5 = y fraction + punpcklbw xmm5, xmm5 + psrlw xmm5, 1 + punpcklwd xmm5, xmm5 + punpckldq xmm5, xmm5 + punpcklqdq xmm5, xmm5 + pxor xmm4, xmm4 + + align 4 + xloop: + movdqu xmm0, [esi] // row0 + movdqu xmm2, [esi + edx] // row1 + movdqu xmm1, xmm0 + movdqu xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + psubw xmm2, xmm0 // row1 - row0 + psubw xmm3, xmm1 + paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 + paddw xmm3, xmm3 + pmulhw xmm2, xmm5 // scale diff + pmulhw xmm3, xmm5 + paddw xmm0, xmm2 // sum rows + paddw xmm1, xmm3 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + jmp xloop99 + + // Blend 25 / 75. + align 4 + xloop25: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 4 + xloop50: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 4 + xloop75: + movdqu xmm1, [esi] + movdqu xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 4 + xloop100: + movdqu xmm0, [esi] + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop100 + + xloop99: + pop edi + pop esi + ret + } +} +#endif // HAS_INTERPOLATEROW_SSE2 + +__declspec(naked) __declspec(align(16)) +void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // src_uv_stride + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + sub edi, eax + + align 4 + convertloop: + movdqa xmm0, [eax] + pavgb xmm0, [eax + edx] + sub ecx, 16 + movdqa [eax + edi], xmm0 + lea eax, [eax + 16] + jg convertloop + pop edi + ret + } +} + +#ifdef HAS_HALFROW_AVX2 +__declspec(naked) __declspec(align(16)) +void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // src_uv_stride + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + sub edi, eax + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vpavgb ymm0, ymm0, [eax + edx] + sub ecx, 32 + vmovdqu [eax + edi], ymm0 + lea eax, [eax + 32] + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_HALFROW_AVX2 + +__declspec(naked) __declspec(align(16)) +void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_bayer + movd xmm5, [esp + 12] // selector + mov ecx, [esp + 16] // pix + pshufd xmm5, xmm5, 0 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pshufb xmm0, xmm5 + pshufb xmm1, xmm5 + punpckldq xmm0, xmm1 + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop + ret + } +} + +// Specialized ARGB to Bayer that just isolates G channel. +__declspec(naked) __declspec(align(16)) +void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_bayer + // selector + mov ecx, [esp + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x000000ff + psrld xmm5, 24 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrld xmm0, 8 // Move green to bottom. + psrld xmm1, 8 + pand xmm0, xmm5 + pand xmm1, xmm5 + packssdw xmm0, xmm1 + packuswb xmm0, xmm1 + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop + ret + } +} + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +__declspec(naked) __declspec(align(16)) +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + movdqa xmm5, [ecx] + mov ecx, [esp + 16] // pix + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pshufb xmm0, xmm5 + pshufb xmm1, xmm5 + sub ecx, 8 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + movdqa xmm5, [ecx] + mov ecx, [esp + 16] // pix + + align 4 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pshufb xmm0, xmm5 + pshufb xmm1, xmm5 + sub ecx, 8 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + ret + } +} + +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. + mov ecx, [esp + 16] // pix + + align 4 + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpshufb ymm0, ymm0, ymm5 + vpshufb ymm1, ymm1, ymm5 + sub ecx, 16 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + jg wloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBSHUFFLEROW_AVX2 + +__declspec(naked) __declspec(align(16)) +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + push ebx + push esi + mov eax, [esp + 8 + 4] // src_argb + mov edx, [esp + 8 + 8] // dst_argb + mov esi, [esp + 8 + 12] // shuffler + mov ecx, [esp + 8 + 16] // pix + pxor xmm5, xmm5 + + mov ebx, [esi] // shuffler + cmp ebx, 0x03000102 + je shuf_3012 + cmp ebx, 0x00010203 + je shuf_0123 + cmp ebx, 0x00030201 + je shuf_0321 + cmp ebx, 0x02010003 + je shuf_2103 + + // TODO(fbarchard): Use one source pointer and 3 offsets. + shuf_any1: + movzx ebx, byte ptr [esi] + movzx ebx, byte ptr [eax + ebx] + mov [edx], bl + movzx ebx, byte ptr [esi + 1] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 1], bl + movzx ebx, byte ptr [esi + 2] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 2], bl + movzx ebx, byte ptr [esi + 3] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 3], bl + lea eax, [eax + 4] + lea edx, [edx + 4] + sub ecx, 1 + jg shuf_any1 + jmp shuf99 + + align 4 + shuf_0123: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB + pshuflw xmm0, xmm0, 01Bh + pshufhw xmm1, xmm1, 01Bh + pshuflw xmm1, xmm1, 01Bh + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_0123 + jmp shuf99 + + align 4 + shuf_0321: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB + pshuflw xmm0, xmm0, 039h + pshufhw xmm1, xmm1, 039h + pshuflw xmm1, xmm1, 039h + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_0321 + jmp shuf99 + + align 4 + shuf_2103: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA + pshuflw xmm0, xmm0, 093h + pshufhw xmm1, xmm1, 093h + pshuflw xmm1, xmm1, 093h + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_2103 + jmp shuf99 + + align 4 + shuf_3012: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB + pshuflw xmm0, xmm0, 0C6h + pshufhw xmm1, xmm1, 0C6h + pshuflw xmm1, xmm1, 0C6h + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_3012 + + shuf99: + pop esi + pop ebx + ret + } +} + +// YUY2 - Macro-pixel = 2 image pixels +// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... + +// UYVY - Macro-pixel = 2 image pixels +// U0Y0V0Y1 + +__declspec(naked) __declspec(align(16)) +void I422ToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + align 4 + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 // YUYV + punpckhbw xmm1, xmm2 + movdqu [edi], xmm0 + movdqu [edi + 16], xmm1 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + align 4 + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y + movdqa xmm1, xmm2 + lea eax, [eax + 16] + punpcklbw xmm1, xmm0 // UYVY + punpckhbw xmm2, xmm0 + movdqu [edi], xmm1 + movdqu [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +__declspec(naked) __declspec(align(16)) +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* src_argb */ + mov edx, [esp + 4 + 8] /* dst_argb */ + mov esi, [esp + 4 + 12] /* poly */ + mov ecx, [esp + 4 + 16] /* width */ + pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. + + // 2 pixel loop. + align 4 + convertloop: +// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel +// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel + movq xmm0, qword ptr [eax] // BGRABGRA + lea eax, [eax + 8] + punpcklbw xmm0, xmm3 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 // pixel 0 + punpckhwd xmm4, xmm3 // pixel 1 + cvtdq2ps xmm0, xmm0 // 4 floats + cvtdq2ps xmm4, xmm4 + movdqa xmm1, xmm0 // X + movdqa xmm5, xmm4 + mulps xmm0, [esi + 16] // C1 * X + mulps xmm4, [esi + 16] + addps xmm0, [esi] // result = C0 + C1 * X + addps xmm4, [esi] + movdqa xmm2, xmm1 + movdqa xmm6, xmm5 + mulps xmm2, xmm1 // X * X + mulps xmm6, xmm5 + mulps xmm1, xmm2 // X * X * X + mulps xmm5, xmm6 + mulps xmm2, [esi + 32] // C2 * X * X + mulps xmm6, [esi + 32] + mulps xmm1, [esi + 48] // C3 * X * X * X + mulps xmm5, [esi + 48] + addps xmm0, xmm2 // result += C2 * X * X + addps xmm4, xmm6 + addps xmm0, xmm1 // result += C3 * X * X * X + addps xmm4, xmm5 + cvttps2dq xmm0, xmm0 + cvttps2dq xmm4, xmm4 + packuswb xmm0, xmm4 + packuswb xmm0, xmm0 + sub ecx, 2 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg convertloop + pop esi + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* poly */ + vbroadcastf128 ymm4, [ecx] // C0 + vbroadcastf128 ymm5, [ecx + 16] // C1 + vbroadcastf128 ymm6, [ecx + 32] // C2 + vbroadcastf128 ymm7, [ecx + 48] // C3 + mov ecx, [esp + 16] /* width */ + + // 2 pixel loop. + align 4 + convertloop: + vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels + lea eax, [eax + 8] + vcvtdq2ps ymm0, ymm0 // X 8 floats + vmulps ymm2, ymm0, ymm0 // X * X + vmulps ymm3, ymm0, ymm7 // C3 * X + vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X + vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X + vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X + vcvttps2dq ymm0, ymm0 + vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 + vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 + vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 + sub ecx, 2 + vmovq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +__declspec(naked) __declspec(align(16)) +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + align 4 + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + movzx edx, byte ptr [eax - 4 + 3] + movzx edx, byte ptr [esi + edx * 4 + 3] + mov byte ptr [eax - 4 + 3], dl + dec ecx + jg convertloop + pop esi + ret + } +} +#endif // HAS_ARGBCOLORTABLEROW_X86 + +#ifdef HAS_RGBCOLORTABLEROW_X86 +// Tranform RGB pixels with color table. +__declspec(naked) __declspec(align(16)) +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + align 4 + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + dec ecx + jg convertloop + + pop esi + ret + } +} +#endif // HAS_RGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +__declspec(naked) __declspec(align(16)) +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, + const uint8* luma, uint32 lumacoeff) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] /* src_argb */ + mov edi, [esp + 8 + 8] /* dst_argb */ + mov ecx, [esp + 8 + 12] /* width */ + movd xmm2, dword ptr [esp + 8 + 16] // luma table + movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff + pshufd xmm2, xmm2, 0 + pshufd xmm3, xmm3, 0 + pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 + psllw xmm4, 8 + pxor xmm5, xmm5 + + // 4 pixel loop. + align 4 + convertloop: + movdqu xmm0, qword ptr [eax] // generate luma ptr + pmaddubsw xmm0, xmm3 + phaddw xmm0, xmm0 + pand xmm0, xmm4 // mask out low bits + punpcklwd xmm0, xmm5 + paddd xmm0, xmm2 // add table base + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi], dl + movzx edx, byte ptr [eax + 1] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 1], dl + movzx edx, byte ptr [eax + 2] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 2], dl + movzx edx, byte ptr [eax + 3] // copy alpha. + mov byte ptr [edi + 3], dl + + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax + 4] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 4], dl + movzx edx, byte ptr [eax + 5] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 5], dl + movzx edx, byte ptr [eax + 6] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 6], dl + movzx edx, byte ptr [eax + 7] // copy alpha. + mov byte ptr [edi + 7], dl + + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax + 8] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 8], dl + movzx edx, byte ptr [eax + 9] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 9], dl + movzx edx, byte ptr [eax + 10] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 10], dl + movzx edx, byte ptr [eax + 11] // copy alpha. + mov byte ptr [edi + 11], dl + + movd esi, xmm0 + + movzx edx, byte ptr [eax + 12] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 12], dl + movzx edx, byte ptr [eax + 13] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 13], dl + movzx edx, byte ptr [eax + 14] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 14], dl + movzx edx, byte ptr [eax + 15] // copy alpha. + mov byte ptr [edi + 15], dl + + sub ecx, 4 + lea eax, [eax + 16] + lea edi, [edi + 16] + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/scale.c b/third_party/libyuv/source/scale.c deleted file mode 100644 index 1809300c0..000000000 --- a/third_party/libyuv/source/scale.c +++ /dev/null @@ -1,3884 +0,0 @@ -/* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "third_party/libyuv/include/libyuv/scale.h" - -#include -#include - -#include "third_party/libyuv/include/libyuv/cpu_id.h" -#include "third_party/libyuv/source/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -/* - * Note: Defining YUV_DISABLE_ASM allows to use c version. - */ -//#define YUV_DISABLE_ASM - -#if defined(_MSC_VER) -#define ALIGN16(var) __declspec(align(16)) var -#else -#define ALIGN16(var) var __attribute__((aligned(16))) -#endif - -// Note: A Neon reference manual -// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html -// Note: Some SSE2 reference manuals -// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf - -// Set the following flag to true to revert to only -// using the reference implementation ScalePlaneBox(), and -// NOT the optimized versions. Useful for debugging and -// when comparing the quality of the resulting YUV planes -// as produced by the optimized and non-optimized versions. - -static int use_reference_impl_ = 0; - -void SetUseReferenceImpl(int use) { - use_reference_impl_ = use; -} - -// ScaleRowDown2Int also used by planar functions - -/** - * NEON downscalers with interpolation. - * - * Provided by Fritz Koenig - * - */ - -#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) -#define HAS_SCALEROWDOWN2_NEON -void ScaleRowDown2_NEON(const uint8* src_ptr, int src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1 - "vst1.u8 {q0}, [%1]! \n" // store even pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "bhi 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1" // Clobber List - ); -} - -void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "add %1, %0 \n" // change the stride to row 2 pointer - "1: \n" - "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment - "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment - "vpaddl.u8 q0, q0 \n" // row 1 add adjacent - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2 - "vpadal.u8 q1, q3 \n" - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #2 \n" - "vst1.u8 {q0}, [%2]! \n" - "subs %3, %3, #16 \n" // 16 processed per loop - "bhi 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "q0", "q1", "q2", "q3" // Clobber List - ); -} - -#define HAS_SCALEROWDOWN4_NEON -static void ScaleRowDown4_NEON(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "1: \n" - "vld2.u8 {d0, d1}, [%0]! \n" - "vtrn.u8 d1, d0 \n" - "vshrn.u16 d0, q0, #8 \n" - "vst1.u32 {d0[1]}, [%1]! \n" - - "subs %2, #4 \n" - "bhi 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1", "memory", "cc" - ); -} - -static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "add r4, %0, %3 \n" - "add r5, r4, %3 \n" - "add %3, r5, %3 \n" - "1: \n" - "vld1.u8 {q0}, [%0]! \n" // load up 16x4 block of input data - "vld1.u8 {q1}, [r4]! \n" - "vld1.u8 {q2}, [r5]! \n" - "vld1.u8 {q3}, [%3]! \n" - - "vpaddl.u8 q0, q0 \n" - "vpadal.u8 q0, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q0, q3 \n" - - "vpaddl.u16 q0, q0 \n" - - "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding - - "vmovn.u16 d0, q0 \n" - "vst1.u32 {d0[0]}, [%1]! \n" - - "subs %2, #4 \n" - "bhi 1b \n" - - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 - : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" - ); -} - -#define HAS_SCALEROWDOWN34_NEON -// Down scale from 4 to 3 pixels. Use the neon multilane read/write -// to load up the every 4th pixel into a 4 different registers. -// Point samples 32 pixels to 24 pixels. -static void ScaleRowDown34_NEON(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "1: \n" - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vmov d2, d3 \n" // order needs to be d0, d1, d2 - "vst3.u8 {d0, d1, d2}, [%1]! \n" - "subs %2, #24 \n" - "bhi 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "d0", "d1", "d2", "d3", "memory", "cc" - ); -} - -static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - - // filter src line 0 with src line 1 - // expand chars to shorts to allow for room - // when adding lines together - "vmovl.u8 q8, d4 \n" - "vmovl.u8 q9, d5 \n" - "vmovl.u8 q10, d6 \n" - "vmovl.u8 q11, d7 \n" - - // 3 * line_0 + line_1 - "vmlal.u8 q8, d0, d24 \n" - "vmlal.u8 q9, d1, d24 \n" - "vmlal.u8 q10, d2, d24 \n" - "vmlal.u8 q11, d3, d24 \n" - - // (3 * line_0 + line_1) >> 2 - "vqrshrn.u16 d0, q8, #2 \n" - "vqrshrn.u16 d1, q9, #2 \n" - "vqrshrn.u16 d2, q10, #2 \n" - "vqrshrn.u16 d3, q11, #2 \n" - - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q8, d1 \n" - "vmlal.u8 q8, d0, d24 \n" - "vqrshrn.u16 d0, q8, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" - - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q8, d2 \n" - "vmlal.u8 q8, d3, d24 \n" - "vqrshrn.u16 d2, q8, #2 \n" - - "vst3.u8 {d0, d1, d2}, [%1]! \n" - - "subs %2, #24 \n" - "bhi 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" - ); -} - -static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - - // average src line 0 with src line 1 - "vrhadd.u8 q0, q0, q2 \n" - "vrhadd.u8 q1, q1, q3 \n" - - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q3, d1 \n" - "vmlal.u8 q3, d0, d24 \n" - "vqrshrn.u16 d0, q3, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" - - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q3, d2 \n" - "vmlal.u8 q3, d3, d24 \n" - "vqrshrn.u16 d2, q3, #2 \n" - - "vst3.u8 {d0, d1, d2}, [%1]! \n" - - "subs %2, #24 \n" - "bhi 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" - ); -} - -#define HAS_SCALEROWDOWN38_NEON -const uint8 shuf38[16] __attribute__ ((aligned(16))) = - { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; -const uint8 shuf38_2[16] __attribute__ ((aligned(16))) = - { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; -const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) = - { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; -const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) = - { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; - -// 32 -> 12 -static void ScaleRowDown38_NEON(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vld1.u8 {q3}, [%3] \n" - "1: \n" - "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" - "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" - "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" - "vst1.u8 {d4}, [%1]! \n" - "vst1.u32 {d5[0]}, [%1]! \n" - "subs %2, #12 \n" - "bhi 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(shuf38) // %3 - : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" - ); -} - -// 32x3 -> 12x1 -static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vld1.u16 {q13}, [%4] \n" - "vld1.u8 {q14}, [%5] \n" - "vld1.u8 {q15}, [%6] \n" - "add r4, %0, %3, lsl #1 \n" - "add %3, %0 \n" - "1: \n" - - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" - "vld4.u8 {d16, d17, d18, d19}, [r4]! \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - "vtrn.u8 d16, d17 \n" - - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - "vtrn.u8 d18, d19 \n" - - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - "vpaddl.u8 q8, q8 \n" - - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - "vpaddl.u8 d19, d19 \n" - - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 q0, q8 \n" - "vadd.u16 d4, d3, d7 \n" - "vadd.u16 d4, d19 \n" - - // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] - // + s[6 + st * 1] + s[7 + st * 1] - // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "vqrdmulh.s16 q2, q13 \n" - "vmovn.u16 d4, q2 \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - "vmovl.u8 q9, d18 \n" - - // combine source lines - "vadd.u16 q1, q3 \n" - "vadd.u16 q1, q9 \n" - - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" - - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" - - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q15 \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" - - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - - "vst1.u8 {d3}, [%1]! \n" - "vst1.u32 {d4[0]}, [%1]! \n" - "subs %2, #12 \n" - "bhi 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : "r"(mult38_div6), // %4 - "r"(shuf38_2), // %5 - "r"(mult38_div9) // %6 - : "r4", "q0", "q1", "q2", "q3", "q8", "q9", - "q13", "q14", "q15", "memory", "cc" - ); -} - -// 32x2 -> 12x1 -static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vld1.u16 {q13}, [%4] \n" - "vld1.u8 {q14}, [%5] \n" - "add %3, %0 \n" - "1: \n" - - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 d4, d3, d7 \n" - - // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "vqrshrn.u16 d4, q2, #2 \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - - // combine source lines - "vadd.u16 q1, q3 \n" - - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" - - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" - - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q13 \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" - - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - - "vst1.u8 {d3}, [%1]! \n" - "vst1.u32 {d4[0]}, [%1]! \n" - "subs %2, #12 \n" - "bhi 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : "r"(mult38_div6), // %4 - "r"(shuf38_2) // %5 - : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" - ); -} - -/** - * SSE2 downscalers with interpolation. - * - * Provided by Frank Barchard (fbarchard@google.com) - * - */ - -// Constants for SSE2 code -#elif (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && \ - !defined(YUV_DISABLE_ASM) -#if defined(_MSC_VER) -#define TALIGN16(t, var) __declspec(align(16)) t _ ## var -#elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && defined(__i386__) -#define TALIGN16(t, var) t var __attribute__((aligned(16))) -#else -#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) -#endif - -#if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \ - defined(__i386__) -#define DECLARE_FUNCTION(name) \ - ".text \n" \ - ".globl _" #name " \n" \ -"_" #name ": \n" -#else -#define DECLARE_FUNCTION(name) \ - ".text \n" \ - ".global " #name " \n" \ -#name ": \n" -#endif - - -// Offsets for source bytes 0 to 9 -//extern "C" -TALIGN16(const uint8, shuf0[16]) = - { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; - -// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -//extern "C" -TALIGN16(const uint8, shuf1[16]) = - { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -//extern "C" -TALIGN16(const uint8, shuf2[16]) = - { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; - -// Offsets for source bytes 0 to 10 -//extern "C" -TALIGN16(const uint8, shuf01[16]) = - { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; - -// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -//extern "C" -TALIGN16(const uint8, shuf11[16]) = - { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -//extern "C" -TALIGN16(const uint8, shuf21[16]) = - { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; - -// Coefficients for source bytes 0 to 10 -//extern "C" -TALIGN16(const uint8, madd01[16]) = - { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; - -// Coefficients for source bytes 10 to 21 -//extern "C" -TALIGN16(const uint8, madd11[16]) = - { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; - -// Coefficients for source bytes 21 to 31 -//extern "C" -TALIGN16(const uint8, madd21[16]) = - { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; - -// Coefficients for source bytes 21 to 31 -//extern "C" -TALIGN16(const int16, round34[8]) = - { 2, 2, 2, 2, 2, 2, 2, 2 }; - -//extern "C" -TALIGN16(const uint8, shuf38a[16]) = - { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - -//extern "C" -TALIGN16(const uint8, shuf38b[16]) = - { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; - -// Arrange words 0,3,6 into 0,1,2 -//extern "C" -TALIGN16(const uint8, shufac0[16]) = - { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - -// Arrange words 0,3,6 into 3,4,5 -//extern "C" -TALIGN16(const uint8, shufac3[16]) = - { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; - -// Scaling values for boxes of 3x3 and 2x3 -//extern "C" -TALIGN16(const uint16, scaleac3[8]) = - { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; - -// Arrange first value for pixels 0,1,2,3,4,5 -//extern "C" -TALIGN16(const uint8, shufab0[16]) = - { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; - -// Arrange second value for pixels 0,1,2,3,4,5 -//extern "C" -TALIGN16(const uint8, shufab1[16]) = - { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; - -// Arrange third value for pixels 0,1,2,3,4,5 -//extern "C" -TALIGN16(const uint8, shufab2[16]) = - { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; - -// Scaling values for boxes of 3x2 and 2x2 -//extern "C" -TALIGN16(const uint16, scaleab2[8]) = - { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; -#endif - -#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) && defined(_MSC_VER) - -#define HAS_SCALEROWDOWN2_SSE2 -// Reads 32 pixels, throws half away and writes 16 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) -static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - ja wloop - - ret - } -} -// Blends 32x2 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) -void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - movdqa [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - ja wloop - - pop esi - ret - } -} - -#define HAS_SCALEROWDOWN4_SSE2 -// Point samples 32 pixels to 8 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) -static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - // src_stride ignored - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x000000ff - psrld xmm5, 24 - - wloop: - movdqa xmm0, [esi] - movdqa xmm1, [esi + 16] - lea esi, [esi + 32] - pand xmm0, xmm5 - pand xmm1, xmm5 - packuswb xmm0, xmm1 - packuswb xmm0, xmm0 - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - sub ecx, 8 - ja wloop - - popad - ret - } -} - -// Blends 32x4 rectangle to 8x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) -static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov ebx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 - lea edx, [ebx + ebx * 2] // src_stride * 3 - - wloop: - movdqa xmm0, [esi] - movdqa xmm1, [esi + 16] - movdqa xmm2, [esi + ebx] - movdqa xmm3, [esi + ebx + 16] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, [esi + ebx * 2] - movdqa xmm3, [esi + ebx * 2 + 16] - movdqa xmm4, [esi + edx] - movdqa xmm5, [esi + edx + 16] - lea esi, [esi + 32] - pavgb xmm2, xmm4 - pavgb xmm3, xmm5 - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm7 - pand xmm3, xmm7 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - movdqa xmm2, xmm0 // average columns (16 to 8 pixels) - psrlw xmm0, 8 - pand xmm2, xmm7 - pavgw xmm0, xmm2 - packuswb xmm0, xmm0 - - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - sub ecx, 8 - ja wloop - - popad - ret - } -} - -#define HAS_SCALEROWDOWN8_SSE2 -// Point samples 32 pixels to 4 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. -__declspec(naked) -static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - // src_stride ignored - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes - psrlq xmm5, 56 - - wloop: - movdqa xmm0, [esi] - movdqa xmm1, [esi + 16] - lea esi, [esi + 32] - pand xmm0, xmm5 - pand xmm1, xmm5 - packuswb xmm0, xmm1 // 32->16 - packuswb xmm0, xmm0 // 16->8 - packuswb xmm0, xmm0 // 8->4 - movd dword ptr [edi], xmm0 - lea edi, [edi + 4] - sub ecx, 4 - ja wloop - - popad - ret - } -} - -// Blends 32x8 rectangle to 4x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. -__declspec(naked) -static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov ebx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - lea edx, [ebx + ebx * 2] // src_stride * 3 - pxor xmm7, xmm7 - - wloop: - movdqa xmm0, [esi] // average 8 rows to 1 - movdqa xmm1, [esi + 16] - movdqa xmm2, [esi + ebx] - movdqa xmm3, [esi + ebx + 16] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - movdqa xmm2, [esi + ebx * 2] - movdqa xmm3, [esi + ebx * 2 + 16] - movdqa xmm4, [esi + edx] - movdqa xmm5, [esi + edx + 16] - lea ebp, [esi + ebx * 4] - lea esi, [esi + 32] - pavgb xmm2, xmm4 - pavgb xmm3, xmm5 - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - - movdqa xmm2, [ebp] - movdqa xmm3, [ebp + 16] - movdqa xmm4, [ebp + ebx] - movdqa xmm5, [ebp + ebx + 16] - pavgb xmm2, xmm4 - pavgb xmm3, xmm5 - movdqa xmm4, [ebp + ebx * 2] - movdqa xmm5, [ebp + ebx * 2 + 16] - movdqa xmm6, [ebp + edx] - pavgb xmm4, xmm6 - movdqa xmm6, [ebp + edx + 16] - pavgb xmm5, xmm6 - pavgb xmm2, xmm4 - pavgb xmm3, xmm5 - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - - psadbw xmm0, xmm7 // average 32 pixels to 4 - psadbw xmm1, xmm7 - pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01 - pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx - por xmm0, xmm1 // -> 3201 - psrlw xmm0, 3 - packuswb xmm0, xmm0 - packuswb xmm0, xmm0 - movd dword ptr [edi], xmm0 - - lea edi, [edi + 4] - sub ecx, 4 - ja wloop - - popad - ret - } -} - -#define HAS_SCALEROWDOWN34_SSSE3 -// Point samples 32 pixels to 24 pixels. -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. -// Then shuffled to do the scaling. - -// Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) -static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - // src_stride ignored - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - movdqa xmm3, _shuf0 - movdqa xmm4, _shuf1 - movdqa xmm5, _shuf2 - - wloop: - movdqa xmm0, [esi] - movdqa xmm1, [esi + 16] - lea esi, [esi + 32] - movdqa xmm2, xmm1 - palignr xmm1, xmm0, 8 - pshufb xmm0, xmm3 - pshufb xmm1, xmm4 - pshufb xmm2, xmm5 - movq qword ptr [edi], xmm0 - movq qword ptr [edi + 8], xmm1 - movq qword ptr [edi + 16], xmm2 - lea edi, [edi + 24] - sub ecx, 24 - ja wloop - - popad - ret - } -} - -// Blends 32x2 rectangle to 24x1 -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. -// Then shuffled to do the scaling. - -// Register usage: -// xmm0 src_row 0 -// xmm1 src_row 1 -// xmm2 shuf 0 -// xmm3 shuf 1 -// xmm4 shuf 2 -// xmm5 madd 0 -// xmm6 madd 1 -// xmm7 round34 - -// Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) -static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov ebx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - movdqa xmm2, _shuf01 - movdqa xmm3, _shuf11 - movdqa xmm4, _shuf21 - movdqa xmm5, _madd01 - movdqa xmm6, _madd11 - movdqa xmm7, _round34 - - wloop: - movdqa xmm0, [esi] // pixels 0..7 - movdqa xmm1, [esi+ebx] - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edi], xmm0 - movdqu xmm0, [esi+8] // pixels 8..15 - movdqu xmm1, [esi+ebx+8] - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edi+8], xmm0 - movdqa xmm0, [esi+16] // pixels 16..23 - movdqa xmm1, [esi+ebx+16] - lea esi, [esi+32] - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, _madd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edi+16], xmm0 - lea edi, [edi+24] - sub ecx, 24 - ja wloop - - popad - ret - } -} - -// Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) -static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov ebx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - movdqa xmm2, _shuf01 - movdqa xmm3, _shuf11 - movdqa xmm4, _shuf21 - movdqa xmm5, _madd01 - movdqa xmm6, _madd11 - movdqa xmm7, _round34 - - wloop: - movdqa xmm0, [esi] // pixels 0..7 - movdqa xmm1, [esi+ebx] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edi], xmm0 - movdqu xmm0, [esi+8] // pixels 8..15 - movdqu xmm1, [esi+ebx+8] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edi+8], xmm0 - movdqa xmm0, [esi+16] // pixels 16..23 - movdqa xmm1, [esi+ebx+16] - lea esi, [esi+32] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, _madd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edi+16], xmm0 - lea edi, [edi+24] - sub ecx, 24 - ja wloop - - popad - ret - } -} - -#define HAS_SCALEROWDOWN38_SSSE3 -// 3/8 point sampler - -// Scale 32 pixels to 12 -__declspec(naked) -static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov edx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - movdqa xmm4, _shuf38a - movdqa xmm5, _shuf38b - - xloop: - movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5 - movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11 - lea esi, [esi + 32] - pshufb xmm0, xmm4 - pshufb xmm1, xmm5 - paddusb xmm0, xmm1 - - movq qword ptr [edi], xmm0 // write 12 pixels - movhlps xmm1, xmm0 - movd [edi + 8], xmm1 - lea edi, [edi + 12] - sub ecx, 12 - ja xloop - - popad - ret - } -} - -// Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) -static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov edx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - movdqa xmm4, _shufac0 - movdqa xmm5, _shufac3 - movdqa xmm6, _scaleac3 - pxor xmm7, xmm7 - - xloop: - movdqa xmm0, [esi] // sum up 3 rows into xmm0/1 - movdqa xmm2, [esi + edx] - movhlps xmm1, xmm0 - movhlps xmm3, xmm2 - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 - punpcklbw xmm2, xmm7 - punpcklbw xmm3, xmm7 - paddusw xmm0, xmm2 - paddusw xmm1, xmm3 - movdqa xmm2, [esi + edx * 2] - lea esi, [esi + 16] - movhlps xmm3, xmm2 - punpcklbw xmm2, xmm7 - punpcklbw xmm3, xmm7 - paddusw xmm0, xmm2 - paddusw xmm1, xmm3 - - movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2 - psrldq xmm0, 2 - paddusw xmm2, xmm0 - psrldq xmm0, 2 - paddusw xmm2, xmm0 - pshufb xmm2, xmm4 - - movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2 - psrldq xmm1, 2 - paddusw xmm3, xmm1 - psrldq xmm1, 2 - paddusw xmm3, xmm1 - pshufb xmm3, xmm5 - paddusw xmm2, xmm3 - - pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6 - packuswb xmm2, xmm2 - - movd [edi], xmm2 // write 6 pixels - pextrw eax, xmm2, 2 - mov [edi + 4], ax - lea edi, [edi + 6] - sub ecx, 6 - ja xloop - - popad - ret - } -} - -// Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) -static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov edx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - movdqa xmm4, _shufab0 - movdqa xmm5, _shufab1 - movdqa xmm6, _shufab2 - movdqa xmm7, _scaleab2 - - xloop: - movdqa xmm2, [esi] // average 2 rows into xmm2 - pavgb xmm2, [esi + edx] - lea esi, [esi + 16] - - movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0 - pshufb xmm0, xmm4 - movdqa xmm1, xmm2 - pshufb xmm1, xmm5 - paddusw xmm0, xmm1 - pshufb xmm2, xmm6 - paddusw xmm0, xmm2 - - pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2 - packuswb xmm0, xmm0 - - movd [edi], xmm0 // write 6 pixels - pextrw eax, xmm0, 2 - mov [edi + 4], ax - lea edi, [edi + 6] - sub ecx, 6 - ja xloop - - popad - ret - } -} - -#define HAS_SCALEADDROWS_SSE2 - -// Reads 8xN bytes and produces 16 shorts at a time. -__declspec(naked) -static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, - uint16* dst_ptr, int src_width, - int src_height) { - __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov edx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - mov ebx, [esp + 32 + 20] // height - pxor xmm5, xmm5 - dec ebx - - xloop: - // first row - movdqa xmm2, [esi] - lea eax, [esi + edx] - movhlps xmm3, xmm2 - mov ebp, ebx - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 - - // sum remaining rows - yloop: - movdqa xmm0, [eax] // read 16 pixels - lea eax, [eax + edx] // advance to next row - movhlps xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - paddusw xmm2, xmm0 // sum 16 words - paddusw xmm3, xmm1 - sub ebp, 1 - ja yloop - - movdqa [edi], xmm2 - movdqa [edi + 16], xmm3 - lea edi, [edi + 32] - lea esi, [esi + 16] - - sub ecx, 16 - ja xloop - - popad - ret - } -} - -// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. -#define HAS_SCALEFILTERROWS_SSE2 -__declspec(naked) -static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, - int src_stride, int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - cmp eax, 0 - je xloop1 - cmp eax, 128 - je xloop2 - - movd xmm6, eax // xmm6 = y fraction - punpcklwd xmm6, xmm6 - pshufd xmm6, xmm6, 0 - neg eax // xmm5 = 256 - y fraction - add eax, 256 - movd xmm5, eax - punpcklwd xmm5, xmm5 - pshufd xmm5, xmm5, 0 - pxor xmm7, xmm7 - - xloop: - movdqa xmm0, [esi] - movdqa xmm2, [esi + edx] - lea esi, [esi + 16] - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - punpcklbw xmm0, xmm7 - punpcklbw xmm2, xmm7 - punpckhbw xmm1, xmm7 - punpckhbw xmm3, xmm7 - pmullw xmm0, xmm5 // scale row 0 - pmullw xmm1, xmm5 - pmullw xmm2, xmm6 // scale row 1 - pmullw xmm3, xmm6 - paddusw xmm0, xmm2 // sum rows - paddusw xmm1, xmm3 - psrlw xmm0, 8 - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 16 - ja xloop - - mov al, [edi - 1] - mov [edi], al - pop edi - pop esi - ret - - xloop1: - movdqa xmm0, [esi] - lea esi, [esi + 16] - movdqa [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 16 - ja xloop1 - - mov al, [edi - 1] - mov [edi], al - pop edi - pop esi - ret - - xloop2: - movdqa xmm0, [esi] - movdqa xmm2, [esi + edx] - lea esi, [esi + 16] - pavgb xmm0, xmm2 - movdqa [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 16 - ja xloop2 - - mov al, [edi - 1] - mov [edi], al - pop edi - pop esi - ret - } -} - -// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. -#define HAS_SCALEFILTERROWS_SSSE3 -__declspec(naked) -static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int src_stride, int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - shr eax, 1 - cmp eax, 0 - je xloop1 - cmp eax, 64 - je xloop2 - - mov ah,al - neg al - add al, 128 - movd xmm5, eax - punpcklwd xmm5, xmm5 - pshufd xmm5, xmm5, 0 - - xloop: - movdqa xmm0, [esi] - movdqa xmm2, [esi + edx] - lea esi, [esi + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 - punpckhbw xmm1, xmm2 - pmaddubsw xmm0, xmm5 - pmaddubsw xmm1, xmm5 - psrlw xmm0, 7 - psrlw xmm1, 7 - packuswb xmm0, xmm1 - movdqa [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 16 - ja xloop - - mov al, [edi - 1] - mov [edi], al - pop edi - pop esi - ret - - xloop1: - movdqa xmm0, [esi] - lea esi, [esi + 16] - movdqa [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 16 - ja xloop1 - - mov al, [edi - 1] - mov [edi], al - pop edi - pop esi - ret - - xloop2: - movdqa xmm0, [esi] - movdqa xmm2, [esi + edx] - lea esi, [esi + 16] - pavgb xmm0, xmm2 - movdqa [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 16 - ja xloop2 - - mov al, [edi - 1] - mov [edi], al - pop edi - pop esi - ret - - } -} - -// Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) -static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int dst_width) { - __asm { - mov edx, [esp + 4] // dst_ptr - mov eax, [esp + 8] // src_ptr - mov ecx, [esp + 12] // dst_width - movdqa xmm1, _round34 - movdqa xmm2, _shuf01 - movdqa xmm3, _shuf11 - movdqa xmm4, _shuf21 - movdqa xmm5, _madd01 - movdqa xmm6, _madd11 - movdqa xmm7, _madd21 - - wloop: - movdqa xmm0, [eax] // pixels 0..7 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm1 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax+8] // pixels 8..15 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm1 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx+8], xmm0 - movdqa xmm0, [eax+16] // pixels 16..23 - lea eax, [eax+32] - pshufb xmm0, xmm4 - pmaddubsw xmm0, xmm7 - paddsw xmm0, xmm1 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx+16], xmm0 - lea edx, [edx+24] - sub ecx, 24 - ja wloop - ret - } -} - -#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM) - -// GCC versions of row functions are verbatim conversions from Visual C. -// Generated using gcc disassembly on Visual C object file: -// objdump -D yuvscaler.obj >yuvscaler.txt -#define HAS_SCALEROWDOWN2_SSE2 -static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" -"1:" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -); -} - -static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" -"1:" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa (%0,%3,1),%%xmm2 \n" - "movdqa 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc" -); -} - -#define HAS_SCALEROWDOWN4_SSE2 -static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" -"1:" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -); -} - -static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - intptr_t temp = 0; - asm volatile ( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0x8,%%xmm7 \n" - "lea (%4,%4,2),%3 \n" -"1:" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa (%0,%4,1),%%xmm2 \n" - "movdqa 0x10(%0,%4,1),%%xmm3 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa (%0,%4,2),%%xmm2 \n" - "movdqa 0x10(%0,%4,2),%%xmm3 \n" - "movdqa (%0,%3,1),%%xmm4 \n" - "movdqa 0x10(%0,%3,1),%%xmm5 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm4,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm5,%%xmm3 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm7,%%xmm2 \n" - "pand %%xmm7,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "pavgw %%xmm2,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(temp) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc" -#if defined(__x86_64__) - , "xmm6", "xmm7" -#endif -); -} - -#define HAS_SCALEROWDOWN8_SSE2 -static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlq $0x38,%%xmm5 \n" -"1:" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%1) \n" - "lea 0x4(%1),%1 \n" - "sub $0x4,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -); -} - -#if defined(__i386__) -void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - DECLARE_FUNCTION(ScaleRowDown8Int_SSE2) - "pusha \n" - "mov 0x24(%esp),%esi \n" - "mov 0x28(%esp),%ebx \n" - "mov 0x2c(%esp),%edi \n" - "mov 0x30(%esp),%ecx \n" - "lea (%ebx,%ebx,2),%edx \n" - "pxor %xmm7,%xmm7 \n" - -"1:" - "movdqa (%esi),%xmm0 \n" - "movdqa 0x10(%esi),%xmm1 \n" - "movdqa (%esi,%ebx,1),%xmm2 \n" - "movdqa 0x10(%esi,%ebx,1),%xmm3 \n" - "pavgb %xmm2,%xmm0 \n" - "pavgb %xmm3,%xmm1 \n" - "movdqa (%esi,%ebx,2),%xmm2 \n" - "movdqa 0x10(%esi,%ebx,2),%xmm3 \n" - "movdqa (%esi,%edx,1),%xmm4 \n" - "movdqa 0x10(%esi,%edx,1),%xmm5 \n" - "lea (%esi,%ebx,4),%ebp \n" - "lea 0x20(%esi),%esi \n" - "pavgb %xmm4,%xmm2 \n" - "pavgb %xmm5,%xmm3 \n" - "pavgb %xmm2,%xmm0 \n" - "pavgb %xmm3,%xmm1 \n" - "movdqa 0x0(%ebp),%xmm2 \n" - "movdqa 0x10(%ebp),%xmm3 \n" - "movdqa 0x0(%ebp,%ebx,1),%xmm4 \n" - "movdqa 0x10(%ebp,%ebx,1),%xmm5 \n" - "pavgb %xmm4,%xmm2 \n" - "pavgb %xmm5,%xmm3 \n" - "movdqa 0x0(%ebp,%ebx,2),%xmm4 \n" - "movdqa 0x10(%ebp,%ebx,2),%xmm5 \n" - "movdqa 0x0(%ebp,%edx,1),%xmm6 \n" - "pavgb %xmm6,%xmm4 \n" - "movdqa 0x10(%ebp,%edx,1),%xmm6 \n" - "pavgb %xmm6,%xmm5 \n" - "pavgb %xmm4,%xmm2 \n" - "pavgb %xmm5,%xmm3 \n" - "pavgb %xmm2,%xmm0 \n" - "pavgb %xmm3,%xmm1 \n" - "psadbw %xmm7,%xmm0 \n" - "psadbw %xmm7,%xmm1 \n" - "pshufd $0xd8,%xmm0,%xmm0 \n" - "pshufd $0x8d,%xmm1,%xmm1 \n" - "por %xmm1,%xmm0 \n" - "psrlw $0x3,%xmm0 \n" - "packuswb %xmm0,%xmm0 \n" - "packuswb %xmm0,%xmm0 \n" - "movd %xmm0,(%edi) \n" - "lea 0x4(%edi),%edi \n" - "sub $0x4,%ecx \n" - "ja 1b \n" - "popa \n" - "ret \n" -); - -// fpic is used for magiccam plugin -#if !defined(__PIC__) -#define HAS_SCALEROWDOWN34_SSSE3 -void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - DECLARE_FUNCTION(ScaleRowDown34_SSSE3) - "pusha \n" - "mov 0x24(%esp),%esi \n" - "mov 0x2c(%esp),%edi \n" - "mov 0x30(%esp),%ecx \n" - "movdqa _shuf0,%xmm3 \n" - "movdqa _shuf1,%xmm4 \n" - "movdqa _shuf2,%xmm5 \n" - -"1:" - "movdqa (%esi),%xmm0 \n" - "movdqa 0x10(%esi),%xmm2 \n" - "lea 0x20(%esi),%esi \n" - "movdqa %xmm2,%xmm1 \n" - "palignr $0x8,%xmm0,%xmm1 \n" - "pshufb %xmm3,%xmm0 \n" - "pshufb %xmm4,%xmm1 \n" - "pshufb %xmm5,%xmm2 \n" - "movq %xmm0,(%edi) \n" - "movq %xmm1,0x8(%edi) \n" - "movq %xmm2,0x10(%edi) \n" - "lea 0x18(%edi),%edi \n" - "sub $0x18,%ecx \n" - "ja 1b \n" - "popa \n" - "ret \n" -); - -void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3) - "pusha \n" - "mov 0x24(%esp),%esi \n" - "mov 0x28(%esp),%ebp \n" - "mov 0x2c(%esp),%edi \n" - "mov 0x30(%esp),%ecx \n" - "movdqa _shuf01,%xmm2 \n" - "movdqa _shuf11,%xmm3 \n" - "movdqa _shuf21,%xmm4 \n" - "movdqa _madd01,%xmm5 \n" - "movdqa _madd11,%xmm6 \n" - "movdqa _round34,%xmm7 \n" - -"1:" - "movdqa (%esi),%xmm0 \n" - "movdqa (%esi,%ebp),%xmm1 \n" - "pavgb %xmm1,%xmm0 \n" - "pshufb %xmm2,%xmm0 \n" - "pmaddubsw %xmm5,%xmm0 \n" - "paddsw %xmm7,%xmm0 \n" - "psrlw $0x2,%xmm0 \n" - "packuswb %xmm0,%xmm0 \n" - "movq %xmm0,(%edi) \n" - "movdqu 0x8(%esi),%xmm0 \n" - "movdqu 0x8(%esi,%ebp),%xmm1 \n" - "pavgb %xmm1,%xmm0 \n" - "pshufb %xmm3,%xmm0 \n" - "pmaddubsw %xmm6,%xmm0 \n" - "paddsw %xmm7,%xmm0 \n" - "psrlw $0x2,%xmm0 \n" - "packuswb %xmm0,%xmm0 \n" - "movq %xmm0,0x8(%edi) \n" - "movdqa 0x10(%esi),%xmm0 \n" - "movdqa 0x10(%esi,%ebp),%xmm1 \n" - "lea 0x20(%esi),%esi \n" - "pavgb %xmm1,%xmm0 \n" - "pshufb %xmm4,%xmm0 \n" - "movdqa _madd21,%xmm1 \n" - "pmaddubsw %xmm1,%xmm0 \n" - "paddsw %xmm7,%xmm0 \n" - "psrlw $0x2,%xmm0 \n" - "packuswb %xmm0,%xmm0 \n" - "movq %xmm0,0x10(%edi) \n" - "lea 0x18(%edi),%edi \n" - "sub $0x18,%ecx \n" - "ja 1b \n" - - "popa \n" - "ret \n" -); - -void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3) - "pusha \n" - "mov 0x24(%esp),%esi \n" - "mov 0x28(%esp),%ebp \n" - "mov 0x2c(%esp),%edi \n" - "mov 0x30(%esp),%ecx \n" - "movdqa _shuf01,%xmm2 \n" - "movdqa _shuf11,%xmm3 \n" - "movdqa _shuf21,%xmm4 \n" - "movdqa _madd01,%xmm5 \n" - "movdqa _madd11,%xmm6 \n" - "movdqa _round34,%xmm7 \n" - -"1:" - "movdqa (%esi),%xmm0 \n" - "movdqa (%esi,%ebp,1),%xmm1 \n" - "pavgb %xmm0,%xmm1 \n" - "pavgb %xmm1,%xmm0 \n" - "pshufb %xmm2,%xmm0 \n" - "pmaddubsw %xmm5,%xmm0 \n" - "paddsw %xmm7,%xmm0 \n" - "psrlw $0x2,%xmm0 \n" - "packuswb %xmm0,%xmm0 \n" - "movq %xmm0,(%edi) \n" - "movdqu 0x8(%esi),%xmm0 \n" - "movdqu 0x8(%esi,%ebp,1),%xmm1 \n" - "pavgb %xmm0,%xmm1 \n" - "pavgb %xmm1,%xmm0 \n" - "pshufb %xmm3,%xmm0 \n" - "pmaddubsw %xmm6,%xmm0 \n" - "paddsw %xmm7,%xmm0 \n" - "psrlw $0x2,%xmm0 \n" - "packuswb %xmm0,%xmm0 \n" - "movq %xmm0,0x8(%edi) \n" - "movdqa 0x10(%esi),%xmm0 \n" - "movdqa 0x10(%esi,%ebp,1),%xmm1 \n" - "lea 0x20(%esi),%esi \n" - "pavgb %xmm0,%xmm1 \n" - "pavgb %xmm1,%xmm0 \n" - "pshufb %xmm4,%xmm0 \n" - "movdqa _madd21,%xmm1 \n" - "pmaddubsw %xmm1,%xmm0 \n" - "paddsw %xmm7,%xmm0 \n" - "psrlw $0x2,%xmm0 \n" - "packuswb %xmm0,%xmm0 \n" - "movq %xmm0,0x10(%edi) \n" - "lea 0x18(%edi),%edi \n" - "sub $0x18,%ecx \n" - "ja 1b \n" - "popa \n" - "ret \n" -); - -#define HAS_SCALEROWDOWN38_SSSE3 -void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - DECLARE_FUNCTION(ScaleRowDown38_SSSE3) - "pusha \n" - "mov 0x24(%esp),%esi \n" - "mov 0x28(%esp),%edx \n" - "mov 0x2c(%esp),%edi \n" - "mov 0x30(%esp),%ecx \n" - "movdqa _shuf38a ,%xmm4 \n" - "movdqa _shuf38b ,%xmm5 \n" - -"1:" - "movdqa (%esi),%xmm0 \n" - "movdqa 0x10(%esi),%xmm1 \n" - "lea 0x20(%esi),%esi \n" - "pshufb %xmm4,%xmm0 \n" - "pshufb %xmm5,%xmm1 \n" - "paddusb %xmm1,%xmm0 \n" - "movq %xmm0,(%edi) \n" - "movhlps %xmm0,%xmm1 \n" - "movd %xmm1,0x8(%edi) \n" - "lea 0xc(%edi),%edi \n" - "sub $0xc,%ecx \n" - "ja 1b \n" - "popa \n" - "ret \n" -); - -void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3) - "pusha \n" - "mov 0x24(%esp),%esi \n" - "mov 0x28(%esp),%edx \n" - "mov 0x2c(%esp),%edi \n" - "mov 0x30(%esp),%ecx \n" - "movdqa _shufac0,%xmm4 \n" - "movdqa _shufac3,%xmm5 \n" - "movdqa _scaleac3,%xmm6 \n" - "pxor %xmm7,%xmm7 \n" - -"1:" - "movdqa (%esi),%xmm0 \n" - "movdqa (%esi,%edx,1),%xmm2 \n" - "movhlps %xmm0,%xmm1 \n" - "movhlps %xmm2,%xmm3 \n" - "punpcklbw %xmm7,%xmm0 \n" - "punpcklbw %xmm7,%xmm1 \n" - "punpcklbw %xmm7,%xmm2 \n" - "punpcklbw %xmm7,%xmm3 \n" - "paddusw %xmm2,%xmm0 \n" - "paddusw %xmm3,%xmm1 \n" - "movdqa (%esi,%edx,2),%xmm2 \n" - "lea 0x10(%esi),%esi \n" - "movhlps %xmm2,%xmm3 \n" - "punpcklbw %xmm7,%xmm2 \n" - "punpcklbw %xmm7,%xmm3 \n" - "paddusw %xmm2,%xmm0 \n" - "paddusw %xmm3,%xmm1 \n" - "movdqa %xmm0,%xmm2 \n" - "psrldq $0x2,%xmm0 \n" - "paddusw %xmm0,%xmm2 \n" - "psrldq $0x2,%xmm0 \n" - "paddusw %xmm0,%xmm2 \n" - "pshufb %xmm4,%xmm2 \n" - "movdqa %xmm1,%xmm3 \n" - "psrldq $0x2,%xmm1 \n" - "paddusw %xmm1,%xmm3 \n" - "psrldq $0x2,%xmm1 \n" - "paddusw %xmm1,%xmm3 \n" - "pshufb %xmm5,%xmm3 \n" - "paddusw %xmm3,%xmm2 \n" - "pmulhuw %xmm6,%xmm2 \n" - "packuswb %xmm2,%xmm2 \n" - "movd %xmm2,(%edi) \n" - "pextrw $0x2,%xmm2,%eax \n" - "mov %ax,0x4(%edi) \n" - "lea 0x6(%edi),%edi \n" - "sub $0x6,%ecx \n" - "ja 1b \n" - "popa \n" - "ret \n" -); - -void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3) - "pusha \n" - "mov 0x24(%esp),%esi \n" - "mov 0x28(%esp),%edx \n" - "mov 0x2c(%esp),%edi \n" - "mov 0x30(%esp),%ecx \n" - "movdqa _shufab0,%xmm4 \n" - "movdqa _shufab1,%xmm5 \n" - "movdqa _shufab2,%xmm6 \n" - "movdqa _scaleab2,%xmm7 \n" - -"1:" - "movdqa (%esi),%xmm2 \n" - "pavgb (%esi,%edx,1),%xmm2 \n" - "lea 0x10(%esi),%esi \n" - "movdqa %xmm2,%xmm0 \n" - "pshufb %xmm4,%xmm0 \n" - "movdqa %xmm2,%xmm1 \n" - "pshufb %xmm5,%xmm1 \n" - "paddusw %xmm1,%xmm0 \n" - "pshufb %xmm6,%xmm2 \n" - "paddusw %xmm2,%xmm0 \n" - "pmulhuw %xmm7,%xmm0 \n" - "packuswb %xmm0,%xmm0 \n" - "movd %xmm0,(%edi) \n" - "pextrw $0x2,%xmm0,%eax \n" - "mov %ax,0x4(%edi) \n" - "lea 0x6(%edi),%edi \n" - "sub $0x6,%ecx \n" - "ja 1b \n" - "popa \n" - "ret \n" -); -#endif // __PIC__ - -#define HAS_SCALEADDROWS_SSE2 -void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, - uint16* dst_ptr, int src_width, - int src_height); - asm( - DECLARE_FUNCTION(ScaleAddRows_SSE2) - "pusha \n" - "mov 0x24(%esp),%esi \n" - "mov 0x28(%esp),%edx \n" - "mov 0x2c(%esp),%edi \n" - "mov 0x30(%esp),%ecx \n" - "mov 0x34(%esp),%ebx \n" - "pxor %xmm5,%xmm5 \n" - -"1:" - "movdqa (%esi),%xmm2 \n" - "lea (%esi,%edx,1),%eax \n" - "movhlps %xmm2,%xmm3 \n" - "lea -0x1(%ebx),%ebp \n" - "punpcklbw %xmm5,%xmm2 \n" - "punpcklbw %xmm5,%xmm3 \n" - -"2:" - "movdqa (%eax),%xmm0 \n" - "lea (%eax,%edx,1),%eax \n" - "movhlps %xmm0,%xmm1 \n" - "punpcklbw %xmm5,%xmm0 \n" - "punpcklbw %xmm5,%xmm1 \n" - "paddusw %xmm0,%xmm2 \n" - "paddusw %xmm1,%xmm3 \n" - "sub $0x1,%ebp \n" - "ja 2b \n" - - "movdqa %xmm2,(%edi) \n" - "movdqa %xmm3,0x10(%edi) \n" - "lea 0x20(%edi),%edi \n" - "lea 0x10(%esi),%esi \n" - "sub $0x10,%ecx \n" - "ja 1b \n" - "popa \n" - "ret \n" -); - -// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version -#define HAS_SCALEFILTERROWS_SSE2 -void ScaleFilterRows_SSE2(uint8* dst_ptr, - const uint8* src_ptr, int src_stride, - int dst_width, int source_y_fraction); - asm( - DECLARE_FUNCTION(ScaleFilterRows_SSE2) - "push %esi \n" - "push %edi \n" - "mov 0xc(%esp),%edi \n" - "mov 0x10(%esp),%esi \n" - "mov 0x14(%esp),%edx \n" - "mov 0x18(%esp),%ecx \n" - "mov 0x1c(%esp),%eax \n" - "cmp $0x0,%eax \n" - "je 2f \n" - "cmp $0x80,%eax \n" - "je 3f \n" - "movd %eax,%xmm6 \n" - "punpcklwd %xmm6,%xmm6 \n" - "pshufd $0x0,%xmm6,%xmm6 \n" - "neg %eax \n" - "add $0x100,%eax \n" - "movd %eax,%xmm5 \n" - "punpcklwd %xmm5,%xmm5 \n" - "pshufd $0x0,%xmm5,%xmm5 \n" - "pxor %xmm7,%xmm7 \n" - -"1:" - "movdqa (%esi),%xmm0 \n" - "movdqa (%esi,%edx,1),%xmm2 \n" - "lea 0x10(%esi),%esi \n" - "movdqa %xmm0,%xmm1 \n" - "movdqa %xmm2,%xmm3 \n" - "punpcklbw %xmm7,%xmm0 \n" - "punpcklbw %xmm7,%xmm2 \n" - "punpckhbw %xmm7,%xmm1 \n" - "punpckhbw %xmm7,%xmm3 \n" - "pmullw %xmm5,%xmm0 \n" - "pmullw %xmm5,%xmm1 \n" - "pmullw %xmm6,%xmm2 \n" - "pmullw %xmm6,%xmm3 \n" - "paddusw %xmm2,%xmm0 \n" - "paddusw %xmm3,%xmm1 \n" - "psrlw $0x8,%xmm0 \n" - "psrlw $0x8,%xmm1 \n" - "packuswb %xmm1,%xmm0 \n" - "movdqa %xmm0,(%edi) \n" - "lea 0x10(%edi),%edi \n" - "sub $0x10,%ecx \n" - "ja 1b \n" - "mov -0x1(%edi),%al \n" - "mov %al,(%edi) \n" - "pop %edi \n" - "pop %esi \n" - "ret \n" - -"2:" - "movdqa (%esi),%xmm0 \n" - "lea 0x10(%esi),%esi \n" - "movdqa %xmm0,(%edi) \n" - "lea 0x10(%edi),%edi \n" - "sub $0x10,%ecx \n" - "ja 2b \n" - - "mov -0x1(%edi),%al \n" - "mov %al,(%edi) \n" - "pop %edi \n" - "pop %esi \n" - "ret \n" - -"3:" - "movdqa (%esi),%xmm0 \n" - "movdqa (%esi,%edx,1),%xmm2 \n" - "lea 0x10(%esi),%esi \n" - "pavgb %xmm2,%xmm0 \n" - "movdqa %xmm0,(%edi) \n" - "lea 0x10(%edi),%edi \n" - "sub $0x10,%ecx \n" - "ja 3b \n" - - "mov -0x1(%edi),%al \n" - "mov %al,(%edi) \n" - "pop %edi \n" - "pop %esi \n" - "ret \n" -); - -// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version -#define HAS_SCALEFILTERROWS_SSSE3 -void ScaleFilterRows_SSSE3(uint8* dst_ptr, - const uint8* src_ptr, int src_stride, - int dst_width, int source_y_fraction); - asm( - DECLARE_FUNCTION(ScaleFilterRows_SSSE3) - "push %esi \n" - "push %edi \n" - "mov 0xc(%esp),%edi \n" - "mov 0x10(%esp),%esi \n" - "mov 0x14(%esp),%edx \n" - "mov 0x18(%esp),%ecx \n" - "mov 0x1c(%esp),%eax \n" - "shr %eax \n" - "cmp $0x0,%eax \n" - "je 2f \n" - "cmp $0x40,%eax \n" - "je 3f \n" - "mov %al,%ah \n" - "neg %al \n" - "add $0x80,%al \n" - "movd %eax,%xmm5 \n" - "punpcklwd %xmm5,%xmm5 \n" - "pshufd $0x0,%xmm5,%xmm5 \n" - -"1:" - "movdqa (%esi),%xmm0 \n" - "movdqa (%esi,%edx,1),%xmm2 \n" - "lea 0x10(%esi),%esi \n" - "movdqa %xmm0,%xmm1 \n" - "punpcklbw %xmm2,%xmm0 \n" - "punpckhbw %xmm2,%xmm1 \n" - "pmaddubsw %xmm5,%xmm0 \n" - "pmaddubsw %xmm5,%xmm1 \n" - "psrlw $0x7,%xmm0 \n" - "psrlw $0x7,%xmm1 \n" - "packuswb %xmm1,%xmm0 \n" - "movdqa %xmm0,(%edi) \n" - "lea 0x10(%edi),%edi \n" - "sub $0x10,%ecx \n" - "ja 1b \n" - "mov -0x1(%edi),%al \n" - "mov %al,(%edi) \n" - "pop %edi \n" - "pop %esi \n" - "ret \n" - -"2:" - "movdqa (%esi),%xmm0 \n" - "lea 0x10(%esi),%esi \n" - "movdqa %xmm0,(%edi) \n" - "lea 0x10(%edi),%edi \n" - "sub $0x10,%ecx \n" - "ja 2b \n" - "mov -0x1(%edi),%al \n" - "mov %al,(%edi) \n" - "pop %edi \n" - "pop %esi \n" - "ret \n" - -"3:" - "movdqa (%esi),%xmm0 \n" - "movdqa (%esi,%edx,1),%xmm2 \n" - "lea 0x10(%esi),%esi \n" - "pavgb %xmm2,%xmm0 \n" - "movdqa %xmm0,(%edi) \n" - "lea 0x10(%edi),%edi \n" - "sub $0x10,%ecx \n" - "ja 3b \n" - "mov -0x1(%edi),%al \n" - "mov %al,(%edi) \n" - "pop %edi \n" - "pop %esi \n" - "ret \n" -); - -#elif defined(__x86_64__) -static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "lea (%3,%3,2),%%r10 \n" - "pxor %%xmm7,%%xmm7 \n" -"1:" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa (%0,%3,1),%%xmm2 \n" - "movdqa 0x10(%0,%3,1),%%xmm3 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa (%0,%3,2),%%xmm2 \n" - "movdqa 0x10(%0,%3,2),%%xmm3 \n" - "movdqa (%0,%%r10,1),%%xmm4 \n" - "movdqa 0x10(%0,%%r10,1),%%xmm5 \n" - "lea (%0,%3,4),%%r11 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm4,%%xmm2 \n" - "pavgb %%xmm5,%%xmm3 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa 0x0(%%r11),%%xmm2 \n" - "movdqa 0x10(%%r11),%%xmm3 \n" - "movdqa 0x0(%%r11,%3,1),%%xmm4 \n" - "movdqa 0x10(%%r11,%3,1),%%xmm5 \n" - "pavgb %%xmm4,%%xmm2 \n" - "pavgb %%xmm5,%%xmm3 \n" - "movdqa 0x0(%%r11,%3,2),%%xmm4 \n" - "movdqa 0x10(%%r11,%3,2),%%xmm5 \n" - "movdqa 0x0(%%r11,%%r10,1),%%xmm6 \n" - "pavgb %%xmm6,%%xmm4 \n" - "movdqa 0x10(%%r11,%%r10,1),%%xmm6 \n" - "pavgb %%xmm6,%%xmm5 \n" - "pavgb %%xmm4,%%xmm2 \n" - "pavgb %%xmm5,%%xmm3 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psadbw %%xmm7,%%xmm0 \n" - "psadbw %%xmm7,%%xmm1 \n" - "pshufd $0xd8,%%xmm0,%%xmm0 \n" - "pshufd $0x8d,%%xmm1,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "psrlw $0x3,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%1) \n" - "lea 0x4(%1),%1 \n" - "sub $0x4,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "r10", "r11", "xmm6", "xmm7" -); -} - -#define HAS_SCALEROWDOWN34_SSSE3 -static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa (%3),%%xmm3 \n" - "movdqa (%4),%%xmm4 \n" - "movdqa (%5),%%xmm5 \n" -"1:" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm2 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "palignr $0x8,%%xmm0,%%xmm1 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(_shuf0), // %3 - "r"(_shuf1), // %4 - "r"(_shuf2) // %5 - : "memory", "cc" -); -} - -static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa (%4),%%xmm2 \n" // _shuf01 - "movdqa (%5),%%xmm3 \n" // _shuf11 - "movdqa (%6),%%xmm4 \n" // _shuf21 - "movdqa (%7),%%xmm5 \n" // _madd01 - "movdqa (%8),%%xmm6 \n" // _madd11 - "movdqa (%9),%%xmm7 \n" // _round34 - "movdqa (%10),%%xmm8 \n" // _madd21 -"1:" - "movdqa (%0),%%xmm0 \n" - "movdqa (%0,%3),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "pshufb %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm5,%%xmm0 \n" - "paddsw %%xmm7,%%xmm0 \n" - "psrlw $0x2,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqu 0x8(%0),%%xmm0 \n" - "movdqu 0x8(%0,%3),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm6,%%xmm0 \n" - "paddsw %%xmm7,%%xmm0 \n" - "psrlw $0x2,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x8(%1) \n" - "movdqa 0x10(%0),%%xmm0 \n" - "movdqa 0x10(%0,%3),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm8,%%xmm0 \n" - "paddsw %%xmm7,%%xmm0 \n" - "psrlw $0x2,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"(_shuf01), // %4 - "r"(_shuf11), // %5 - "r"(_shuf21), // %6 - "r"(_madd01), // %7 - "r"(_madd11), // %8 - "r"(_round34), // %9 - "r"(_madd21) // %10 - : "memory", "cc", "xmm6", "xmm7", "xmm8" -); -} - -static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa (%4),%%xmm2 \n" // _shuf01 - "movdqa (%5),%%xmm3 \n" // _shuf11 - "movdqa (%6),%%xmm4 \n" // _shuf21 - "movdqa (%7),%%xmm5 \n" // _madd01 - "movdqa (%8),%%xmm6 \n" // _madd11 - "movdqa (%9),%%xmm7 \n" // _round34 - "movdqa (%10),%%xmm8 \n" // _madd21 -"1:" - "movdqa (%0),%%xmm0 \n" - "movdqa (%0,%3,1),%%xmm1 \n" - "pavgb %%xmm0,%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "pshufb %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm5,%%xmm0 \n" - "paddsw %%xmm7,%%xmm0 \n" - "psrlw $0x2,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqu 0x8(%0),%%xmm0 \n" - "movdqu 0x8(%0,%3,1),%%xmm1 \n" - "pavgb %%xmm0,%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm6,%%xmm0 \n" - "paddsw %%xmm7,%%xmm0 \n" - "psrlw $0x2,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x8(%1) \n" - "movdqa 0x10(%0),%%xmm0 \n" - "movdqa 0x10(%0,%3,1),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm0,%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm8,%%xmm0 \n" - "paddsw %%xmm7,%%xmm0 \n" - "psrlw $0x2,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"(_shuf01), // %4 - "r"(_shuf11), // %5 - "r"(_shuf21), // %6 - "r"(_madd01), // %7 - "r"(_madd11), // %8 - "r"(_round34), // %9 - "r"(_madd21) // %10 - : "memory", "cc", "xmm6", "xmm7", "xmm8" -); -} - -#define HAS_SCALEROWDOWN38_SSSE3 -static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa (%3),%%xmm4 \n" - "movdqa (%4),%%xmm5 \n" -"1:" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movhlps %%xmm0,%%xmm1 \n" - "movd %%xmm1,0x8(%1) \n" - "lea 0xc(%1),%1 \n" - "sub $0xc,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(_shuf38a), // %3 - "r"(_shuf38b) // %4 - : "memory", "cc" -); -} - -static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa (%4),%%xmm4 \n" - "movdqa (%5),%%xmm5 \n" - "movdqa (%6),%%xmm6 \n" - "pxor %%xmm7,%%xmm7 \n" -"1:" - "movdqa (%0),%%xmm0 \n" - "movdqa (%0,%3,1),%%xmm2 \n" - "movhlps %%xmm0,%%xmm1 \n" - "movhlps %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm7,%%xmm0 \n" - "punpcklbw %%xmm7,%%xmm1 \n" - "punpcklbw %%xmm7,%%xmm2 \n" - "punpcklbw %%xmm7,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqa (%0,%3,2),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "movhlps %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm7,%%xmm2 \n" - "punpcklbw %%xmm7,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm2 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm3 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm3 \n" - "pshufb %%xmm5,%%xmm3 \n" - "paddusw %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm6,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,(%1) \n" - "pextrw $0x2,%%xmm2,%%eax \n" - "mov %%ax,0x4(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"(_shufac0), // %4 - "r"(_shufac3), // %5 - "r"(_scaleac3) // %6 - : "memory", "cc", "rax", "xmm6", "xmm7" -); -} - -static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa (%4),%%xmm4 \n" - "movdqa (%5),%%xmm5 \n" - "movdqa (%6),%%xmm6 \n" - "movdqa (%7),%%xmm7 \n" -"1:" - "movdqa (%0),%%xmm2 \n" - "pavgb (%0,%3,1),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm2,%%xmm0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusw %%xmm1,%%xmm0 \n" - "pshufb %%xmm6,%%xmm2 \n" - "paddusw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%1) \n" - "pextrw $0x2,%%xmm0,%%eax \n" - "mov %%ax,0x4(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"(_shufab0), // %4 - "r"(_shufab1), // %5 - "r"(_shufab2), // %6 - "r"(_scaleab2) // %7 - : "memory", "cc", "rax", "xmm6", "xmm7" -); -} - -#define HAS_SCALEADDROWS_SSE2 -static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, - uint16* dst_ptr, int src_width, - int src_height) { - asm volatile ( - "pxor %%xmm5,%%xmm5 \n" -"1:" - "movdqa (%0),%%xmm2 \n" - "lea (%0,%4,1),%%r10 \n" - "movhlps %%xmm2,%%xmm3 \n" - "lea -0x1(%3),%%r11 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - -"2:" - "movdqa (%%r10),%%xmm0 \n" - "lea (%%r10,%4,1),%%r10 \n" - "movhlps %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "paddusw %%xmm0,%%xmm2 \n" - "paddusw %%xmm1,%%xmm3 \n" - "sub $0x1,%%r11 \n" - "ja 2b \n" - - "movdqa %%xmm2,(%1) \n" - "movdqa %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "lea 0x10(%0),%0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width), // %2 - "+r"(src_height) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", "r10", "r11" -); -} - -// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version -#define HAS_SCALEFILTERROWS_SSE2 -static void ScaleFilterRows_SSE2(uint8* dst_ptr, - const uint8* src_ptr, int src_stride, - int dst_width, int source_y_fraction) { - if (source_y_fraction == 0) { - asm volatile ( - "1:" - "movdqa (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - "mov -0x1(%0),%%al \n" - "mov %%al,(%0) \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "rax" - ); - return; - } else if (source_y_fraction == 128) { - asm volatile ( - "1:" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%3,1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqa %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - "mov -0x1(%0),%%al \n" - "mov %%al,(%0) \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "rax" - ); - return; - } else { - asm volatile ( - "mov %3,%%eax \n" - "movd %%eax,%%xmm6 \n" - "punpcklwd %%xmm6,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "neg %%eax \n" - "add $0x100,%%eax \n" - "movd %%eax,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pxor %%xmm7,%%xmm7 \n" - "1:" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm7,%%xmm0 \n" - "punpcklbw %%xmm7,%%xmm2 \n" - "punpckhbw %%xmm7,%%xmm1 \n" - "punpckhbw %%xmm7,%%xmm3 \n" - "pmullw %%xmm5,%%xmm0 \n" - "pmullw %%xmm5,%%xmm1 \n" - "pmullw %%xmm6,%%xmm2 \n" - "pmullw %%xmm6,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - "mov -0x1(%0),%%al \n" - "mov %%al,(%0) \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", "rax", "xmm6", "xmm7" - ); - } - return; -} - -// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version -#define HAS_SCALEFILTERROWS_SSSE3 -static void ScaleFilterRows_SSSE3(uint8* dst_ptr, - const uint8* src_ptr, int src_stride, - int dst_width, int source_y_fraction) { - source_y_fraction >>= 1; - if (source_y_fraction == 0) { - asm volatile ( - "1:" - "movdqa (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - "mov -0x1(%0),%%al \n" - "mov %%al,(%0) \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "rax" - ); - return; - } else if (source_y_fraction == 64) { - asm volatile ( - "1:" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%3,1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqa %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - "mov -0x1(%0),%%al \n" - "mov %%al,(%0) \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "rax" - ); - return; - } else { - asm volatile ( - "mov %3,%%eax \n" - "mov %%al,%%ah \n" - "neg %%al \n" - "add $0x80,%%al \n" - "movd %%eax,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "1:" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "pmaddubsw %%xmm5,%%xmm0 \n" - "pmaddubsw %%xmm5,%%xmm1 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - "mov -0x1(%0),%%al \n" - "mov %%al,(%0) \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", "rax" - ); - } - return; -} -#endif -#endif - -// CPU agnostic row functions -static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride, - uint8* dst, int dst_width) { - int x; - for (x = 0; x < dst_width; ++x) { - *dst++ = *src_ptr; - src_ptr += 2; - } -} - -static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, - uint8* dst, int dst_width) { - int x; - for (x = 0; x < dst_width; ++x) { - *dst++ = (src_ptr[0] + src_ptr[1] + - src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2; - src_ptr += 2; - } -} - -static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride, - uint8* dst, int dst_width) { - int x; - for (x = 0; x < dst_width; ++x) { - *dst++ = *src_ptr; - src_ptr += 4; - } -} - -static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride, - uint8* dst, int dst_width) { - int x; - for (x = 0; x < dst_width; ++x) { - *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + - src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + - src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + - src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + - 8) >> 4; - src_ptr += 4; - } -} - -// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down. -// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu. -// The following 2 lines cause error on Windows. -//static const int kMaxOutputWidth = 640; -//static const int kMaxRow12 = 1280; //kMaxOutputWidth * 2; -#define kMaxOutputWidth 640 -#define kMaxRow12 1280 - -static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride, - uint8* dst, int dst_width) { - int x; - for (x = 0; x < dst_width; ++x) { - *dst++ = *src_ptr; - src_ptr += 8; - } -} - -// Note calling code checks width is less than max and if not -// uses ScaleRowDown8_C instead. -static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride, - uint8* dst, int dst_width) { - ALIGN16(uint8 src_row[kMaxRow12 * 2]); - assert(dst_width <= kMaxOutputWidth); - ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2); - ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride, - src_row + kMaxOutputWidth, - dst_width * 2); - ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width); -} - -static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride, - uint8* dst, int dst_width) { - uint8* dend; - assert((dst_width % 3 == 0) && (dst_width > 0)); - dend = dst + dst_width; - do { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[1]; - dst[2] = src_ptr[3]; - dst += 3; - src_ptr += 4; - } while (dst < dend); -} - -// Filter rows 0 and 1 together, 3 : 1 -static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride, - uint8* d, int dst_width) { - uint8* dend; - const uint8* s; - const uint8* t; - assert((dst_width % 3 == 0) && (dst_width > 0)); - dend = d + dst_width; - s = src_ptr; - t = src_ptr + src_stride; - do { - uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; - d[0] = (a0 * 3 + b0 + 2) >> 2; - d[1] = (a1 * 3 + b1 + 2) >> 2; - d[2] = (a2 * 3 + b2 + 2) >> 2; - d += 3; - s += 4; - t += 4; - } while (d < dend); -} - -// Filter rows 1 and 2 together, 1 : 1 -static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride, - uint8* d, int dst_width) { - uint8* dend; - const uint8* s; - const uint8* t; - assert((dst_width % 3 == 0) && (dst_width > 0)); - dend = d + dst_width; - s = src_ptr; - t = src_ptr + src_stride; - do { - uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; - d[0] = (a0 + b0 + 1) >> 1; - d[1] = (a1 + b1 + 1) >> 1; - d[2] = (a2 + b2 + 1) >> 1; - d += 3; - s += 4; - t += 4; - } while (d < dend); -} - -#if defined(HAS_SCALEFILTERROWS_SSE2) -// Filter row to 3/4 -static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width) { - uint8* dend; - const uint8* s; - assert((dst_width % 3 == 0) && (dst_width > 0)); - dend = dst_ptr + dst_width; - s = src_ptr; - do { - dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; - dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; - dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2; - dst_ptr += 3; - s += 4; - } while (dst_ptr < dend); -} -#endif - -static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int dx) { - int x = 0; - int j; - for (j = 0; j < dst_width; ++j) { - int xi = x >> 16; - int xf1 = x & 0xffff; - int xf0 = 65536 - xf1; - - *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16; - x += dx; - } -} - -//Not work on Windows -//static const int kMaxInputWidth = 2560; -#define kMaxInputWidth 2560 -#if defined(HAS_SCALEFILTERROWS_SSE2) -#define HAS_SCALEROWDOWN34_SSE2 -// Filter rows 0 and 1 together, 3 : 1 -static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - ALIGN16(uint8 row[kMaxInputWidth]); - assert((dst_width % 3 == 0) && (dst_width > 0)); - ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4); - ScaleFilterCols34_C(dst_ptr, row, dst_width); -} - -// Filter rows 1 and 2 together, 1 : 1 -static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - ALIGN16(uint8 row[kMaxInputWidth]); - assert((dst_width % 3 == 0) && (dst_width > 0)); - ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2); - ScaleFilterCols34_C(dst_ptr, row, dst_width); -} -#endif - -static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride, - uint8* dst, int dst_width) { - int x; - assert(dst_width % 3 == 0); - for (x = 0; x < dst_width; x += 3) { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[3]; - dst[2] = src_ptr[6]; - dst += 3; - src_ptr += 8; - } -} - -// 8x3 -> 3x1 -static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - int i; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (i = 0; i < dst_width; i+=3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + - src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * - (65536 / 9) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + - src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * - (65536 / 9) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + - src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) * - (65536 / 6) >> 16; - src_ptr += 8; - dst_ptr += 3; - } -} - -// 8x2 -> 3x1 -static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - int i; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (i = 0; i < dst_width; i+=3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2]) * (65536 / 6) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + - src_ptr[src_stride + 5]) * (65536 / 6) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) * - (65536 / 4) >> 16; - src_ptr += 8; - dst_ptr += 3; - } -} - -// C version 8x2 -> 8x1 -static void ScaleFilterRows_C(uint8* dst_ptr, - const uint8* src_ptr, int src_stride, - int dst_width, int source_y_fraction) { - int y1_fraction; - int y0_fraction; - const uint8* src_ptr1; - uint8* end; - assert(dst_width > 0); - y1_fraction = source_y_fraction; - y0_fraction = 256 - y1_fraction; - src_ptr1 = src_ptr + src_stride; - end = dst_ptr + dst_width; - do { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; - dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; - dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; - dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; - dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; - dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; - dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; - dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; - src_ptr += 8; - src_ptr1 += 8; - dst_ptr += 8; - } while (dst_ptr < end); - dst_ptr[0] = dst_ptr[-1]; -} - -void ScaleAddRows_C(const uint8* src_ptr, int src_stride, - uint16* dst_ptr, int src_width, int src_height) { - int x,y; - assert(src_width > 0); - assert(src_height > 0); - for (x = 0; x < src_width; ++x) { - const uint8* s = src_ptr + x; - int sum = 0; - for (y = 0; y < src_height; ++y) { - sum += s[0]; - s += src_stride; - } - dst_ptr[x] = sum; - } -} - -/** - * Scale plane, 1/2 - * - * This is an optimized version for scaling down a plane to 1/2 of - * its original size. - * - */ -static void ScalePlaneDown2(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, - FilterModeEnum filtering) { - void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - assert(IS_ALIGNED(src_width, 2)); - assert(IS_ALIGNED(src_height, 2)); - -#if defined(HAS_SCALEROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON) && - IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; - } else -#endif -#if defined(HAS_SCALEROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(dst_width, 16) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; - } else -#endif - { - ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; - } - - { - int y; - for (y = 0; y < dst_height; ++y) { - ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += (src_stride << 1); - dst_ptr += dst_stride; - } - } -} - -/** - * Scale plane, 1/4 - * - * This is an optimized version for scaling down a plane to 1/4 of - * its original size. - */ -static void ScalePlaneDown4(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, - FilterModeEnum filtering) { - void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - assert(IS_ALIGNED(src_width, 4)); - assert(IS_ALIGNED(src_height, 4)); - -#if defined(HAS_SCALEROWDOWN4_NEON) - if (TestCpuFlag(kCpuHasNEON) && - IS_ALIGNED(dst_width, 4)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON; - } else -#endif -#if defined(HAS_SCALEROWDOWN4_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; - } else -#endif - { - ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; - } - - { - int y; - for (y = 0; y < dst_height; ++y) { - ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += (src_stride << 2); - dst_ptr += dst_stride; - } - } -} - -/** - * Scale plane, 1/8 - * - * This is an optimized version for scaling down a plane to 1/8 - * of its original size. - * - */ -static void ScalePlaneDown8(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, - FilterModeEnum filtering) { - void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - assert(IS_ALIGNED(src_width, 8)); - assert(IS_ALIGNED(src_height, 8)); - -#if defined(HAS_SCALEROWDOWN8_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; - } else -#endif - { - ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ? - ScaleRowDown8Int_C : ScaleRowDown8_C; - } - - { - int y; - for (y = 0; y < dst_height; ++y) { - ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += (src_stride << 3); - dst_ptr += dst_stride; - } - } -} - -/** - * Scale plane down, 3/4 - * - * Provided by Frank Barchard (fbarchard@google.com) - * - */ -static void ScalePlaneDown34(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, - FilterModeEnum filtering) { - void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - assert(dst_width % 3 == 0); -#if defined(HAS_SCALEROWDOWN34_NEON) - if (TestCpuFlag(kCpuHasNEON) && - (dst_width % 24 == 0)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_NEON; - ScaleRowDown34_1 = ScaleRowDown34_NEON; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON; - ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON; - } - } else -#endif - -#if defined(HAS_SCALEROWDOWN34_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_SSSE3; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3; - } - } else -#endif -#if defined(HAS_SCALEROWDOWN34_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_stride, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) && - filtering) { - ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2; - ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2; - } else -#endif - { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_C; - ScaleRowDown34_1 = ScaleRowDown34_C; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Int_C; - ScaleRowDown34_1 = ScaleRowDown34_1_Int_C; - } - } - { - int src_row = 0; - int y; - for (y = 0; y < dst_height; ++y) { - switch (src_row) { - case 0: - ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); - break; - - case 1: - ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width); - break; - - case 2: - ScaleRowDown34_0(src_ptr + src_stride, -src_stride, - dst_ptr, dst_width); - break; - } - ++src_row; - src_ptr += src_stride; - dst_ptr += dst_stride; - if (src_row >= 3) { - src_ptr += src_stride; - src_row = 0; - } - } -} -} - -/** - * Scale plane, 3/8 - * - * This is an optimized version for scaling down a plane to 3/8 - * of its original size. - * - * Reduces 16x3 to 6x1 - */ -static void ScalePlaneDown38(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, - FilterModeEnum filtering) { - void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - assert(dst_width % 3 == 0); -#if defined(HAS_SCALEROWDOWN38_NEON) - if (TestCpuFlag(kCpuHasNEON) && - (dst_width % 12 == 0)) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_NEON; - ScaleRowDown38_2 = ScaleRowDown38_NEON; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON; - ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON; - } - } else -#endif - -#if defined(HAS_SCALEROWDOWN38_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_stride, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_SSSE3; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; - } - } else -#endif - { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_C; - ScaleRowDown38_2 = ScaleRowDown38_C; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Int_C; - ScaleRowDown38_2 = ScaleRowDown38_2_Int_C; - } - } - { - int src_row = 0; - int y; - for (y = 0; y < dst_height; ++y) { - switch (src_row) { - case 0: - case 1: - ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += src_stride * 3; - ++src_row; - break; - - case 2: - ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += src_stride * 2; - src_row = 0; - break; - } - dst_ptr += dst_stride; - } -} -} - -__inline static uint32 SumBox(int iboxwidth, int iboxheight, - int src_stride, const uint8* src_ptr) { - int x, y; - uint32 sum; - assert(iboxwidth > 0); - assert(iboxheight > 0); - sum = 0u; - for (y = 0; y < iboxheight; ++y) { - for (x = 0; x < iboxwidth; ++x) { - sum += src_ptr[x]; - } - src_ptr += src_stride; - } - return sum; -} - -static void ScalePlaneBoxRow(int dst_width, int boxheight, - int dx, int src_stride, - const uint8* src_ptr, uint8* dst_ptr) { - int x = 0; - int i; - for (i = 0; i < dst_width; ++i) { - int ix = x >> 16; - int boxwidth; - x += dx; - boxwidth = (x >> 16) - ix; - *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / - (boxwidth * boxheight); - } -} - -__inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { - uint32 sum; - int x; - assert(iboxwidth > 0); - sum = 0u; - for (x = 0; x < iboxwidth; ++x) { - sum += src_ptr[x]; - } - return sum; -} - -static void ScaleAddCols2_C(int dst_width, int boxheight, int dx, - const uint16* src_ptr, uint8* dst_ptr) { - int scaletbl[2]; - int minboxwidth = (dx >> 16); - scaletbl[0] = 65536 / (minboxwidth * boxheight); - scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); - { - int *scaleptr = scaletbl - minboxwidth; - int x = 0; - int i; - for (i = 0; i < dst_width; ++i) { - int ix = x >> 16; - int boxwidth; - x += dx; - boxwidth = (x >> 16) - ix; - *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; - } - } -} - -static void ScaleAddCols1_C(int dst_width, int boxheight, int dx, - const uint16* src_ptr, uint8* dst_ptr) { - int boxwidth = (dx >> 16); - int scaleval = 65536 / (boxwidth * boxheight); - int x = 0; - int i; - for (i = 0; i < dst_width; ++i) { - *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; - x += boxwidth; - } -} - -/** - * Scale plane down to any dimensions, with interpolation. - * (boxfilter). - * - * Same method as SimpleScale, which is fixed point, outputting - * one pixel of destination using fixed point (16.16) to step - * through source, sampling a box of pixel with simple - * averaging. - */ -static void ScalePlaneBox(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { - int dx, dy; - assert(dst_width > 0); - assert(dst_height > 0); - dy = (src_height << 16) / dst_height; - dx = (src_width << 16) / dst_width; - if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) || - dst_height * 2 > src_height) { - uint8* dst = dst_ptr; - int dy = (src_height << 16) / dst_height; - int dx = (src_width << 16) / dst_width; - int y = 0; - int j; - for (j = 0; j < dst_height; ++j) { - int iy = y >> 16; - const uint8* const src = src_ptr + iy * src_stride; - int boxheight; - y += dy; - if (y > (src_height << 16)) { - y = (src_height << 16); - } - boxheight = (y >> 16) - iy; - ScalePlaneBoxRow(dst_width, boxheight, - dx, src_stride, - src, dst); - - dst += dst_stride; - } - } else { - ALIGN16(uint16 row[kMaxInputWidth]); - void (*ScaleAddRows)(const uint8* src_ptr, int src_stride, - uint16* dst_ptr, int src_width, int src_height); - void (*ScaleAddCols)(int dst_width, int boxheight, int dx, - const uint16* src_ptr, uint8* dst_ptr); -#if defined(HAS_SCALEADDROWS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && - IS_ALIGNED(src_width, 16)) { - ScaleAddRows = ScaleAddRows_SSE2; - } else -#endif - { - ScaleAddRows = ScaleAddRows_C; - } - if (dx & 0xffff) { - ScaleAddCols = ScaleAddCols2_C; - } else { - ScaleAddCols = ScaleAddCols1_C; - } - - { - int y = 0; - int j; - for (j = 0; j < dst_height; ++j) { - int iy = y >> 16; - const uint8* const src = src_ptr + iy * src_stride; - int boxheight; - y += dy; - if (y > (src_height << 16)) { - y = (src_height << 16); - } - boxheight = (y >> 16) - iy; - ScaleAddRows(src, src_stride, row, src_width, boxheight); - ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr); - dst_ptr += dst_stride; - } - } - } -} - -/** - * Scale plane to/from any dimensions, with interpolation. - */ -static void ScalePlaneBilinearSimple(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { - int i, j; - uint8* dst = dst_ptr; - int dx = (src_width << 16) / dst_width; - int dy = (src_height << 16) / dst_height; - int maxx = ((src_width - 1) << 16) - 1; - int maxy = ((src_height - 1) << 16) - 1; - int y = (dst_height < src_height) ? 32768 : - (src_height << 16) / dst_height - 32768; - for (i = 0; i < dst_height; ++i) { - int cy = (y < 0) ? 0 : y; - int yi = cy >> 16; - int yf = cy & 0xffff; - const uint8* const src = src_ptr + yi * src_stride; - int x = (dst_width < src_width) ? 32768 : - (src_width << 16) / dst_width - 32768; - for (j = 0; j < dst_width; ++j) { - int cx = (x < 0) ? 0 : x; - int xi = cx >> 16; - int xf = cx & 0xffff; - int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16; - int r1 = (src[xi + src_stride] * (65536 - xf) + - src[xi + src_stride + 1] * xf) >> 16; - *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16; - x += dx; - if (x > maxx) - x = maxx; - } - dst += dst_stride - dst_width; - y += dy; - if (y > maxy) - y = maxy; - } -} - -/** - * Scale plane to/from any dimensions, with bilinear - * interpolation. - */ -static void ScalePlaneBilinear(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { - int dy; - int dx; - assert(dst_width > 0); - assert(dst_height > 0); - dy = (src_height << 16) / dst_height; - dx = (src_width << 16) / dst_width; - if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) { - ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - - } else { - ALIGN16(uint8 row[kMaxInputWidth + 1]); - void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr, - int src_stride, - int dst_width, int source_y_fraction); - void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int dx); -#if defined(HAS_SCALEFILTERROWS_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && - IS_ALIGNED(src_width, 16)) { - ScaleFilterRows = ScaleFilterRows_SSSE3; - } else -#endif -#if defined(HAS_SCALEFILTERROWS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && - IS_ALIGNED(src_width, 16)) { - ScaleFilterRows = ScaleFilterRows_SSE2; - } else -#endif - { - ScaleFilterRows = ScaleFilterRows_C; - } - ScaleFilterCols = ScaleFilterCols_C; - - { - int y = 0; - int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows. - int j; - for (j = 0; j < dst_height; ++j) { - int iy = y >> 16; - int fy = (y >> 8) & 255; - const uint8* const src = src_ptr + iy * src_stride; - ScaleFilterRows(row, src, src_stride, src_width, fy); - ScaleFilterCols(dst_ptr, row, dst_width, dx); - dst_ptr += dst_stride; - y += dy; - if (y > maxy) { - y = maxy; - } - } - } -} -} - -/** - * Scale plane to/from any dimensions, without interpolation. - * Fixed point math is used for performance: The upper 16 bits - * of x and dx is the integer part of the source position and - * the lower 16 bits are the fixed decimal part. - */ -static void ScalePlaneSimple(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { - uint8* dst = dst_ptr; - int dx = (src_width << 16) / dst_width; - int y; - for (y = 0; y < dst_height; ++y) { - const uint8* const src = src_ptr + (y * src_height / dst_height) * - src_stride; - // TODO(fbarchard): Round X coordinate by setting x=0x8000. - int x = 0; - int i; - for (i = 0; i < dst_width; ++i) { - *dst++ = src[x >> 16]; - x += dx; - } - dst += dst_stride - dst_width; - } -} - -/** - * Scale plane to/from any dimensions. - */ -static void ScalePlaneAnySize(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, - FilterModeEnum filtering) { - if (!filtering) { - ScalePlaneSimple(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - } else { - // fall back to non-optimized version - ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - } -} - -/** - * Scale plane down, any size - * - * This is an optimized version for scaling down a plane to any size. - * The current implementation is ~10 times faster compared to the - * reference implementation for e.g. XGA->LowResPAL - * - */ -static void ScalePlaneDown(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, - FilterModeEnum filtering) { - if (!filtering) { - ScalePlaneSimple(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) { - // between 1/2x and 1x use bilinear - ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - } else { - ScalePlaneBox(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - } -} - -/** - * Copy plane, no scaling - * - * This simply copies the given plane without scaling. - * The current implementation is ~115 times faster - * compared to the reference implementation. - * - */ -static void CopyPlane(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { - if (src_stride == src_width && dst_stride == dst_width) { - // All contiguous, so can use REALLY fast path. - memcpy(dst_ptr, src_ptr, src_width * src_height); - } else { - // Not all contiguous; must copy scanlines individually - const uint8* src = src_ptr; - uint8* dst = dst_ptr; - int i; - for (i = 0; i < src_height; ++i) { - memcpy(dst, src, src_width); - dst += dst_stride; - src += src_stride; - } - } -} - -static void ScalePlane(const uint8* src, int src_stride, - int src_width, int src_height, - uint8* dst, int dst_stride, - int dst_width, int dst_height, - FilterModeEnum filtering, int use_ref) { - // Use specialized scales to improve performance for common resolutions. - // For example, all the 1/2 scalings will use ScalePlaneDown2() - if (dst_width == src_width && dst_height == src_height) { - // Straight copy. - CopyPlane(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst); - } else if (dst_width <= src_width && dst_height <= src_height) { - // Scale down. - if (use_ref) { - // For testing, allow the optimized versions to be disabled. - ScalePlaneDown(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - } else if (4 * dst_width == 3 * src_width && - 4 * dst_height == 3 * src_height) { - // optimized, 3/4 - ScalePlaneDown34(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - } else if (2 * dst_width == src_width && 2 * dst_height == src_height) { - // optimized, 1/2 - ScalePlaneDown2(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - // 3/8 rounded up for odd sized chroma height. - } else if (8 * dst_width == 3 * src_width && - dst_height == ((src_height * 3 + 7) / 8)) { - // optimized, 3/8 - ScalePlaneDown38(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - } else if (4 * dst_width == src_width && 4 * dst_height == src_height) { - // optimized, 1/4 - ScalePlaneDown4(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - } else if (8 * dst_width == src_width && 8 * dst_height == src_height) { - // optimized, 1/8 - ScalePlaneDown8(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - } else { - // Arbitrary downsample - ScalePlaneDown(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - } - } else { - // Arbitrary scale up and/or down. - ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - } -} - -/** - * Scale a plane. - * - * This function in turn calls a scaling function - * suitable for handling the desired resolutions. - * - */ - -int I420Scale(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int dst_width, int dst_height, - FilterModeEnum filtering) { - if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || - !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { - return -1; - } - // Negative height means invert the image. - if (src_height < 0) { - int halfheight; - src_height = -src_height; - halfheight = (src_height + 1) >> 1; - src_y = src_y + (src_height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - { - int src_halfwidth = (src_width + 1) >> 1; - int src_halfheight = (src_height + 1) >> 1; - int dst_halfwidth = (dst_width + 1) >> 1; - int dst_halfheight = (dst_height + 1) >> 1; - - ScalePlane(src_y, src_stride_y, src_width, src_height, - dst_y, dst_stride_y, dst_width, dst_height, - filtering, use_reference_impl_); - ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, - dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, - filtering, use_reference_impl_); - ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, - dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, - filtering, use_reference_impl_); - } - return 0; -} - -// Deprecated api -int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, - int src_stride_y, int src_stride_u, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, uint8* dst_u, uint8* dst_v, - int dst_stride_y, int dst_stride_u, int dst_stride_v, - int dst_width, int dst_height, - int interpolate) { - if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || - !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { - return -1; - } - // Negative height means invert the image. - if (src_height < 0) { - int halfheight; - src_height = -src_height; - halfheight = (src_height + 1) >> 1; - src_y = src_y + (src_height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - { - int src_halfwidth = (src_width + 1) >> 1; - int src_halfheight = (src_height + 1) >> 1; - int dst_halfwidth = (dst_width + 1) >> 1; - int dst_halfheight = (dst_height + 1) >> 1; - FilterModeEnum filtering = interpolate ? kFilterBox : kFilterNone; - - ScalePlane(src_y, src_stride_y, src_width, src_height, - dst_y, dst_stride_y, dst_width, dst_height, - filtering, use_reference_impl_); - ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, - dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, - filtering, use_reference_impl_); - ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, - dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, - filtering, use_reference_impl_); - } - return 0; -} - -// Deprecated api -int ScaleOffset(const uint8* src, int src_width, int src_height, - uint8* dst, int dst_width, int dst_height, int dst_yoffset, - int interpolate) { - if (!src || src_width <= 0 || src_height <= 0 || - !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 || - dst_yoffset >= dst_height) { - return -1; - } - dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2. - { - int src_halfwidth = (src_width + 1) >> 1; - int src_halfheight = (src_height + 1) >> 1; - int dst_halfwidth = (dst_width + 1) >> 1; - int dst_halfheight = (dst_height + 1) >> 1; - int aheight = dst_height - dst_yoffset * 2; // actual output height - const uint8* const src_y = src; - const uint8* const src_u = src + src_width * src_height; - const uint8* const src_v = src + src_width * src_height + - src_halfwidth * src_halfheight; - uint8* dst_y = dst + dst_yoffset * dst_width; - uint8* dst_u = dst + dst_width * dst_height + - (dst_yoffset >> 1) * dst_halfwidth; - uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + - (dst_yoffset >> 1) * dst_halfwidth; - return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth, - src_width, src_height, dst_y, dst_u, dst_v, dst_width, - dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate); - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/third_party/libyuv/source/scale.cc b/third_party/libyuv/source/scale.cc new file mode 100644 index 000000000..31cedf11d --- /dev/null +++ b/third_party/libyuv/source/scale.cc @@ -0,0 +1,1716 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/scale.h" + +#include +#include + +#include "third_party/libyuv/include/libyuv/cpu_id.h" +#include "third_party/libyuv/include/libyuv/planar_functions.h" // CopyPlane +#include "third_party/libyuv/include/libyuv/row.h" +#include "third_party/libyuv/include/libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Remove this macro if OVERREAD is safe. +#define AVOID_OVERREAD 1 + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) + +// Scale plane, 1/2 +// This is an optimized version for scaling down a plane to 1/2 of +// its original size. + +static void ScalePlaneDown2(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) = + filtering == kFilterNone ? ScaleRowDown2_C : + (filtering == kFilterLinear ? ScaleRowDown2Linear_C : + ScaleRowDown2Box_C); + int row_stride = src_stride << 1; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON; + } +#elif defined(HAS_SCALEROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 : + ScaleRowDown2Box_Unaligned_SSE2); + if (IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : + ScaleRowDown2Box_SSE2); + } + } +#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && + IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown2 = filtering ? + ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + // TODO(fbarchard): Loop through source height to allow odd height. + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +static void ScalePlaneDown2_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width) = + filtering == kFilterNone ? ScaleRowDown2_16_C : + (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C : + ScaleRowDown2Box_16_C); + int row_stride = src_stride << 1; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + +#if defined(HAS_SCALEROWDOWN2_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON : + ScaleRowDown2_16_NEON; + } +#elif defined(HAS_SCALEROWDOWN2_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? + ScaleRowDown2_Unaligned_16_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 : + ScaleRowDown2Box_Unaligned_16_SSE2); + if (IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 : + ScaleRowDown2Box_16_SSE2); + } + } +#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && + IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown2 = filtering ? + ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + // TODO(fbarchard): Loop through source height to allow odd height. + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +// Scale plane, 1/4 +// This is an optimized version for scaling down a plane to 1/4 of +// its original size. + +static void ScalePlaneDown4(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) = + filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; + int row_stride = src_stride << 2; + if (!filtering) { + src_ptr += src_stride * 2; // Point to row 2. + src_stride = 0; + } +#if defined(HAS_SCALEROWDOWN4_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; + } +#elif defined(HAS_SCALEROWDOWN4_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; + } +#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +static void ScalePlaneDown4_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width) = + filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C; + int row_stride = src_stride << 2; + if (!filtering) { + src_ptr += src_stride * 2; // Point to row 2. + src_stride = 0; + } +#if defined(HAS_SCALEROWDOWN4_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON : + ScaleRowDown4_16_NEON; + } +#elif defined(HAS_SCALEROWDOWN4_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 : + ScaleRowDown4_16_SSE2; + } +#elif defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +// Scale plane down, 3/4 + +static void ScalePlaneDown34(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_C; + ScaleRowDown34_1 = ScaleRowDown34_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; + } +#if defined(HAS_SCALEROWDOWN34_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_NEON; + ScaleRowDown34_1 = ScaleRowDown34_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, + dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); + } +} + +static void ScalePlaneDown34_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_C; + ScaleRowDown34_1 = ScaleRowDown34_16_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C; + } +#if defined(HAS_SCALEROWDOWN34_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_NEON; + ScaleRowDown34_1 = ScaleRowDown34_16_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3; + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, + dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); + } +} + + +// Scale plane, 3/8 +// This is an optimized version for scaling down a plane to 3/8 +// of its original size. +// +// Uses box filter arranges like this +// aaabbbcc -> abc +// aaabbbcc def +// aaabbbcc ghi +// dddeeeff +// dddeeeff +// dddeeeff +// ggghhhii +// ggghhhii +// Boxes are 3x3, 2x3, 3x2 and 2x2 + +static void ScalePlaneDown38(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_C; + ScaleRowDown38_2 = ScaleRowDown38_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; + } +#if defined(HAS_SCALEROWDOWN38_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_NEON; + ScaleRowDown38_2 = ScaleRowDown38_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; + } + } +#elif defined(HAS_SCALEROWDOWN38_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; + } + } +#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } +} + +static void ScalePlaneDown38_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_C; + ScaleRowDown38_2 = ScaleRowDown38_16_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C; + } +#if defined(HAS_SCALEROWDOWN38_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_NEON; + ScaleRowDown38_2 = ScaleRowDown38_16_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON; + } + } +#elif defined(HAS_SCALEROWDOWN38_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3; + } + } +#elif defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } +} + +static __inline uint32 SumBox(int iboxwidth, int iboxheight, + ptrdiff_t src_stride, const uint8* src_ptr) { + uint32 sum = 0u; + int y; + assert(iboxwidth > 0); + assert(iboxheight > 0); + for (y = 0; y < iboxheight; ++y) { + int x; + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + src_ptr += src_stride; + } + return sum; +} + +static __inline uint32 SumBox_16(int iboxwidth, int iboxheight, + ptrdiff_t src_stride, const uint16* src_ptr) { + uint32 sum = 0u; + int y; + assert(iboxwidth > 0); + assert(iboxheight > 0); + for (y = 0; y < iboxheight; ++y) { + int x; + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + src_ptr += src_stride; + } + return sum; +} + +static void ScalePlaneBoxRow_C(int dst_width, int boxheight, + int x, int dx, ptrdiff_t src_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int i; + int boxwidth; + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / + (boxwidth * boxheight); + } +} + +static void ScalePlaneBoxRow_16_C(int dst_width, int boxheight, + int x, int dx, ptrdiff_t src_stride, + const uint16* src_ptr, uint16* dst_ptr) { + int i; + int boxwidth; + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumBox_16(boxwidth, boxheight, src_stride, src_ptr + ix) / + (boxwidth * boxheight); + } +} + +static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { + uint32 sum = 0u; + int x; + assert(iboxwidth > 0); + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) { + uint32 sum = 0u; + int x; + assert(iboxwidth > 0); + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int i; + int scaletbl[2]; + int minboxwidth = (dx >> 16); + int* scaleptr = scaletbl - minboxwidth; + int boxwidth; + scaletbl[0] = 65536 / (minboxwidth * boxheight); + scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; + } +} + +static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, + const uint32* src_ptr, uint16* dst_ptr) { + int i; + int scaletbl[2]; + int minboxwidth = (dx >> 16); + int* scaleptr = scaletbl - minboxwidth; + int boxwidth; + scaletbl[0] = 65536 / (minboxwidth * boxheight); + scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) * + scaleptr[boxwidth] >> 16; + } +} + +static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int boxwidth = (dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int i; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx, + const uint32* src_ptr, uint16* dst_ptr) { + int boxwidth = (dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int i; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +// Scale plane down to any dimensions, with interpolation. +// (boxfilter). +// +// Same method as SimpleScale, which is fixed point, outputting +// one pixel of destination using fixed point (16.16) to step +// through source, sampling a box of pixel with simple +// averaging. +static void ScalePlaneBox(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height << 16); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + // TODO(fbarchard): Remove this and make AddRows handle boxheight 1. + if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) { + uint8* dst = dst_ptr; + int j; + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint8* src = src_ptr + iy * src_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + boxheight = (y >> 16) - iy; + ScalePlaneBoxRow_C(dst_width, boxheight, + x, dx, src_stride, + src, dst); + dst += dst_stride; + } + return; + } + { + // Allocate a row buffer of uint16. + align_buffer_64(row16, src_width * 2); + void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, + const uint16* src_ptr, uint8* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C; + void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C; + +#if defined(HAS_SCALEADDROWS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && +#ifdef AVOID_OVERREAD + IS_ALIGNED(src_width, 16) && +#endif + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ScaleAddRows = ScaleAddRows_SSE2; + } +#endif + + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint8* src = src_ptr + iy * src_stride; + y += dy; + if (y > (src_height << 16)) { + y = (src_height << 16); + } + boxheight = (y >> 16) - iy; + ScaleAddRows(src, src_stride, (uint16*)(row16), + src_width, boxheight); + ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), + dst_ptr); + dst_ptr += dst_stride; + } + free_aligned_buffer_64(row16); + } +} + +static void ScalePlaneBox_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height << 16); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + // TODO(fbarchard): Remove this and make AddRows handle boxheight 1. + if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) { + uint16* dst = dst_ptr; + int j; + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint16* src = src_ptr + iy * src_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + boxheight = (y >> 16) - iy; + ScalePlaneBoxRow_16_C(dst_width, boxheight, + x, dx, src_stride, + src, dst); + dst += dst_stride; + } + return; + } + { + // Allocate a row buffer of uint32. + align_buffer_64(row32, src_width * 4); + void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, + const uint32* src_ptr, uint16* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C; + void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride, + uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C; + +#if defined(HAS_SCALEADDROWS_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && +#ifdef AVOID_OVERREAD + IS_ALIGNED(src_width, 16) && +#endif + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ScaleAddRows = ScaleAddRows_16_SSE2; + } +#endif + + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint16* src = src_ptr + iy * src_stride; + y += dy; + if (y > (src_height << 16)) { + y = (src_height << 16); + } + boxheight = (y >> 16) - iy; + ScaleAddRows(src, src_stride, (uint32*)(row32), + src_width, boxheight); + ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), + dst_ptr); + dst_ptr += dst_stride; + } + free_aligned_buffer_64(row32); + } +} + +// Scale plane down with bilinear interpolation. +void ScalePlaneBilinearDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row buffer. + align_buffer_64(row, src_width); + + const int max_y = (src_height - 1) << 16; + int j; + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(src_width, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif + + +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif + if (y > max_y) { + y = max_y; + } + + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8* src = src_ptr + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, row, dst_width, x, dx); + } + dst_ptr += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); +} + +void ScalePlaneBilinearDown_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row buffer. + align_buffer_64(row, src_width * 2); + + const int max_y = (src_height - 1) << 16; + int j; + void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C; + void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_16_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_16_SSE2; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_16_SSE2; + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_16_SSSE3; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_16_SSSE3; + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) { + InterpolateRow = InterpolateRow_Any_16_AVX2; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_16_NEON; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) { + InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; + if (IS_ALIGNED(src_width, 4)) { + InterpolateRow = InterpolateRow_16_MIPS_DSPR2; + } + } +#endif + + +#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_16_SSSE3; + } +#endif + if (y > max_y) { + y = max_y; + } + + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint16* src = src_ptr + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow((uint16*)row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx); + } + dst_ptr += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); +} + +// Scale up down with bilinear interpolation. +void ScalePlaneBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height - 1) << 16; + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_C : ScaleCols_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif + + if (filtering && src_width >= 32768) { + ScaleFilterCols = ScaleFilterCols64_C; + } +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleFilterCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleFilterCols = ScaleColsUp2_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + { + int yi = y >> 16; + const uint8* src = src_ptr + yi * src_stride; + + // Allocate 2 row buffers. + const int kRowSize = (dst_width + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + uint8* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_ptr + yi * src_stride; + } + if (yi != lasty) { + ScaleFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); + } + dst_ptr += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +void ScalePlaneBilinearUp_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height - 1) << 16; + void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_16_C; + void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_16_SSE2; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_16_SSE2; + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_16_SSSE3; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_16_SSSE3; + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) { + InterpolateRow = InterpolateRow_Any_16_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_16_NEON; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_16_MIPS_DSPR2; + } + } +#endif + + if (filtering && src_width >= 32768) { + ScaleFilterCols = ScaleFilterCols64_16_C; + } +#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_16_SSSE3; + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleFilterCols = ScaleColsUp2_16_C; +#if defined(HAS_SCALECOLS_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleFilterCols = ScaleColsUp2_16_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + { + int yi = y >> 16; + const uint16* src = src_ptr + yi * src_stride; + + // Allocate 2 row buffers. + const int kRowSize = (dst_width + 15) & ~15; + align_buffer_64(row, kRowSize * 4); + + uint16* rowptr = (uint16*)row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_ptr + yi * src_stride; + } + if (yi != lasty) { + ScaleFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); + } + dst_ptr += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +// Scale Plane to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScalePlaneSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int i; + void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = ScaleCols_C; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleCols = ScaleColsUp2_SSE2; + } +#endif + } + + for (i = 0; i < dst_height; ++i) { + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, + dst_width, x, dx); + dst_ptr += dst_stride; + y += dy; + } +} + +static void ScalePlaneSimple_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr) { + int i; + void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) = ScaleCols_16_C; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleCols = ScaleColsUp2_16_C; +#if defined(HAS_SCALECOLS_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleCols = ScaleColsUp2_16_SSE2; + } +#endif + } + + for (i = 0; i < dst_height; ++i) { + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, + dst_width, x, dx); + dst_ptr += dst_stride; + y += dy; + } +} + +// Scale a plane. +// This function dispatches to a specialized scaler based on scale factor. + +LIBYUV_API +void ScalePlane(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, + dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); + return; + } + if (dst_width == src_width) { + int dy = FixedDiv(src_height, dst_height); + // Arbitrary scale vertically, but unscaled vertically. + ScalePlaneVertical(src_height, + dst_width, dst_height, + src_stride, dst_stride, src, dst, + 0, 0, dy, 1, filtering); + return; + } + if (dst_width <= Abs(src_width) && dst_height <= src_height) { + // Scale down. + if (4 * dst_width == 3 * src_width && + 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + // 3/8 rounded up for odd sized chroma height. + if (8 * dst_width == 3 * src_width && + dst_height == ((src_height * 3 + 7) / 8)) { + // optimized, 3/8 + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (4 * dst_width == src_width && 4 * dst_height == src_height && + filtering != kFilterBilinear) { + // optimized, 1/4 + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + } + if (filtering == kFilterBox && dst_height * 2 < src_height) { + ScalePlaneBox(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if (filtering && dst_height > src_height) { + ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (filtering) { + ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); +} + +LIBYUV_API +void ScalePlane_16(const uint16* src, int src_stride, + int src_width, int src_height, + uint16* dst, int dst_stride, + int dst_width, int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, + dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height); + return; + } + if (dst_width == src_width) { + int dy = FixedDiv(src_height, dst_height); + // Arbitrary scale vertically, but unscaled vertically. + ScalePlaneVertical_16(src_height, + dst_width, dst_height, + src_stride, dst_stride, src, dst, + 0, 0, dy, 1, filtering); + return; + } + if (dst_width <= Abs(src_width) && dst_height <= src_height) { + // Scale down. + if (4 * dst_width == 3 * src_width && + 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + // 3/8 rounded up for odd sized chroma height. + if (8 * dst_width == 3 * src_width && + dst_height == ((src_height * 3 + 7) / 8)) { + // optimized, 3/8 + ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (4 * dst_width == src_width && 4 * dst_height == src_height && + filtering != kFilterBilinear) { + // optimized, 1/4 + ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + } + if (filtering == kFilterBox && dst_height * 2 < src_height) { + ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if (filtering && dst_height > src_height) { + ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (filtering) { + ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); +} + +// Scale an I420 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, + dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, + filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, + dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, + filtering); + return 0; +} + +LIBYUV_API +int I420Scale_16(const uint16* src_y, int src_stride_y, + const uint16* src_u, int src_stride_u, + const uint16* src_v, int src_stride_v, + int src_width, int src_height, + uint16* dst_y, int dst_stride_y, + uint16* dst_u, int dst_stride_u, + uint16* dst_v, int dst_stride_v, + int dst_width, int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_16(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering); + ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, + dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, + filtering); + ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, + dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, + filtering); + return 0; +} + +// Deprecated api +LIBYUV_API +int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + LIBYUV_BOOL interpolate) { + return I420Scale(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + src_width, src_height, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + dst_width, dst_height, + interpolate ? kFilterBox : kFilterNone); +} + +// Deprecated api +LIBYUV_API +int ScaleOffset(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int dst_yoffset, + LIBYUV_BOOL interpolate) { + // Chroma requires offset to multiple of 2. + int dst_yoffset_even = dst_yoffset & ~1; + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + int aheight = dst_height - dst_yoffset_even * 2; // actual output height + const uint8* src_y = src; + const uint8* src_u = src + src_width * src_height; + const uint8* src_v = src + src_width * src_height + + src_halfwidth * src_halfheight; + uint8* dst_y = dst + dst_yoffset_even * dst_width; + uint8* dst_u = dst + dst_width * dst_height + + (dst_yoffset_even >> 1) * dst_halfwidth; + uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + + (dst_yoffset_even >> 1) * dst_halfwidth; + if (!src || src_width <= 0 || src_height <= 0 || + !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 || + dst_yoffset_even >= dst_height) { + return -1; + } + return I420Scale(src_y, src_width, + src_u, src_halfwidth, + src_v, src_halfwidth, + src_width, src_height, + dst_y, dst_width, + dst_u, dst_halfwidth, + dst_v, dst_halfwidth, + dst_width, aheight, + interpolate ? kFilterBox : kFilterNone); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/scale_common.cc b/third_party/libyuv/source/scale_common.cc new file mode 100644 index 000000000..595ad66ba --- /dev/null +++ b/third_party/libyuv/source/scale_common.cc @@ -0,0 +1,1165 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/scale.h" + +#include +#include + +#include "third_party/libyuv/include/libyuv/cpu_id.h" +#include "third_party/libyuv/include/libyuv/planar_functions.h" // CopyARGB +#include "third_party/libyuv/include/libyuv/row.h" +#include "third_party/libyuv/include/libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// CPU agnostic row functions +void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = src_ptr[1]; + } +} + +void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = src_ptr[1]; + } +} + +void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* s = src_ptr; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + 1) >> 1; + dst[1] = (s[2] + s[3] + 1) >> 1; + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + 1) >> 1; + } +} + +void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + const uint16* s = src_ptr; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + 1) >> 1; + dst[1] = (s[2] + s[3] + 1) >> 1; + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + 1) >> 1; + } +} + +void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + } +} + +void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + const uint16* s = src_ptr; + const uint16* t = src_ptr + src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + } +} + +void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = src_ptr[2]; + } +} + +void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = src_ptr[2]; + } +} + +void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + intptr_t stride = src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + + src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + + src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + + 8) >> 4; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + } +} + +void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + intptr_t stride = src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + + src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + + src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + + 8) >> 4; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + } +} + +void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } +} + +void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } +} + +// Filter rows 0 and 1 together, 3 : 1 +void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } +} + +void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* d, int dst_width) { + const uint16* s = src_ptr; + const uint16* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } +} + +// Filter rows 1 and 2 together, 1 : 1 +void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } +} + +void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* d, int dst_width) { + const uint16* s = src_ptr; + const uint16* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[0] = src_ptr[x >> 16]; + x += dx; + dst_ptr[1] = src_ptr[x >> 16]; + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[x >> 16]; + } +} + +void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[0] = src_ptr[x >> 16]; + x += dx; + dst_ptr[1] = src_ptr[x >> 16]; + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[1] = dst_ptr[0] = src_ptr[0]; + src_ptr += 1; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[0]; + } +} + +void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[1] = dst_ptr[0] = src_ptr[0]; + src_ptr += 1; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[0]; + } +} + +// (1-f)a + fb can be replaced with a + f(b-a) +#define BLENDER(a, b, f) (uint8)((int)(a) + \ + ((int)(f) * ((int)(b) - (int)(a)) >> 16)) + +void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} + +void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x32, int dx) { + int64 x = (int64)(x32); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64 xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int64 xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} +#undef BLENDER + +#define BLENDER(a, b, f) (uint16)((int)(a) + \ + ((int)(f) * ((int)(b) - (int)(a)) >> 16)) + +void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} + +void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x32, int dx) { + int64 x = (int64)(x32); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64 xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int64 xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} +#undef BLENDER + +void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + int x; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + int x; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +// 8x3 -> 3x1 +void ScaleRowDown38_3_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// 8x2 -> 3x1 +void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2]) * (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5]) * (65536 / 6) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2]) * (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5]) * (65536 / 6) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + int x; + assert(src_width > 0); + assert(src_height > 0); + for (x = 0; x < src_width; ++x) { + const uint8* s = src_ptr + x; + unsigned int sum = 0u; + int y; + for (y = 0; y < src_height; ++y) { + sum += s[0]; + s += src_stride; + } + // TODO(fbarchard): Consider limitting height to 256 to avoid overflow. + dst_ptr[x] = sum < 65535u ? sum : 65535u; + } +} + +void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint32* dst_ptr, int src_width, int src_height) { + int x; + assert(src_width > 0); + assert(src_height > 0); + for (x = 0; x < src_width; ++x) { + const uint16* s = src_ptr + x; + unsigned int sum = 0u; + int y; + for (y = 0; y < src_height; ++y) { + sum += s[0]; + s += src_stride; + } + // No risk of overflow here now + dst_ptr[x] = sum; + } +} + +void ScaleARGBRowDown2_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[1]; + dst[1] = src[3]; + src += 4; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[1]; + } +} + +void ScaleARGBRowDown2Linear_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1; + dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1; + dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1; + dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + + src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + + src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + + src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + + src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + + src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + + src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + + src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + + src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + src_argb += src_stepx * 4; + dst_argb += 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x32, int dx) { + int64 x = (int64)(x32); + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[1] = dst[0] = src[0]; + src += 1; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +// Mimics SSSE3 blender +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 +#define BLENDERC(a, b, f, s) (uint32)( \ + BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) \ + BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \ + BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) + +void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} + +void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x32, int dx) { + int64 x = (int64)(x32); + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64 xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int64 xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} +#undef BLENDER1 +#undef BLENDERC +#undef BLENDER + +// Scale plane vertically with bilinear interpolation. +void ScalePlaneVertical(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int y, int dy, + int bpp, enum FilterMode filtering) { + // TODO(fbarchard): Allow higher bpp. + int dst_width_bytes = dst_width * bpp; + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + int j; + assert(bpp >= 1 && bpp <= 4); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + src_argb += (x >> 16) * bpp; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(dst_width_bytes, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif + for (j = 0; j < dst_height; ++j) { + int yi; + int yf; + if (y > max_y) { + y = max_y; + } + yi = y >> 16; + yf = filtering ? ((y >> 8) & 255) : 0; + InterpolateRow(dst_argb, src_argb + yi * src_stride, + src_stride, dst_width_bytes, yf); + dst_argb += dst_stride; + y += dy; + } +} +void ScalePlaneVertical_16(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_argb, uint16* dst_argb, + int x, int y, int dy, + int wpp, enum FilterMode filtering) { + // TODO(fbarchard): Allow higher wpp. + int dst_width_words = dst_width * wpp; + void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_16_C; + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + int j; + assert(wpp >= 1 && wpp <= 2); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + src_argb += (x >> 16) * wpp; +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_16_SSE2; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_Unaligned_16_SSE2; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_16_SSSE3; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_Unaligned_16_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) { + InterpolateRow = InterpolateRow_Any_16_AVX2; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_16_NEON; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; + if (IS_ALIGNED(dst_width_bytes, 4)) { + InterpolateRow = InterpolateRow_16_MIPS_DSPR2; + } + } +#endif + for (j = 0; j < dst_height; ++j) { + int yi; + int yf; + if (y > max_y) { + y = max_y; + } + yi = y >> 16; + yf = filtering ? ((y >> 8) & 255) : 0; + InterpolateRow(dst_argb, src_argb + yi * src_stride, + src_stride, dst_width_words, yf); + dst_argb += dst_stride; + y += dy; + } +} + +// Simplify the filtering based on scale factors. +enum FilterMode ScaleFilterReduce(int src_width, int src_height, + int dst_width, int dst_height, + enum FilterMode filtering) { + if (src_width < 0) { + src_width = -src_width; + } + if (src_height < 0) { + src_height = -src_height; + } + if (filtering == kFilterBox) { + // If scaling both axis to 0.5 or larger, switch from Box to Bilinear. + if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) { + filtering = kFilterBilinear; + } + // If scaling to larger, switch from Box to Bilinear. + if (dst_width >= src_width || dst_height >= src_height) { + filtering = kFilterBilinear; + } + } + if (filtering == kFilterBilinear) { + if (src_height == 1) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to Linear. + if (dst_height == src_height || dst_height * 3 == src_height) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to + // avoid reading 2 pixels horizontally that causes memory exception. + if (src_width == 1) { + filtering = kFilterNone; + } + } + if (filtering == kFilterLinear) { + if (src_width == 1) { + filtering = kFilterNone; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to None. + if (dst_width == src_width || dst_width * 3 == src_width) { + filtering = kFilterNone; + } + } + return filtering; +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_C(int num, int div) { + return (int)(((int64)(num) << 16) / div); +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv1_C(int num, int div) { + return (int)((((int64)(num) << 16) - 0x00010001) / + (div - 1)); +} + +#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) + +// Compute slope values for stepping. +void ScaleSlope(int src_width, int src_height, + int dst_width, int dst_height, + enum FilterMode filtering, + int* x, int* y, int* dx, int* dy) { + assert(x != NULL); + assert(y != NULL); + assert(dx != NULL); + assert(dy != NULL); + assert(src_width != 0); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + // Check for 1 pixel and avoid FixedDiv overflow. + if (dst_width == 1 && src_width >= 32768) { + dst_width = src_width; + } + if (dst_height == 1 && src_height >= 32768) { + dst_height = src_height; + } + if (filtering == kFilterBox) { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = 0; + *y = 0; + } else if (filtering == kFilterBilinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_width > 1) { + *dx = FixedDiv1(Abs(src_width), dst_width); + *x = 0; + } + if (dst_height <= src_height) { + *dy = FixedDiv(src_height, dst_height); + *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_height > 1) { + *dy = FixedDiv1(src_height, dst_height); + *y = 0; + } + } else if (filtering == kFilterLinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_width > 1) { + *dx = FixedDiv1(Abs(src_width), dst_width); + *x = 0; + } + *dy = FixedDiv(src_height, dst_height); + *y = *dy >> 1; + } else { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = CENTERSTART(*dx, 0); + *y = CENTERSTART(*dy, 0); + } + // Negative src_width means horizontally mirror. + if (src_width < 0) { + *x += (dst_width - 1) * *dx; + *dx = -*dx; + // src_width = -src_width; // Caller must do this. + } +} +#undef CENTERSTART + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/scale_mips.cc b/third_party/libyuv/source/scale_mips.cc new file mode 100644 index 000000000..5722dea80 --- /dev/null +++ b/third_party/libyuv/source/scale_mips.cc @@ -0,0 +1,653 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/basic_types.h" +#include "third_party/libyuv/include/libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC MIPS DSPR2 +#if !defined(LIBYUV_DISABLE_MIPS) && \ + defined(__mips_dsp) && (__mips_dsp_rev >= 2) + +void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 4 \n" // iterations -> by 16 + "beqz $t9, 2f \n" + " nop \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| + // TODO(fbarchard): Use odd pixels instead of even. + "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0| + "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8| + "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16| + "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "addiu $t9, $t9, -1 \n" + "sw $t8, 0(%[dst]) \n" + "sw $t0, 4(%[dst]) \n" + "sw $t1, 8(%[dst]) \n" + "sw $t2, 12(%[dst]) \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 16 \n" + + "2: \n" + "andi $t9, %[dst_width], 0xf \n" // residue + "beqz $t9, 3f \n" + " nop \n" + + "21: \n" + "lbu $t0, 0(%[src_ptr]) \n" + "addiu %[src_ptr], %[src_ptr], 2 \n" + "addiu $t9, $t9, -1 \n" + "sb $t0, 0(%[dst]) \n" + "bgtz $t9, 21b \n" + " addiu %[dst], %[dst], 1 \n" + + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst) + : [dst_width] "r" (dst_width) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} + +void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* t = src_ptr + src_stride; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 3 \n" // iterations -> step 8 + "bltz $t9, 2f \n" + " nop \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t4, 0(%[t]) \n" // |19|18|17|16| + "lw $t5, 4(%[t]) \n" // |23|22|21|20| + "lw $t6, 8(%[t]) \n" // |27|26|25|24| + "lw $t7, 12(%[t]) \n" // |31|30|29|28| + "addiu $t9, $t9, -1 \n" + "srl $t8, $t0, 16 \n" // |X|X|3|2| + "ins $t0, $t4, 16, 16 \n" // |17|16|1|0| + "ins $t4, $t8, 0, 16 \n" // |19|18|3|2| + "raddu.w.qb $t0, $t0 \n" // |17+16+1+0| + "raddu.w.qb $t4, $t4 \n" // |19+18+3+2| + "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2 + "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2 + "srl $t8, $t1, 16 \n" // |X|X|7|6| + "ins $t1, $t5, 16, 16 \n" // |21|20|5|4| + "ins $t5, $t8, 0, 16 \n" // |22|23|7|6| + "raddu.w.qb $t1, $t1 \n" // |21+20+5+4| + "raddu.w.qb $t5, $t5 \n" // |23+22+7+6| + "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2 + "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2 + "srl $t8, $t2, 16 \n" // |X|X|11|10| + "ins $t2, $t6, 16, 16 \n" // |25|24|9|8| + "ins $t6, $t8, 0, 16 \n" // |27|26|11|10| + "raddu.w.qb $t2, $t2 \n" // |25+24+9+8| + "raddu.w.qb $t6, $t6 \n" // |27+26+11+10| + "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2 + "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2 + "srl $t8, $t3, 16 \n" // |X|X|15|14| + "ins $t3, $t7, 16, 16 \n" // |29|28|13|12| + "ins $t7, $t8, 0, 16 \n" // |31|30|15|14| + "raddu.w.qb $t3, $t3 \n" // |29+28+13+12| + "raddu.w.qb $t7, $t7 \n" // |31+30+15+14| + "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2 + "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2 + "addiu %[src_ptr], %[src_ptr], 16 \n" + "addiu %[t], %[t], 16 \n" + "sb $t0, 0(%[dst]) \n" + "sb $t4, 1(%[dst]) \n" + "sb $t1, 2(%[dst]) \n" + "sb $t5, 3(%[dst]) \n" + "sb $t2, 4(%[dst]) \n" + "sb $t6, 5(%[dst]) \n" + "sb $t3, 6(%[dst]) \n" + "sb $t7, 7(%[dst]) \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 8 \n" + + "2: \n" + "andi $t9, %[dst_width], 0x7 \n" // x = residue + "beqz $t9, 3f \n" + " nop \n" + + "21: \n" + "lwr $t1, 0(%[src_ptr]) \n" + "lwl $t1, 3(%[src_ptr]) \n" + "lwr $t2, 0(%[t]) \n" + "lwl $t2, 3(%[t]) \n" + "srl $t8, $t1, 16 \n" + "ins $t1, $t2, 16, 16 \n" + "ins $t2, $t8, 0, 16 \n" + "raddu.w.qb $t1, $t1 \n" + "raddu.w.qb $t2, $t2 \n" + "shra_r.w $t1, $t1, 2 \n" + "shra_r.w $t2, $t2, 2 \n" + "sb $t1, 0(%[dst]) \n" + "sb $t2, 1(%[dst]) \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "addiu $t9, $t9, -2 \n" + "addiu %[t], %[t], 4 \n" + "bgtz $t9, 21b \n" + " addiu %[dst], %[dst], 2 \n" + + "3: \n" + ".set pop \n" + + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), [t] "+r" (t) + : [dst_width] "r" (dst_width) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} + +void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 3 \n" + "beqz $t9, 2f \n" + " nop \n" + + ".p2align 2 \n" + "1: \n" + "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| + "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0| + "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8| + "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16| + "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24| + "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0| + "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "addiu $t9, $t9, -1 \n" + "sw $t1, 0(%[dst]) \n" + "sw $t5, 4(%[dst]) \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 8 \n" + + "2: \n" + "andi $t9, %[dst_width], 7 \n" // residue + "beqz $t9, 3f \n" + " nop \n" + + "21: \n" + "lbu $t1, 0(%[src_ptr]) \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "addiu $t9, $t9, -1 \n" + "sb $t1, 0(%[dst]) \n" + "bgtz $t9, 21b \n" + " addiu %[dst], %[dst], 1 \n" + + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst) + : [dst_width] "r" (dst_width) + : "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} + +void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + intptr_t stride = src_stride; + const uint8* s1 = src_ptr + stride; + const uint8* s2 = s1 + stride; + const uint8* s3 = s2 + stride; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 1 \n" + "andi $t8, %[dst_width], 1 \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 0(%[s1]) \n" // |7|6|5|4| + "lw $t2, 0(%[s2]) \n" // |11|10|9|8| + "lw $t3, 0(%[s3]) \n" // |15|14|13|12| + "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16| + "lw $t5, 4(%[s1]) \n" // |23|22|21|20| + "lw $t6, 4(%[s2]) \n" // |27|26|25|24| + "lw $t7, 4(%[s3]) \n" // |31|30|29|28| + "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| + "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| + "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| + "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| + "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16| + "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20| + "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24| + "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28| + "add $t0, $t0, $t1 \n" + "add $t1, $t2, $t3 \n" + "add $t0, $t0, $t1 \n" + "add $t4, $t4, $t5 \n" + "add $t6, $t6, $t7 \n" + "add $t4, $t4, $t6 \n" + "shra_r.w $t0, $t0, 4 \n" + "shra_r.w $t4, $t4, 4 \n" + "sb $t0, 0(%[dst]) \n" + "sb $t4, 1(%[dst]) \n" + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[s1], %[s1], 8 \n" + "addiu %[s2], %[s2], 8 \n" + "addiu %[s3], %[s3], 8 \n" + "addiu $t9, $t9, -1 \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 2 \n" + "beqz $t8, 2f \n" + " nop \n" + + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 0(%[s1]) \n" // |7|6|5|4| + "lw $t2, 0(%[s2]) \n" // |11|10|9|8| + "lw $t3, 0(%[s3]) \n" // |15|14|13|12| + "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| + "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| + "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| + "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| + "add $t0, $t0, $t1 \n" + "add $t1, $t2, $t3 \n" + "add $t0, $t0, $t1 \n" + "shra_r.w $t0, $t0, 4 \n" + "sb $t0, 0(%[dst]) \n" + + "2: \n" + ".set pop \n" + + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), + [s1] "+r" (s1), + [s2] "+r" (s2), + [s3] "+r" (s3) + : [dst_width] "r" (dst_width) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6","t7", "t8", "t9" + ); +} + +void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + ".p2align 2 \n" + "1: \n" + "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| + "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13| + "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30| + "addiu %[dst_width], %[dst_width], -24 \n" + "ins $t1, $t1, 8, 16 \n" // |3|1|0|X| + "ins $t4, $t0, 8, 16 \n" // |X|15|13|12| + "ins $t5, $t5, 8, 16 \n" // |19|17|16|X| + "ins $t8, $t9, 8, 16 \n" // |X|31|29|28| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5| + "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21| + "prepend $t1, $t2, 8 \n" // |4|3|1|0| + "prepend $t3, $t4, 24 \n" // |15|13|12|11| + "prepend $t5, $t6, 8 \n" // |20|19|17|16| + "prepend $t7, $t8, 24 \n" // |31|29|28|27| + "sw $t1, 0(%[dst]) \n" + "sw $t0, 4(%[dst]) \n" + "sw $t3, 8(%[dst]) \n" + "sw $t5, 12(%[dst]) \n" + "sw $t9, 16(%[dst]) \n" + "sw $t7, 20(%[dst]) \n" + "bnez %[dst_width], 1b \n" + " addiu %[dst], %[dst], 24 \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6","t7", "t8", "t9" + ); +} + +void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "repl.ph $t3, 3 \n" // 0x00030003 + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| + "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1| + "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| + "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3| + "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3| + "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1| + "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| + "raddu.w.qb $t0, $t0 \n" + "raddu.w.qb $t1, $t1 \n" + "shra_r.w $t0, $t0, 1 \n" + "shra_r.w $t1, $t1, 1 \n" + "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1| + "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| + "rotr $t2, $t2, 16 \n" // |0|S1|0|S2| + "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| + "addu.ph $t2, $t2, $t4 \n" + "addu.ph $t6, $t6, $t5 \n" + "sll $t5, $t0, 1 \n" + "add $t0, $t5, $t0 \n" + "shra_r.ph $t2, $t2, 2 \n" + "shra_r.ph $t6, $t6, 2 \n" + "shll.ph $t4, $t2, 1 \n" + "addq.ph $t4, $t4, $t2 \n" + "addu $t0, $t0, $t1 \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "shra_r.w $t0, $t0, 2 \n" + "addu.ph $t6, $t6, $t4 \n" + "shra_r.ph $t6, $t6, 2 \n" + "srl $t1, $t6, 16 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "sb $t1, 0(%[d]) \n" + "sb $t0, 1(%[d]) \n" + "sb $t6, 2(%[d]) \n" + "bgtz %[dst_width], 1b \n" + " addiu %[d], %[d], 3 \n" + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [src_stride] "+r" (src_stride), + [d] "+r" (d), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", + "t4", "t5", "t6" + ); +} + +void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "repl.ph $t2, 3 \n" // 0x00030003 + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| + "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1| + "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| + "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3| + "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3| + "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1| + "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| + "raddu.w.qb $t0, $t0 \n" + "raddu.w.qb $t1, $t1 \n" + "shra_r.w $t0, $t0, 1 \n" + "shra_r.w $t1, $t1, 1 \n" + "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1| + "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| + "rotr $t4, $t4, 16 \n" // |0|S1|0|S2| + "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| + "addu.ph $t4, $t4, $t3 \n" + "addu.ph $t6, $t6, $t5 \n" + "shra_r.ph $t6, $t6, 2 \n" + "shra_r.ph $t4, $t4, 2 \n" + "addu.ph $t6, $t6, $t4 \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "shra_r.ph $t6, $t6, 1 \n" + "addu $t0, $t0, $t1 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "shra_r.w $t0, $t0, 1 \n" + "srl $t1, $t6, 16 \n" + "sb $t1, 0(%[d]) \n" + "sb $t0, 1(%[d]) \n" + "sb $t6, 2(%[d]) \n" + "bgtz %[dst_width], 1b \n" + " addiu %[d], %[d], 3 \n" + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [src_stride] "+r" (src_stride), + [d] "+r" (d), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", + "t4", "t5", "t6" + ); +} + +void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| + "wsbh $t0, $t0 \n" // |2|3|0|1| + "wsbh $t6, $t6 \n" // |26|27|24|25| + "srl $t0, $t0, 8 \n" // |X|2|3|0| + "srl $t3, $t3, 16 \n" // |X|X|15|14| + "srl $t5, $t5, 16 \n" // |X|X|23|22| + "srl $t7, $t7, 16 \n" // |X|X|31|30| + "ins $t1, $t2, 24, 8 \n" // |8|6|5|4| + "ins $t6, $t5, 0, 8 \n" // |26|27|24|22| + "ins $t1, $t0, 0, 16 \n" // |8|6|3|0| + "ins $t6, $t7, 24, 8 \n" // |30|27|24|22| + "prepend $t2, $t3, 24 \n" // |X|15|14|11| + "ins $t4, $t4, 16, 8 \n" // |19|16|17|X| + "ins $t4, $t2, 0, 16 \n" // |19|16|14|11| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "addiu %[dst_width], %[dst_width], -12 \n" + "addiu $t8,%[dst_width], -12 \n" + "sw $t1, 0(%[dst]) \n" + "sw $t4, 4(%[dst]) \n" + "sw $t6, 8(%[dst]) \n" + "bgez $t8, 1b \n" + " addiu %[dst], %[dst], 12 \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", "t4", + "t5", "t6", "t7", "t8" + ); +} + +void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + const uint8* t = src_ptr + stride; + const int c = 0x2AAA; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| + "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0| + "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4| + "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| + "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6| + "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4| + "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6 + "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4 + "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1| + "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3| + "srl $t4, $t4, 2 \n" // t4 / 4 + "srl $t6, $t6, 16 \n" // |0|0|S3|T3| + "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3 + "addu $t6, $t5, $t6 \n" + "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA + "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| + "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| + "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0 + "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0 + "addu $t0, $t0, $t2 \n" + "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[t], %[t], 8 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "addiu %[dst_ptr], %[dst_ptr], 3 \n" + "srl $t6, $t6, 16 \n" + "srl $t0, $t0, 16 \n" + "sb $t4, -1(%[dst_ptr]) \n" + "sb $t6, -2(%[dst_ptr]) \n" + "bgtz %[dst_width], 1b \n" + " sb $t0, -3(%[dst_ptr]) \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst_ptr] "+r" (dst_ptr), + [t] "+r" (t), + [dst_width] "+r" (dst_width) + : [c] "r" (c) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6" + ); +} + +void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + const uint8* s1 = src_ptr + stride; + stride += stride; + const uint8* s2 = src_ptr + stride; + const int c1 = 0x1C71; + const int c2 = 0x2AAA; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| + "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0| + "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4| + "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0| + "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4| + "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| + "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6| + "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6 + "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4| + "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4 + "sll $t8, $t5, 16 \n" // |R5|R4|0|0| + "raddu.w.qb $t8, $t8 \n" // R5+R4 + "addu $t7, $t7, $t8 \n" + "srl $t8, $t5, 16 \n" // |0|0|R7|R6| + "raddu.w.qb $t8, $t8 \n" // R7 + R6 + "addu $t6, $t6, $t8 \n" + "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA + "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1| + "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1| + "srl $t8, $t8, 8 \n" // |0|S3|T3|R3| + "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3 + "addu $t7, $t7, $t8 \n" + "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71 + "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| + "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| + "sll $t4, $t4, 8 \n" // |R2|R1|R0|0| + "raddu.w.qb $t0, $t0 \n" + "raddu.w.qb $t2, $t2 \n" + "raddu.w.qb $t4, $t4 \n" + "addu $t0, $t0, $t2 \n" + "addu $t0, $t0, $t4 \n" + "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71 + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[s1], %[s1], 8 \n" + "addiu %[s2], %[s2], 8 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "addiu %[dst_ptr], %[dst_ptr], 3 \n" + "srl $t6, $t6, 16 \n" + "srl $t7, $t7, 16 \n" + "srl $t0, $t0, 16 \n" + "sb $t6, -1(%[dst_ptr]) \n" + "sb $t7, -2(%[dst_ptr]) \n" + "bgtz %[dst_width], 1b \n" + " sb $t0, -3(%[dst_ptr]) \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst_ptr] "+r" (dst_ptr), + [s1] "+r" (s1), + [s2] "+r" (s2), + [dst_width] "+r" (dst_width) + : [c1] "r" (c1), [c2] "r" (c2) + : "t0", "t1", "t2", "t3", "t4", + "t5", "t6", "t7", "t8" + ); +} + +#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/third_party/libyuv/source/scale_neon.cc b/third_party/libyuv/source/scale_neon.cc new file mode 100644 index 000000000..704cfd251 --- /dev/null +++ b/third_party/libyuv/source/scale_neon.cc @@ -0,0 +1,684 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) + +// NEON downscalers with interpolation. +// Provided by Fritz Koenig + +// Read 32x1 throw away even pixels, and write 16x1. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + // load even pixels into q0, odd into q1 + "vld2.8 {q0, q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +// Read 32x2 average down and write 16x1. +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %0 \n" + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc" + ); +} + +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "add r4, %0, %3 \n" + "add r5, r4, %3 \n" + "add %3, r5, %3 \n" + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + "vld1.8 {q1}, [r4]! \n" + "vld1.8 {q2}, [r5]! \n" + "vld1.8 {q3}, [%3]! \n" + "subs %2, %2, #4 \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + "vst1.32 {d0[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(src_stride) // %3 + : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" + ); +} + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc" + ); +} + +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" + + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" + + "vst3.8 {d0, d1, d2}, [%1]! \n" + + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" + ); +} + +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" + ); +} + +#define HAS_SCALEROWDOWN38_NEON +static uvec8 kShuf38 = + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; +static uvec8 kShuf38_2 = + { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; +static vec16 kMult38_Div6 = + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; +static vec16 kMult38_Div9 = + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.8 {q3}, [%3] \n" + ".p2align 2 \n" + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.8 {d4}, [%1]! \n" + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" + ); +} + +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" + "vld1.8 {q15}, [%6] \n" + "add r4, %0, %3, lsl #1 \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.8 {d16, d17, d18, d19}, [r4]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q2, q13 \n" + "vmovn.u16 d4, q2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q15 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2), // %5 + "r"(&kMult38_Div9) // %6 + : "r4", "q0", "q1", "q2", "q3", "q8", "q9", + "q13", "q14", "q15", "memory", "cc" + ); +} + +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q13 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" + ); +} + +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + asm volatile ( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + "vld1.8 {q1}, [%1]! \n" + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + "vst1.8 {d1[7]}, [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" + ); +} + +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + // load even pixels into q0, odd into q1 + "vld2.32 {q0, q1}, [%0]! \n" + "vld2.32 {q2, q3}, [%0]! \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "vst1.8 {q3}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %3, lsl #2 \n" + ".p2align 2 \n" + "1: \n" + "vld1.32 {d0[0]}, [%0], r12 \n" + "vld1.32 {d0[1]}, [%0], r12 \n" + "vld1.32 {d1[0]}, [%0], r12 \n" + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx) // %3 + : "memory", "cc", "r12", "q0" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" + ".p2align 2 \n" + "1: \n" + "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 + "vld1.8 {d1}, [%1], r12 \n" + "vld1.8 {d2}, [%0], r12 \n" + "vld1.8 {d3}, [%1], r12 \n" + "vld1.8 {d4}, [%0], r12 \n" + "vld1.8 {d5}, [%1], r12 \n" + "vld1.8 {d6}, [%0], r12 \n" + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx) // %4 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" + ); +} + +#endif // __ARM_NEON__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/scale_posix.cc b/third_party/libyuv/source/scale_posix.cc new file mode 100644 index 000000000..18b081026 --- /dev/null +++ b/third_party/libyuv/source/scale_posix.cc @@ -0,0 +1,1315 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +// Offsets for source bytes 0 to 9 +static uvec8 kShuf0 = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static uvec8 kShuf1 = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf2 = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +static uvec8 kShuf01 = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static uvec8 kShuf11 = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf21 = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +static uvec8 kMadd01 = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +static uvec8 kMadd11 = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +static uvec8 kMadd21 = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +static vec16 kRound34 = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +static uvec8 kShuf38a = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +static uvec8 kShuf38b = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +static uvec8 kShufAc = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +static uvec8 kShufAc3 = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +static uvec16 kScaleAc33 = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb0 = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb1 = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb2 = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +static uvec16 kScaleAb2 = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt + +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stridex3 = 0; + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0x8,%%xmm7 \n" + "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" + + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3 + MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4 + MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm7,%%xmm2 \n" + "pand %%xmm7,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "pavgw %%xmm2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(stridex3) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" +#endif + ); +} + +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kShuf0), // %0 + "m"(kShuf1), // %1 + "m"(kShuf2) // %2 + ); + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "movq %%xmm1," MEMACCESS2(0x8,1) " \n" + "movq %%xmm2," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS(1) " \n" + "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x8,1) " \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7 + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS(1) " \n" + "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x8,1) " \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1," MEMACCESS2(0x8,1) " \n" + "lea " MEMLEA(0xc,1) ",%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kShuf38a), // %3 + "m"(kShuf38b) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm4", "xmm5" +#endif + ); +} + +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" + : + : "m"(kShufAb0), // %0 + "m"(kShufAb1), // %1 + "m"(kShufAb2), // %2 + "m"(kScaleAb2) // %3 + ); + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "sub $0x6,%2 \n" + "movd %%xmm1," MEMACCESS(1) " \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1," MEMACCESS2(0x2,1) " \n" + "lea " MEMLEA(0x6,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + : + : "m"(kShufAc), // %0 + "m"(kShufAc3), // %1 + "m"(kScaleAc33) // %2 + ); + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6 + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "sub $0x6,%2 \n" + "movd %%xmm6," MEMACCESS(1) " \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6," MEMACCESS2(0x2,1) " \n" + "lea " MEMLEA(0x6,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + int tmp_height = 0; + intptr_t tmp_src = 0; + asm volatile ( + "pxor %%xmm4,%%xmm4 \n" + "sub $0x1,%5 \n" + + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "mov %0,%3 \n" + "add %6,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "punpckhbw %%xmm4,%%xmm1 \n" + "mov %5,%2 \n" + "test %2,%2 \n" + "je 3f \n" + + LABELALIGN + "2: \n" + "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "add %6,%0 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "sub $0x1,%2 \n" + "jg 2b \n" + + LABELALIGN + "3: \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x10,3) ",%0 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_height), // %2 + "+r"(tmp_src), // %3 + "+r"(src_width), // %4 + "+rm"(src_height) // %5 + : "rm"((intptr_t)(src_stride)) // %6 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} + +// Bilinear column filtering. SSSE3 version. +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0, temp_pixel = 0; + asm volatile ( + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + BUNDLEALIGN + MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,%k2 \n" + "mov %w2," MEMACCESS(0) " \n" + "lea " MEMLEA(0x2,0) ",%0 \n" + "sub $0x2,%5 \n" + "jge 2b \n" + + LABELALIGN + "29: \n" + "addl $0x1,%5 \n" + "jl 99f \n" + MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,%k2 \n" + "mov %b2," MEMACCESS(0) " \n" + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+a"(temp_pixel), // %2 + "+r"(x0), // %3 + "+r"(x1), // %4 + "+rm"(dst_width) // %5 + : "rm"(x), // %6 + "rm"(dx) // %7 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "sub $0x20,%2 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "jg 1b \n" + + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 + MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12 = 0; + asm volatile ( + "lea " MEMLEA3(0x00,1,4) ",%1 \n" + "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" + LABELALIGN + "1: \n" + "movd " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 + "punpckldq %%xmm1,%%xmm0 \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 + MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 + "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width), // %3 + "+r"(src_stepx_x12) // %4 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, + uint8* dst_argb, int dst_width) { + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12 = 0; + intptr_t row1 = (intptr_t)(src_stride); + asm volatile ( + "lea " MEMLEA3(0x00,1,4) ",%1 \n" + "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" + "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" + + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 + MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 + BUNDLEALIGN + MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 + "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" + "movq " MEMACCESS(5) ",%%xmm2 \n" + BUNDLEALIGN + MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 + MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 + MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 + "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+rm"(dst_width), // %3 + "+r"(src_stepx_x12), // %4 + "+r"(row1) // %5 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0; + asm volatile ( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x11,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x5,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "cmp $0x0,%4 \n" + "jl 99f \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + LABELALIGN + "40: \n" + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 + "pextrw $0x5,%%xmm2,%k0 \n" + "pextrw $0x7,%%xmm2,%k1 \n" + "paddd %%xmm3,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" + MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 + MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" + "sub $0x4,%4 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jge 40b \n" + + "49: \n" + "test $0x2,%4 \n" + "je 29f \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 + "pextrw $0x5,%%xmm2,%k0 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x8,2) ",%2 \n" + "29: \n" + "test $0x1,%4 \n" + "je 99f \n" + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + "movd %%xmm0," MEMACCESS(2) " \n" + "99: \n" + : "+a"(x0), // %0 + "+d"(x1), // %1 + "+r"(dst_argb), // %2 + "+r"(src_argb), // %3 + "+r"(dst_width) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "jg 1b \n" + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0; + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm5 \n" + : + : "m"(kShuffleColARGB), // %0 + "m"(kShuffleFractions) // %1 + ); + + asm volatile ( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "sub $0x2,%2 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 + "psrlw $0x9,%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 + "pshufb %%xmm5,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(0) " \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "sub $0x2,%2 \n" + "jge 2b \n" + + LABELALIGN + "29: \n" + "add $0x1,%2 \n" + "jl 99f \n" + "psrlw $0x9,%%xmm2 \n" + BUNDLEALIGN + MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 + "pshufb %%xmm5,%%xmm2 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0," MEMACCESS(0) " \n" + + LABELALIGN + "99: \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+rm"(dst_width), // %2 + "+r"(x0), // %3 + "+r"(x1) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_X86(int num, int div) { + asm volatile ( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx" + ); + return num; +} + +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. +int FixedDiv1_X86(int num, int div) { + asm volatile ( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "sub $0x10001,%%eax \n" + "sbb $0x0,%%edx \n" + "sub $0x1,%1 \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx" + ); + return num; +} + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/scale_win.cc b/third_party/libyuv/source/scale_win.cc new file mode 100644 index 000000000..bd5cca8af --- /dev/null +++ b/third_party/libyuv/source/scale_win.cc @@ -0,0 +1,1320 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Visual C x86. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +// Offsets for source bytes 0 to 9 +static uvec8 kShuf0 = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static uvec8 kShuf1 = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf2 = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +static uvec8 kShuf01 = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static uvec8 kShuf11 = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf21 = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +static uvec8 kMadd01 = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +static uvec8 kMadd11 = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +static uvec8 kMadd21 = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +static vec16 kRound34 = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +static uvec8 kShuf38a = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +static uvec8 kShuf38b = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +static uvec8 kShufAc = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +static uvec8 kShufAc3 = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +static uvec16 kScaleAc33 = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb0 = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb1 = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb2 = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +static uvec16 kScaleAb2 = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; + +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x1 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + align 4 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x1 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +// Point samples 32 pixels to 8 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 + psrld xmm5, 24 + pslld xmm5, 16 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm0, 8 + packuswb xmm0, xmm0 + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop + + ret + } +} + +// Blends 32x4 rectangle to 8x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, [eax + esi * 2] + movdqa xmm3, [eax + esi * 2 + 16] + movdqa xmm4, [eax + edi] + movdqa xmm5, [eax + edi + 16] + lea eax, [eax + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm7 + pand xmm3, xmm7 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa xmm2, xmm0 // average columns (16 to 8 pixels) + psrlw xmm0, 8 + pand xmm2, xmm7 + pavgw xmm0, xmm2 + packuswb xmm0, xmm0 + + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop + + pop edi + pop esi + ret + } +} + +// Point samples 32 pixels to 24 pixels. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm3, kShuf0 + movdqa xmm4, kShuf1 + movdqa xmm5, kShuf2 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm1 + palignr xmm1, xmm0, 8 + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + 8], xmm1 + movq qword ptr [edx + 16], xmm2 + lea edx, [edx + 24] + sub ecx, 24 + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 24x1 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Register usage: +// xmm0 src_row 0 +// xmm1 src_row 1 +// xmm2 shuf 0 +// xmm3 shuf 1 +// xmm4 shuf 2 +// xmm5 madd 0 +// xmm6 madd 1 +// xmm7 kRound34 + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShuf01 + movdqa xmm3, kShuf11 + movdqa xmm4, kShuf21 + movdqa xmm5, kMadd01 + movdqa xmm6, kMadd11 + movdqa xmm7, kRound34 + + align 4 + wloop: + movdqa xmm0, [eax] // pixels 0..7 + movdqa xmm1, [eax + esi] + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqa xmm0, [eax + 16] // pixels 16..23 + movdqa xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + sub ecx, 24 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx + 24] + jg wloop + + pop esi + ret + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShuf01 + movdqa xmm3, kShuf11 + movdqa xmm4, kShuf21 + movdqa xmm5, kMadd01 + movdqa xmm6, kMadd11 + movdqa xmm7, kRound34 + + align 4 + wloop: + movdqa xmm0, [eax] // pixels 0..7 + movdqa xmm1, [eax + esi] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqa xmm0, [eax + 16] // pixels 16..23 + movdqa xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + sub ecx, 24 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx+24] + jg wloop + + pop esi + ret + } +} + +// 3/8 point sampler + +// Scale 32 pixels to 12 +__declspec(naked) __declspec(align(16)) +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm4, kShuf38a + movdqa xmm5, kShuf38b + + align 4 + xloop: + movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + lea eax, [eax + 32] + pshufb xmm0, xmm4 + pshufb xmm1, xmm5 + paddusb xmm0, xmm1 + + sub ecx, 12 + movq qword ptr [edx], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edx + 8], xmm1 + lea edx, [edx + 12] + jg xloop + + ret + } +} + +// Scale 16x3 pixels to 6x1 with interpolation +__declspec(naked) __declspec(align(16)) +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShufAc + movdqa xmm3, kShufAc3 + movdqa xmm4, kScaleAc33 + pxor xmm5, xmm5 + + align 4 + xloop: + movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqa xmm6, [eax + esi] + movhlps xmm1, xmm0 + movhlps xmm7, xmm6 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + movdqa xmm6, [eax + esi * 2] + lea eax, [eax + 16] + movhlps xmm7, xmm6 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + + movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + pshufb xmm6, xmm2 + + movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + pshufb xmm7, xmm3 + paddusw xmm6, xmm7 + + pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 + packuswb xmm6, xmm6 + + sub ecx, 6 + movd [edx], xmm6 // write 6 pixels + psrlq xmm6, 16 + movd [edx + 2], xmm6 + lea edx, [edx + 6] + jg xloop + + pop esi + ret + } +} + +// Scale 16x2 pixels to 6x1 with interpolation +__declspec(naked) __declspec(align(16)) +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShufAb0 + movdqa xmm3, kShufAb1 + movdqa xmm4, kShufAb2 + movdqa xmm5, kScaleAb2 + + align 4 + xloop: + movdqa xmm0, [eax] // average 2 rows into xmm0 + pavgb xmm0, [eax + esi] + lea eax, [eax + 16] + + movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 + pshufb xmm1, xmm2 + movdqa xmm6, xmm0 + pshufb xmm6, xmm3 + paddusw xmm1, xmm6 + pshufb xmm0, xmm4 + paddusw xmm1, xmm0 + + pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 + packuswb xmm1, xmm1 + + sub ecx, 6 + movd [edx], xmm1 // write 6 pixels + psrlq xmm1, 16 + movd [edx + 2], xmm1 + lea edx, [edx + 6] + jg xloop + + pop esi + ret + } +} + +// Reads 16xN bytes and produces 16 shorts at a time. +// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. +__declspec(naked) __declspec(align(16)) +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, + int src_height) { + __asm { + push esi + push edi + push ebx + push ebp + mov esi, [esp + 16 + 4] // src_ptr + mov edx, [esp + 16 + 8] // src_stride + mov edi, [esp + 16 + 12] // dst_ptr + mov ecx, [esp + 16 + 16] // dst_width + mov ebx, [esp + 16 + 20] // height + pxor xmm4, xmm4 + dec ebx + + align 4 + xloop: + // first row + movdqa xmm0, [esi] + lea eax, [esi + edx] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + lea esi, [esi + 16] + mov ebp, ebx + test ebp, ebp + je ydone + + // sum remaining rows + align 4 + yloop: + movdqa xmm2, [eax] // read 16 pixels + lea eax, [eax + edx] // advance to next row + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + paddusw xmm0, xmm2 // sum 16 words + paddusw xmm1, xmm3 + sub ebp, 1 + jg yloop + + align 4 + ydone: + movdqa [edi], xmm0 + movdqa [edi + 16], xmm1 + lea edi, [edi + 32] + + sub ecx, 16 + jg xloop + + pop ebp + pop ebx + pop edi + pop esi + ret + } +} + +// Bilinear column filtering. SSSE3 version. +// TODO(fbarchard): Port to Neon +// TODO(fbarchard): Switch the following: +// xor ebx, ebx +// mov bx, word ptr [esi + eax] // 2 source x0 pixels +// To +// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels +// when drmemory bug fixed. +// https://code.google.com/p/drmemory/issues/detail?id=1396 + +__declspec(naked) __declspec(align(16)) +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + __asm { + push ebx + push esi + push edi + mov edi, [esp + 12 + 4] // dst_ptr + mov esi, [esp + 12 + 8] // src_ptr + mov ecx, [esp + 12 + 12] // dst_width + movd xmm2, [esp + 12 + 16] // x + movd xmm3, [esp + 12 + 20] // dx + mov eax, 0x04040000 // shuffle to line up fractions with pixel. + movd xmm5, eax + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + align 4 + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm1, 9 // 7 bit fractions. + movzx ebx, word ptr [esi + edx] // 2 source x1 pixels + movd xmm4, ebx + pshufb xmm1, xmm5 // 0011 + punpcklwd xmm0, xmm4 + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // 8 bits, 2 pixels. + movd ebx, xmm0 + mov [edi], bx + lea edi, [edi + 2] + sub ecx, 2 // 2 pixels + jge xloop2 + + align 4 + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm2, 9 // 7 bit fractions. + pshufb xmm2, xmm5 // 0011 + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // 16 bit + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // 8 bits + movd ebx, xmm0 + mov [edi], bl + + align 4 + xloop99: + + pop edi + pop esi + pop ebx + ret + } +} + +// Reads 16 pixels, duplicates them and writes 32 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + __asm { + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + sub ecx, 32 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + + ret + } +} + +// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + shufps xmm0, xmm1, 0xdd + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 8x1 rectangle to 4x1. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 8x2 rectangle to 4x1. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +// Reads 4 pixels at a time. +// Alignment requirement: dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + __asm { + push ebx + push edi + mov eax, [esp + 8 + 4] // src_argb + // src_stride ignored + mov ebx, [esp + 8 + 12] // src_stepx + mov edx, [esp + 8 + 16] // dst_argb + mov ecx, [esp + 8 + 20] // dst_width + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + align 4 + wloop: + movd xmm0, [eax] + movd xmm1, [eax + ebx] + punpckldq xmm0, xmm1 + movd xmm2, [eax + ebx * 2] + movd xmm3, [eax + edi] + lea eax, [eax + ebx * 4] + punpckldq xmm2, xmm3 + punpcklqdq xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop edi + pop ebx + ret + } +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // src_argb + mov esi, [esp + 12 + 8] // src_stride + mov ebx, [esp + 12 + 12] // src_stepx + mov edx, [esp + 12 + 16] // dst_argb + mov ecx, [esp + 12 + 20] // dst_width + lea esi, [eax + esi] // row1 pointer + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + align 4 + wloop: + movq xmm0, qword ptr [eax] // row0 4 pairs + movhps xmm0, qword ptr [eax + ebx] + movq xmm1, qword ptr [eax + ebx * 2] + movhps xmm1, qword ptr [eax + edi] + lea eax, [eax + ebx * 4] + movq xmm2, qword ptr [esi] // row1 4 pairs + movhps xmm2, qword ptr [esi + ebx] + movq xmm3, qword ptr [esi + ebx * 2] + movhps xmm3, qword ptr [esi + edi] + lea esi, [esi + ebx * 4] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop edi + pop esi + pop ebx + ret + } +} + +// Column scaling unfiltered. SSE2 version. +__declspec(naked) __declspec(align(16)) +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + __asm { + push edi + push esi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + + pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 + pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 + paddd xmm2, xmm0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 2 + pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 + paddd xmm2, xmm0 // x3 x2 x1 x0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 4 + pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 + + pextrw eax, xmm2, 1 // get x0 integer. + pextrw edx, xmm2, 3 // get x1 integer. + + cmp ecx, 0 + jle xloop99 + sub ecx, 4 + jl xloop49 + + // 4 Pixel loop. + align 4 + xloop4: + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + pextrw edx, xmm2, 7 // get x3 integer. + paddd xmm2, xmm3 // x += dx + punpckldq xmm0, xmm1 // x0 x1 + + movd xmm1, [esi + eax * 4] // 1 source x2 pixels + movd xmm4, [esi + edx * 4] // 1 source x3 pixels + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + punpckldq xmm1, xmm4 // x2 x3 + punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 + sub ecx, 4 // 4 pixels + movdqu [edi], xmm0 + lea edi, [edi + 16] + jge xloop4 + + align 4 + xloop49: + test ecx, 2 + je xloop29 + + // 2 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + punpckldq xmm0, xmm1 // x0 x1 + + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + + xloop29: + test ecx, 1 + je xloop99 + + // 1 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x2 pixels + movd dword ptr [edi], xmm0 + align 4 + xloop99: + + pop esi + pop edi + ret + } +} + +// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. +// TODO(fbarchard): Port to Neon + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +__declspec(naked) __declspec(align(16)) +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + movdqa xmm4, kShuffleColARGB + movdqa xmm5, kShuffleFractions + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + align 4 + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + psrlw xmm1, 9 // 7 bit fractions. + movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels + pshufb xmm1, xmm5 // 0000000011111111 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 2 // 2 pixels + jge xloop2 + + align 4 + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + psrlw xmm2, 9 // 7 bit fractions. + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + pshufb xmm2, xmm5 // 00000000 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. + psrlw xmm0, 7 + packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. + movd [edi], xmm0 + + align 4 + xloop99: + + pop edi + pop esi + ret + } +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + __asm { + mov edx, [esp + 4] // dst_argb + mov eax, [esp + 8] // src_argb + mov ecx, [esp + 12] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpckldq xmm0, xmm0 + punpckhdq xmm1, xmm1 + sub ecx, 8 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + + ret + } +} + +// Divide num by div and return as 16.16 fixed point result. +__declspec(naked) __declspec(align(16)) +int FixedDiv_X86(int num, int div) { + __asm { + mov eax, [esp + 4] // num + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 + shl eax, 16 + idiv dword ptr [esp + 8] + ret + } +} + +// Divide num by div and return as 16.16 fixed point result. +__declspec(naked) __declspec(align(16)) +int FixedDiv1_X86(int num, int div) { + __asm { + mov eax, [esp + 4] // num + mov ecx, [esp + 8] // denom + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 + shl eax, 16 + sub eax, 0x00010001 + sbb edx, 0 + sub ecx, 1 + idiv ecx + ret + } +} + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/vpxdec.c b/vpxdec.c index ed37c7083..127e65f89 100644 --- a/vpxdec.c +++ b/vpxdec.c @@ -873,8 +873,16 @@ int main_loop(int argc, const char **argv_) { } if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) { +#if CONFIG_LIBYUV vpx_image_scale(img, scaled_img, kFilterBox); img = scaled_img; +#else + fprintf(stderr, "Failed to scale output frame: %s.\n" + "Scaling is disabled in this configuration. " + "To enable scaling, configure with --enable-libyuv\n", + vpx_codec_error(&decoder)); + return EXIT_FAILURE; +#endif } } diff --git a/vpxenc.c b/vpxenc.c index 4d9629408..d46a83eb0 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -1268,6 +1268,7 @@ static void encode_frame(struct stream_state *stream, fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name); exit(EXIT_FAILURE); } +#if CONFIG_LIBYUV if (!stream->img) stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, cfg->g_w, cfg->g_h, 16); @@ -1283,8 +1284,15 @@ static void encode_frame(struct stream_state *stream, stream->img->stride[VPX_PLANE_V], stream->img->d_w, stream->img->d_h, kFilterBox); - img = stream->img; +#else + stream->encoder.err = 1; + ctx_exit_on_error(&stream->encoder, + "Stream %d: Failed to encode frame.\n" + "Scaling disabled in this configuration. \n" + "To enable, configure with --enable-libyuv\n", + stream->index); +#endif } vpx_usec_timer_start(&timer);