From b644eb9f446c89a25b26a19882f394b6f3711d64 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 22 Aug 2014 10:31:01 -0700 Subject: [PATCH] libyuv: update to r1060 picks up some lint, build fixes Change-Id: I0efb19385afa4ea3073a53e2b8334e57f245eea0 --- third_party/libyuv/README.libvpx | 2 +- .../libyuv/include/libyuv/mjpeg_decoder.h | 1 - third_party/libyuv/include/libyuv/row.h | 90 +++- third_party/libyuv/include/libyuv/scale_row.h | 8 + third_party/libyuv/include/libyuv/version.h | 2 +- third_party/libyuv/source/compare.cc | 2 +- third_party/libyuv/source/compare_neon.cc | 39 ++ third_party/libyuv/source/convert.cc | 306 ++++++------ .../libyuv/source/convert_from_argb.cc | 168 ++++--- third_party/libyuv/source/cpu_id.cc | 11 +- .../libyuv/source/format_conversion.cc | 12 +- third_party/libyuv/source/mjpeg_decoder.cc | 10 +- third_party/libyuv/source/row_any.cc | 24 +- third_party/libyuv/source/row_neon64.cc | 437 +++++++++--------- third_party/libyuv/source/row_win.cc | 4 +- 15 files changed, 663 insertions(+), 453 deletions(-) diff --git a/third_party/libyuv/README.libvpx b/third_party/libyuv/README.libvpx index fa5b498ca..fb2b9d162 100644 --- a/third_party/libyuv/README.libvpx +++ b/third_party/libyuv/README.libvpx @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1041 +Version: 1060 License: BSD License File: LICENSE diff --git a/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/third_party/libyuv/include/libyuv/mjpeg_decoder.h index 82fd95df2..8423121d1 100644 --- a/third_party/libyuv/include/libyuv/mjpeg_decoder.h +++ b/third_party/libyuv/include/libyuv/mjpeg_decoder.h @@ -153,7 +153,6 @@ class LIBYUV_API MJpegDecoder { int* subsample_x, int* subsample_y, int number_of_components); private: - void AllocOutputBuffers(int num_outbufs); void DestroyOutputBuffers(); diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h index fdfe1ae35..4b3c870f9 100644 --- a/third_party/libyuv/include/libyuv/row.h +++ b/third_party/libyuv/include/libyuv/row.h @@ -252,6 +252,94 @@ extern "C" { // The following are available on arm64 platforms: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +// #define HAS_I444TOARGBROW_NEON +// #define HAS_I422TOARGBROW_NEON +// #define HAS_I411TOARGBROW_NEON +// #define HAS_I422TOBGRAROW_NEON +// #define HAS_I422TOABGRROW_NEON +// #define HAS_I422TORGBAROW_NEON +// #define HAS_I422TORGB24ROW_NEON +// #define HAS_I422TORAWROW_NEON +// #define HAS_I422TORGB565ROW_NEON +// #define HAS_I422TOARGB1555ROW_NEON +// #define HAS_I422TOARGB4444ROW_NEON +// #define HAS_YTOARGBROW_NEON +// #define HAS_I400TOARGBROW_NEON +// #define HAS_NV12TOARGBROW_NEON +// #define HAS_NV21TOARGBROW_NEON +// #define HAS_NV12TORGB565ROW_NEON +// #define HAS_NV21TORGB565ROW_NEON +// #define HAS_YUY2TOARGBROW_NEON +// #define HAS_UYVYTOARGBROW_NEON +#define HAS_SPLITUVROW_NEON +#define HAS_MERGEUVROW_NEON +#define HAS_COPYROW_NEON +#define HAS_SETROW_NEON +#define HAS_ARGBSETROWS_NEON +#define HAS_MIRRORROW_NEON +#define HAS_MIRRORUVROW_NEON +#define HAS_ARGBMIRRORROW_NEON +#define HAS_RGB24TOARGBROW_NEON +#define HAS_RAWTOARGBROW_NEON +// #define HAS_RGB565TOARGBROW_NEON +// #define HAS_ARGB1555TOARGBROW_NEON +// #define HAS_ARGB4444TOARGBROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORAWROW_NEON +#define HAS_YUY2TOYROW_NEON +#define HAS_UYVYTOYROW_NEON +#define HAS_YUY2TOUV422ROW_NEON +#define HAS_UYVYTOUV422ROW_NEON +#define HAS_YUY2TOUVROW_NEON +#define HAS_UYVYTOUVROW_NEON +#define HAS_HALFROW_NEON +#define HAS_ARGBTOBAYERROW_NEON +#define HAS_ARGBTOBAYERGGROW_NEON +#define HAS_ARGBSHUFFLEROW_NEON +#define HAS_I422TOYUY2ROW_NEON +#define HAS_I422TOUYVYROW_NEON +// #define HAS_ARGBTORGB565ROW_NEON +// #define HAS_ARGBTOARGB1555ROW_NEON +// #define HAS_ARGBTOARGB4444ROW_NEON +#define HAS_ARGBTOYROW_NEON +#define HAS_ARGBTOYJROW_NEON +// #define HAS_ARGBTOUV444ROW_NEON +// #define HAS_ARGBTOUV422ROW_NEON +// #define HAS_ARGBTOUV411ROW_NEON +// #define HAS_ARGBTOUVROW_NEON +// #define HAS_ARGBTOUVJROW_NEON +// #define HAS_BGRATOUVROW_NEON +// #define HAS_ABGRTOUVROW_NEON +// #define HAS_RGBATOUVROW_NEON +// #define HAS_RGB24TOUVROW_NEON +// #define HAS_RAWTOUVROW_NEON +// #define HAS_RGB565TOUVROW_NEON +// #define HAS_ARGB1555TOUVROW_NEON +// #define HAS_ARGB4444TOUVROW_NEON +// #define HAS_RGB565TOYROW_NEON +// #define HAS_ARGB1555TOYROW_NEON +// #define HAS_ARGB4444TOYROW_NEON +// #define HAS_BGRATOYROW_NEON +// #define HAS_ABGRTOYROW_NEON +// #define HAS_RGBATOYROW_NEON +// #define HAS_RGB24TOYROW_NEON +// #define HAS_RAWTOYROW_NEON +// #define HAS_INTERPOLATEROW_NEON +// #define HAS_ARGBBLENDROW_NEON +// #define HAS_ARGBATTENUATEROW_NEON +// #define HAS_ARGBQUANTIZEROW_NEON +// #define HAS_ARGBSHADEROW_NEON +// #define HAS_ARGBGRAYROW_NEON +// #define HAS_ARGBSEPIAROW_NEON +// #define HAS_ARGBCOLORMATRIXROW_NEON +#define HAS_ARGBMULTIPLYROW_NEON +#define HAS_ARGBADDROW_NEON +#define HAS_ARGBSUBTRACTROW_NEON +#define HAS_SOBELROW_NEON +#define HAS_SOBELTOPLANEROW_NEON +#define HAS_SOBELXYROW_NEON +#define HAS_SOBELXROW_NEON +#define HAS_SOBELYROW_NEON #endif // The following are available on Neon platforms: @@ -465,7 +553,7 @@ typedef uint8 uvec8[16]; #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" #endif // defined(__native_client__) && defined(__x86_64__) -#if defined(__arm__) +#if defined(__arm__) || defined(__aarch64__) #undef MEMACCESS #if defined(__native_client__) #define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h index 8dc0762f2..3c495424f 100644 --- a/third_party/libyuv/include/libyuv/scale_row.h +++ b/third_party/libyuv/include/libyuv/scale_row.h @@ -51,6 +51,14 @@ extern "C" { #define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEARGBROWDOWN2_NEON +#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__aarch64__) || defined(LIBYUV_NEON)) +/* #define HAS_SCALEROWDOWN2_NEON */ +/* #define HAS_SCALEROWDOWN4_NEON */ +/* #define HAS_SCALEROWDOWN34_NEON */ +/* #define HAS_SCALEROWDOWN38_NEON */ +/* #define HAS_SCALEARGBROWDOWNEVEN_NEON */ +/* #define HAS_SCALEARGBROWDOWN2_NEON */ #endif // The following are available on Mips platforms: diff --git a/third_party/libyuv/include/libyuv/version.h b/third_party/libyuv/include/libyuv/version.h index 912c4c9e0..73a7f1b01 100644 --- a/third_party/libyuv/include/libyuv/version.h +++ b/third_party/libyuv/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1041 +#define LIBYUV_VERSION 1059 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/third_party/libyuv/source/compare.cc b/third_party/libyuv/source/compare.cc index 9ea81b4e2..dc715e019 100644 --- a/third_party/libyuv/source/compare.cc +++ b/third_party/libyuv/source/compare.cc @@ -80,7 +80,7 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); #if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SUMSQUAREERROR_NEON uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); #endif diff --git a/third_party/libyuv/source/compare_neon.cc b/third_party/libyuv/source/compare_neon.cc index 5e7b8e443..55052c0ee 100644 --- a/third_party/libyuv/source/compare_neon.cc +++ b/third_party/libyuv/source/compare_neon.cc @@ -56,6 +56,45 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { return sse; } +#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 sse; + asm volatile ( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + "subs %2, %2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "bgt 1b \n" + + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + return sse; +} + #endif // __ARM_NEON__ #ifdef __cplusplus diff --git a/third_party/libyuv/source/convert.cc b/third_party/libyuv/source/convert.cc index 874a6cb7c..a8e294f47 100644 --- a/third_party/libyuv/source/convert.cc +++ b/third_party/libyuv/source/convert.cc @@ -401,7 +401,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int width, int height) { int y; - int halfheight = (height + 1) >> 1; + int halfheight; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUV422Row_C; @@ -711,11 +711,13 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; } } #endif @@ -963,9 +965,6 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1022,36 +1021,44 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, #endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_RGB24TOYROW_NEON - for (y = 0; y < height - 1; y += 2) { + { +#if !defined(HAS_RGB24TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_RGB24TOYROW_NEON) - RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); - RGB24ToYRow(src_rgb24, dst_y, width); - RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); + RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); + RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); #else - RGB24ToARGBRow(src_rgb24, row, width); - RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_rgb24 += src_stride_rgb24 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + src_rgb24 += src_stride_rgb24 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_RGB24TOYROW_NEON) - RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); - RGB24ToYRow(src_rgb24, dst_y, width); + RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); #else - RGB24ToARGBRow(src_rgb24, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + RGB24ToARGBRow(src_rgb24, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } + } #if !defined(HAS_RGB24TOYROW_NEON) - free_aligned_buffer_64(row); + free_aligned_buffer_64(row); #endif + } return 0; } @@ -1075,9 +1082,6 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1134,36 +1138,42 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, #endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_RAWTOYROW_NEON - for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_RAWTOYROW_NEON) - RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); - RAWToYRow(src_raw, dst_y, width); - RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); -#else - RAWToARGBRow(src_raw, row, width); - RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); -#endif - src_raw += src_stride_raw * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if defined(HAS_RAWTOYROW_NEON) - RAWToUVRow(src_raw, 0, dst_u, dst_v, width); - RAWToYRow(src_raw, dst_y, width); -#else - RAWToARGBRow(src_raw, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); -#endif + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + for (y = 0; y < height - 1; y += 2) { + #if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); + #else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + #endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + #if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, 0, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + #else + RAWToARGBRow(src_raw, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + #endif + } + #if !defined(HAS_RAWTOYROW_NEON) + free_aligned_buffer_64(row); + #endif } -#if !defined(HAS_RAWTOYROW_NEON) - free_aligned_buffer_64(row); -#endif return 0; } @@ -1187,9 +1197,6 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1246,36 +1253,44 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, #endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_RGB565TOYROW_NEON - for (y = 0; y < height - 1; y += 2) { + { +#if !defined(HAS_RGB565TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_RGB565TOYROW_NEON) - RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); - RGB565ToYRow(src_rgb565, dst_y, width); - RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); + RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); + RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); #else - RGB565ToARGBRow(src_rgb565, row, width); - RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + RGB565ToARGBRow(src_rgb565, row, width); + RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_rgb565 += src_stride_rgb565 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + src_rgb565 += src_stride_rgb565 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_RGB565TOYROW_NEON) - RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); - RGB565ToYRow(src_rgb565, dst_y, width); + RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); #else - RGB565ToARGBRow(src_rgb565, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + RGB565ToARGBRow(src_rgb565, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } + } #if !defined(HAS_RGB565TOYROW_NEON) - free_aligned_buffer_64(row); + free_aligned_buffer_64(row); #endif + } return 0; } @@ -1299,9 +1314,6 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1358,38 +1370,45 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, #endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGB1555TOYROW_NEON - for (y = 0; y < height - 1; y += 2) { + { +#if !defined(HAS_ARGB1555TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_ARGB1555TOYROW_NEON) - ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); - ARGB1555ToYRow(src_argb1555, dst_y, width); - ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, - width); + ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); + ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, + width); #else - ARGB1555ToARGBRow(src_argb1555, row, width); - ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, - width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_argb1555 += src_stride_argb1555 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + src_argb1555 += src_stride_argb1555 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_ARGB1555TOYROW_NEON) - ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); - ARGB1555ToYRow(src_argb1555, dst_y, width); + ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); #else - ARGB1555ToARGBRow(src_argb1555, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } + } #if !defined(HAS_ARGB1555TOYROW_NEON) free_aligned_buffer_64(row); #endif + } return 0; } @@ -1413,9 +1432,6 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1472,38 +1488,46 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, #endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGB4444TOYROW_NEON - for (y = 0; y < height - 1; y += 2) { + { +#if !defined(HAS_ARGB4444TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_ARGB4444TOYROW_NEON) - ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); - ARGB4444ToYRow(src_argb4444, dst_y, width); - ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, - width); + ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); + ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, + width); #else - ARGB4444ToARGBRow(src_argb4444, row, width); - ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, - width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_argb4444 += src_stride_argb4444 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + src_argb4444 += src_stride_argb4444 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_ARGB4444TOYROW_NEON) - ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); - ARGB4444ToYRow(src_argb4444, dst_y, width); + ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); #else - ARGB4444ToARGBRow(src_argb4444, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } + } #if !defined(HAS_ARGB4444TOYROW_NEON) - free_aligned_buffer_64(row); + free_aligned_buffer_64(row); #endif + } return 0; } diff --git a/third_party/libyuv/source/convert_from_argb.cc b/third_party/libyuv/source/convert_from_argb.cc index 121a41611..de461ddb0 100644 --- a/third_party/libyuv/source/convert_from_argb.cc +++ b/third_party/libyuv/source/convert_from_argb.cc @@ -60,6 +60,13 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, } } } +#elif defined(HAS_ARGBTOUV444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToUV444Row = ARGBToUV444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444Row = ARGBToUV444Row_NEON; + } + } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { @@ -76,10 +83,8 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, #elif defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; - ARGBToUV444Row = ARGBToUV444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; - ARGBToUV444Row = ARGBToUV444Row_NEON; } } #endif @@ -134,6 +139,13 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, } } } +#elif defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } #endif #if defined(HAS_ARGBTOYROW_SSSE3) @@ -153,12 +165,6 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUV422Row = ARGBToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_NEON; - } - } } #endif @@ -228,11 +234,13 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 32) { - ARGBToUV411Row = ARGBToUV411Row_Any_NEON; - if (IS_ALIGNED(width, 32)) { - ARGBToUV411Row = ARGBToUV411Row_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUV411ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 32) { + ARGBToUV411Row = ARGBToUV411Row_Any_NEON; + if (IS_ALIGNED(width, 32)) { + ARGBToUV411Row = ARGBToUV411Row_NEON; } } #endif @@ -261,9 +269,6 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, ARGBToYRow_C; void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) = MergeUVRow_C; - // Allocate a rows of uv. - align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); - uint8* row_v = row_u + ((halfwidth + 15) & ~15); if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { @@ -296,11 +301,13 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; } } #endif @@ -331,22 +338,27 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, } } #endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); + uint8* row_v = row_u + ((halfwidth + 15) & ~15); - for (y = 0; y < height - 1; y += 2) { - ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); - MergeUVRow_(row_u, row_v, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); - ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); - src_argb += src_stride_argb * 2; - dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - ARGBToUVRow(src_argb, 0, row_u, row_v, width); - MergeUVRow_(row_u, row_v, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); } - free_aligned_buffer_64(row_u); return 0; } @@ -364,9 +376,6 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, ARGBToYRow_C; void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) = MergeUVRow_C; - // Allocate a rows of uv. - align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); - uint8* row_v = row_u + ((halfwidth + 15) & ~15); if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { @@ -399,11 +408,13 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; } } #endif @@ -434,22 +445,27 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, } } #endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); + uint8* row_v = row_u + ((halfwidth + 15) & ~15); - for (y = 0; y < height - 1; y += 2) { - ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); - ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); - src_argb += src_stride_argb * 2; - dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - ARGBToUVRow(src_argb, 0, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); } - free_aligned_buffer_64(row_u); return 0; } @@ -493,6 +509,13 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } } } +#elif defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { @@ -510,12 +533,6 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUV422Row = ARGBToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_NEON; - } - } } #endif @@ -594,6 +611,13 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } } } +#elif defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { @@ -611,12 +635,6 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUV422Row = ARGBToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_NEON; - } - } } #endif @@ -1022,11 +1040,13 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_NEON; } - if (width >= 16) { - ARGBToUVJRow = ARGBToUVJRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; } } #endif diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc index 2e0d61d20..deb4c4465 100644 --- a/third_party/libyuv/source/cpu_id.cc +++ b/third_party/libyuv/source/cpu_id.cc @@ -14,7 +14,7 @@ #include // For __cpuidex() #endif #if !defined(__pnacl__) && !defined(__CLR_VER) && \ - !defined(__native_client__) && defined(_M_X64) && \ + !defined(__native_client__) && \ defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) #include // For _xgetbv() #endif @@ -97,7 +97,7 @@ int TestOsSaveYmm() { uint32 xcr0 = 0u; #if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. -#elif defined(_M_IX86) +#elif defined(_M_IX86) && defined(_MSC_VER) __asm { xor ecx, ecx // xcr 0 _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. @@ -256,12 +256,17 @@ int InitCpuFlags(void) { if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) { cpu_info_ &= ~kCpuHasMIPS_DSPR2; } -#elif defined(__arm__) +#elif defined(__arm__) || defined(__aarch64__) // gcc -mfpu=neon defines __ARM_NEON__ // __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. // For Linux, /proc/cpuinfo can be tested but without that assume Neon. #if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) cpu_info_ = kCpuHasNEON; +// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon +// flag in it. +// So for aarch64, neon enabling is hard coded here. +#elif defined(__aarch64__) + cpu_info_ = kCpuHasNEON; #else // Linux arm parse text file for neon detect. cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); diff --git a/third_party/libyuv/source/format_conversion.cc b/third_party/libyuv/source/format_conversion.cc index a3daf96a9..3c1737153 100644 --- a/third_party/libyuv/source/format_conversion.cc +++ b/third_party/libyuv/source/format_conversion.cc @@ -332,11 +332,13 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; } } #endif diff --git a/third_party/libyuv/source/mjpeg_decoder.cc b/third_party/libyuv/source/mjpeg_decoder.cc index 15b0ed88a..36028c3cc 100644 --- a/third_party/libyuv/source/mjpeg_decoder.cc +++ b/third_party/libyuv/source/mjpeg_decoder.cc @@ -13,8 +13,8 @@ #ifdef HAVE_JPEG #include -#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\ - !defined(TARGET_IPHONE_SIMULATOR) +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) // Must be included before jpeglib. #include #define HAVE_SETJMP @@ -101,7 +101,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { } buf_.data = src; - buf_.len = (int)(src_len); + buf_.len = static_cast(src_len); buf_vec_.pos = 0; decompress_struct_->client_data = &buf_vec_; #ifdef HAVE_SETJMP @@ -411,7 +411,7 @@ void init_source(j_decompress_ptr cinfo) { } boolean fill_input_buffer(j_decompress_ptr cinfo) { - BufferVector* buf_vec = (BufferVector*)(cinfo->client_data); + BufferVector* buf_vec = reinterpret_cast(cinfo->client_data); if (buf_vec->pos >= buf_vec->len) { assert(0 && "No more data"); // ERROR: No more data @@ -447,7 +447,7 @@ void ErrorHandler(j_common_ptr cinfo) { // ERROR: Error in jpeglib: buf #endif - SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err); + SetJmpErrorMgr* mgr = reinterpret_cast(cinfo->err); // This rewinds the call stack to the point of the corresponding setjmp() // and causes it to return (for a second time) with value 1. longjmp(mgr->setjmp_buffer, 1); diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc index 97ef84417..ce8b3dad1 100644 --- a/third_party/libyuv/source/row_any.cc +++ b/third_party/libyuv/source/row_any.cc @@ -79,9 +79,13 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C, YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C, 1, 2, 7) YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7) +#endif // HAS_I422TOARGBROW_NEON +#ifdef HAS_I422TOYUY2ROW_NEON YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15) +#endif // HAS_I422TOYUY2ROW_NEON +#ifdef HAS_I422TOUYVYROW_NEON YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) -#endif // HAS_I422TOARGBROW_NEON +#endif // HAS_I422TOUYVYROW_NEON #undef YANY // Wrappers to handle odd width @@ -250,12 +254,26 @@ YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8) YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8) YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8) YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8) +#endif +#ifdef HAS_YUY2TOYROW_NEON YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16) +#endif +#ifdef HAS_UYVYTOYROW_NEON YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16) +#endif +#ifdef HAS_RGB24TOARGBROW_NEON YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8) +#endif +#ifdef HAS_RAWTOARGBROW_NEON YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8) +#endif +#ifdef HAS_RGB565TOARGBROW_NEON YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8) +#endif +#ifdef HAS_ARGB1555TOARGBROW_NEON YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8) +#endif +#ifdef HAS_ARGB4444TOARGBROW_NEON YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8) #endif #undef YANY @@ -333,7 +351,11 @@ UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15) UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15) UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15) UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15) +#endif +#ifdef HAS_YUY2TOUVROW_NEON UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15) +#endif +#ifdef HAS_UYVYTOUVROW_NEON UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) #endif #undef UVANY diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc index 46e9ceb33..21111cf60 100644 --- a/third_party/libyuv/source/row_neon64.cc +++ b/third_party/libyuv/source/row_neon64.cc @@ -824,19 +824,19 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store U + "st1 {v0.16b}, [%1], #16 \n" // store U MEMACCESS(2) - "vst1.8 {q1}, [%2]! \n" // store V + "st1 {v1.16b}, [%2], #16 \n" // store V "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 // Output registers : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); } #endif // HAS_SPLITUVROW_NEON @@ -849,12 +849,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load U + "ld1 {v0.16b}, [%0], #16 \n" // load U MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load V + "ld1 {v1.16b}, [%1], #16 \n" // load V "subs %3, %3, #16 \n" // 16 processed per loop MEMACCESS(2) - "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV "bgt 1b \n" : "+r"(src_u), // %0 @@ -862,7 +862,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, "+r"(dst_uv), // %2 "+r"(width) // %3 // Output registers : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); } #endif // HAS_MERGEUVROW_NEON @@ -874,16 +874,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop MEMACCESS(1) - "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32 "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(count) // %2 // Output registers : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_COPYROW_NEON @@ -892,16 +892,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { #ifdef HAS_SETROW_NEON void SetRow_NEON(uint8* dst, uint32 v32, int count) { asm volatile ( - "vdup.u32 q0, %2 \n" // duplicate 4 ints + "dup v0.4s, %w2 \n" // duplicate 4 ints "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" // store + "st1 {v0.16b}, [%0], #16 \n" // store "bgt 1b \n" : "+r"(dst), // %0 "+r"(count) // %1 : "r"(v32) // %2 - : "cc", "memory", "q0" + : "cc", "memory", "v0" ); } #endif // HAS_SETROW_NEON @@ -922,26 +922,25 @@ void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { asm volatile ( // Start at end of source row. - "mov r3, #-16 \n" "add %0, %0, %2 \n" - "sub %0, #16 \n" + "sub %0, %0, #16 \n" ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #16 \n" // 16 pixels per loop. - "vrev64.8 q0, q0 \n" + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %2, %2, #16 \n" // 16 pixels per loop. + "rev64 v0.16b, v0.16b \n" MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // dst += 16 + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" + "st1 {v0.D}[0], [%1], #8 \n" "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0" + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0" ); } #endif // HAS_MIRRORROW_NEON @@ -951,27 +950,27 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( // Start at end of source row. - "mov r12, #-16 \n" "add %0, %0, %3, lsl #1 \n" - "sub %0, #16 \n" + "sub %0, %0, #16 \n" ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 - "subs %3, #8 \n" // 8 pixels per loop. - "vrev64.8 q0, q0 \n" + "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 + "subs %3, %3, #8 \n" // 8 pixels per loop. + "rev64 v0.8b, v0.8b \n" + "rev64 v1.8b, v1.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // dst += 8 + "st1 {v0.8b}, [%1], #8 \n" // dst += 8 MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" + "st1 {v1.8b}, [%2], #8 \n" "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 - : - : "cc", "memory", "r12", "q0" + : "r"((ptrdiff_t)-16) // %4 + : "cc", "memory", "v0", "v1" ); } #endif // HAS_MIRRORUVROW_NEON @@ -980,26 +979,25 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { asm volatile ( // Start at end of source row. - "mov r3, #-16 \n" "add %0, %0, %2, lsl #2 \n" - "sub %0, #16 \n" + "sub %0, %0, #16 \n" ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #4 \n" // 4 pixels per loop. - "vrev64.32 q0, q0 \n" + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %2, %2, #4 \n" // 4 pixels per loop. + "rev64 v0.4s, v0.4s \n" MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // dst += 16 + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" + "st1 {v0.D}[0], [%1], #8 \n" "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0" + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0" ); } #endif // HAS_ARGBMIRRORROW_NEON @@ -1007,20 +1005,20 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { #ifdef HAS_RGB24TOARGBROW_NEON void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha + "movi v4.8b, #255 \n" // Alpha ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. MEMACCESS(1) - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List ); } #endif // HAS_RGB24TOARGBROW_NEON @@ -1028,21 +1026,22 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { #ifdef HAS_RAWTOARGBROW_NEON void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha + "movi v5.8b, #255 \n" // Alpha ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B + "mov v3.8b, v1.8b \n" // move g + "mov v4.8b, v0.8b \n" // move r MEMACCESS(1) - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List ); } #endif // HAS_RAWTOARGBROW_NEON @@ -1170,16 +1169,16 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. + "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List ); } #endif // HAS_ARGBTORGB24ROW_NEON @@ -1190,17 +1189,18 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B + "mov v4.8b, v2.8b \n" // mov g + "mov v5.8b, v1.8b \n" // mov b MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List ); } #endif // HAS_ARGBTORAWROW_NEON @@ -1211,16 +1211,16 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); } #endif // HAS_YUY2TOYROW_NEON @@ -1231,16 +1231,16 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); } #endif // HAS_UYVYTOYROW_NEON @@ -1252,19 +1252,19 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // store 8 U. + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. MEMACCESS(2) - "vst1.8 {d3}, [%2]! \n" // store 8 V. + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_YUY2TOUV422ROW_NEON @@ -1276,19 +1276,19 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 U. + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. MEMACCESS(2) - "vst1.8 {d2}, [%2]! \n" // store 8 V. + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_UYVYTOUV422ROW_NEON @@ -1297,20 +1297,20 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - "add %1, %0, %1 \n" // stride + src_yuy2 + "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2 ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. - "vrhadd.u8 d1, d1, d5 \n" // average rows of U - "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2. + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 U. + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. MEMACCESS(3) - "vst1.8 {d3}, [%3]! \n" // store 8 V. + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(stride_yuy2), // %1 @@ -1318,7 +1318,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List ); } #endif // HAS_YUY2TOUVROW_NEON @@ -1327,20 +1327,20 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - "add %1, %0, %1 \n" // stride + src_uyvy + "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. - "vrhadd.u8 d0, d0, d4 \n" // average rows of U - "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY. + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 U. + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. MEMACCESS(3) - "vst1.8 {d2}, [%3]! \n" // store 8 V. + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(stride_uyvy), // %1 @@ -1348,7 +1348,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List ); } #endif // HAS_UYVYTOUVROW_NEON @@ -1358,23 +1358,23 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix) { asm volatile ( // change the stride to row 2 pointer - "add %1, %0 \n" + "add %x1, %x0, %w1, sxtw \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels. "subs %3, %3, #16 \n" // 16 processed per loop MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. - "vrhadd.u8 q0, q1 \n" // average row 1 and 2 + "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels. + "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2 MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" + "st1 {v0.16b}, [%2], #16 \n" "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(src_uv_stride), // %1 "+r"(dst_uv), // %2 "+r"(pix) // %3 : - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); } #endif // HAS_HALFROW_NEON @@ -1384,22 +1384,22 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { asm volatile ( - "vmov.u32 d6[0], %3 \n" // selector + "mov v2.s[0], %w3 \n" // selector "1: \n" MEMACCESS(0) - "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. + "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop - "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels - "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels - "vtrn.u32 d4, d5 \n" // combine 8 pixels + "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels + "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels + "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels MEMACCESS(1) - "vst1.8 {d4}, [%1]! \n" // store 8. + "st1 {v4.8b}, [%1], #8 \n" // store 8. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 "+r"(pix) // %2 : "r"(selector) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List ); } #endif // HAS_ARGBTOBAYERROW_NEON @@ -1411,16 +1411,16 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, asm volatile ( "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // store 8 G's. + "st1 {v1.8b}, [%1], #8 \n" // store 8 G's. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_ARGBTOBAYERGGROW_NEON @@ -1431,21 +1431,20 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( MEMACCESS(3) - "vld1.8 {q2}, [%3] \n" // shuffler + "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. "subs %2, %2, #4 \n" // 4 processed per loop - "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels - "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store 4. + "st1 {v1.16b}, [%1], #16 \n" // store 4. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : "r"(shuffler) // %3 - : "cc", "memory", "q0", "q1", "q2" // Clobber List + : "cc", "memory", "v0", "v1", "v2" // Clobber List ); } #endif // HAS_ARGBSHUFFLEROW_NEON @@ -1459,14 +1458,15 @@ void I422ToYUY2Row_NEON(const uint8* src_y, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "mov v2.8b, v1.8b \n" MEMACCESS(1) - "vld1.8 {d1}, [%1]! \n" // load 8 Us + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us MEMACCESS(2) - "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels MEMACCESS(3) - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -1474,7 +1474,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, "+r"(dst_yuy2), // %3 "+r"(width) // %4 : - : "cc", "memory", "d0", "d1", "d2", "d3" + : "cc", "memory", "v0", "v1", "v2", "v3" ); } #endif // HAS_I422TOYUY2ROW_NEON @@ -1488,14 +1488,15 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys + "mov v3.8b, v2.8b \n" MEMACCESS(1) - "vld1.8 {d0}, [%1]! \n" // load 8 Us + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us MEMACCESS(2) - "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels MEMACCESS(3) - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -1503,7 +1504,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, "+r"(dst_uyvy), // %3 "+r"(width) // %4 : - : "cc", "memory", "d0", "d1", "d2", "d3" + : "cc", "memory", "v0", "v1", "v2", "v3" ); } #endif // HAS_I422TOUYVYROW_NEON @@ -1577,28 +1578,28 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, #ifdef HAS_ARGBTOYROW_NEON void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } #endif // HAS_ARGBTOYROW_NEON @@ -1606,26 +1607,26 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { #ifdef HAS_ARGBTOYJROW_NEON void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient - "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient - "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + "movi v4.8b, #15 \n" // B * 0.11400 coefficient + "movi v5.8b, #75 \n" // G * 0.58700 coefficient + "movi v6.8b, #38 \n" // R * 0.29900 coefficient ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" ); } #endif // HAS_ARGBTOYJROW_NEON @@ -3048,20 +3049,20 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) - "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q0, d0, d1 \n" // multiply B - "vmull.u8 q1, d2, d3 \n" // multiply G - "vmull.u8 q2, d4, d5 \n" // multiply R - "vmull.u8 q3, d6, d7 \n" // multiply A - "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B - "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G - "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R - "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "umull v2.8h, v2.8b, v6.8b \n" // multiply R + "umull v3.8h, v3.8b, v7.8b \n" // multiply A + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb0), // %0 @@ -3069,7 +3070,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1", "q2", "q3" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } #endif // HAS_ARGBMULTIPLYROW_NEON @@ -3083,14 +3084,16 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 q0, q0, q2 \n" // add B, G - "vqadd.u8 q1, q1, q3 \n" // add R, A + "uqadd v0.8b, v0.8b, v4.8b \n" + "uqadd v1.8b, v1.8b, v5.8b \n" + "uqadd v2.8b, v2.8b, v6.8b \n" + "uqadd v3.8b, v3.8b, v7.8b \n" MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb0), // %0 @@ -3098,7 +3101,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1", "q2", "q3" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } #endif // HAS_ARGBADDROW_NEON @@ -3112,14 +3115,16 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. - "vqsub.u8 q0, q0, q2 \n" // subtract B, G - "vqsub.u8 q1, q1, q3 \n" // subtract R, A + "uqsub v0.8b, v0.8b, v4.8b \n" + "uqsub v1.8b, v1.8b, v5.8b \n" + "uqsub v2.8b, v2.8b, v6.8b \n" + "uqsub v3.8b, v3.8b, v7.8b \n" MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb0), // %0 @@ -3127,7 +3132,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1", "q2", "q3" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } #endif // HAS_ARGBSUBTRACTROW_NEON @@ -3141,27 +3146,27 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { asm volatile ( - "vmov.u8 d3, #255 \n" // alpha + "movi v3.8b, #255 \n" // alpha // 8 pixel loop. ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. MEMACCESS(1) - "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d0, d0, d1 \n" // add - "vmov.u8 d1, d0 \n" - "vmov.u8 d2, d0 \n" + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "mov v1.8b, v0.8b \n" + "mov v2.8b, v0.8b \n" MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1" + : "cc", "memory", "v0", "v1", "v2", "v3" ); } #endif // HAS_SOBELROW_NEON @@ -3175,20 +3180,20 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. "subs %3, %3, #16 \n" // 16 processed per loop. - "vqadd.u8 q0, q0, q1 \n" // add + "uqadd v0.16b, v0.16b, v1.16b \n" // add MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1" + : "cc", "memory", "v0", "v1" ); } #endif // HAS_SOBELTOPLANEROW_NEON @@ -3202,25 +3207,25 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { asm volatile ( - "vmov.u8 d3, #255 \n" // alpha + "movi v3.8b, #255 \n" // alpha // 8 pixel loop. ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. MEMACCESS(1) - "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d1, d0, d2 \n" // add + "uqadd v1.8b, v0.8b, v2.8b \n" // add MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1" + : "cc", "memory", "v0", "v1", "v2", "v3" ); } #endif // HAS_SOBELXYROW_NEON @@ -3236,28 +3241,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d0}, [%0],%5 \n" // top + "ld1 {v0.8b}, [%0],%5 \n" // top MEMACCESS(0) - "vld1.8 {d1}, [%0],%6 \n" - "vsubl.u8 q0, d0, d1 \n" + "ld1 {v1.8b}, [%0],%6 \n" + "usubl v0.8h, v0.8b, v1.8b \n" MEMACCESS(1) - "vld1.8 {d2}, [%1],%5 \n" // center * 2 + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 MEMACCESS(1) - "vld1.8 {d3}, [%1],%6 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" + "ld1 {v3.8b}, [%1],%6 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" MEMACCESS(2) - "vld1.8 {d2}, [%2],%5 \n" // bottom + "ld1 {v2.8b}, [%2],%5 \n" // bottom MEMACCESS(2) - "vld1.8 {d3}, [%2],%6 \n" + "ld1 {v3.8b}, [%2],%6 \n" "subs %4, %4, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" MEMACCESS(3) - "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 @@ -3266,7 +3271,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, "+r"(width) // %4 : "r"(2), // %5 "r"(6) // %6 - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_SOBELXROW_NEON @@ -3282,28 +3287,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d0}, [%0],%4 \n" // left + "ld1 {v0.8b}, [%0],%4 \n" // left MEMACCESS(1) - "vld1.8 {d1}, [%1],%4 \n" - "vsubl.u8 q0, d0, d1 \n" + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" MEMACCESS(0) - "vld1.8 {d2}, [%0],%4 \n" // center * 2 + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 MEMACCESS(1) - "vld1.8 {d3}, [%1],%4 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" MEMACCESS(0) - "vld1.8 {d2}, [%0],%5 \n" // right + "ld1 {v2.8b}, [%0],%5 \n" // right MEMACCESS(1) - "vld1.8 {d3}, [%1],%5 \n" + "ld1 {v3.8b}, [%1],%5 \n" "subs %3, %3, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 @@ -3311,7 +3316,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, "+r"(width) // %3 : "r"(1), // %4 "r"(6) // %5 - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_SOBELYROW_NEON diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc index 8eb888926..f58fc5138 100644 --- a/third_party/libyuv/source/row_win.cc +++ b/third_party/libyuv/source/row_win.cc @@ -10,7 +10,7 @@ #include "libyuv/row.h" -#if defined (_M_X64) +#if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) #include #include // For _mm_maddubs_epi16 #endif @@ -78,7 +78,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* v_buf, uint8* dst_argb, int width) { - __m128i xmm0, xmm1, xmm2, xmm3; const __m128i xmm5 = _mm_set1_epi8(-1); const __m128i xmm4 = _mm_setzero_si128(); @@ -132,7 +131,6 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, const uint8* v_buf, uint8* dst_argb, int width) { - __m128i xmm0, xmm1, xmm2, xmm3; const __m128i xmm5 = _mm_set1_epi8(-1); const __m128i xmm4 = _mm_setzero_si128(); -- 2.40.0