2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "libyuv/row.h"
13 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
14 defined(_MSC_VER) && !defined(__clang__)
15 #include <emmintrin.h>
16 #include <tmmintrin.h> // For _mm_maddubs_epi16
24 // This module is for Visual C.
25 #if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
26 defined(_MSC_VER) && !defined(__clang__)
32 lvec16 kUVBiasB; // 96
33 lvec16 kUVBiasG; // 128
34 lvec16 kUVBiasR; // 160
35 lvec16 kYToRgb; // 192
38 // BT.601 YUV to RGB reference
39 // R = (Y - 16) * 1.164 - V * -1.596
40 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
41 // B = (Y - 16) * 1.164 - U * -2.018
43 // Y contribution to R,G,B. Scale and bias.
44 // TODO(fbarchard): Consider moving constants into a common header.
45 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
46 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
48 // U and V contributions to R,G,B.
49 #define UB -128 /* max(-128, round(-2.018 * 64)) */
50 #define UG 25 /* round(0.391 * 64) */
51 #define VG 52 /* round(0.813 * 64) */
52 #define VR -102 /* round(-1.596 * 64) */
54 // Bias values to subtract 16 from Y and 128 from U and V.
55 #define BB (UB * 128 + YGB)
56 #define BG (UG * 128 + VG * 128 + YGB)
57 #define BR (VR * 128 + YGB)
59 // BT601 constants for YUV to RGB.
60 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
61 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
62 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
63 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
64 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
65 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
66 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
67 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
68 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
69 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
70 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
73 // BT601 constants for NV21 where chroma plane is VU instead of UV.
74 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
75 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
76 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
77 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
78 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
79 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
80 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
81 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
82 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
83 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
84 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
97 // JPEG YUV to RGB reference
98 // * R = Y - V * -1.40200
99 // * G = Y - U * 0.34414 - V * 0.71414
100 // * B = Y - U * -1.77200
102 // Y contribution to R,G,B. Scale and bias.
103 // TODO(fbarchard): Consider moving constants into a common header.
104 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
105 #define YGBJ 32 /* 64 / 2 */
107 // U and V contributions to R,G,B.
108 #define UBJ -113 /* round(-1.77200 * 64) */
109 #define UGJ 22 /* round(0.34414 * 64) */
110 #define VGJ 46 /* round(0.71414 * 64) */
111 #define VRJ -90 /* round(-1.40200 * 64) */
113 // Bias values to subtract 16 from Y and 128 from U and V.
114 #define BBJ (UBJ * 128 + YGBJ)
115 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
116 #define BRJ (VRJ * 128 + YGBJ)
118 // JPEG constants for YUV to RGB.
119 static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
120 { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
121 UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
122 { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
123 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
124 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
125 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
126 { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
127 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
128 { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
129 BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
130 { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
131 BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
132 { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
133 BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
134 { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
135 YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
150 #if defined(HAS_I422TOARGBROW_SSSE3)
151 void I422ToARGBRow_SSSE3(const uint8* y_buf,
156 __m128i xmm0, xmm1, xmm2, xmm3;
157 const __m128i xmm5 = _mm_set1_epi8(-1);
158 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
161 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
162 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
163 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
164 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
165 xmm1 = _mm_loadu_si128(&xmm0);
166 xmm2 = _mm_loadu_si128(&xmm0);
167 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
168 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
169 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
170 xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
171 xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
172 xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
173 xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
174 xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
175 xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
176 xmm0 = _mm_adds_epi16(xmm0, xmm3);
177 xmm1 = _mm_adds_epi16(xmm1, xmm3);
178 xmm2 = _mm_adds_epi16(xmm2, xmm3);
179 xmm0 = _mm_srai_epi16(xmm0, 6);
180 xmm1 = _mm_srai_epi16(xmm1, 6);
181 xmm2 = _mm_srai_epi16(xmm2, 6);
182 xmm0 = _mm_packus_epi16(xmm0, xmm0);
183 xmm1 = _mm_packus_epi16(xmm1, xmm1);
184 xmm2 = _mm_packus_epi16(xmm2, xmm2);
185 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
186 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
187 xmm1 = _mm_loadu_si128(&xmm0);
188 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
189 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
191 _mm_storeu_si128((__m128i *)dst_argb, xmm0);
192 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
202 #else // defined(_M_X64)
203 #ifdef HAS_ARGBTOYROW_SSSE3
205 // Constants for ARGB.
206 static const vec8 kARGBToY = {
207 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
211 static const vec8 kARGBToYJ = {
212 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
215 static const vec8 kARGBToU = {
216 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
219 static const vec8 kARGBToUJ = {
220 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
223 static const vec8 kARGBToV = {
224 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
227 static const vec8 kARGBToVJ = {
228 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
231 // vpshufb for vphaddw + vpackuswb packed to shorts.
232 static const lvec8 kShufARGBToUV_AVX = {
233 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
234 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
237 // Constants for BGRA.
238 static const vec8 kBGRAToY = {
239 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
242 static const vec8 kBGRAToU = {
243 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
246 static const vec8 kBGRAToV = {
247 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
250 // Constants for ABGR.
251 static const vec8 kABGRToY = {
252 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
255 static const vec8 kABGRToU = {
256 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
259 static const vec8 kABGRToV = {
260 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
263 // Constants for RGBA.
264 static const vec8 kRGBAToY = {
265 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
268 static const vec8 kRGBAToU = {
269 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
272 static const vec8 kRGBAToV = {
273 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
276 static const uvec8 kAddY16 = {
277 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
280 // 7 bit fixed point 0.5.
281 static const vec16 kAddYJ64 = {
282 64, 64, 64, 64, 64, 64, 64, 64
285 static const uvec8 kAddUV128 = {
286 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
287 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
290 static const uvec16 kAddUVJ128 = {
291 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
294 // Shuffle table for converting RGB24 to ARGB.
295 static const uvec8 kShuffleMaskRGB24ToARGB = {
296 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
299 // Shuffle table for converting RAW to ARGB.
300 static const uvec8 kShuffleMaskRAWToARGB = {
301 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
304 // Shuffle table for converting ARGB to RGB24.
305 static const uvec8 kShuffleMaskARGBToRGB24 = {
306 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
309 // Shuffle table for converting ARGB to RAW.
310 static const uvec8 kShuffleMaskARGBToRAW = {
311 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
314 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
315 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
316 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
319 // Shuffle table for converting ARGB to RAW.
320 static const uvec8 kShuffleMaskARGBToRAW_0 = {
321 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
324 // Duplicates gray value 3 times and fills in alpha opaque.
326 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
328 mov eax, [esp + 4] // src_y
329 mov edx, [esp + 8] // dst_argb
330 mov ecx, [esp + 12] // pix
331 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
335 movq xmm0, qword ptr [eax]
344 movdqu [edx + 16], xmm1
352 #ifdef HAS_J400TOARGBROW_AVX2
353 // Duplicates gray value 3 times and fills in alpha opaque.
355 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
357 mov eax, [esp + 4] // src_y
358 mov edx, [esp + 8] // dst_argb
359 mov ecx, [esp + 12] // pix
360 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
361 vpslld ymm5, ymm5, 24
366 vpermq ymm0, ymm0, 0xd8
367 vpunpcklbw ymm0, ymm0, ymm0
368 vpermq ymm0, ymm0, 0xd8
369 vpunpckhwd ymm1, ymm0, ymm0
370 vpunpcklwd ymm0, ymm0, ymm0
371 vpor ymm0, ymm0, ymm5
372 vpor ymm1, ymm1, ymm5
374 vmovdqu [edx + 32], ymm1
382 #endif // HAS_J400TOARGBROW_AVX2
385 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
387 mov eax, [esp + 4] // src_rgb24
388 mov edx, [esp + 8] // dst_argb
389 mov ecx, [esp + 12] // pix
390 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
392 movdqa xmm4, kShuffleMaskRGB24ToARGB
396 movdqu xmm1, [eax + 16]
397 movdqu xmm3, [eax + 32]
400 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
403 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
405 movdqu [edx + 32], xmm2
410 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
412 movdqu [edx + 16], xmm1
414 movdqu [edx + 48], xmm3
423 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
426 mov eax, [esp + 4] // src_raw
427 mov edx, [esp + 8] // dst_argb
428 mov ecx, [esp + 12] // pix
429 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
431 movdqa xmm4, kShuffleMaskRAWToARGB
435 movdqu xmm1, [eax + 16]
436 movdqu xmm3, [eax + 32]
439 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
442 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
444 movdqu [edx + 32], xmm2
449 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
451 movdqu [edx + 16], xmm1
453 movdqu [edx + 48], xmm3
461 // pmul method to replicate bits.
462 // Math to replicate bits:
463 // (v << 8) | (v << 3)
466 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
469 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
472 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
475 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
478 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
480 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
483 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
486 mov eax, [esp + 4] // src_rgb565
487 mov edx, [esp + 8] // dst_argb
488 mov ecx, [esp + 12] // pix
493 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
496 pand xmm1, xmm3 // R in upper 5 bits
497 psllw xmm2, 11 // B in upper 5 bits
498 pmulhuw xmm1, xmm5 // * (256 + 8)
499 pmulhuw xmm2, xmm5 // * (256 + 8)
502 pand xmm0, xmm4 // G in middle 6 bits
503 pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
508 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
509 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
517 #ifdef HAS_RGB565TOARGBROW_AVX2
518 // pmul method to replicate bits.
519 // Math to replicate bits:
520 // (v << 8) | (v << 3)
523 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
525 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
528 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
530 vbroadcastss ymm5, xmm5
531 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
533 vbroadcastss ymm6, xmm6
534 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
535 vpsllw ymm3, ymm3, 11
536 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
537 vpsllw ymm4, ymm4, 10
539 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
542 mov eax, [esp + 4] // src_rgb565
543 mov edx, [esp + 8] // dst_argb
544 mov ecx, [esp + 12] // pix
549 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
550 vpand ymm1, ymm0, ymm3 // R in upper 5 bits
551 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
552 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
553 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
555 vpor ymm1, ymm1, ymm2 // RB
556 vpand ymm0, ymm0, ymm4 // G in middle 6 bits
557 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
558 vpor ymm0, ymm0, ymm7 // AG
559 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
560 vpermq ymm1, ymm1, 0xd8
561 vpunpckhbw ymm2, ymm1, ymm0
562 vpunpcklbw ymm1, ymm1, ymm0
563 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
564 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
572 #endif // HAS_RGB565TOARGBROW_AVX2
574 #ifdef HAS_ARGB1555TOARGBROW_AVX2
576 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
579 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
581 vbroadcastss ymm5, xmm5
582 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
584 vbroadcastss ymm6, xmm6
585 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
586 vpsllw ymm3, ymm3, 11
587 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
588 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
591 mov eax, [esp + 4] // src_argb1555
592 mov edx, [esp + 8] // dst_argb
593 mov ecx, [esp + 12] // pix
598 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
599 vpsllw ymm1, ymm0, 1 // R in upper 5 bits
600 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
601 vpand ymm1, ymm1, ymm3
602 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
603 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
605 vpor ymm1, ymm1, ymm2 // RB
606 vpsraw ymm2, ymm0, 8 // A
607 vpand ymm0, ymm0, ymm4 // G in middle 5 bits
608 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
609 vpand ymm2, ymm2, ymm7
610 vpor ymm0, ymm0, ymm2 // AG
611 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
612 vpermq ymm1, ymm1, 0xd8
613 vpunpckhbw ymm2, ymm1, ymm0
614 vpunpcklbw ymm1, ymm1, ymm0
615 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
616 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
624 #endif // HAS_ARGB1555TOARGBROW_AVX2
626 #ifdef HAS_ARGB4444TOARGBROW_AVX2
628 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
631 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
633 vbroadcastss ymm4, xmm4
634 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
635 mov eax, [esp + 4] // src_argb4444
636 mov edx, [esp + 8] // dst_argb
637 mov ecx, [esp + 12] // pix
642 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
643 vpand ymm2, ymm0, ymm5 // mask high nibbles
644 vpand ymm0, ymm0, ymm4 // mask low nibbles
647 vpor ymm2, ymm2, ymm3
648 vpor ymm0, ymm0, ymm1
649 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
650 vpermq ymm2, ymm2, 0xd8
651 vpunpckhbw ymm1, ymm0, ymm2
652 vpunpcklbw ymm0, ymm0, ymm2
653 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
654 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
662 #endif // HAS_ARGB4444TOARGBROW_AVX2
666 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
669 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
672 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
675 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
677 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
679 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
682 mov eax, [esp + 4] // src_argb1555
683 mov edx, [esp + 8] // dst_argb
684 mov ecx, [esp + 12] // pix
689 movdqu xmm0, [eax] // fetch 8 pixels of 1555
692 psllw xmm1, 1 // R in upper 5 bits
693 psllw xmm2, 11 // B in upper 5 bits
695 pmulhuw xmm2, xmm5 // * (256 + 8)
696 pmulhuw xmm1, xmm5 // * (256 + 8)
700 pand xmm0, xmm4 // G in middle 5 bits
702 pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
708 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
709 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
719 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
722 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
725 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
727 mov eax, [esp + 4] // src_argb4444
728 mov edx, [esp + 8] // dst_argb
729 mov ecx, [esp + 12] // pix
734 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
736 pand xmm0, xmm4 // mask low nibbles
737 pand xmm2, xmm5 // mask high nibbles
747 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
748 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
757 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
759 mov eax, [esp + 4] // src_argb
760 mov edx, [esp + 8] // dst_rgb
761 mov ecx, [esp + 12] // pix
762 movdqa xmm6, kShuffleMaskARGBToRGB24
765 movdqu xmm0, [eax] // fetch 16 pixels of argb
766 movdqu xmm1, [eax + 16]
767 movdqu xmm2, [eax + 32]
768 movdqu xmm3, [eax + 48]
770 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
774 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
775 psrldq xmm1, 4 // 8 bytes from 1
776 pslldq xmm4, 12 // 4 bytes from 1 for 0
777 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
778 por xmm0, xmm4 // 4 bytes from 1 for 0
779 pslldq xmm5, 8 // 8 bytes from 2 for 1
780 movdqu [edx], xmm0 // store 0
781 por xmm1, xmm5 // 8 bytes from 2 for 1
782 psrldq xmm2, 8 // 4 bytes from 2
783 pslldq xmm3, 4 // 12 bytes from 3 for 2
784 por xmm2, xmm3 // 12 bytes from 3 for 2
785 movdqu [edx + 16], xmm1 // store 1
786 movdqu [edx + 32], xmm2 // store 2
795 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
797 mov eax, [esp + 4] // src_argb
798 mov edx, [esp + 8] // dst_rgb
799 mov ecx, [esp + 12] // pix
800 movdqa xmm6, kShuffleMaskARGBToRAW
803 movdqu xmm0, [eax] // fetch 16 pixels of argb
804 movdqu xmm1, [eax + 16]
805 movdqu xmm2, [eax + 32]
806 movdqu xmm3, [eax + 48]
808 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
812 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
813 psrldq xmm1, 4 // 8 bytes from 1
814 pslldq xmm4, 12 // 4 bytes from 1 for 0
815 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
816 por xmm0, xmm4 // 4 bytes from 1 for 0
817 pslldq xmm5, 8 // 8 bytes from 2 for 1
818 movdqu [edx], xmm0 // store 0
819 por xmm1, xmm5 // 8 bytes from 2 for 1
820 psrldq xmm2, 8 // 4 bytes from 2
821 pslldq xmm3, 4 // 12 bytes from 3 for 2
822 por xmm2, xmm3 // 12 bytes from 3 for 2
823 movdqu [edx + 16], xmm1 // store 1
824 movdqu [edx + 32], xmm2 // store 2
834 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
836 mov eax, [esp + 4] // src_argb
837 mov edx, [esp + 8] // dst_rgb
838 mov ecx, [esp + 12] // pix
839 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
841 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
844 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
848 movdqu xmm0, [eax] // fetch 4 pixels of argb
849 movdqa xmm1, xmm0 // B
850 movdqa xmm2, xmm0 // G
859 por xmm0, xmm1 // BGR
862 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
872 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
873 const uint32 dither4, int pix) {
876 mov eax, [esp + 4] // src_argb
877 mov edx, [esp + 8] // dst_rgb
878 movd xmm6, [esp + 12] // dither4
879 mov ecx, [esp + 16] // pix
880 punpcklbw xmm6, xmm6 // make dither 16 bytes
884 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
886 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
889 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
893 movdqu xmm0, [eax] // fetch 4 pixels of argb
894 paddusb xmm0, xmm6 // add dither
895 movdqa xmm1, xmm0 // B
896 movdqa xmm2, xmm0 // G
905 por xmm0, xmm1 // BGR
908 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
916 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
918 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
919 const uint32 dither4, int pix) {
921 mov eax, [esp + 4] // src_argb
922 mov edx, [esp + 8] // dst_rgb
923 vbroadcastss xmm6, [esp + 12] // dither4
924 mov ecx, [esp + 16] // pix
925 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
926 vpermq ymm6, ymm6, 0xd8
927 vpunpcklwd ymm6, ymm6, ymm6
928 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
929 vpsrld ymm3, ymm3, 27
930 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
931 vpsrld ymm4, ymm4, 26
933 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
936 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
937 vpaddusb ymm0, ymm0, ymm6 // add dither
938 vpsrld ymm2, ymm0, 5 // G
939 vpsrld ymm1, ymm0, 3 // B
940 vpsrld ymm0, ymm0, 8 // R
941 vpand ymm2, ymm2, ymm4 // G
942 vpand ymm1, ymm1, ymm3 // B
943 vpand ymm0, ymm0, ymm5 // R
944 vpor ymm1, ymm1, ymm2 // BG
945 vpor ymm0, ymm0, ymm1 // BGR
946 vpackusdw ymm0, ymm0, ymm0
947 vpermq ymm0, ymm0, 0xd8
949 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
957 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
959 // TODO(fbarchard): Improve sign extension/packing.
961 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
963 mov eax, [esp + 4] // src_argb
964 mov edx, [esp + 8] // dst_rgb
965 mov ecx, [esp + 12] // pix
966 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
968 movdqa xmm5, xmm4 // generate mask 0x000003e0
970 movdqa xmm6, xmm4 // generate mask 0x00007c00
972 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
976 movdqu xmm0, [eax] // fetch 4 pixels of argb
977 movdqa xmm1, xmm0 // B
978 movdqa xmm2, xmm0 // G
979 movdqa xmm3, xmm0 // R
990 por xmm0, xmm2 // BGRA
993 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
1002 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
1004 mov eax, [esp + 4] // src_argb
1005 mov edx, [esp + 8] // dst_rgb
1006 mov ecx, [esp + 12] // pix
1007 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
1009 movdqa xmm3, xmm4 // generate mask 0x00f000f0
1013 movdqu xmm0, [eax] // fetch 4 pixels of argb
1015 pand xmm0, xmm3 // low nibble
1016 pand xmm1, xmm4 // high nibble
1022 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
1030 #ifdef HAS_ARGBTORGB565ROW_AVX2
1032 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
1034 mov eax, [esp + 4] // src_argb
1035 mov edx, [esp + 8] // dst_rgb
1036 mov ecx, [esp + 12] // pix
1037 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
1038 vpsrld ymm3, ymm3, 27
1039 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
1040 vpsrld ymm4, ymm4, 26
1041 vpslld ymm4, ymm4, 5
1042 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
1045 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1046 vpsrld ymm2, ymm0, 5 // G
1047 vpsrld ymm1, ymm0, 3 // B
1048 vpsrld ymm0, ymm0, 8 // R
1049 vpand ymm2, ymm2, ymm4 // G
1050 vpand ymm1, ymm1, ymm3 // B
1051 vpand ymm0, ymm0, ymm5 // R
1052 vpor ymm1, ymm1, ymm2 // BG
1053 vpor ymm0, ymm0, ymm1 // BGR
1054 vpackusdw ymm0, ymm0, ymm0
1055 vpermq ymm0, ymm0, 0xd8
1057 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
1065 #endif // HAS_ARGBTORGB565ROW_AVX2
1067 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
1069 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
1071 mov eax, [esp + 4] // src_argb
1072 mov edx, [esp + 8] // dst_rgb
1073 mov ecx, [esp + 12] // pix
1074 vpcmpeqb ymm4, ymm4, ymm4
1075 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
1076 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
1077 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
1078 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
1079 vpslld ymm7, ymm7, 15
1082 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1083 vpsrld ymm3, ymm0, 9 // R
1084 vpsrld ymm2, ymm0, 6 // G
1085 vpsrld ymm1, ymm0, 3 // B
1086 vpsrad ymm0, ymm0, 16 // A
1087 vpand ymm3, ymm3, ymm6 // R
1088 vpand ymm2, ymm2, ymm5 // G
1089 vpand ymm1, ymm1, ymm4 // B
1090 vpand ymm0, ymm0, ymm7 // A
1091 vpor ymm0, ymm0, ymm1 // BA
1092 vpor ymm2, ymm2, ymm3 // GR
1093 vpor ymm0, ymm0, ymm2 // BGRA
1094 vpackssdw ymm0, ymm0, ymm0
1095 vpermq ymm0, ymm0, 0xd8
1097 vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
1105 #endif // HAS_ARGBTOARGB1555ROW_AVX2
1107 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
1109 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
1111 mov eax, [esp + 4] // src_argb
1112 mov edx, [esp + 8] // dst_rgb
1113 mov ecx, [esp + 12] // pix
1114 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
1115 vpsllw ymm4, ymm4, 12
1116 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
1119 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1120 vpand ymm1, ymm0, ymm4 // high nibble
1121 vpand ymm0, ymm0, ymm3 // low nibble
1122 vpsrld ymm1, ymm1, 8
1123 vpsrld ymm0, ymm0, 4
1124 vpor ymm0, ymm0, ymm1
1125 vpackuswb ymm0, ymm0, ymm0
1126 vpermq ymm0, ymm0, 0xd8
1128 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
1136 #endif // HAS_ARGBTOARGB4444ROW_AVX2
1138 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1140 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1142 mov eax, [esp + 4] /* src_argb */
1143 mov edx, [esp + 8] /* dst_y */
1144 mov ecx, [esp + 12] /* pix */
1145 movdqa xmm4, kARGBToY
1146 movdqa xmm5, kAddY16
1150 movdqu xmm1, [eax + 16]
1151 movdqu xmm2, [eax + 32]
1152 movdqu xmm3, [eax + 48]
1153 pmaddubsw xmm0, xmm4
1154 pmaddubsw xmm1, xmm4
1155 pmaddubsw xmm2, xmm4
1156 pmaddubsw xmm3, xmm4
1172 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1173 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1175 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1177 mov eax, [esp + 4] /* src_argb */
1178 mov edx, [esp + 8] /* dst_y */
1179 mov ecx, [esp + 12] /* pix */
1180 movdqa xmm4, kARGBToYJ
1181 movdqa xmm5, kAddYJ64
1185 movdqu xmm1, [eax + 16]
1186 movdqu xmm2, [eax + 32]
1187 movdqu xmm3, [eax + 48]
1188 pmaddubsw xmm0, xmm4
1189 pmaddubsw xmm1, xmm4
1190 pmaddubsw xmm2, xmm4
1191 pmaddubsw xmm3, xmm4
1195 paddw xmm0, xmm5 // Add .5 for rounding.
1208 #ifdef HAS_ARGBTOYROW_AVX2
1209 // vpermd for vphaddw + vpackuswb vpermd.
1210 static const lvec32 kPermdARGBToY_AVX = {
1211 0, 4, 1, 5, 2, 6, 3, 7
1214 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1216 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
1218 mov eax, [esp + 4] /* src_argb */
1219 mov edx, [esp + 8] /* dst_y */
1220 mov ecx, [esp + 12] /* pix */
1221 vbroadcastf128 ymm4, kARGBToY
1222 vbroadcastf128 ymm5, kAddY16
1223 vmovdqu ymm6, kPermdARGBToY_AVX
1227 vmovdqu ymm1, [eax + 32]
1228 vmovdqu ymm2, [eax + 64]
1229 vmovdqu ymm3, [eax + 96]
1230 vpmaddubsw ymm0, ymm0, ymm4
1231 vpmaddubsw ymm1, ymm1, ymm4
1232 vpmaddubsw ymm2, ymm2, ymm4
1233 vpmaddubsw ymm3, ymm3, ymm4
1234 lea eax, [eax + 128]
1235 vphaddw ymm0, ymm0, ymm1 // mutates.
1236 vphaddw ymm2, ymm2, ymm3
1237 vpsrlw ymm0, ymm0, 7
1238 vpsrlw ymm2, ymm2, 7
1239 vpackuswb ymm0, ymm0, ymm2 // mutates.
1240 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1241 vpaddb ymm0, ymm0, ymm5 // add 16 for Y
1250 #endif // HAS_ARGBTOYROW_AVX2
1252 #ifdef HAS_ARGBTOYJROW_AVX2
1253 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1255 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
1257 mov eax, [esp + 4] /* src_argb */
1258 mov edx, [esp + 8] /* dst_y */
1259 mov ecx, [esp + 12] /* pix */
1260 vbroadcastf128 ymm4, kARGBToYJ
1261 vbroadcastf128 ymm5, kAddYJ64
1262 vmovdqu ymm6, kPermdARGBToY_AVX
1266 vmovdqu ymm1, [eax + 32]
1267 vmovdqu ymm2, [eax + 64]
1268 vmovdqu ymm3, [eax + 96]
1269 vpmaddubsw ymm0, ymm0, ymm4
1270 vpmaddubsw ymm1, ymm1, ymm4
1271 vpmaddubsw ymm2, ymm2, ymm4
1272 vpmaddubsw ymm3, ymm3, ymm4
1273 lea eax, [eax + 128]
1274 vphaddw ymm0, ymm0, ymm1 // mutates.
1275 vphaddw ymm2, ymm2, ymm3
1276 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
1277 vpaddw ymm2, ymm2, ymm5
1278 vpsrlw ymm0, ymm0, 7
1279 vpsrlw ymm2, ymm2, 7
1280 vpackuswb ymm0, ymm0, ymm2 // mutates.
1281 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1291 #endif // HAS_ARGBTOYJROW_AVX2
1294 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1296 mov eax, [esp + 4] /* src_argb */
1297 mov edx, [esp + 8] /* dst_y */
1298 mov ecx, [esp + 12] /* pix */
1299 movdqa xmm4, kBGRAToY
1300 movdqa xmm5, kAddY16
1304 movdqu xmm1, [eax + 16]
1305 movdqu xmm2, [eax + 32]
1306 movdqu xmm3, [eax + 48]
1307 pmaddubsw xmm0, xmm4
1308 pmaddubsw xmm1, xmm4
1309 pmaddubsw xmm2, xmm4
1310 pmaddubsw xmm3, xmm4
1327 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1329 mov eax, [esp + 4] /* src_argb */
1330 mov edx, [esp + 8] /* dst_y */
1331 mov ecx, [esp + 12] /* pix */
1332 movdqa xmm4, kABGRToY
1333 movdqa xmm5, kAddY16
1337 movdqu xmm1, [eax + 16]
1338 movdqu xmm2, [eax + 32]
1339 movdqu xmm3, [eax + 48]
1340 pmaddubsw xmm0, xmm4
1341 pmaddubsw xmm1, xmm4
1342 pmaddubsw xmm2, xmm4
1343 pmaddubsw xmm3, xmm4
1360 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1362 mov eax, [esp + 4] /* src_argb */
1363 mov edx, [esp + 8] /* dst_y */
1364 mov ecx, [esp + 12] /* pix */
1365 movdqa xmm4, kRGBAToY
1366 movdqa xmm5, kAddY16
1370 movdqu xmm1, [eax + 16]
1371 movdqu xmm2, [eax + 32]
1372 movdqu xmm3, [eax + 48]
1373 pmaddubsw xmm0, xmm4
1374 pmaddubsw xmm1, xmm4
1375 pmaddubsw xmm2, xmm4
1376 pmaddubsw xmm3, xmm4
1393 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1394 uint8* dst_u, uint8* dst_v, int width) {
1398 mov eax, [esp + 8 + 4] // src_argb
1399 mov esi, [esp + 8 + 8] // src_stride_argb
1400 mov edx, [esp + 8 + 12] // dst_u
1401 mov edi, [esp + 8 + 16] // dst_v
1402 mov ecx, [esp + 8 + 20] // pix
1403 movdqa xmm5, kAddUV128
1404 movdqa xmm6, kARGBToV
1405 movdqa xmm7, kARGBToU
1406 sub edi, edx // stride from u to v
1409 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1411 movdqu xmm4, [eax + esi]
1413 movdqu xmm1, [eax + 16]
1414 movdqu xmm4, [eax + esi + 16]
1416 movdqu xmm2, [eax + 32]
1417 movdqu xmm4, [eax + esi + 32]
1419 movdqu xmm3, [eax + 48]
1420 movdqu xmm4, [eax + esi + 48]
1425 shufps xmm0, xmm1, 0x88
1426 shufps xmm4, xmm1, 0xdd
1429 shufps xmm2, xmm3, 0x88
1430 shufps xmm4, xmm3, 0xdd
1433 // step 2 - convert to U and V
1434 // from here down is very similar to Y code except
1435 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1438 pmaddubsw xmm0, xmm7 // U
1439 pmaddubsw xmm2, xmm7
1440 pmaddubsw xmm1, xmm6 // V
1441 pmaddubsw xmm3, xmm6
1447 paddb xmm0, xmm5 // -> unsigned
1449 // step 3 - store 8 U and 8 V values
1450 movlps qword ptr [edx], xmm0 // U
1451 movhps qword ptr [edx + edi], xmm0 // V
1463 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1464 uint8* dst_u, uint8* dst_v, int width) {
1468 mov eax, [esp + 8 + 4] // src_argb
1469 mov esi, [esp + 8 + 8] // src_stride_argb
1470 mov edx, [esp + 8 + 12] // dst_u
1471 mov edi, [esp + 8 + 16] // dst_v
1472 mov ecx, [esp + 8 + 20] // pix
1473 movdqa xmm5, kAddUVJ128
1474 movdqa xmm6, kARGBToVJ
1475 movdqa xmm7, kARGBToUJ
1476 sub edi, edx // stride from u to v
1479 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1481 movdqu xmm4, [eax + esi]
1483 movdqu xmm1, [eax + 16]
1484 movdqu xmm4, [eax + esi + 16]
1486 movdqu xmm2, [eax + 32]
1487 movdqu xmm4, [eax + esi + 32]
1489 movdqu xmm3, [eax + 48]
1490 movdqu xmm4, [eax + esi + 48]
1495 shufps xmm0, xmm1, 0x88
1496 shufps xmm4, xmm1, 0xdd
1499 shufps xmm2, xmm3, 0x88
1500 shufps xmm4, xmm3, 0xdd
1503 // step 2 - convert to U and V
1504 // from here down is very similar to Y code except
1505 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1508 pmaddubsw xmm0, xmm7 // U
1509 pmaddubsw xmm2, xmm7
1510 pmaddubsw xmm1, xmm6 // V
1511 pmaddubsw xmm3, xmm6
1514 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1520 // step 3 - store 8 U and 8 V values
1521 movlps qword ptr [edx], xmm0 // U
1522 movhps qword ptr [edx + edi], xmm0 // V
1533 #ifdef HAS_ARGBTOUVROW_AVX2
1535 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1536 uint8* dst_u, uint8* dst_v, int width) {
1540 mov eax, [esp + 8 + 4] // src_argb
1541 mov esi, [esp + 8 + 8] // src_stride_argb
1542 mov edx, [esp + 8 + 12] // dst_u
1543 mov edi, [esp + 8 + 16] // dst_v
1544 mov ecx, [esp + 8 + 20] // pix
1545 vbroadcastf128 ymm5, kAddUV128
1546 vbroadcastf128 ymm6, kARGBToV
1547 vbroadcastf128 ymm7, kARGBToU
1548 sub edi, edx // stride from u to v
1551 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1553 vmovdqu ymm1, [eax + 32]
1554 vmovdqu ymm2, [eax + 64]
1555 vmovdqu ymm3, [eax + 96]
1556 vpavgb ymm0, ymm0, [eax + esi]
1557 vpavgb ymm1, ymm1, [eax + esi + 32]
1558 vpavgb ymm2, ymm2, [eax + esi + 64]
1559 vpavgb ymm3, ymm3, [eax + esi + 96]
1560 lea eax, [eax + 128]
1561 vshufps ymm4, ymm0, ymm1, 0x88
1562 vshufps ymm0, ymm0, ymm1, 0xdd
1563 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1564 vshufps ymm4, ymm2, ymm3, 0x88
1565 vshufps ymm2, ymm2, ymm3, 0xdd
1566 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1568 // step 2 - convert to U and V
1569 // from here down is very similar to Y code except
1570 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1571 vpmaddubsw ymm1, ymm0, ymm7 // U
1572 vpmaddubsw ymm3, ymm2, ymm7
1573 vpmaddubsw ymm0, ymm0, ymm6 // V
1574 vpmaddubsw ymm2, ymm2, ymm6
1575 vphaddw ymm1, ymm1, ymm3 // mutates
1576 vphaddw ymm0, ymm0, ymm2
1577 vpsraw ymm1, ymm1, 8
1578 vpsraw ymm0, ymm0, 8
1579 vpacksswb ymm0, ymm1, ymm0 // mutates
1580 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1581 vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw
1582 vpaddb ymm0, ymm0, ymm5 // -> unsigned
1584 // step 3 - store 16 U and 16 V values
1585 vextractf128 [edx], ymm0, 0 // U
1586 vextractf128 [edx + edi], ymm0, 1 // V
1597 #endif // HAS_ARGBTOUVROW_AVX2
1600 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1601 uint8* dst_u, uint8* dst_v, int width) {
1604 mov eax, [esp + 4 + 4] // src_argb
1605 mov edx, [esp + 4 + 8] // dst_u
1606 mov edi, [esp + 4 + 12] // dst_v
1607 mov ecx, [esp + 4 + 16] // pix
1608 movdqa xmm5, kAddUV128
1609 movdqa xmm6, kARGBToV
1610 movdqa xmm7, kARGBToU
1611 sub edi, edx // stride from u to v
1614 /* convert to U and V */
1615 movdqu xmm0, [eax] // U
1616 movdqu xmm1, [eax + 16]
1617 movdqu xmm2, [eax + 32]
1618 movdqu xmm3, [eax + 48]
1619 pmaddubsw xmm0, xmm7
1620 pmaddubsw xmm1, xmm7
1621 pmaddubsw xmm2, xmm7
1622 pmaddubsw xmm3, xmm7
1631 movdqu xmm0, [eax] // V
1632 movdqu xmm1, [eax + 16]
1633 movdqu xmm2, [eax + 32]
1634 movdqu xmm3, [eax + 48]
1635 pmaddubsw xmm0, xmm6
1636 pmaddubsw xmm1, xmm6
1637 pmaddubsw xmm2, xmm6
1638 pmaddubsw xmm3, xmm6
1646 movdqu [edx + edi], xmm0
1657 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1658 uint8* dst_u, uint8* dst_v, int width) {
1661 mov eax, [esp + 4 + 4] // src_argb
1662 mov edx, [esp + 4 + 8] // dst_u
1663 mov edi, [esp + 4 + 12] // dst_v
1664 mov ecx, [esp + 4 + 16] // pix
1665 movdqa xmm5, kAddUV128
1666 movdqa xmm6, kARGBToV
1667 movdqa xmm7, kARGBToU
1668 sub edi, edx // stride from u to v
1671 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1673 movdqu xmm1, [eax + 16]
1674 movdqu xmm2, [eax + 32]
1675 movdqu xmm3, [eax + 48]
1678 shufps xmm0, xmm1, 0x88
1679 shufps xmm4, xmm1, 0xdd
1682 shufps xmm2, xmm3, 0x88
1683 shufps xmm4, xmm3, 0xdd
1686 // step 2 - convert to U and V
1687 // from here down is very similar to Y code except
1688 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1691 pmaddubsw xmm0, xmm7 // U
1692 pmaddubsw xmm2, xmm7
1693 pmaddubsw xmm1, xmm6 // V
1694 pmaddubsw xmm3, xmm6
1700 paddb xmm0, xmm5 // -> unsigned
1702 // step 3 - store 8 U and 8 V values
1703 movlps qword ptr [edx], xmm0 // U
1704 movhps qword ptr [edx + edi], xmm0 // V
1715 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1716 uint8* dst_u, uint8* dst_v, int width) {
1720 mov eax, [esp + 8 + 4] // src_argb
1721 mov esi, [esp + 8 + 8] // src_stride_argb
1722 mov edx, [esp + 8 + 12] // dst_u
1723 mov edi, [esp + 8 + 16] // dst_v
1724 mov ecx, [esp + 8 + 20] // pix
1725 movdqa xmm5, kAddUV128
1726 movdqa xmm6, kBGRAToV
1727 movdqa xmm7, kBGRAToU
1728 sub edi, edx // stride from u to v
1731 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1733 movdqu xmm4, [eax + esi]
1735 movdqu xmm1, [eax + 16]
1736 movdqu xmm4, [eax + esi + 16]
1738 movdqu xmm2, [eax + 32]
1739 movdqu xmm4, [eax + esi + 32]
1741 movdqu xmm3, [eax + 48]
1742 movdqu xmm4, [eax + esi + 48]
1747 shufps xmm0, xmm1, 0x88
1748 shufps xmm4, xmm1, 0xdd
1751 shufps xmm2, xmm3, 0x88
1752 shufps xmm4, xmm3, 0xdd
1755 // step 2 - convert to U and V
1756 // from here down is very similar to Y code except
1757 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1760 pmaddubsw xmm0, xmm7 // U
1761 pmaddubsw xmm2, xmm7
1762 pmaddubsw xmm1, xmm6 // V
1763 pmaddubsw xmm3, xmm6
1769 paddb xmm0, xmm5 // -> unsigned
1771 // step 3 - store 8 U and 8 V values
1772 movlps qword ptr [edx], xmm0 // U
1773 movhps qword ptr [edx + edi], xmm0 // V
1785 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1786 uint8* dst_u, uint8* dst_v, int width) {
1790 mov eax, [esp + 8 + 4] // src_argb
1791 mov esi, [esp + 8 + 8] // src_stride_argb
1792 mov edx, [esp + 8 + 12] // dst_u
1793 mov edi, [esp + 8 + 16] // dst_v
1794 mov ecx, [esp + 8 + 20] // pix
1795 movdqa xmm5, kAddUV128
1796 movdqa xmm6, kABGRToV
1797 movdqa xmm7, kABGRToU
1798 sub edi, edx // stride from u to v
1801 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1803 movdqu xmm4, [eax + esi]
1805 movdqu xmm1, [eax + 16]
1806 movdqu xmm4, [eax + esi + 16]
1808 movdqu xmm2, [eax + 32]
1809 movdqu xmm4, [eax + esi + 32]
1811 movdqu xmm3, [eax + 48]
1812 movdqu xmm4, [eax + esi + 48]
1817 shufps xmm0, xmm1, 0x88
1818 shufps xmm4, xmm1, 0xdd
1821 shufps xmm2, xmm3, 0x88
1822 shufps xmm4, xmm3, 0xdd
1825 // step 2 - convert to U and V
1826 // from here down is very similar to Y code except
1827 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1830 pmaddubsw xmm0, xmm7 // U
1831 pmaddubsw xmm2, xmm7
1832 pmaddubsw xmm1, xmm6 // V
1833 pmaddubsw xmm3, xmm6
1839 paddb xmm0, xmm5 // -> unsigned
1841 // step 3 - store 8 U and 8 V values
1842 movlps qword ptr [edx], xmm0 // U
1843 movhps qword ptr [edx + edi], xmm0 // V
1855 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1856 uint8* dst_u, uint8* dst_v, int width) {
1860 mov eax, [esp + 8 + 4] // src_argb
1861 mov esi, [esp + 8 + 8] // src_stride_argb
1862 mov edx, [esp + 8 + 12] // dst_u
1863 mov edi, [esp + 8 + 16] // dst_v
1864 mov ecx, [esp + 8 + 20] // pix
1865 movdqa xmm5, kAddUV128
1866 movdqa xmm6, kRGBAToV
1867 movdqa xmm7, kRGBAToU
1868 sub edi, edx // stride from u to v
1871 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1873 movdqu xmm4, [eax + esi]
1875 movdqu xmm1, [eax + 16]
1876 movdqu xmm4, [eax + esi + 16]
1878 movdqu xmm2, [eax + 32]
1879 movdqu xmm4, [eax + esi + 32]
1881 movdqu xmm3, [eax + 48]
1882 movdqu xmm4, [eax + esi + 48]
1887 shufps xmm0, xmm1, 0x88
1888 shufps xmm4, xmm1, 0xdd
1891 shufps xmm2, xmm3, 0x88
1892 shufps xmm4, xmm3, 0xdd
1895 // step 2 - convert to U and V
1896 // from here down is very similar to Y code except
1897 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1900 pmaddubsw xmm0, xmm7 // U
1901 pmaddubsw xmm2, xmm7
1902 pmaddubsw xmm1, xmm6 // V
1903 pmaddubsw xmm3, xmm6
1909 paddb xmm0, xmm5 // -> unsigned
1911 // step 3 - store 8 U and 8 V values
1912 movlps qword ptr [edx], xmm0 // U
1913 movhps qword ptr [edx + edi], xmm0 // V
1923 #endif // HAS_ARGBTOYROW_SSSE3
1925 // Read 16 UV from 444
1926 #define READYUV444_AVX2 __asm { \
1927 __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \
1928 __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \
1929 __asm lea esi, [esi + 16] \
1930 __asm vpermq ymm0, ymm0, 0xd8 \
1931 __asm vpermq ymm1, ymm1, 0xd8 \
1932 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1935 // Read 8 UV from 422, upsample to 16 UV.
1936 #define READYUV422_AVX2 __asm { \
1937 __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
1938 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
1939 __asm lea esi, [esi + 8] \
1940 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1941 __asm vpermq ymm0, ymm0, 0xd8 \
1942 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1945 // Read 4 UV from 411, upsample to 16 UV.
1946 #define READYUV411_AVX2 __asm { \
1947 __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \
1948 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \
1949 __asm lea esi, [esi + 4] \
1950 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1951 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1952 __asm vpermq ymm0, ymm0, 0xd8 \
1953 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \
1956 // Read 8 UV from NV12, upsample to 16 UV.
1957 #define READNV12_AVX2 __asm { \
1958 __asm vmovdqu xmm0, [esi] /* UV */ \
1959 __asm lea esi, [esi + 16] \
1960 __asm vpermq ymm0, ymm0, 0xd8 \
1961 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1964 // Convert 16 pixels: 16 UV and 16 Y.
1965 #define YUVTORGB_AVX2(YuvConstants) __asm { \
1966 /* Step 1: Find 8 UV contributions to 16 R,G,B values */ \
1967 __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR /* scale R UV */ \
1968 __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG /* scale G UV */ \
1969 __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB /* scale B UV */ \
1970 __asm vmovdqu ymm3, YuvConstants.kUVBiasR \
1971 __asm vpsubw ymm2, ymm3, ymm2 \
1972 __asm vmovdqu ymm3, YuvConstants.kUVBiasG \
1973 __asm vpsubw ymm1, ymm3, ymm1 \
1974 __asm vmovdqu ymm3, YuvConstants.kUVBiasB \
1975 __asm vpsubw ymm0, ymm3, ymm0 \
1976 /* Step 2: Find Y contribution to 16 R,G,B values */ \
1977 __asm vmovdqu xmm3, [eax] /* NOLINT */ \
1978 __asm lea eax, [eax + 16] \
1979 __asm vpermq ymm3, ymm3, 0xd8 \
1980 __asm vpunpcklbw ymm3, ymm3, ymm3 \
1981 __asm vpmulhuw ymm3, ymm3, YuvConstants.kYToRgb \
1982 __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \
1983 __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \
1984 __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \
1985 __asm vpsraw ymm0, ymm0, 6 \
1986 __asm vpsraw ymm1, ymm1, 6 \
1987 __asm vpsraw ymm2, ymm2, 6 \
1988 __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
1989 __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
1990 __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
1993 // Store 16 ARGB values.
1994 #define STOREARGB_AVX2 __asm { \
1995 /* Step 3: Weave into ARGB */ \
1996 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
1997 __asm vpermq ymm0, ymm0, 0xd8 \
1998 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
1999 __asm vpermq ymm2, ymm2, 0xd8 \
2000 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
2001 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
2002 __asm vmovdqu 0[edx], ymm1 \
2003 __asm vmovdqu 32[edx], ymm0 \
2004 __asm lea edx, [edx + 64] \
2007 #ifdef HAS_I422TOARGBROW_AVX2
2009 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2011 void I422ToARGBRow_AVX2(const uint8* y_buf,
2019 mov eax, [esp + 8 + 4] // Y
2020 mov esi, [esp + 8 + 8] // U
2021 mov edi, [esp + 8 + 12] // V
2022 mov edx, [esp + 8 + 16] // argb
2023 mov ecx, [esp + 8 + 20] // width
2025 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2029 YUVTORGB_AVX2(kYuvConstants)
2041 #endif // HAS_I422TOARGBROW_AVX2
2043 #ifdef HAS_J422TOARGBROW_AVX2
2045 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2047 void J422ToARGBRow_AVX2(const uint8* y_buf,
2055 mov eax, [esp + 8 + 4] // Y
2056 mov esi, [esp + 8 + 8] // U
2057 mov edi, [esp + 8 + 12] // V
2058 mov edx, [esp + 8 + 16] // argb
2059 mov ecx, [esp + 8 + 20] // width
2061 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2065 YUVTORGB_AVX2(kYuvJConstants)
2077 #endif // HAS_J422TOARGBROW_AVX2
2079 #ifdef HAS_I444TOARGBROW_AVX2
2081 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2083 void I444ToARGBRow_AVX2(const uint8* y_buf,
2091 mov eax, [esp + 8 + 4] // Y
2092 mov esi, [esp + 8 + 8] // U
2093 mov edi, [esp + 8 + 12] // V
2094 mov edx, [esp + 8 + 16] // argb
2095 mov ecx, [esp + 8 + 20] // width
2097 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2101 YUVTORGB_AVX2(kYuvConstants)
2113 #endif // HAS_I444TOARGBROW_AVX2
2115 #ifdef HAS_I411TOARGBROW_AVX2
2117 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2119 void I411ToARGBRow_AVX2(const uint8* y_buf,
2127 mov eax, [esp + 8 + 4] // Y
2128 mov esi, [esp + 8 + 8] // U
2129 mov edi, [esp + 8 + 12] // V
2130 mov edx, [esp + 8 + 16] // argb
2131 mov ecx, [esp + 8 + 20] // width
2133 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2137 YUVTORGB_AVX2(kYuvConstants)
2149 #endif // HAS_I411TOARGBROW_AVX2
2151 #ifdef HAS_NV12TOARGBROW_AVX2
2153 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2155 void NV12ToARGBRow_AVX2(const uint8* y_buf,
2156 const uint8* uv_buf,
2161 mov eax, [esp + 4 + 4] // Y
2162 mov esi, [esp + 4 + 8] // UV
2163 mov edx, [esp + 4 + 12] // argb
2164 mov ecx, [esp + 4 + 16] // width
2165 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2169 YUVTORGB_AVX2(kYuvConstants)
2180 #endif // HAS_NV12TOARGBROW_AVX2
2182 #ifdef HAS_NV21TOARGBROW_AVX2
2184 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
2186 void NV21ToARGBRow_AVX2(const uint8* y_buf,
2187 const uint8* uv_buf,
2192 mov eax, [esp + 4 + 4] // Y
2193 mov esi, [esp + 4 + 8] // UV
2194 mov edx, [esp + 4 + 12] // argb
2195 mov ecx, [esp + 4 + 16] // width
2196 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2200 YUVTORGB_AVX2(kYvuConstants)
2211 #endif // HAS_NV21TOARGBROW_AVX2
2213 #ifdef HAS_I422TOBGRAROW_AVX2
2215 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
2216 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
2218 void I422ToBGRARow_AVX2(const uint8* y_buf,
2226 mov eax, [esp + 8 + 4] // Y
2227 mov esi, [esp + 8 + 8] // U
2228 mov edi, [esp + 8 + 12] // V
2229 mov edx, [esp + 8 + 16] // argb
2230 mov ecx, [esp + 8 + 20] // width
2232 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2236 YUVTORGB_AVX2(kYuvConstants)
2238 // Step 3: Weave into BGRA
2239 vpunpcklbw ymm1, ymm1, ymm0 // GB
2240 vpermq ymm1, ymm1, 0xd8
2241 vpunpcklbw ymm2, ymm5, ymm2 // AR
2242 vpermq ymm2, ymm2, 0xd8
2243 vpunpcklwd ymm0, ymm2, ymm1 // ARGB first 8 pixels
2244 vpunpckhwd ymm2, ymm2, ymm1 // ARGB next 8 pixels
2246 vmovdqu [edx + 32], ymm2
2257 #endif // HAS_I422TOBGRAROW_AVX2
2259 #ifdef HAS_I422TORGBAROW_AVX2
2261 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2262 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
2264 void I422ToRGBARow_AVX2(const uint8* y_buf,
2272 mov eax, [esp + 8 + 4] // Y
2273 mov esi, [esp + 8 + 8] // U
2274 mov edi, [esp + 8 + 12] // V
2275 mov edx, [esp + 8 + 16] // argb
2276 mov ecx, [esp + 8 + 20] // width
2278 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2282 YUVTORGB_AVX2(kYuvConstants)
2284 // Step 3: Weave into RGBA
2285 vpunpcklbw ymm1, ymm1, ymm2 // GR
2286 vpermq ymm1, ymm1, 0xd8
2287 vpunpcklbw ymm2, ymm5, ymm0 // AB
2288 vpermq ymm2, ymm2, 0xd8
2289 vpunpcklwd ymm0, ymm2, ymm1 // ABGR first 8 pixels
2290 vpunpckhwd ymm1, ymm2, ymm1 // ABGR next 8 pixels
2292 vmovdqu [edx + 32], ymm1
2303 #endif // HAS_I422TORGBAROW_AVX2
2305 #ifdef HAS_I422TOABGRROW_AVX2
2307 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
2308 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
2310 void I422ToABGRRow_AVX2(const uint8* y_buf,
2318 mov eax, [esp + 8 + 4] // Y
2319 mov esi, [esp + 8 + 8] // U
2320 mov edi, [esp + 8 + 12] // V
2321 mov edx, [esp + 8 + 16] // argb
2322 mov ecx, [esp + 8 + 20] // width
2324 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2328 YUVTORGB_AVX2(kYuvConstants)
2330 // Step 3: Weave into ABGR
2331 vpunpcklbw ymm1, ymm2, ymm1 // RG
2332 vpermq ymm1, ymm1, 0xd8
2333 vpunpcklbw ymm2, ymm0, ymm5 // BA
2334 vpermq ymm2, ymm2, 0xd8
2335 vpunpcklwd ymm0, ymm1, ymm2 // RGBA first 8 pixels
2336 vpunpckhwd ymm1, ymm1, ymm2 // RGBA next 8 pixels
2338 vmovdqu [edx + 32], ymm1
2349 #endif // HAS_I422TOABGRROW_AVX2
2351 #if defined(HAS_I422TOARGBROW_SSSE3)
2352 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2354 // Read 8 UV from 444.
2355 #define READYUV444 __asm { \
2356 __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
2357 __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
2358 __asm lea esi, [esi + 8] \
2359 __asm punpcklbw xmm0, xmm1 /* UV */ \
2362 // Read 4 UV from 422, upsample to 8 UV.
2363 #define READYUV422 __asm { \
2364 __asm movd xmm0, [esi] /* U */ \
2365 __asm movd xmm1, [esi + edi] /* V */ \
2366 __asm lea esi, [esi + 4] \
2367 __asm punpcklbw xmm0, xmm1 /* UV */ \
2368 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2371 // Read 2 UV from 411, upsample to 8 UV.
2372 #define READYUV411 __asm { \
2373 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \
2374 __asm movd xmm0, ebx \
2375 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \
2376 __asm movd xmm1, ebx \
2377 __asm lea esi, [esi + 2] \
2378 __asm punpcklbw xmm0, xmm1 /* UV */ \
2379 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2380 __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \
2383 // Read 4 UV from NV12, upsample to 8 UV.
2384 #define READNV12 __asm { \
2385 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
2386 __asm lea esi, [esi + 8] \
2387 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2390 // Convert 8 pixels: 8 UV and 8 Y.
2391 #define YUVTORGB(YuvConstants) __asm { \
2392 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
2393 __asm movdqa xmm1, xmm0 \
2394 __asm movdqa xmm2, xmm0 \
2395 __asm movdqa xmm3, xmm0 \
2396 __asm movdqa xmm0, YuvConstants.kUVBiasB /* unbias back to signed */ \
2397 __asm pmaddubsw xmm1, YuvConstants.kUVToB /* scale B UV */ \
2398 __asm psubw xmm0, xmm1 \
2399 __asm movdqa xmm1, YuvConstants.kUVBiasG \
2400 __asm pmaddubsw xmm2, YuvConstants.kUVToG /* scale G UV */ \
2401 __asm psubw xmm1, xmm2 \
2402 __asm movdqa xmm2, YuvConstants.kUVBiasR \
2403 __asm pmaddubsw xmm3, YuvConstants.kUVToR /* scale R UV */ \
2404 __asm psubw xmm2, xmm3 \
2405 /* Step 2: Find Y contribution to 8 R,G,B values */ \
2406 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
2407 __asm lea eax, [eax + 8] \
2408 __asm punpcklbw xmm3, xmm3 \
2409 __asm pmulhuw xmm3, YuvConstants.kYToRgb \
2410 __asm paddsw xmm0, xmm3 /* B += Y */ \
2411 __asm paddsw xmm1, xmm3 /* G += Y */ \
2412 __asm paddsw xmm2, xmm3 /* R += Y */ \
2413 __asm psraw xmm0, 6 \
2414 __asm psraw xmm1, 6 \
2415 __asm psraw xmm2, 6 \
2416 __asm packuswb xmm0, xmm0 /* B */ \
2417 __asm packuswb xmm1, xmm1 /* G */ \
2418 __asm packuswb xmm2, xmm2 /* R */ \
2421 // Store 8 ARGB values.
2422 #define STOREARGB __asm { \
2423 /* Step 3: Weave into ARGB */ \
2424 __asm punpcklbw xmm0, xmm1 /* BG */ \
2425 __asm punpcklbw xmm2, xmm5 /* RA */ \
2426 __asm movdqa xmm1, xmm0 \
2427 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
2428 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
2429 __asm movdqu 0[edx], xmm0 \
2430 __asm movdqu 16[edx], xmm1 \
2431 __asm lea edx, [edx + 32] \
2434 // Store 8 BGRA values.
2435 #define STOREBGRA __asm { \
2436 /* Step 3: Weave into BGRA */ \
2437 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2438 __asm punpcklbw xmm1, xmm0 /* GB */ \
2439 __asm punpcklbw xmm5, xmm2 /* AR */ \
2440 __asm movdqa xmm0, xmm5 \
2441 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
2442 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
2443 __asm movdqu 0[edx], xmm5 \
2444 __asm movdqu 16[edx], xmm0 \
2445 __asm lea edx, [edx + 32] \
2448 // Store 8 ABGR values.
2449 #define STOREABGR __asm { \
2450 /* Step 3: Weave into ABGR */ \
2451 __asm punpcklbw xmm2, xmm1 /* RG */ \
2452 __asm punpcklbw xmm0, xmm5 /* BA */ \
2453 __asm movdqa xmm1, xmm2 \
2454 __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \
2455 __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \
2456 __asm movdqu 0[edx], xmm2 \
2457 __asm movdqu 16[edx], xmm1 \
2458 __asm lea edx, [edx + 32] \
2461 // Store 8 RGBA values.
2462 #define STORERGBA __asm { \
2463 /* Step 3: Weave into RGBA */ \
2464 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2465 __asm punpcklbw xmm1, xmm2 /* GR */ \
2466 __asm punpcklbw xmm5, xmm0 /* AB */ \
2467 __asm movdqa xmm0, xmm5 \
2468 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
2469 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
2470 __asm movdqu 0[edx], xmm5 \
2471 __asm movdqu 16[edx], xmm0 \
2472 __asm lea edx, [edx + 32] \
2475 // Store 8 RGB24 values.
2476 #define STORERGB24 __asm { \
2477 /* Step 3: Weave into RRGB */ \
2478 __asm punpcklbw xmm0, xmm1 /* BG */ \
2479 __asm punpcklbw xmm2, xmm2 /* RR */ \
2480 __asm movdqa xmm1, xmm0 \
2481 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2482 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
2483 /* Step 4: RRGB -> RGB24 */ \
2484 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2485 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
2486 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2487 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
2488 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
2489 __asm lea edx, [edx + 24] \
2492 // Store 8 RAW values.
2493 #define STORERAW __asm { \
2494 /* Step 3: Weave into RRGB */ \
2495 __asm punpcklbw xmm0, xmm1 /* BG */ \
2496 __asm punpcklbw xmm2, xmm2 /* RR */ \
2497 __asm movdqa xmm1, xmm0 \
2498 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2499 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
2500 /* Step 4: RRGB -> RAW */ \
2501 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2502 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
2503 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2504 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
2505 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
2506 __asm lea edx, [edx + 24] \
2509 // Store 8 RGB565 values.
2510 #define STORERGB565 __asm { \
2511 /* Step 3: Weave into RRGB */ \
2512 __asm punpcklbw xmm0, xmm1 /* BG */ \
2513 __asm punpcklbw xmm2, xmm2 /* RR */ \
2514 __asm movdqa xmm1, xmm0 \
2515 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2516 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
2517 /* Step 4: RRGB -> RGB565 */ \
2518 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
2519 __asm movdqa xmm2, xmm0 /* G */ \
2520 __asm pslld xmm0, 8 /* R */ \
2521 __asm psrld xmm3, 3 /* B */ \
2522 __asm psrld xmm2, 5 /* G */ \
2523 __asm psrad xmm0, 16 /* R */ \
2524 __asm pand xmm3, xmm5 /* B */ \
2525 __asm pand xmm2, xmm6 /* G */ \
2526 __asm pand xmm0, xmm7 /* R */ \
2527 __asm por xmm3, xmm2 /* BG */ \
2528 __asm por xmm0, xmm3 /* BGR */ \
2529 __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
2530 __asm movdqa xmm2, xmm1 /* G */ \
2531 __asm pslld xmm1, 8 /* R */ \
2532 __asm psrld xmm3, 3 /* B */ \
2533 __asm psrld xmm2, 5 /* G */ \
2534 __asm psrad xmm1, 16 /* R */ \
2535 __asm pand xmm3, xmm5 /* B */ \
2536 __asm pand xmm2, xmm6 /* G */ \
2537 __asm pand xmm1, xmm7 /* R */ \
2538 __asm por xmm3, xmm2 /* BG */ \
2539 __asm por xmm1, xmm3 /* BGR */ \
2540 __asm packssdw xmm0, xmm1 \
2541 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
2542 __asm lea edx, [edx + 16] \
2546 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2548 void I444ToARGBRow_SSSE3(const uint8* y_buf,
2556 mov eax, [esp + 8 + 4] // Y
2557 mov esi, [esp + 8 + 8] // U
2558 mov edi, [esp + 8 + 12] // V
2559 mov edx, [esp + 8 + 16] // argb
2560 mov ecx, [esp + 8 + 20] // width
2562 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2566 YUVTORGB(kYuvConstants)
2579 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2581 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
2589 mov eax, [esp + 8 + 4] // Y
2590 mov esi, [esp + 8 + 8] // U
2591 mov edi, [esp + 8 + 12] // V
2592 mov edx, [esp + 8 + 16] // rgb24
2593 mov ecx, [esp + 8 + 20] // width
2595 movdqa xmm5, kShuffleMaskARGBToRGB24_0
2596 movdqa xmm6, kShuffleMaskARGBToRGB24
2600 YUVTORGB(kYuvConstants)
2613 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
2615 void I422ToRAWRow_SSSE3(const uint8* y_buf,
2623 mov eax, [esp + 8 + 4] // Y
2624 mov esi, [esp + 8 + 8] // U
2625 mov edi, [esp + 8 + 12] // V
2626 mov edx, [esp + 8 + 16] // raw
2627 mov ecx, [esp + 8 + 20] // width
2629 movdqa xmm5, kShuffleMaskARGBToRAW_0
2630 movdqa xmm6, kShuffleMaskARGBToRAW
2634 YUVTORGB(kYuvConstants)
2647 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2649 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
2657 mov eax, [esp + 8 + 4] // Y
2658 mov esi, [esp + 8 + 8] // U
2659 mov edi, [esp + 8 + 12] // V
2660 mov edx, [esp + 8 + 16] // rgb565
2661 mov ecx, [esp + 8 + 20] // width
2663 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
2665 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
2668 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
2673 YUVTORGB(kYuvConstants)
2686 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2688 void I422ToARGBRow_SSSE3(const uint8* y_buf,
2696 mov eax, [esp + 8 + 4] // Y
2697 mov esi, [esp + 8 + 8] // U
2698 mov edi, [esp + 8 + 12] // V
2699 mov edx, [esp + 8 + 16] // argb
2700 mov ecx, [esp + 8 + 20] // width
2702 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2706 YUVTORGB(kYuvConstants)
2719 // JPeg color space version of I422ToARGB
2720 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2722 void J422ToARGBRow_SSSE3(const uint8* y_buf,
2730 mov eax, [esp + 8 + 4] // Y
2731 mov esi, [esp + 8 + 8] // U
2732 mov edi, [esp + 8 + 12] // V
2733 mov edx, [esp + 8 + 16] // argb
2734 mov ecx, [esp + 8 + 20] // width
2736 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2740 YUVTORGB(kYuvJConstants)
2753 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2754 // Similar to I420 but duplicate UV once more.
2756 void I411ToARGBRow_SSSE3(const uint8* y_buf,
2765 mov eax, [esp + 12 + 4] // Y
2766 mov esi, [esp + 12 + 8] // U
2767 mov edi, [esp + 12 + 12] // V
2768 mov edx, [esp + 12 + 16] // argb
2769 mov ecx, [esp + 12 + 20] // width
2771 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2774 READYUV411 // modifies EBX
2775 YUVTORGB(kYuvConstants)
2789 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2791 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
2792 const uint8* uv_buf,
2797 mov eax, [esp + 4 + 4] // Y
2798 mov esi, [esp + 4 + 8] // UV
2799 mov edx, [esp + 4 + 12] // argb
2800 mov ecx, [esp + 4 + 16] // width
2801 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2805 YUVTORGB(kYuvConstants)
2817 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
2819 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
2820 const uint8* uv_buf,
2825 mov eax, [esp + 4 + 4] // Y
2826 mov esi, [esp + 4 + 8] // UV
2827 mov edx, [esp + 4 + 12] // argb
2828 mov ecx, [esp + 4 + 16] // width
2829 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2833 YUVTORGB(kYvuConstants)
2845 void I422ToBGRARow_SSSE3(const uint8* y_buf,
2853 mov eax, [esp + 8 + 4] // Y
2854 mov esi, [esp + 8 + 8] // U
2855 mov edi, [esp + 8 + 12] // V
2856 mov edx, [esp + 8 + 16] // bgra
2857 mov ecx, [esp + 8 + 20] // width
2862 YUVTORGB(kYuvConstants)
2875 void I422ToABGRRow_SSSE3(const uint8* y_buf,
2883 mov eax, [esp + 8 + 4] // Y
2884 mov esi, [esp + 8 + 8] // U
2885 mov edi, [esp + 8 + 12] // V
2886 mov edx, [esp + 8 + 16] // abgr
2887 mov ecx, [esp + 8 + 20] // width
2889 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2893 YUVTORGB(kYuvConstants)
2906 void I422ToRGBARow_SSSE3(const uint8* y_buf,
2914 mov eax, [esp + 8 + 4] // Y
2915 mov esi, [esp + 8 + 8] // U
2916 mov edi, [esp + 8 + 12] // V
2917 mov edx, [esp + 8 + 16] // rgba
2918 mov ecx, [esp + 8 + 20] // width
2923 YUVTORGB(kYuvConstants)
2935 #endif // HAS_I422TOARGBROW_SSSE3
2937 #ifdef HAS_I400TOARGBROW_SSE2
2938 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
2940 void I400ToARGBRow_SSE2(const uint8* y_buf,
2944 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
2947 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
2949 pshufd xmm3, xmm3, 0
2950 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
2953 mov eax, [esp + 4] // Y
2954 mov edx, [esp + 8] // rgb
2955 mov ecx, [esp + 12] // width
2958 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2959 movq xmm0, qword ptr [eax]
2961 punpcklbw xmm0, xmm0 // Y.Y
2965 packuswb xmm0, xmm0 // G
2967 // Step 2: Weave into ARGB
2968 punpcklbw xmm0, xmm0 // GG
2970 punpcklwd xmm0, xmm0 // BGRA first 4 pixels
2971 punpckhwd xmm1, xmm1 // BGRA next 4 pixels
2975 movdqu [edx + 16], xmm1
2982 #endif // HAS_I400TOARGBROW_SSE2
2984 #ifdef HAS_I400TOARGBROW_AVX2
2985 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2986 // note: vpunpcklbw mutates and vpackuswb unmutates.
2988 void I400ToARGBRow_AVX2(const uint8* y_buf,
2992 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
2994 vbroadcastss ymm2, xmm2
2995 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
2997 vbroadcastss ymm3, xmm3
2998 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
2999 vpslld ymm4, ymm4, 24
3001 mov eax, [esp + 4] // Y
3002 mov edx, [esp + 8] // rgb
3003 mov ecx, [esp + 12] // width
3006 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
3009 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
3010 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
3011 vpmulhuw ymm0, ymm0, ymm2
3012 vpsubusw ymm0, ymm0, ymm3
3013 vpsrlw ymm0, ymm0, 6
3014 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
3016 // TODO(fbarchard): Weave alpha with unpack.
3017 // Step 2: Weave into ARGB
3018 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
3019 vpermq ymm1, ymm1, 0xd8
3020 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
3021 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
3022 vpor ymm0, ymm0, ymm4
3023 vpor ymm1, ymm1, ymm4
3025 vmovdqu [edx + 32], ymm1
3033 #endif // HAS_I400TOARGBROW_AVX2
3035 #ifdef HAS_MIRRORROW_SSSE3
3036 // Shuffle table for reversing the bytes.
3037 static const uvec8 kShuffleMirror = {
3038 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3041 // TODO(fbarchard): Replace lea with -16 offset.
3043 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3045 mov eax, [esp + 4] // src
3046 mov edx, [esp + 8] // dst
3047 mov ecx, [esp + 12] // width
3048 movdqa xmm5, kShuffleMirror
3051 movdqu xmm0, [eax - 16 + ecx]
3060 #endif // HAS_MIRRORROW_SSSE3
3062 #ifdef HAS_MIRRORROW_AVX2
3064 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3066 mov eax, [esp + 4] // src
3067 mov edx, [esp + 8] // dst
3068 mov ecx, [esp + 12] // width
3069 vbroadcastf128 ymm5, kShuffleMirror
3072 vmovdqu ymm0, [eax - 32 + ecx]
3073 vpshufb ymm0, ymm0, ymm5
3074 vpermq ymm0, ymm0, 0x4e // swap high and low halfs
3083 #endif // HAS_MIRRORROW_AVX2
3085 #ifdef HAS_MIRRORROW_SSE2
3087 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3089 mov eax, [esp + 4] // src
3090 mov edx, [esp + 8] // dst
3091 mov ecx, [esp + 12] // width
3094 movdqu xmm0, [eax - 16 + ecx]
3095 movdqa xmm1, xmm0 // swap bytes
3099 pshuflw xmm0, xmm0, 0x1b // swap words
3100 pshufhw xmm0, xmm0, 0x1b
3101 pshufd xmm0, xmm0, 0x4e // swap qwords
3109 #endif // HAS_MIRRORROW_SSE2
3111 #ifdef HAS_MIRRORROW_UV_SSSE3
3112 // Shuffle table for reversing the bytes of UV channels.
3113 static const uvec8 kShuffleMirrorUV = {
3114 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
3118 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
3122 mov eax, [esp + 4 + 4] // src
3123 mov edx, [esp + 4 + 8] // dst_u
3124 mov edi, [esp + 4 + 12] // dst_v
3125 mov ecx, [esp + 4 + 16] // width
3126 movdqa xmm1, kShuffleMirrorUV
3127 lea eax, [eax + ecx * 2 - 16]
3134 movlpd qword ptr [edx], xmm0
3135 movhpd qword ptr [edx + edi], xmm0
3144 #endif // HAS_MIRRORROW_UV_SSSE3
3146 #ifdef HAS_ARGBMIRRORROW_SSE2
3148 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3150 mov eax, [esp + 4] // src
3151 mov edx, [esp + 8] // dst
3152 mov ecx, [esp + 12] // width
3153 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
3158 pshufd xmm0, xmm0, 0x1b
3166 #endif // HAS_ARGBMIRRORROW_SSE2
3168 #ifdef HAS_ARGBMIRRORROW_AVX2
3169 // Shuffle table for reversing the bytes.
3170 static const ulvec32 kARGBShuffleMirror_AVX2 = {
3171 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3175 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3177 mov eax, [esp + 4] // src
3178 mov edx, [esp + 8] // dst
3179 mov ecx, [esp + 12] // width
3180 vmovdqu ymm5, kARGBShuffleMirror_AVX2
3183 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
3192 #endif // HAS_ARGBMIRRORROW_AVX2
3194 #ifdef HAS_SPLITUVROW_SSE2
3196 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3199 mov eax, [esp + 4 + 4] // src_uv
3200 mov edx, [esp + 4 + 8] // dst_u
3201 mov edi, [esp + 4 + 12] // dst_v
3202 mov ecx, [esp + 4 + 16] // pix
3203 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3209 movdqu xmm1, [eax + 16]
3213 pand xmm0, xmm5 // even bytes
3216 psrlw xmm2, 8 // odd bytes
3220 movdqu [edx + edi], xmm2
3230 #endif // HAS_SPLITUVROW_SSE2
3232 #ifdef HAS_SPLITUVROW_AVX2
3234 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3237 mov eax, [esp + 4 + 4] // src_uv
3238 mov edx, [esp + 4 + 8] // dst_u
3239 mov edi, [esp + 4 + 12] // dst_v
3240 mov ecx, [esp + 4 + 16] // pix
3241 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3242 vpsrlw ymm5, ymm5, 8
3247 vmovdqu ymm1, [eax + 32]
3249 vpsrlw ymm2, ymm0, 8 // odd bytes
3250 vpsrlw ymm3, ymm1, 8
3251 vpand ymm0, ymm0, ymm5 // even bytes
3252 vpand ymm1, ymm1, ymm5
3253 vpackuswb ymm0, ymm0, ymm1
3254 vpackuswb ymm2, ymm2, ymm3
3255 vpermq ymm0, ymm0, 0xd8
3256 vpermq ymm2, ymm2, 0xd8
3258 vmovdqu [edx + edi], ymm2
3268 #endif // HAS_SPLITUVROW_AVX2
3270 #ifdef HAS_MERGEUVROW_SSE2
3272 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3276 mov eax, [esp + 4 + 4] // src_u
3277 mov edx, [esp + 4 + 8] // src_v
3278 mov edi, [esp + 4 + 12] // dst_uv
3279 mov ecx, [esp + 4 + 16] // width
3283 movdqu xmm0, [eax] // read 16 U's
3284 movdqu xmm1, [eax + edx] // and 16 V's
3287 punpcklbw xmm0, xmm1 // first 8 UV pairs
3288 punpckhbw xmm2, xmm1 // next 8 UV pairs
3290 movdqu [edi + 16], xmm2
3299 #endif // HAS_MERGEUVROW_SSE2
3301 #ifdef HAS_MERGEUVROW_AVX2
3303 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3307 mov eax, [esp + 4 + 4] // src_u
3308 mov edx, [esp + 4 + 8] // src_v
3309 mov edi, [esp + 4 + 12] // dst_uv
3310 mov ecx, [esp + 4 + 16] // width
3314 vmovdqu ymm0, [eax] // read 32 U's
3315 vmovdqu ymm1, [eax + edx] // and 32 V's
3317 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
3318 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
3319 vextractf128 [edi], ymm2, 0 // bytes 0..15
3320 vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
3321 vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
3322 vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
3332 #endif // HAS_MERGEUVROW_AVX2
3334 #ifdef HAS_COPYROW_SSE2
3335 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
3337 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3339 mov eax, [esp + 4] // src
3340 mov edx, [esp + 8] // dst
3341 mov ecx, [esp + 12] // count
3345 movdqu xmm1, [eax + 16]
3348 movdqu [edx + 16], xmm1
3355 #endif // HAS_COPYROW_SSE2
3357 #ifdef HAS_COPYROW_AVX
3358 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
3360 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
3362 mov eax, [esp + 4] // src
3363 mov edx, [esp + 8] // dst
3364 mov ecx, [esp + 12] // count
3368 vmovdqu ymm1, [eax + 32]
3371 vmovdqu [edx + 32], ymm1
3380 #endif // HAS_COPYROW_AVX
3384 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
3388 mov esi, [esp + 4] // src
3389 mov edi, [esp + 8] // dst
3390 mov ecx, [esp + 12] // count
3398 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3401 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3403 mov eax, [esp + 4] // src
3404 mov edx, [esp + 8] // dst
3405 mov ecx, [esp + 12] // count
3406 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3408 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3413 movdqu xmm3, [eax + 16]
3416 movdqu xmm5, [edx + 16]
3424 movdqu [edx + 16], xmm3
3432 #endif // HAS_ARGBCOPYALPHAROW_SSE2
3434 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3437 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3439 mov eax, [esp + 4] // src
3440 mov edx, [esp + 8] // dst
3441 mov ecx, [esp + 12] // count
3442 vpcmpeqb ymm0, ymm0, ymm0
3443 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3447 vmovdqu ymm2, [eax + 32]
3449 vpblendvb ymm1, ymm1, [edx], ymm0
3450 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3452 vmovdqu [edx + 32], ymm2
3461 #endif // HAS_ARGBCOPYALPHAROW_AVX2
3463 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3466 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3468 mov eax, [esp + 4] // src
3469 mov edx, [esp + 8] // dst
3470 mov ecx, [esp + 12] // count
3471 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3473 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3477 movq xmm2, qword ptr [eax] // 8 Y's
3479 punpcklbw xmm2, xmm2
3480 punpckhwd xmm3, xmm2
3481 punpcklwd xmm2, xmm2
3483 movdqu xmm5, [edx + 16]
3491 movdqu [edx + 16], xmm3
3499 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3501 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3504 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3506 mov eax, [esp + 4] // src
3507 mov edx, [esp + 8] // dst
3508 mov ecx, [esp + 12] // count
3509 vpcmpeqb ymm0, ymm0, ymm0
3510 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3513 vpmovzxbd ymm1, qword ptr [eax]
3514 vpmovzxbd ymm2, qword ptr [eax + 8]
3516 vpslld ymm1, ymm1, 24
3517 vpslld ymm2, ymm2, 24
3518 vpblendvb ymm1, ymm1, [edx], ymm0
3519 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3521 vmovdqu [edx + 32], ymm2
3530 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3532 #ifdef HAS_SETROW_X86
3533 // Write 'count' bytes using an 8 bit value repeated.
3534 // Count should be multiple of 4.
3536 void SetRow_X86(uint8* dst, uint8 v8, int count) {
3538 movzx eax, byte ptr [esp + 8] // v8
3539 mov edx, 0x01010101 // Duplicate byte to all bytes.
3540 mul edx // overwrites edx with upper part of result.
3542 mov edi, [esp + 4] // dst
3543 mov ecx, [esp + 12] // count
3551 // Write 'count' bytes using an 8 bit value repeated.
3553 void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
3556 mov edi, [esp + 4] // dst
3557 mov eax, [esp + 8] // v8
3558 mov ecx, [esp + 12] // count
3565 // Write 'count' 32 bit values.
3567 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
3570 mov edi, [esp + 4] // dst
3571 mov eax, [esp + 8] // v32
3572 mov ecx, [esp + 12] // count
3578 #endif // HAS_SETROW_X86
3580 #ifdef HAS_YUY2TOYROW_AVX2
3582 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
3583 uint8* dst_y, int pix) {
3585 mov eax, [esp + 4] // src_yuy2
3586 mov edx, [esp + 8] // dst_y
3587 mov ecx, [esp + 12] // pix
3588 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3589 vpsrlw ymm5, ymm5, 8
3593 vmovdqu ymm1, [eax + 32]
3595 vpand ymm0, ymm0, ymm5 // even bytes are Y
3596 vpand ymm1, ymm1, ymm5
3597 vpackuswb ymm0, ymm0, ymm1 // mutates.
3598 vpermq ymm0, ymm0, 0xd8
3609 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3610 uint8* dst_u, uint8* dst_v, int pix) {
3614 mov eax, [esp + 8 + 4] // src_yuy2
3615 mov esi, [esp + 8 + 8] // stride_yuy2
3616 mov edx, [esp + 8 + 12] // dst_u
3617 mov edi, [esp + 8 + 16] // dst_v
3618 mov ecx, [esp + 8 + 20] // pix
3619 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3620 vpsrlw ymm5, ymm5, 8
3625 vmovdqu ymm1, [eax + 32]
3626 vpavgb ymm0, ymm0, [eax + esi]
3627 vpavgb ymm1, ymm1, [eax + esi + 32]
3629 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3630 vpsrlw ymm1, ymm1, 8
3631 vpackuswb ymm0, ymm0, ymm1 // mutates.
3632 vpermq ymm0, ymm0, 0xd8
3633 vpand ymm1, ymm0, ymm5 // U
3634 vpsrlw ymm0, ymm0, 8 // V
3635 vpackuswb ymm1, ymm1, ymm1 // mutates.
3636 vpackuswb ymm0, ymm0, ymm0 // mutates.
3637 vpermq ymm1, ymm1, 0xd8
3638 vpermq ymm0, ymm0, 0xd8
3639 vextractf128 [edx], ymm1, 0 // U
3640 vextractf128 [edx + edi], ymm0, 0 // V
3653 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3654 uint8* dst_u, uint8* dst_v, int pix) {
3657 mov eax, [esp + 4 + 4] // src_yuy2
3658 mov edx, [esp + 4 + 8] // dst_u
3659 mov edi, [esp + 4 + 12] // dst_v
3660 mov ecx, [esp + 4 + 16] // pix
3661 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3662 vpsrlw ymm5, ymm5, 8
3667 vmovdqu ymm1, [eax + 32]
3669 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3670 vpsrlw ymm1, ymm1, 8
3671 vpackuswb ymm0, ymm0, ymm1 // mutates.
3672 vpermq ymm0, ymm0, 0xd8
3673 vpand ymm1, ymm0, ymm5 // U
3674 vpsrlw ymm0, ymm0, 8 // V
3675 vpackuswb ymm1, ymm1, ymm1 // mutates.
3676 vpackuswb ymm0, ymm0, ymm0 // mutates.
3677 vpermq ymm1, ymm1, 0xd8
3678 vpermq ymm0, ymm0, 0xd8
3679 vextractf128 [edx], ymm1, 0 // U
3680 vextractf128 [edx + edi], ymm0, 0 // V
3692 void UYVYToYRow_AVX2(const uint8* src_uyvy,
3693 uint8* dst_y, int pix) {
3695 mov eax, [esp + 4] // src_uyvy
3696 mov edx, [esp + 8] // dst_y
3697 mov ecx, [esp + 12] // pix
3701 vmovdqu ymm1, [eax + 32]
3703 vpsrlw ymm0, ymm0, 8 // odd bytes are Y
3704 vpsrlw ymm1, ymm1, 8
3705 vpackuswb ymm0, ymm0, ymm1 // mutates.
3706 vpermq ymm0, ymm0, 0xd8
3717 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3718 uint8* dst_u, uint8* dst_v, int pix) {
3722 mov eax, [esp + 8 + 4] // src_yuy2
3723 mov esi, [esp + 8 + 8] // stride_yuy2
3724 mov edx, [esp + 8 + 12] // dst_u
3725 mov edi, [esp + 8 + 16] // dst_v
3726 mov ecx, [esp + 8 + 20] // pix
3727 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3728 vpsrlw ymm5, ymm5, 8
3733 vmovdqu ymm1, [eax + 32]
3734 vpavgb ymm0, ymm0, [eax + esi]
3735 vpavgb ymm1, ymm1, [eax + esi + 32]
3737 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3738 vpand ymm1, ymm1, ymm5
3739 vpackuswb ymm0, ymm0, ymm1 // mutates.
3740 vpermq ymm0, ymm0, 0xd8
3741 vpand ymm1, ymm0, ymm5 // U
3742 vpsrlw ymm0, ymm0, 8 // V
3743 vpackuswb ymm1, ymm1, ymm1 // mutates.
3744 vpackuswb ymm0, ymm0, ymm0 // mutates.
3745 vpermq ymm1, ymm1, 0xd8
3746 vpermq ymm0, ymm0, 0xd8
3747 vextractf128 [edx], ymm1, 0 // U
3748 vextractf128 [edx + edi], ymm0, 0 // V
3761 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3762 uint8* dst_u, uint8* dst_v, int pix) {
3765 mov eax, [esp + 4 + 4] // src_yuy2
3766 mov edx, [esp + 4 + 8] // dst_u
3767 mov edi, [esp + 4 + 12] // dst_v
3768 mov ecx, [esp + 4 + 16] // pix
3769 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3770 vpsrlw ymm5, ymm5, 8
3775 vmovdqu ymm1, [eax + 32]
3777 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3778 vpand ymm1, ymm1, ymm5
3779 vpackuswb ymm0, ymm0, ymm1 // mutates.
3780 vpermq ymm0, ymm0, 0xd8
3781 vpand ymm1, ymm0, ymm5 // U
3782 vpsrlw ymm0, ymm0, 8 // V
3783 vpackuswb ymm1, ymm1, ymm1 // mutates.
3784 vpackuswb ymm0, ymm0, ymm0 // mutates.
3785 vpermq ymm1, ymm1, 0xd8
3786 vpermq ymm0, ymm0, 0xd8
3787 vextractf128 [edx], ymm1, 0 // U
3788 vextractf128 [edx + edi], ymm0, 0 // V
3798 #endif // HAS_YUY2TOYROW_AVX2
3800 #ifdef HAS_YUY2TOYROW_SSE2
3802 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
3803 uint8* dst_y, int pix) {
3805 mov eax, [esp + 4] // src_yuy2
3806 mov edx, [esp + 8] // dst_y
3807 mov ecx, [esp + 12] // pix
3808 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3813 movdqu xmm1, [eax + 16]
3815 pand xmm0, xmm5 // even bytes are Y
3827 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
3828 uint8* dst_u, uint8* dst_v, int pix) {
3832 mov eax, [esp + 8 + 4] // src_yuy2
3833 mov esi, [esp + 8 + 8] // stride_yuy2
3834 mov edx, [esp + 8 + 12] // dst_u
3835 mov edi, [esp + 8 + 16] // dst_v
3836 mov ecx, [esp + 8 + 20] // pix
3837 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3843 movdqu xmm1, [eax + 16]
3844 movdqu xmm2, [eax + esi]
3845 movdqu xmm3, [eax + esi + 16]
3849 psrlw xmm0, 8 // YUYV -> UVUV
3853 pand xmm0, xmm5 // U
3857 movq qword ptr [edx], xmm0
3858 movq qword ptr [edx + edi], xmm1
3870 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3871 uint8* dst_u, uint8* dst_v, int pix) {
3874 mov eax, [esp + 4 + 4] // src_yuy2
3875 mov edx, [esp + 4 + 8] // dst_u
3876 mov edi, [esp + 4 + 12] // dst_v
3877 mov ecx, [esp + 4 + 16] // pix
3878 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3884 movdqu xmm1, [eax + 16]
3886 psrlw xmm0, 8 // YUYV -> UVUV
3890 pand xmm0, xmm5 // U
3894 movq qword ptr [edx], xmm0
3895 movq qword ptr [edx + edi], xmm1
3906 void UYVYToYRow_SSE2(const uint8* src_uyvy,
3907 uint8* dst_y, int pix) {
3909 mov eax, [esp + 4] // src_uyvy
3910 mov edx, [esp + 8] // dst_y
3911 mov ecx, [esp + 12] // pix
3915 movdqu xmm1, [eax + 16]
3917 psrlw xmm0, 8 // odd bytes are Y
3929 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
3930 uint8* dst_u, uint8* dst_v, int pix) {
3934 mov eax, [esp + 8 + 4] // src_yuy2
3935 mov esi, [esp + 8 + 8] // stride_yuy2
3936 mov edx, [esp + 8 + 12] // dst_u
3937 mov edi, [esp + 8 + 16] // dst_v
3938 mov ecx, [esp + 8 + 20] // pix
3939 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3945 movdqu xmm1, [eax + 16]
3946 movdqu xmm2, [eax + esi]
3947 movdqu xmm3, [eax + esi + 16]
3951 pand xmm0, xmm5 // UYVY -> UVUV
3955 pand xmm0, xmm5 // U
3959 movq qword ptr [edx], xmm0
3960 movq qword ptr [edx + edi], xmm1
3972 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3973 uint8* dst_u, uint8* dst_v, int pix) {
3976 mov eax, [esp + 4 + 4] // src_yuy2
3977 mov edx, [esp + 4 + 8] // dst_u
3978 mov edi, [esp + 4 + 12] // dst_v
3979 mov ecx, [esp + 4 + 16] // pix
3980 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3986 movdqu xmm1, [eax + 16]
3988 pand xmm0, xmm5 // UYVY -> UVUV
3992 pand xmm0, xmm5 // U
3996 movq qword ptr [edx], xmm0
3997 movq qword ptr [edx + edi], xmm1
4006 #endif // HAS_YUY2TOYROW_SSE2
4008 #ifdef HAS_ARGBBLENDROW_SSE2
4009 // Blend 8 pixels at a time.
4011 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4012 uint8* dst_argb, int width) {
4015 mov eax, [esp + 4 + 4] // src_argb0
4016 mov esi, [esp + 4 + 8] // src_argb1
4017 mov edx, [esp + 4 + 12] // dst_argb
4018 mov ecx, [esp + 4 + 16] // width
4019 pcmpeqb xmm7, xmm7 // generate constant 1
4021 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4023 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4025 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4028 jl convertloop4b // less than 4 pixels?
4032 movdqu xmm3, [eax] // src argb
4034 movdqa xmm0, xmm3 // src argb
4035 pxor xmm3, xmm4 // ~alpha
4036 movdqu xmm2, [esi] // _r_b
4037 psrlw xmm3, 8 // alpha
4038 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4039 pshuflw xmm3, xmm3, 0F5h
4040 pand xmm2, xmm6 // _r_b
4041 paddw xmm3, xmm7 // 256 - alpha
4042 pmullw xmm2, xmm3 // _r_b * alpha
4043 movdqu xmm1, [esi] // _a_g
4045 psrlw xmm1, 8 // _a_g
4046 por xmm0, xmm4 // set alpha to 255
4047 pmullw xmm1, xmm3 // _a_g * alpha
4048 psrlw xmm2, 8 // _r_b convert to 8 bits again
4049 paddusb xmm0, xmm2 // + src argb
4050 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4051 paddusb xmm0, xmm1 // + src argb
4063 movd xmm3, [eax] // src argb
4065 movdqa xmm0, xmm3 // src argb
4066 pxor xmm3, xmm4 // ~alpha
4067 movd xmm2, [esi] // _r_b
4068 psrlw xmm3, 8 // alpha
4069 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4070 pshuflw xmm3, xmm3, 0F5h
4071 pand xmm2, xmm6 // _r_b
4072 paddw xmm3, xmm7 // 256 - alpha
4073 pmullw xmm2, xmm3 // _r_b * alpha
4074 movd xmm1, [esi] // _a_g
4076 psrlw xmm1, 8 // _a_g
4077 por xmm0, xmm4 // set alpha to 255
4078 pmullw xmm1, xmm3 // _a_g * alpha
4079 psrlw xmm2, 8 // _r_b convert to 8 bits again
4080 paddusb xmm0, xmm2 // + src argb
4081 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4082 paddusb xmm0, xmm1 // + src argb
4093 #endif // HAS_ARGBBLENDROW_SSE2
4095 #ifdef HAS_ARGBBLENDROW_SSSE3
4096 // Shuffle table for isolating alpha.
4097 static const uvec8 kShuffleAlpha = {
4098 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4099 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4101 // Same as SSE2, but replaces:
4102 // psrlw xmm3, 8 // alpha
4103 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4104 // pshuflw xmm3, xmm3, 0F5h
4106 // pshufb xmm3, kShuffleAlpha // alpha
4107 // Blend 8 pixels at a time.
4110 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
4111 uint8* dst_argb, int width) {
4114 mov eax, [esp + 4 + 4] // src_argb0
4115 mov esi, [esp + 4 + 8] // src_argb1
4116 mov edx, [esp + 4 + 12] // dst_argb
4117 mov ecx, [esp + 4 + 16] // width
4118 pcmpeqb xmm7, xmm7 // generate constant 0x0001
4120 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4122 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4124 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4127 jl convertloop4b // less than 4 pixels?
4131 movdqu xmm3, [eax] // src argb
4133 movdqa xmm0, xmm3 // src argb
4134 pxor xmm3, xmm4 // ~alpha
4135 movdqu xmm2, [esi] // _r_b
4136 pshufb xmm3, kShuffleAlpha // alpha
4137 pand xmm2, xmm6 // _r_b
4138 paddw xmm3, xmm7 // 256 - alpha
4139 pmullw xmm2, xmm3 // _r_b * alpha
4140 movdqu xmm1, [esi] // _a_g
4142 psrlw xmm1, 8 // _a_g
4143 por xmm0, xmm4 // set alpha to 255
4144 pmullw xmm1, xmm3 // _a_g * alpha
4145 psrlw xmm2, 8 // _r_b convert to 8 bits again
4146 paddusb xmm0, xmm2 // + src argb
4147 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4148 paddusb xmm0, xmm1 // + src argb
4160 movd xmm3, [eax] // src argb
4162 movdqa xmm0, xmm3 // src argb
4163 pxor xmm3, xmm4 // ~alpha
4164 movd xmm2, [esi] // _r_b
4165 pshufb xmm3, kShuffleAlpha // alpha
4166 pand xmm2, xmm6 // _r_b
4167 paddw xmm3, xmm7 // 256 - alpha
4168 pmullw xmm2, xmm3 // _r_b * alpha
4169 movd xmm1, [esi] // _a_g
4171 psrlw xmm1, 8 // _a_g
4172 por xmm0, xmm4 // set alpha to 255
4173 pmullw xmm1, xmm3 // _a_g * alpha
4174 psrlw xmm2, 8 // _r_b convert to 8 bits again
4175 paddusb xmm0, xmm2 // + src argb
4176 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4177 paddusb xmm0, xmm1 // + src argb
4188 #endif // HAS_ARGBBLENDROW_SSSE3
4190 #ifdef HAS_ARGBATTENUATEROW_SSE2
4191 // Attenuate 4 pixels at a time.
4193 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
4195 mov eax, [esp + 4] // src_argb0
4196 mov edx, [esp + 8] // dst_argb
4197 mov ecx, [esp + 12] // width
4198 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4200 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
4204 movdqu xmm0, [eax] // read 4 pixels
4205 punpcklbw xmm0, xmm0 // first 2
4206 pshufhw xmm2, xmm0, 0FFh // 8 alpha words
4207 pshuflw xmm2, xmm2, 0FFh
4208 pmulhuw xmm0, xmm2 // rgb * a
4209 movdqu xmm1, [eax] // read 4 pixels
4210 punpckhbw xmm1, xmm1 // next 2 pixels
4211 pshufhw xmm2, xmm1, 0FFh // 8 alpha words
4212 pshuflw xmm2, xmm2, 0FFh
4213 pmulhuw xmm1, xmm2 // rgb * a
4214 movdqu xmm2, [eax] // alphas
4220 pand xmm0, xmm5 // keep original alphas
4230 #endif // HAS_ARGBATTENUATEROW_SSE2
4232 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4233 // Shuffle table duplicating alpha.
4234 static const uvec8 kShuffleAlpha0 = {
4235 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4237 static const uvec8 kShuffleAlpha1 = {
4238 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4239 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4242 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4244 mov eax, [esp + 4] // src_argb0
4245 mov edx, [esp + 8] // dst_argb
4246 mov ecx, [esp + 12] // width
4247 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
4249 movdqa xmm4, kShuffleAlpha0
4250 movdqa xmm5, kShuffleAlpha1
4253 movdqu xmm0, [eax] // read 4 pixels
4254 pshufb xmm0, xmm4 // isolate first 2 alphas
4255 movdqu xmm1, [eax] // read 4 pixels
4256 punpcklbw xmm1, xmm1 // first 2 pixel rgbs
4257 pmulhuw xmm0, xmm1 // rgb * a
4258 movdqu xmm1, [eax] // read 4 pixels
4259 pshufb xmm1, xmm5 // isolate next 2 alphas
4260 movdqu xmm2, [eax] // read 4 pixels
4261 punpckhbw xmm2, xmm2 // next 2 pixel rgbs
4262 pmulhuw xmm1, xmm2 // rgb * a
4263 movdqu xmm2, [eax] // mask original alpha
4269 por xmm0, xmm2 // copy original alpha
4278 #endif // HAS_ARGBATTENUATEROW_SSSE3
4280 #ifdef HAS_ARGBATTENUATEROW_AVX2
4281 // Shuffle table duplicating alpha.
4282 static const uvec8 kShuffleAlpha_AVX2 = {
4283 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
4286 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
4288 mov eax, [esp + 4] // src_argb0
4289 mov edx, [esp + 8] // dst_argb
4290 mov ecx, [esp + 12] // width
4292 vbroadcastf128 ymm4,kShuffleAlpha_AVX2
4293 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
4294 vpslld ymm5, ymm5, 24
4297 vmovdqu ymm6, [eax] // read 8 pixels.
4298 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4299 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4300 vpshufb ymm2, ymm0, ymm4 // low 4 alphas
4301 vpshufb ymm3, ymm1, ymm4 // high 4 alphas
4302 vpmulhuw ymm0, ymm0, ymm2 // rgb * a
4303 vpmulhuw ymm1, ymm1, ymm3 // rgb * a
4304 vpand ymm6, ymm6, ymm5 // isolate alpha
4305 vpsrlw ymm0, ymm0, 8
4306 vpsrlw ymm1, ymm1, 8
4307 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4308 vpor ymm0, ymm0, ymm6 // copy original alpha
4309 vmovdqu [eax + edx], ymm0
4318 #endif // HAS_ARGBATTENUATEROW_AVX2
4320 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4321 // Unattenuate 4 pixels at a time.
4323 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4328 mov eax, [esp + 8 + 4] // src_argb0
4329 mov edx, [esp + 8 + 8] // dst_argb
4330 mov ecx, [esp + 8 + 12] // width
4333 movdqu xmm0, [eax] // read 4 pixels
4334 movzx esi, byte ptr [eax + 3] // first alpha
4335 movzx edi, byte ptr [eax + 7] // second alpha
4336 punpcklbw xmm0, xmm0 // first 2
4337 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
4338 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
4339 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
4340 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4342 pmulhuw xmm0, xmm2 // rgb * a
4344 movdqu xmm1, [eax] // read 4 pixels
4345 movzx esi, byte ptr [eax + 11] // third alpha
4346 movzx edi, byte ptr [eax + 15] // forth alpha
4347 punpckhbw xmm1, xmm1 // next 2
4348 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
4349 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
4350 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
4351 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4353 pmulhuw xmm1, xmm2 // rgb * a
4366 #endif // HAS_ARGBUNATTENUATEROW_SSE2
4368 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4369 // Shuffle table duplicating alpha.
4370 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4371 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
4373 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4374 // USE_GATHER is not on by default, due to being a slow instruction.
4377 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4380 mov eax, [esp + 4] // src_argb0
4381 mov edx, [esp + 8] // dst_argb
4382 mov ecx, [esp + 12] // width
4384 vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
4387 vmovdqu ymm6, [eax] // read 8 pixels.
4388 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
4389 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
4390 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4391 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4392 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
4393 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4394 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4395 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
4396 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
4397 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4398 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4399 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4400 vmovdqu [eax + edx], ymm0
4411 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4415 mov eax, [esp + 4] // src_argb0
4416 mov edx, [esp + 8] // dst_argb
4417 mov ecx, [esp + 12] // width
4419 vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
4426 movzx esi, byte ptr [eax + 3] // alpha0
4427 movzx edi, byte ptr [eax + 7] // alpha1
4428 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0]
4429 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1]
4430 movzx esi, byte ptr [eax + 11] // alpha2
4431 movzx edi, byte ptr [eax + 15] // alpha3
4432 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
4433 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2]
4434 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3]
4435 movzx esi, byte ptr [eax + 19] // alpha4
4436 movzx edi, byte ptr [eax + 23] // alpha5
4437 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
4438 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4]
4439 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5]
4440 movzx esi, byte ptr [eax + 27] // alpha6
4441 movzx edi, byte ptr [eax + 31] // alpha7
4442 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
4443 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6]
4444 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7]
4445 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
4446 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
4447 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
4448 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4451 vmovdqu ymm6, [eax] // read 8 pixels.
4452 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4453 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4454 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4455 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4456 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
4457 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
4458 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4459 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4460 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4461 vmovdqu [eax + edx], ymm0
4472 #endif // USE_GATHER
4473 #endif // HAS_ARGBATTENUATEROW_AVX2
4475 #ifdef HAS_ARGBGRAYROW_SSSE3
4476 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4478 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4480 mov eax, [esp + 4] /* src_argb */
4481 mov edx, [esp + 8] /* dst_argb */
4482 mov ecx, [esp + 12] /* width */
4483 movdqa xmm4, kARGBToYJ
4484 movdqa xmm5, kAddYJ64
4487 movdqu xmm0, [eax] // G
4488 movdqu xmm1, [eax + 16]
4489 pmaddubsw xmm0, xmm4
4490 pmaddubsw xmm1, xmm4
4492 paddw xmm0, xmm5 // Add .5 for rounding.
4494 packuswb xmm0, xmm0 // 8 G bytes
4495 movdqu xmm2, [eax] // A
4496 movdqu xmm3, [eax + 16]
4501 packuswb xmm2, xmm2 // 8 A bytes
4502 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
4503 punpcklbw xmm0, xmm0 // 8 GG words
4504 punpcklbw xmm3, xmm2 // 8 GA words
4506 punpcklwd xmm0, xmm3 // GGGA first 4
4507 punpckhwd xmm1, xmm3 // GGGA next 4
4509 movdqu [edx + 16], xmm1
4516 #endif // HAS_ARGBGRAYROW_SSSE3
4518 #ifdef HAS_ARGBSEPIAROW_SSSE3
4519 // b = (r * 35 + g * 68 + b * 17) >> 7
4520 // g = (r * 45 + g * 88 + b * 22) >> 7
4521 // r = (r * 50 + g * 98 + b * 24) >> 7
4522 // Constant for ARGB color to sepia tone.
4523 static const vec8 kARGBToSepiaB = {
4524 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
4527 static const vec8 kARGBToSepiaG = {
4528 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
4531 static const vec8 kARGBToSepiaR = {
4532 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
4535 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4537 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
4539 mov eax, [esp + 4] /* dst_argb */
4540 mov ecx, [esp + 8] /* width */
4541 movdqa xmm2, kARGBToSepiaB
4542 movdqa xmm3, kARGBToSepiaG
4543 movdqa xmm4, kARGBToSepiaR
4546 movdqu xmm0, [eax] // B
4547 movdqu xmm6, [eax + 16]
4548 pmaddubsw xmm0, xmm2
4549 pmaddubsw xmm6, xmm2
4552 packuswb xmm0, xmm0 // 8 B values
4553 movdqu xmm5, [eax] // G
4554 movdqu xmm1, [eax + 16]
4555 pmaddubsw xmm5, xmm3
4556 pmaddubsw xmm1, xmm3
4559 packuswb xmm5, xmm5 // 8 G values
4560 punpcklbw xmm0, xmm5 // 8 BG values
4561 movdqu xmm5, [eax] // R
4562 movdqu xmm1, [eax + 16]
4563 pmaddubsw xmm5, xmm4
4564 pmaddubsw xmm1, xmm4
4567 packuswb xmm5, xmm5 // 8 R values
4568 movdqu xmm6, [eax] // A
4569 movdqu xmm1, [eax + 16]
4573 packuswb xmm6, xmm6 // 8 A values
4574 punpcklbw xmm5, xmm6 // 8 RA values
4575 movdqa xmm1, xmm0 // Weave BG, RA together
4576 punpcklwd xmm0, xmm5 // BGRA first 4
4577 punpckhwd xmm1, xmm5 // BGRA next 4
4579 movdqu [eax + 16], xmm1
4586 #endif // HAS_ARGBSEPIAROW_SSSE3
4588 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4589 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4590 // Same as Sepia except matrix is provided.
4591 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4592 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4594 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4595 const int8* matrix_argb, int width) {
4597 mov eax, [esp + 4] /* src_argb */
4598 mov edx, [esp + 8] /* dst_argb */
4599 mov ecx, [esp + 12] /* matrix_argb */
4601 pshufd xmm2, xmm5, 0x00
4602 pshufd xmm3, xmm5, 0x55
4603 pshufd xmm4, xmm5, 0xaa
4604 pshufd xmm5, xmm5, 0xff
4605 mov ecx, [esp + 16] /* width */
4608 movdqu xmm0, [eax] // B
4609 movdqu xmm7, [eax + 16]
4610 pmaddubsw xmm0, xmm2
4611 pmaddubsw xmm7, xmm2
4612 movdqu xmm6, [eax] // G
4613 movdqu xmm1, [eax + 16]
4614 pmaddubsw xmm6, xmm3
4615 pmaddubsw xmm1, xmm3
4616 phaddsw xmm0, xmm7 // B
4617 phaddsw xmm6, xmm1 // G
4620 packuswb xmm0, xmm0 // 8 B values
4621 packuswb xmm6, xmm6 // 8 G values
4622 punpcklbw xmm0, xmm6 // 8 BG values
4623 movdqu xmm1, [eax] // R
4624 movdqu xmm7, [eax + 16]
4625 pmaddubsw xmm1, xmm4
4626 pmaddubsw xmm7, xmm4
4627 phaddsw xmm1, xmm7 // R
4628 movdqu xmm6, [eax] // A
4629 movdqu xmm7, [eax + 16]
4630 pmaddubsw xmm6, xmm5
4631 pmaddubsw xmm7, xmm5
4632 phaddsw xmm6, xmm7 // A
4635 packuswb xmm1, xmm1 // 8 R values
4636 packuswb xmm6, xmm6 // 8 A values
4637 punpcklbw xmm1, xmm6 // 8 RA values
4638 movdqa xmm6, xmm0 // Weave BG, RA together
4639 punpcklwd xmm0, xmm1 // BGRA first 4
4640 punpckhwd xmm6, xmm1 // BGRA next 4
4642 movdqu [edx + 16], xmm6
4650 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4652 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4653 // Quantize 4 ARGB pixels (16 bytes).
4655 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4656 int interval_offset, int width) {
4658 mov eax, [esp + 4] /* dst_argb */
4659 movd xmm2, [esp + 8] /* scale */
4660 movd xmm3, [esp + 12] /* interval_size */
4661 movd xmm4, [esp + 16] /* interval_offset */
4662 mov ecx, [esp + 20] /* width */
4663 pshuflw xmm2, xmm2, 040h
4664 pshufd xmm2, xmm2, 044h
4665 pshuflw xmm3, xmm3, 040h
4666 pshufd xmm3, xmm3, 044h
4667 pshuflw xmm4, xmm4, 040h
4668 pshufd xmm4, xmm4, 044h
4669 pxor xmm5, xmm5 // constant 0
4670 pcmpeqb xmm6, xmm6 // generate mask 0xff000000
4674 movdqu xmm0, [eax] // read 4 pixels
4675 punpcklbw xmm0, xmm5 // first 2 pixels
4676 pmulhuw xmm0, xmm2 // pixel * scale >> 16
4677 movdqu xmm1, [eax] // read 4 pixels
4678 punpckhbw xmm1, xmm5 // next 2 pixels
4680 pmullw xmm0, xmm3 // * interval_size
4681 movdqu xmm7, [eax] // read 4 pixels
4683 pand xmm7, xmm6 // mask alpha
4684 paddw xmm0, xmm4 // + interval_size / 2
4695 #endif // HAS_ARGBQUANTIZEROW_SSE2
4697 #ifdef HAS_ARGBSHADEROW_SSE2
4698 // Shade 4 pixels at a time by specified value.
4700 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4703 mov eax, [esp + 4] // src_argb
4704 mov edx, [esp + 8] // dst_argb
4705 mov ecx, [esp + 12] // width
4706 movd xmm2, [esp + 16] // value
4707 punpcklbw xmm2, xmm2
4708 punpcklqdq xmm2, xmm2
4711 movdqu xmm0, [eax] // read 4 pixels
4714 punpcklbw xmm0, xmm0 // first 2
4715 punpckhbw xmm1, xmm1 // next 2
4716 pmulhuw xmm0, xmm2 // argb * value
4717 pmulhuw xmm1, xmm2 // argb * value
4729 #endif // HAS_ARGBSHADEROW_SSE2
4731 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4732 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4734 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4735 uint8* dst_argb, int width) {
4738 mov eax, [esp + 4 + 4] // src_argb0
4739 mov esi, [esp + 4 + 8] // src_argb1
4740 mov edx, [esp + 4 + 12] // dst_argb
4741 mov ecx, [esp + 4 + 16] // width
4742 pxor xmm5, xmm5 // constant 0
4745 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4746 movdqu xmm2, [esi] // read 4 pixels from src_argb1
4749 punpcklbw xmm0, xmm0 // first 2
4750 punpckhbw xmm1, xmm1 // next 2
4751 punpcklbw xmm2, xmm5 // first 2
4752 punpckhbw xmm3, xmm5 // next 2
4753 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
4754 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
4767 #endif // HAS_ARGBMULTIPLYROW_SSE2
4769 #ifdef HAS_ARGBADDROW_SSE2
4770 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4771 // TODO(fbarchard): Port this to posix, neon and other math functions.
4773 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4774 uint8* dst_argb, int width) {
4777 mov eax, [esp + 4 + 4] // src_argb0
4778 mov esi, [esp + 4 + 8] // src_argb1
4779 mov edx, [esp + 4 + 12] // dst_argb
4780 mov ecx, [esp + 4 + 16] // width
4786 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4788 movdqu xmm1, [esi] // read 4 pixels from src_argb1
4790 paddusb xmm0, xmm1 // src_argb0 + src_argb1
4801 movd xmm0, [eax] // read 1 pixels from src_argb0
4803 movd xmm1, [esi] // read 1 pixels from src_argb1
4805 paddusb xmm0, xmm1 // src_argb0 + src_argb1
4816 #endif // HAS_ARGBADDROW_SSE2
4818 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4819 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
4821 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4822 uint8* dst_argb, int width) {
4825 mov eax, [esp + 4 + 4] // src_argb0
4826 mov esi, [esp + 4 + 8] // src_argb1
4827 mov edx, [esp + 4 + 12] // dst_argb
4828 mov ecx, [esp + 4 + 16] // width
4831 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4833 movdqu xmm1, [esi] // read 4 pixels from src_argb1
4835 psubusb xmm0, xmm1 // src_argb0 - src_argb1
4845 #endif // HAS_ARGBSUBTRACTROW_SSE2
4847 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4848 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4850 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4851 uint8* dst_argb, int width) {
4854 mov eax, [esp + 4 + 4] // src_argb0
4855 mov esi, [esp + 4 + 8] // src_argb1
4856 mov edx, [esp + 4 + 12] // dst_argb
4857 mov ecx, [esp + 4 + 16] // width
4858 vpxor ymm5, ymm5, ymm5 // constant 0
4861 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
4863 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
4865 vpunpcklbw ymm0, ymm1, ymm1 // low 4
4866 vpunpckhbw ymm1, ymm1, ymm1 // high 4
4867 vpunpcklbw ymm2, ymm3, ymm5 // low 4
4868 vpunpckhbw ymm3, ymm3, ymm5 // high 4
4869 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
4870 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
4871 vpackuswb ymm0, ymm0, ymm1
4882 #endif // HAS_ARGBMULTIPLYROW_AVX2
4884 #ifdef HAS_ARGBADDROW_AVX2
4885 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
4887 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4888 uint8* dst_argb, int width) {
4891 mov eax, [esp + 4 + 4] // src_argb0
4892 mov esi, [esp + 4 + 8] // src_argb1
4893 mov edx, [esp + 4 + 12] // dst_argb
4894 mov ecx, [esp + 4 + 16] // width
4897 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
4899 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
4911 #endif // HAS_ARGBADDROW_AVX2
4913 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4914 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
4916 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4917 uint8* dst_argb, int width) {
4920 mov eax, [esp + 4 + 4] // src_argb0
4921 mov esi, [esp + 4 + 8] // src_argb1
4922 mov edx, [esp + 4 + 12] // dst_argb
4923 mov ecx, [esp + 4 + 16] // width
4926 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
4928 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
4940 #endif // HAS_ARGBSUBTRACTROW_AVX2
4942 #ifdef HAS_SOBELXROW_SSE2
4943 // SobelX as a matrix is
4948 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4949 const uint8* src_y2, uint8* dst_sobelx, int width) {
4953 mov eax, [esp + 8 + 4] // src_y0
4954 mov esi, [esp + 8 + 8] // src_y1
4955 mov edi, [esp + 8 + 12] // src_y2
4956 mov edx, [esp + 8 + 16] // dst_sobelx
4957 mov ecx, [esp + 8 + 20] // width
4961 pxor xmm5, xmm5 // constant 0
4964 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
4965 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
4966 punpcklbw xmm0, xmm5
4967 punpcklbw xmm1, xmm5
4969 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
4970 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
4971 punpcklbw xmm1, xmm5
4972 punpcklbw xmm2, xmm5
4974 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
4975 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
4976 punpcklbw xmm2, xmm5
4977 punpcklbw xmm3, xmm5
4982 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
4986 movq qword ptr [eax + edx], xmm0
4996 #endif // HAS_SOBELXROW_SSE2
4998 #ifdef HAS_SOBELYROW_SSE2
4999 // SobelY as a matrix is
5004 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5005 uint8* dst_sobely, int width) {
5008 mov eax, [esp + 4 + 4] // src_y0
5009 mov esi, [esp + 4 + 8] // src_y1
5010 mov edx, [esp + 4 + 12] // dst_sobely
5011 mov ecx, [esp + 4 + 16] // width
5014 pxor xmm5, xmm5 // constant 0
5017 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5018 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5019 punpcklbw xmm0, xmm5
5020 punpcklbw xmm1, xmm5
5022 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
5023 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
5024 punpcklbw xmm1, xmm5
5025 punpcklbw xmm2, xmm5
5027 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5028 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5029 punpcklbw xmm2, xmm5
5030 punpcklbw xmm3, xmm5
5035 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5039 movq qword ptr [eax + edx], xmm0
5048 #endif // HAS_SOBELYROW_SSE2
5050 #ifdef HAS_SOBELROW_SSE2
5051 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5057 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5058 uint8* dst_argb, int width) {
5061 mov eax, [esp + 4 + 4] // src_sobelx
5062 mov esi, [esp + 4 + 8] // src_sobely
5063 mov edx, [esp + 4 + 12] // dst_argb
5064 mov ecx, [esp + 4 + 16] // width
5066 pcmpeqb xmm5, xmm5 // alpha 255
5067 pslld xmm5, 24 // 0xff000000
5070 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5071 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5073 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5074 movdqa xmm2, xmm0 // GG
5075 punpcklbw xmm2, xmm0 // First 8
5076 punpckhbw xmm0, xmm0 // Next 8
5077 movdqa xmm1, xmm2 // GGGG
5078 punpcklwd xmm1, xmm2 // First 4
5079 punpckhwd xmm2, xmm2 // Next 4
5080 por xmm1, xmm5 // GGGA
5082 movdqa xmm3, xmm0 // GGGG
5083 punpcklwd xmm3, xmm0 // Next 4
5084 punpckhwd xmm0, xmm0 // Last 4
5085 por xmm3, xmm5 // GGGA
5088 movdqu [edx + 16], xmm2
5089 movdqu [edx + 32], xmm3
5090 movdqu [edx + 48], xmm0
5099 #endif // HAS_SOBELROW_SSE2
5101 #ifdef HAS_SOBELTOPLANEROW_SSE2
5102 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5104 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5105 uint8* dst_y, int width) {
5108 mov eax, [esp + 4 + 4] // src_sobelx
5109 mov esi, [esp + 4 + 8] // src_sobely
5110 mov edx, [esp + 4 + 12] // dst_argb
5111 mov ecx, [esp + 4 + 16] // width
5115 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5116 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5118 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5128 #endif // HAS_SOBELTOPLANEROW_SSE2
5130 #ifdef HAS_SOBELXYROW_SSE2
5131 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5137 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5138 uint8* dst_argb, int width) {
5141 mov eax, [esp + 4 + 4] // src_sobelx
5142 mov esi, [esp + 4 + 8] // src_sobely
5143 mov edx, [esp + 4 + 12] // dst_argb
5144 mov ecx, [esp + 4 + 16] // width
5146 pcmpeqb xmm5, xmm5 // alpha 255
5149 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5150 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5153 paddusb xmm2, xmm1 // sobel = sobelx + sobely
5154 movdqa xmm3, xmm0 // XA
5155 punpcklbw xmm3, xmm5
5156 punpckhbw xmm0, xmm5
5157 movdqa xmm4, xmm1 // YS
5158 punpcklbw xmm4, xmm2
5159 punpckhbw xmm1, xmm2
5160 movdqa xmm6, xmm4 // YSXA
5161 punpcklwd xmm6, xmm3 // First 4
5162 punpckhwd xmm4, xmm3 // Next 4
5163 movdqa xmm7, xmm1 // YSXA
5164 punpcklwd xmm7, xmm0 // Next 4
5165 punpckhwd xmm1, xmm0 // Last 4
5167 movdqu [edx + 16], xmm4
5168 movdqu [edx + 32], xmm7
5169 movdqu [edx + 48], xmm1
5178 #endif // HAS_SOBELXYROW_SSE2
5180 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5181 // Consider float CumulativeSum.
5182 // Consider calling CumulativeSum one row at time as needed.
5183 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5184 // Convert cumulative sum for an area to an average for 1 pixel.
5185 // topleft is pointer to top left of CumulativeSum buffer for area.
5186 // botleft is pointer to bottom left of CumulativeSum buffer.
5187 // width is offset from left to right of area in CumulativeSum buffer measured
5188 // in number of ints.
5189 // area is the number of pixels in the area being averaged.
5190 // dst points to pixel to store result to.
5191 // count is number of averaged pixels to produce.
5192 // Does 4 pixels at a time.
5193 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
5194 int width, int area, uint8* dst,
5197 mov eax, topleft // eax topleft
5198 mov esi, botleft // esi botleft
5204 rcpss xmm4, xmm5 // 1.0f / area
5205 pshufd xmm4, xmm4, 0
5209 cmp area, 128 // 128 pixels will not overflow 15 bits.
5212 pshufd xmm5, xmm5, 0 // area
5213 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
5216 addps xmm5, xmm6 // (65536.0 + area - 1)
5217 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
5218 cvtps2dq xmm5, xmm5 // 0.16 fixed point
5219 packssdw xmm5, xmm5 // 16 bit shorts
5221 // 4 pixel loop small blocks.
5225 movdqu xmm1, [eax + 16]
5226 movdqu xmm2, [eax + 32]
5227 movdqu xmm3, [eax + 48]
5230 psubd xmm0, [eax + edx * 4]
5231 psubd xmm1, [eax + edx * 4 + 16]
5232 psubd xmm2, [eax + edx * 4 + 32]
5233 psubd xmm3, [eax + edx * 4 + 48]
5238 psubd xmm1, [esi + 16]
5239 psubd xmm2, [esi + 32]
5240 psubd xmm3, [esi + 48]
5243 paddd xmm0, [esi + edx * 4]
5244 paddd xmm1, [esi + edx * 4 + 16]
5245 paddd xmm2, [esi + edx * 4 + 32]
5246 paddd xmm3, [esi + edx * 4 + 48]
5249 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
5267 movdqu xmm1, [eax + 16]
5268 movdqu xmm2, [eax + 32]
5269 movdqu xmm3, [eax + 48]
5272 psubd xmm0, [eax + edx * 4]
5273 psubd xmm1, [eax + edx * 4 + 16]
5274 psubd xmm2, [eax + edx * 4 + 32]
5275 psubd xmm3, [eax + edx * 4 + 48]
5280 psubd xmm1, [esi + 16]
5281 psubd xmm2, [esi + 32]
5282 psubd xmm3, [esi + 48]
5285 paddd xmm0, [esi + edx * 4]
5286 paddd xmm1, [esi + edx * 4 + 16]
5287 paddd xmm2, [esi + edx * 4 + 32]
5288 paddd xmm3, [esi + edx * 4 + 48]
5291 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
5318 psubd xmm0, [eax + edx * 4]
5321 paddd xmm0, [esi + edx * 4]
5328 movd dword ptr [edi], xmm0
5335 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5337 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5338 // Creates a table of cumulative sums where each value is a sum of all values
5339 // above and to the left of the value.
5340 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
5341 const int32* previous_cumsum, int width) {
5345 mov esi, previous_cumsum
5357 movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
5361 punpcklbw xmm2, xmm1
5363 punpcklwd xmm2, xmm1
5364 punpckhwd xmm3, xmm1
5366 punpckhbw xmm4, xmm1
5368 punpcklwd xmm4, xmm1
5369 punpckhwd xmm5, xmm1
5372 movdqu xmm2, [esi] // previous row above.
5376 movdqu xmm3, [esi + 16]
5380 movdqu xmm4, [esi + 32]
5384 movdqu xmm5, [esi + 48]
5389 movdqu [edx + 16], xmm3
5390 movdqu [edx + 32], xmm4
5391 movdqu [edx + 48], xmm5
5403 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
5405 punpcklbw xmm2, xmm1
5406 punpcklwd xmm2, xmm1
5419 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
5421 #ifdef HAS_ARGBAFFINEROW_SSE2
5422 // Copy ARGB pixels from source image with slope to a row of destination.
5425 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
5426 uint8* dst_argb, const float* uv_dudv, int width) {
5430 mov eax, [esp + 12] // src_argb
5431 mov esi, [esp + 16] // stride
5432 mov edx, [esp + 20] // dst_argb
5433 mov ecx, [esp + 24] // pointer to uv_dudv
5434 movq xmm2, qword ptr [ecx] // uv
5435 movq xmm7, qword ptr [ecx + 8] // dudv
5436 mov ecx, [esp + 28] // width
5437 shl esi, 16 // 4, stride
5443 // setup for 4 pixel loop
5444 pshufd xmm7, xmm7, 0x44 // dup dudv
5445 pshufd xmm5, xmm5, 0 // dup 4, stride
5446 movdqa xmm0, xmm2 // x0, y0, x1, y1
5450 addps xmm4, xmm4 // dudv *= 2
5451 movdqa xmm3, xmm2 // x2, y2, x3, y3
5453 addps xmm4, xmm4 // dudv *= 4
5457 cvttps2dq xmm0, xmm2 // x, y float to int first 2
5458 cvttps2dq xmm1, xmm3 // x, y float to int next 2
5459 packssdw xmm0, xmm1 // x, y as 8 shorts
5460 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
5462 pshufd xmm0, xmm0, 0x39 // shift right
5464 pshufd xmm0, xmm0, 0x39 // shift right
5465 movd xmm1, [eax + esi] // read pixel 0
5466 movd xmm6, [eax + edi] // read pixel 1
5467 punpckldq xmm1, xmm6 // combine pixel 0 and 1
5468 addps xmm2, xmm4 // x, y += dx, dy first 2
5469 movq qword ptr [edx], xmm1
5471 pshufd xmm0, xmm0, 0x39 // shift right
5473 movd xmm6, [eax + esi] // read pixel 2
5474 movd xmm0, [eax + edi] // read pixel 3
5475 punpckldq xmm6, xmm0 // combine pixel 2 and 3
5476 addps xmm3, xmm4 // x, y += dx, dy next 2
5477 movq qword ptr 8[edx], xmm6
5488 cvttps2dq xmm0, xmm2 // x, y float to int
5489 packssdw xmm0, xmm0 // x, y as shorts
5490 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
5491 addps xmm2, xmm7 // x, y += dx, dy
5493 movd xmm0, [eax + esi] // copy a pixel
5504 #endif // HAS_ARGBAFFINEROW_SSE2
5506 #ifdef HAS_INTERPOLATEROW_AVX2
5507 // Bilinear filter 32x2 -> 32x1
5509 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
5510 ptrdiff_t src_stride, int dst_width,
5511 int source_y_fraction) {
5515 mov edi, [esp + 8 + 4] // dst_ptr
5516 mov esi, [esp + 8 + 8] // src_ptr
5517 mov edx, [esp + 8 + 12] // src_stride
5518 mov ecx, [esp + 8 + 16] // dst_width
5519 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5521 // Dispatch to specialized filters if applicable.
5523 je xloop100 // 0 / 128. Blend 100 / 0.
5526 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
5528 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
5530 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
5532 vmovd xmm0, eax // high fraction 0..127
5535 vmovd xmm5, eax // low fraction 128..1
5536 vpunpcklbw xmm5, xmm5, xmm0
5537 vpunpcklwd xmm5, xmm5, xmm5
5538 vpxor ymm0, ymm0, ymm0
5539 vpermd ymm5, ymm0, ymm5
5543 vmovdqu ymm2, [esi + edx]
5544 vpunpckhbw ymm1, ymm0, ymm2 // mutates
5545 vpunpcklbw ymm0, ymm0, ymm2 // mutates
5546 vpmaddubsw ymm0, ymm0, ymm5
5547 vpmaddubsw ymm1, ymm1, ymm5
5548 vpsrlw ymm0, ymm0, 7
5549 vpsrlw ymm1, ymm1, 7
5550 vpackuswb ymm0, ymm0, ymm1 // unmutates
5551 vmovdqu [esi + edi], ymm0
5560 vmovdqu ymm1, [esi + edx]
5561 vpavgb ymm0, ymm0, ymm1
5562 vpavgb ymm0, ymm0, ymm1
5563 vmovdqu [esi + edi], ymm0
5572 vpavgb ymm0, ymm0, [esi + edx]
5573 vmovdqu [esi + edi], ymm0
5582 vmovdqu ymm0, [esi + edx]
5583 vpavgb ymm0, ymm0, ymm1
5584 vpavgb ymm0, ymm0, ymm1
5585 vmovdqu [esi + edi], ymm0
5591 // Blend 100 / 0 - Copy row unchanged.
5602 #endif // HAS_INTERPOLATEROW_AVX2
5604 // Bilinear filter 16x2 -> 16x1
5606 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
5607 ptrdiff_t src_stride, int dst_width,
5608 int source_y_fraction) {
5612 mov edi, [esp + 8 + 4] // dst_ptr
5613 mov esi, [esp + 8 + 8] // src_ptr
5614 mov edx, [esp + 8 + 12] // src_stride
5615 mov ecx, [esp + 8 + 16] // dst_width
5616 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5619 // Dispatch to specialized filters if applicable.
5621 je xloop100 // 0 / 128. Blend 100 / 0.
5623 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
5625 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
5627 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
5629 movd xmm0, eax // high fraction 0..127
5632 movd xmm5, eax // low fraction 128..1
5633 punpcklbw xmm5, xmm0
5634 punpcklwd xmm5, xmm5
5635 pshufd xmm5, xmm5, 0
5639 movdqu xmm2, [esi + edx]
5641 punpcklbw xmm0, xmm2
5642 punpckhbw xmm1, xmm2
5643 pmaddubsw xmm0, xmm5
5644 pmaddubsw xmm1, xmm5
5648 movdqu [esi + edi], xmm0
5657 movdqu xmm1, [esi + edx]
5660 movdqu [esi + edi], xmm0
5669 movdqu xmm1, [esi + edx]
5671 movdqu [esi + edi], xmm0
5680 movdqu xmm0, [esi + edx]
5683 movdqu [esi + edi], xmm0
5689 // Blend 100 / 0 - Copy row unchanged.
5692 movdqu [esi + edi], xmm0
5704 #ifdef HAS_INTERPOLATEROW_SSE2
5705 // Bilinear filter 16x2 -> 16x1
5707 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
5708 ptrdiff_t src_stride, int dst_width,
5709 int source_y_fraction) {
5713 mov edi, [esp + 8 + 4] // dst_ptr
5714 mov esi, [esp + 8 + 8] // src_ptr
5715 mov edx, [esp + 8 + 12] // src_stride
5716 mov ecx, [esp + 8 + 16] // dst_width
5717 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5719 // Dispatch to specialized filters if applicable.
5721 je xloop100 // 0 / 256. Blend 100 / 0.
5723 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
5725 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
5727 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
5729 movd xmm5, eax // xmm5 = y fraction
5730 punpcklbw xmm5, xmm5
5732 punpcklwd xmm5, xmm5
5733 punpckldq xmm5, xmm5
5734 punpcklqdq xmm5, xmm5
5738 movdqu xmm0, [esi] // row0
5739 movdqu xmm2, [esi + edx] // row1
5742 punpcklbw xmm2, xmm4
5743 punpckhbw xmm3, xmm4
5744 punpcklbw xmm0, xmm4
5745 punpckhbw xmm1, xmm4
5746 psubw xmm2, xmm0 // row1 - row0
5748 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
5750 pmulhw xmm2, xmm5 // scale diff
5752 paddw xmm0, xmm2 // sum rows
5755 movdqu [esi + edi], xmm0
5764 movdqu xmm1, [esi + edx]
5767 movdqu [esi + edi], xmm0
5776 movdqu xmm1, [esi + edx]
5778 movdqu [esi + edi], xmm0
5787 movdqu xmm0, [esi + edx]
5790 movdqu [esi + edi], xmm0
5796 // Blend 100 / 0 - Copy row unchanged.
5799 movdqu [esi + edi], xmm0
5810 #endif // HAS_INTERPOLATEROW_SSE2
5812 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5814 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5815 const uint8* shuffler, int pix) {
5817 mov eax, [esp + 4] // src_argb
5818 mov edx, [esp + 8] // dst_argb
5819 mov ecx, [esp + 12] // shuffler
5821 mov ecx, [esp + 16] // pix
5825 movdqu xmm1, [eax + 16]
5830 movdqu [edx + 16], xmm1
5838 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5840 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5841 const uint8* shuffler, int pix) {
5843 mov eax, [esp + 4] // src_argb
5844 mov edx, [esp + 8] // dst_argb
5845 mov ecx, [esp + 12] // shuffler
5846 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
5847 mov ecx, [esp + 16] // pix
5851 vmovdqu ymm1, [eax + 32]
5853 vpshufb ymm0, ymm0, ymm5
5854 vpshufb ymm1, ymm1, ymm5
5856 vmovdqu [edx + 32], ymm1
5865 #endif // HAS_ARGBSHUFFLEROW_AVX2
5868 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5869 const uint8* shuffler, int pix) {
5873 mov eax, [esp + 8 + 4] // src_argb
5874 mov edx, [esp + 8 + 8] // dst_argb
5875 mov esi, [esp + 8 + 12] // shuffler
5876 mov ecx, [esp + 8 + 16] // pix
5879 mov ebx, [esi] // shuffler
5889 // TODO(fbarchard): Use one source pointer and 3 offsets.
5891 movzx ebx, byte ptr [esi]
5892 movzx ebx, byte ptr [eax + ebx]
5894 movzx ebx, byte ptr [esi + 1]
5895 movzx ebx, byte ptr [eax + ebx]
5897 movzx ebx, byte ptr [esi + 2]
5898 movzx ebx, byte ptr [eax + ebx]
5900 movzx ebx, byte ptr [esi + 3]
5901 movzx ebx, byte ptr [eax + ebx]
5913 punpcklbw xmm0, xmm5
5914 punpckhbw xmm1, xmm5
5915 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
5916 pshuflw xmm0, xmm0, 01Bh
5917 pshufhw xmm1, xmm1, 01Bh
5918 pshuflw xmm1, xmm1, 01Bh
5930 punpcklbw xmm0, xmm5
5931 punpckhbw xmm1, xmm5
5932 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
5933 pshuflw xmm0, xmm0, 039h
5934 pshufhw xmm1, xmm1, 039h
5935 pshuflw xmm1, xmm1, 039h
5947 punpcklbw xmm0, xmm5
5948 punpckhbw xmm1, xmm5
5949 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
5950 pshuflw xmm0, xmm0, 093h
5951 pshufhw xmm1, xmm1, 093h
5952 pshuflw xmm1, xmm1, 093h
5964 punpcklbw xmm0, xmm5
5965 punpckhbw xmm1, xmm5
5966 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
5967 pshuflw xmm0, xmm0, 0C6h
5968 pshufhw xmm1, xmm1, 0C6h
5969 pshuflw xmm1, xmm1, 0C6h
5983 // YUY2 - Macro-pixel = 2 image pixels
5984 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5986 // UYVY - Macro-pixel = 2 image pixels
5990 void I422ToYUY2Row_SSE2(const uint8* src_y,
5993 uint8* dst_frame, int width) {
5997 mov eax, [esp + 8 + 4] // src_y
5998 mov esi, [esp + 8 + 8] // src_u
5999 mov edx, [esp + 8 + 12] // src_v
6000 mov edi, [esp + 8 + 16] // dst_frame
6001 mov ecx, [esp + 8 + 20] // width
6005 movq xmm2, qword ptr [esi] // U
6006 movq xmm3, qword ptr [esi + edx] // V
6008 punpcklbw xmm2, xmm3 // UV
6009 movdqu xmm0, [eax] // Y
6012 punpcklbw xmm0, xmm2 // YUYV
6013 punpckhbw xmm1, xmm2
6015 movdqu [edi + 16], xmm1
6027 void I422ToUYVYRow_SSE2(const uint8* src_y,
6030 uint8* dst_frame, int width) {
6034 mov eax, [esp + 8 + 4] // src_y
6035 mov esi, [esp + 8 + 8] // src_u
6036 mov edx, [esp + 8 + 12] // src_v
6037 mov edi, [esp + 8 + 16] // dst_frame
6038 mov ecx, [esp + 8 + 20] // width
6042 movq xmm2, qword ptr [esi] // U
6043 movq xmm3, qword ptr [esi + edx] // V
6045 punpcklbw xmm2, xmm3 // UV
6046 movdqu xmm0, [eax] // Y
6049 punpcklbw xmm1, xmm0 // UYVY
6050 punpckhbw xmm2, xmm0
6052 movdqu [edi + 16], xmm2
6063 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
6065 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
6066 uint8* dst_argb, const float* poly,
6070 mov eax, [esp + 4 + 4] /* src_argb */
6071 mov edx, [esp + 4 + 8] /* dst_argb */
6072 mov esi, [esp + 4 + 12] /* poly */
6073 mov ecx, [esp + 4 + 16] /* width */
6074 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
6078 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
6079 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
6080 movq xmm0, qword ptr [eax] // BGRABGRA
6082 punpcklbw xmm0, xmm3
6084 punpcklwd xmm0, xmm3 // pixel 0
6085 punpckhwd xmm4, xmm3 // pixel 1
6086 cvtdq2ps xmm0, xmm0 // 4 floats
6088 movdqa xmm1, xmm0 // X
6090 mulps xmm0, [esi + 16] // C1 * X
6091 mulps xmm4, [esi + 16]
6092 addps xmm0, [esi] // result = C0 + C1 * X
6096 mulps xmm2, xmm1 // X * X
6098 mulps xmm1, xmm2 // X * X * X
6100 mulps xmm2, [esi + 32] // C2 * X * X
6101 mulps xmm6, [esi + 32]
6102 mulps xmm1, [esi + 48] // C3 * X * X * X
6103 mulps xmm5, [esi + 48]
6104 addps xmm0, xmm2 // result += C2 * X * X
6106 addps xmm0, xmm1 // result += C3 * X * X * X
6108 cvttps2dq xmm0, xmm0
6109 cvttps2dq xmm4, xmm4
6112 movq qword ptr [edx], xmm0
6120 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
6122 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
6124 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
6125 uint8* dst_argb, const float* poly,
6128 mov eax, [esp + 4] /* src_argb */
6129 mov edx, [esp + 8] /* dst_argb */
6130 mov ecx, [esp + 12] /* poly */
6131 vbroadcastf128 ymm4, [ecx] // C0
6132 vbroadcastf128 ymm5, [ecx + 16] // C1
6133 vbroadcastf128 ymm6, [ecx + 32] // C2
6134 vbroadcastf128 ymm7, [ecx + 48] // C3
6135 mov ecx, [esp + 16] /* width */
6139 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
6141 vcvtdq2ps ymm0, ymm0 // X 8 floats
6142 vmulps ymm2, ymm0, ymm0 // X * X
6143 vmulps ymm3, ymm0, ymm7 // C3 * X
6144 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
6145 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
6146 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
6147 vcvttps2dq ymm0, ymm0
6148 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
6149 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
6150 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
6151 vmovq qword ptr [edx], xmm0
6159 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
6161 #ifdef HAS_ARGBCOLORTABLEROW_X86
6162 // Tranform ARGB pixels with color table.
6164 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
6168 mov eax, [esp + 4 + 4] /* dst_argb */
6169 mov esi, [esp + 4 + 8] /* table_argb */
6170 mov ecx, [esp + 4 + 12] /* width */
6174 movzx edx, byte ptr [eax]
6176 movzx edx, byte ptr [esi + edx * 4]
6177 mov byte ptr [eax - 4], dl
6178 movzx edx, byte ptr [eax - 4 + 1]
6179 movzx edx, byte ptr [esi + edx * 4 + 1]
6180 mov byte ptr [eax - 4 + 1], dl
6181 movzx edx, byte ptr [eax - 4 + 2]
6182 movzx edx, byte ptr [esi + edx * 4 + 2]
6183 mov byte ptr [eax - 4 + 2], dl
6184 movzx edx, byte ptr [eax - 4 + 3]
6185 movzx edx, byte ptr [esi + edx * 4 + 3]
6186 mov byte ptr [eax - 4 + 3], dl
6193 #endif // HAS_ARGBCOLORTABLEROW_X86
6195 #ifdef HAS_RGBCOLORTABLEROW_X86
6196 // Tranform RGB pixels with color table.
6198 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
6201 mov eax, [esp + 4 + 4] /* dst_argb */
6202 mov esi, [esp + 4 + 8] /* table_argb */
6203 mov ecx, [esp + 4 + 12] /* width */
6207 movzx edx, byte ptr [eax]
6209 movzx edx, byte ptr [esi + edx * 4]
6210 mov byte ptr [eax - 4], dl
6211 movzx edx, byte ptr [eax - 4 + 1]
6212 movzx edx, byte ptr [esi + edx * 4 + 1]
6213 mov byte ptr [eax - 4 + 1], dl
6214 movzx edx, byte ptr [eax - 4 + 2]
6215 movzx edx, byte ptr [esi + edx * 4 + 2]
6216 mov byte ptr [eax - 4 + 2], dl
6224 #endif // HAS_RGBCOLORTABLEROW_X86
6226 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6227 // Tranform RGB pixels with luma table.
6229 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
6231 const uint8* luma, uint32 lumacoeff) {
6235 mov eax, [esp + 8 + 4] /* src_argb */
6236 mov edi, [esp + 8 + 8] /* dst_argb */
6237 mov ecx, [esp + 8 + 12] /* width */
6238 movd xmm2, dword ptr [esp + 8 + 16] // luma table
6239 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
6240 pshufd xmm2, xmm2, 0
6241 pshufd xmm3, xmm3, 0
6242 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
6248 movdqu xmm0, qword ptr [eax] // generate luma ptr
6249 pmaddubsw xmm0, xmm3
6251 pand xmm0, xmm4 // mask out low bits
6252 punpcklwd xmm0, xmm5
6253 paddd xmm0, xmm2 // add table base
6255 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6257 movzx edx, byte ptr [eax]
6258 movzx edx, byte ptr [esi + edx]
6259 mov byte ptr [edi], dl
6260 movzx edx, byte ptr [eax + 1]
6261 movzx edx, byte ptr [esi + edx]
6262 mov byte ptr [edi + 1], dl
6263 movzx edx, byte ptr [eax + 2]
6264 movzx edx, byte ptr [esi + edx]
6265 mov byte ptr [edi + 2], dl
6266 movzx edx, byte ptr [eax + 3] // copy alpha.
6267 mov byte ptr [edi + 3], dl
6270 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6272 movzx edx, byte ptr [eax + 4]
6273 movzx edx, byte ptr [esi + edx]
6274 mov byte ptr [edi + 4], dl
6275 movzx edx, byte ptr [eax + 5]
6276 movzx edx, byte ptr [esi + edx]
6277 mov byte ptr [edi + 5], dl
6278 movzx edx, byte ptr [eax + 6]
6279 movzx edx, byte ptr [esi + edx]
6280 mov byte ptr [edi + 6], dl
6281 movzx edx, byte ptr [eax + 7] // copy alpha.
6282 mov byte ptr [edi + 7], dl
6285 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6287 movzx edx, byte ptr [eax + 8]
6288 movzx edx, byte ptr [esi + edx]
6289 mov byte ptr [edi + 8], dl
6290 movzx edx, byte ptr [eax + 9]
6291 movzx edx, byte ptr [esi + edx]
6292 mov byte ptr [edi + 9], dl
6293 movzx edx, byte ptr [eax + 10]
6294 movzx edx, byte ptr [esi + edx]
6295 mov byte ptr [edi + 10], dl
6296 movzx edx, byte ptr [eax + 11] // copy alpha.
6297 mov byte ptr [edi + 11], dl
6301 movzx edx, byte ptr [eax + 12]
6302 movzx edx, byte ptr [esi + edx]
6303 mov byte ptr [edi + 12], dl
6304 movzx edx, byte ptr [eax + 13]
6305 movzx edx, byte ptr [esi + edx]
6306 mov byte ptr [edi + 13], dl
6307 movzx edx, byte ptr [eax + 14]
6308 movzx edx, byte ptr [esi + edx]
6309 mov byte ptr [edi + 14], dl
6310 movzx edx, byte ptr [eax + 15] // copy alpha.
6311 mov byte ptr [edi + 15], dl
6323 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6325 #endif // defined(_M_X64)
6326 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6330 } // namespace libyuv