3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree.
12 #include "libyuv/row.h"
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
25 static vec8 kARGBToY = {
26 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
30 static vec8 kARGBToYJ = {
31 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
33 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
35 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
37 static vec8 kARGBToU = {
38 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
41 static vec8 kARGBToUJ = {
42 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
45 static vec8 kARGBToV = {
46 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
49 static vec8 kARGBToVJ = {
50 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
54 static vec8 kBGRAToY = {
55 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
58 static vec8 kBGRAToU = {
59 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
62 static vec8 kBGRAToV = {
63 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
67 static vec8 kABGRToY = {
68 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
71 static vec8 kABGRToU = {
72 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
75 static vec8 kABGRToV = {
76 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
79 // Constants for RGBA.
80 static vec8 kRGBAToY = {
81 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
84 static vec8 kRGBAToU = {
85 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
88 static vec8 kRGBAToV = {
89 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
92 static uvec8 kAddY16 = {
93 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
96 // 7 bit fixed point 0.5.
97 static vec16 kAddYJ64 = {
98 64, 64, 64, 64, 64, 64, 64, 64
101 static uvec8 kAddUV128 = {
102 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
103 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
106 static uvec16 kAddUVJ128 = {
107 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
109 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
111 #ifdef HAS_RGB24TOARGBROW_SSSE3
113 // Shuffle table for converting RGB24 to ARGB.
114 static uvec8 kShuffleMaskRGB24ToARGB = {
115 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
118 // Shuffle table for converting RAW to ARGB.
119 static uvec8 kShuffleMaskRAWToARGB = {
120 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
123 // Shuffle table for converting ARGB to RGB24.
124 static uvec8 kShuffleMaskARGBToRGB24 = {
125 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
128 // Shuffle table for converting ARGB to RAW.
129 static uvec8 kShuffleMaskARGBToRAW = {
130 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
133 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
134 static uvec8 kShuffleMaskARGBToRGB24_0 = {
135 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
138 // Shuffle table for converting ARGB to RAW.
139 static uvec8 kShuffleMaskARGBToRAW_0 = {
140 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
142 #endif // HAS_RGB24TOARGBROW_SSSE3
144 #if defined(TESTING) && defined(__x86_64__)
145 void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
159 "mov %%r10d,%%r10d \n"
160 "mov %%r11d,%%r11d \n"
161 "mov %%r12d,%%r12d \n"
162 "mov %%r13d,%%r13d \n"
163 "mov %%r14d,%%r14d \n"
164 "mov %%r15d,%%r15d \n"
166 "lea (%%rax),%%eax \n"
167 "lea (%%rbx),%%ebx \n"
168 "lea (%%rcx),%%ecx \n"
169 "lea (%%rdx),%%edx \n"
170 "lea (%%rsi),%%esi \n"
171 "lea (%%rdi),%%edi \n"
172 "lea (%%rbp),%%ebp \n"
173 "lea (%%rsp),%%esp \n"
175 "lea (%%r8),%%r8d \n"
176 "lea (%%r9),%%r9d \n"
177 "lea (%%r10),%%r10d \n"
178 "lea (%%r11),%%r11d \n"
179 "lea (%%r12),%%r12d \n"
180 "lea (%%r13),%%r13d \n"
181 "lea (%%r14),%%r14d \n"
182 "lea (%%r15),%%r15d \n"
185 "lea 0x10(%%rax),%%eax \n"
186 "lea 0x10(%%rbx),%%ebx \n"
187 "lea 0x10(%%rcx),%%ecx \n"
188 "lea 0x10(%%rdx),%%edx \n"
189 "lea 0x10(%%rsi),%%esi \n"
190 "lea 0x10(%%rdi),%%edi \n"
191 "lea 0x10(%%rbp),%%ebp \n"
192 "lea 0x10(%%rsp),%%esp \n"
194 "lea 0x10(%%r8),%%r8d \n"
195 "lea 0x10(%%r9),%%r9d \n"
196 "lea 0x10(%%r10),%%r10d \n"
197 "lea 0x10(%%r11),%%r11d \n"
198 "lea 0x10(%%r12),%%r12d \n"
199 "lea 0x10(%%r13),%%r13d \n"
200 "lea 0x10(%%r14),%%r14d \n"
201 "lea 0x10(%%r15),%%r15d \n"
224 "movq " MEMACCESS(0) ",%%xmm0 \n"
225 "lea " MEMLEA(0x8,0) ",%0 \n"
226 "movdqu %%xmm0," MEMACCESS(1) " \n"
227 "lea " MEMLEA(0x20,1) ",%1 \n"
231 "+r"(dst_argb), // %1
234 : "memory", "cc", "xmm0", "xmm1", "xmm5"
239 #ifdef HAS_J400TOARGBROW_SSE2
240 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
242 "pcmpeqb %%xmm5,%%xmm5 \n"
243 "pslld $0x18,%%xmm5 \n"
246 "movq " MEMACCESS(0) ",%%xmm0 \n"
247 "lea " MEMLEA(0x8,0) ",%0 \n"
248 "punpcklbw %%xmm0,%%xmm0 \n"
249 "movdqa %%xmm0,%%xmm1 \n"
250 "punpcklwd %%xmm0,%%xmm0 \n"
251 "punpckhwd %%xmm1,%%xmm1 \n"
252 "por %%xmm5,%%xmm0 \n"
253 "por %%xmm5,%%xmm1 \n"
254 "movdqu %%xmm0," MEMACCESS(1) " \n"
255 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
256 "lea " MEMLEA(0x20,1) ",%1 \n"
260 "+r"(dst_argb), // %1
262 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
265 #endif // HAS_J400TOARGBROW_SSE2
267 #ifdef HAS_RGB24TOARGBROW_SSSE3
268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
270 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
271 "pslld $0x18,%%xmm5 \n"
272 "movdqa %3,%%xmm4 \n"
275 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
276 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
277 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
278 "lea " MEMLEA(0x30,0) ",%0 \n"
279 "movdqa %%xmm3,%%xmm2 \n"
280 "palignr $0x8,%%xmm1,%%xmm2 \n"
281 "pshufb %%xmm4,%%xmm2 \n"
282 "por %%xmm5,%%xmm2 \n"
283 "palignr $0xc,%%xmm0,%%xmm1 \n"
284 "pshufb %%xmm4,%%xmm0 \n"
285 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
286 "por %%xmm5,%%xmm0 \n"
287 "pshufb %%xmm4,%%xmm1 \n"
288 "movdqu %%xmm0," MEMACCESS(1) " \n"
289 "por %%xmm5,%%xmm1 \n"
290 "palignr $0x4,%%xmm3,%%xmm3 \n"
291 "pshufb %%xmm4,%%xmm3 \n"
292 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
293 "por %%xmm5,%%xmm3 \n"
294 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
295 "lea " MEMLEA(0x40,1) ",%1 \n"
298 : "+r"(src_rgb24), // %0
299 "+r"(dst_argb), // %1
301 : "m"(kShuffleMaskRGB24ToARGB) // %3
302 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
306 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
308 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
309 "pslld $0x18,%%xmm5 \n"
310 "movdqa %3,%%xmm4 \n"
313 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
314 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
315 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
316 "lea " MEMLEA(0x30,0) ",%0 \n"
317 "movdqa %%xmm3,%%xmm2 \n"
318 "palignr $0x8,%%xmm1,%%xmm2 \n"
319 "pshufb %%xmm4,%%xmm2 \n"
320 "por %%xmm5,%%xmm2 \n"
321 "palignr $0xc,%%xmm0,%%xmm1 \n"
322 "pshufb %%xmm4,%%xmm0 \n"
323 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
324 "por %%xmm5,%%xmm0 \n"
325 "pshufb %%xmm4,%%xmm1 \n"
326 "movdqu %%xmm0," MEMACCESS(1) " \n"
327 "por %%xmm5,%%xmm1 \n"
328 "palignr $0x4,%%xmm3,%%xmm3 \n"
329 "pshufb %%xmm4,%%xmm3 \n"
330 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
331 "por %%xmm5,%%xmm3 \n"
332 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
333 "lea " MEMLEA(0x40,1) ",%1 \n"
336 : "+r"(src_raw), // %0
337 "+r"(dst_argb), // %1
339 : "m"(kShuffleMaskRAWToARGB) // %3
340 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
344 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
346 "mov $0x1080108,%%eax \n"
347 "movd %%eax,%%xmm5 \n"
348 "pshufd $0x0,%%xmm5,%%xmm5 \n"
349 "mov $0x20802080,%%eax \n"
350 "movd %%eax,%%xmm6 \n"
351 "pshufd $0x0,%%xmm6,%%xmm6 \n"
352 "pcmpeqb %%xmm3,%%xmm3 \n"
353 "psllw $0xb,%%xmm3 \n"
354 "pcmpeqb %%xmm4,%%xmm4 \n"
355 "psllw $0xa,%%xmm4 \n"
356 "psrlw $0x5,%%xmm4 \n"
357 "pcmpeqb %%xmm7,%%xmm7 \n"
358 "psllw $0x8,%%xmm7 \n"
363 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
364 "movdqa %%xmm0,%%xmm1 \n"
365 "movdqa %%xmm0,%%xmm2 \n"
366 "pand %%xmm3,%%xmm1 \n"
367 "psllw $0xb,%%xmm2 \n"
368 "pmulhuw %%xmm5,%%xmm1 \n"
369 "pmulhuw %%xmm5,%%xmm2 \n"
370 "psllw $0x8,%%xmm1 \n"
371 "por %%xmm2,%%xmm1 \n"
372 "pand %%xmm4,%%xmm0 \n"
373 "pmulhuw %%xmm6,%%xmm0 \n"
374 "por %%xmm7,%%xmm0 \n"
375 "movdqa %%xmm1,%%xmm2 \n"
376 "punpcklbw %%xmm0,%%xmm1 \n"
377 "punpckhbw %%xmm0,%%xmm2 \n"
378 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
379 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
380 "lea " MEMLEA(0x10,0) ",%0 \n"
387 : "memory", "cc", "eax", NACL_R14
388 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
392 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
394 "mov $0x1080108,%%eax \n"
395 "movd %%eax,%%xmm5 \n"
396 "pshufd $0x0,%%xmm5,%%xmm5 \n"
397 "mov $0x42004200,%%eax \n"
398 "movd %%eax,%%xmm6 \n"
399 "pshufd $0x0,%%xmm6,%%xmm6 \n"
400 "pcmpeqb %%xmm3,%%xmm3 \n"
401 "psllw $0xb,%%xmm3 \n"
402 "movdqa %%xmm3,%%xmm4 \n"
403 "psrlw $0x6,%%xmm4 \n"
404 "pcmpeqb %%xmm7,%%xmm7 \n"
405 "psllw $0x8,%%xmm7 \n"
410 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
411 "movdqa %%xmm0,%%xmm1 \n"
412 "movdqa %%xmm0,%%xmm2 \n"
413 "psllw $0x1,%%xmm1 \n"
414 "psllw $0xb,%%xmm2 \n"
415 "pand %%xmm3,%%xmm1 \n"
416 "pmulhuw %%xmm5,%%xmm2 \n"
417 "pmulhuw %%xmm5,%%xmm1 \n"
418 "psllw $0x8,%%xmm1 \n"
419 "por %%xmm2,%%xmm1 \n"
420 "movdqa %%xmm0,%%xmm2 \n"
421 "pand %%xmm4,%%xmm0 \n"
422 "psraw $0x8,%%xmm2 \n"
423 "pmulhuw %%xmm6,%%xmm0 \n"
424 "pand %%xmm7,%%xmm2 \n"
425 "por %%xmm2,%%xmm0 \n"
426 "movdqa %%xmm1,%%xmm2 \n"
427 "punpcklbw %%xmm0,%%xmm1 \n"
428 "punpckhbw %%xmm0,%%xmm2 \n"
429 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
430 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
431 "lea " MEMLEA(0x10,0) ",%0 \n"
438 : "memory", "cc", "eax", NACL_R14
439 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
443 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
445 "mov $0xf0f0f0f,%%eax \n"
446 "movd %%eax,%%xmm4 \n"
447 "pshufd $0x0,%%xmm4,%%xmm4 \n"
448 "movdqa %%xmm4,%%xmm5 \n"
449 "pslld $0x4,%%xmm5 \n"
454 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
455 "movdqa %%xmm0,%%xmm2 \n"
456 "pand %%xmm4,%%xmm0 \n"
457 "pand %%xmm5,%%xmm2 \n"
458 "movdqa %%xmm0,%%xmm1 \n"
459 "movdqa %%xmm2,%%xmm3 \n"
460 "psllw $0x4,%%xmm1 \n"
461 "psrlw $0x4,%%xmm3 \n"
462 "por %%xmm1,%%xmm0 \n"
463 "por %%xmm3,%%xmm2 \n"
464 "movdqa %%xmm0,%%xmm1 \n"
465 "punpcklbw %%xmm2,%%xmm0 \n"
466 "punpckhbw %%xmm2,%%xmm1 \n"
467 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
468 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
469 "lea " MEMLEA(0x10,0) ",%0 \n"
476 : "memory", "cc", "eax", NACL_R14
477 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
481 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
483 "movdqa %3,%%xmm6 \n"
486 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
487 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
488 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
489 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
490 "lea " MEMLEA(0x40,0) ",%0 \n"
491 "pshufb %%xmm6,%%xmm0 \n"
492 "pshufb %%xmm6,%%xmm1 \n"
493 "pshufb %%xmm6,%%xmm2 \n"
494 "pshufb %%xmm6,%%xmm3 \n"
495 "movdqa %%xmm1,%%xmm4 \n"
496 "psrldq $0x4,%%xmm1 \n"
497 "pslldq $0xc,%%xmm4 \n"
498 "movdqa %%xmm2,%%xmm5 \n"
499 "por %%xmm4,%%xmm0 \n"
500 "pslldq $0x8,%%xmm5 \n"
501 "movdqu %%xmm0," MEMACCESS(1) " \n"
502 "por %%xmm5,%%xmm1 \n"
503 "psrldq $0x8,%%xmm2 \n"
504 "pslldq $0x4,%%xmm3 \n"
505 "por %%xmm3,%%xmm2 \n"
506 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
507 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
508 "lea " MEMLEA(0x30,1) ",%1 \n"
514 : "m"(kShuffleMaskARGBToRGB24) // %3
515 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
519 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
521 "movdqa %3,%%xmm6 \n"
524 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
525 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
526 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
527 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
528 "lea " MEMLEA(0x40,0) ",%0 \n"
529 "pshufb %%xmm6,%%xmm0 \n"
530 "pshufb %%xmm6,%%xmm1 \n"
531 "pshufb %%xmm6,%%xmm2 \n"
532 "pshufb %%xmm6,%%xmm3 \n"
533 "movdqa %%xmm1,%%xmm4 \n"
534 "psrldq $0x4,%%xmm1 \n"
535 "pslldq $0xc,%%xmm4 \n"
536 "movdqa %%xmm2,%%xmm5 \n"
537 "por %%xmm4,%%xmm0 \n"
538 "pslldq $0x8,%%xmm5 \n"
539 "movdqu %%xmm0," MEMACCESS(1) " \n"
540 "por %%xmm5,%%xmm1 \n"
541 "psrldq $0x8,%%xmm2 \n"
542 "pslldq $0x4,%%xmm3 \n"
543 "por %%xmm3,%%xmm2 \n"
544 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
545 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
546 "lea " MEMLEA(0x30,1) ",%1 \n"
552 : "m"(kShuffleMaskARGBToRAW) // %3
553 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
557 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
559 "pcmpeqb %%xmm3,%%xmm3 \n"
560 "psrld $0x1b,%%xmm3 \n"
561 "pcmpeqb %%xmm4,%%xmm4 \n"
562 "psrld $0x1a,%%xmm4 \n"
563 "pslld $0x5,%%xmm4 \n"
564 "pcmpeqb %%xmm5,%%xmm5 \n"
565 "pslld $0xb,%%xmm5 \n"
568 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
569 "movdqa %%xmm0,%%xmm1 \n"
570 "movdqa %%xmm0,%%xmm2 \n"
571 "pslld $0x8,%%xmm0 \n"
572 "psrld $0x3,%%xmm1 \n"
573 "psrld $0x5,%%xmm2 \n"
574 "psrad $0x10,%%xmm0 \n"
575 "pand %%xmm3,%%xmm1 \n"
576 "pand %%xmm4,%%xmm2 \n"
577 "pand %%xmm5,%%xmm0 \n"
578 "por %%xmm2,%%xmm1 \n"
579 "por %%xmm1,%%xmm0 \n"
580 "packssdw %%xmm0,%%xmm0 \n"
581 "lea " MEMLEA(0x10,0) ",%0 \n"
582 "movq %%xmm0," MEMACCESS(1) " \n"
583 "lea " MEMLEA(0x8,1) ",%1 \n"
589 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
593 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
595 "pcmpeqb %%xmm4,%%xmm4 \n"
596 "psrld $0x1b,%%xmm4 \n"
597 "movdqa %%xmm4,%%xmm5 \n"
598 "pslld $0x5,%%xmm5 \n"
599 "movdqa %%xmm4,%%xmm6 \n"
600 "pslld $0xa,%%xmm6 \n"
601 "pcmpeqb %%xmm7,%%xmm7 \n"
602 "pslld $0xf,%%xmm7 \n"
605 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
606 "movdqa %%xmm0,%%xmm1 \n"
607 "movdqa %%xmm0,%%xmm2 \n"
608 "movdqa %%xmm0,%%xmm3 \n"
609 "psrad $0x10,%%xmm0 \n"
610 "psrld $0x3,%%xmm1 \n"
611 "psrld $0x6,%%xmm2 \n"
612 "psrld $0x9,%%xmm3 \n"
613 "pand %%xmm7,%%xmm0 \n"
614 "pand %%xmm4,%%xmm1 \n"
615 "pand %%xmm5,%%xmm2 \n"
616 "pand %%xmm6,%%xmm3 \n"
617 "por %%xmm1,%%xmm0 \n"
618 "por %%xmm3,%%xmm2 \n"
619 "por %%xmm2,%%xmm0 \n"
620 "packssdw %%xmm0,%%xmm0 \n"
621 "lea " MEMLEA(0x10,0) ",%0 \n"
622 "movq %%xmm0," MEMACCESS(1) " \n"
623 "lea " MEMLEA(0x8,1) ",%1 \n"
630 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
634 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
636 "pcmpeqb %%xmm4,%%xmm4 \n"
637 "psllw $0xc,%%xmm4 \n"
638 "movdqa %%xmm4,%%xmm3 \n"
639 "psrlw $0x8,%%xmm3 \n"
642 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
643 "movdqa %%xmm0,%%xmm1 \n"
644 "pand %%xmm3,%%xmm0 \n"
645 "pand %%xmm4,%%xmm1 \n"
646 "psrlq $0x4,%%xmm0 \n"
647 "psrlq $0x8,%%xmm1 \n"
648 "por %%xmm1,%%xmm0 \n"
649 "packuswb %%xmm0,%%xmm0 \n"
650 "lea " MEMLEA(0x10,0) ",%0 \n"
651 "movq %%xmm0," MEMACCESS(1) " \n"
652 "lea " MEMLEA(0x8,1) ",%1 \n"
658 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
661 #endif // HAS_RGB24TOARGBROW_SSSE3
663 #ifdef HAS_ARGBTOYROW_SSSE3
664 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
665 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
667 "movdqa %3,%%xmm4 \n"
668 "movdqa %4,%%xmm5 \n"
671 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
672 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
673 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
674 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
675 "pmaddubsw %%xmm4,%%xmm0 \n"
676 "pmaddubsw %%xmm4,%%xmm1 \n"
677 "pmaddubsw %%xmm4,%%xmm2 \n"
678 "pmaddubsw %%xmm4,%%xmm3 \n"
679 "lea " MEMLEA(0x40,0) ",%0 \n"
680 "phaddw %%xmm1,%%xmm0 \n"
681 "phaddw %%xmm3,%%xmm2 \n"
682 "psrlw $0x7,%%xmm0 \n"
683 "psrlw $0x7,%%xmm2 \n"
684 "packuswb %%xmm2,%%xmm0 \n"
685 "paddb %%xmm5,%%xmm0 \n"
686 "movdqu %%xmm0," MEMACCESS(1) " \n"
687 "lea " MEMLEA(0x10,1) ",%1 \n"
690 : "+r"(src_argb), // %0
693 : "m"(kARGBToY), // %3
695 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
698 #endif // HAS_ARGBTOYROW_SSSE3
700 #ifdef HAS_ARGBTOYJROW_SSSE3
701 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
702 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
703 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
705 "movdqa %3,%%xmm4 \n"
706 "movdqa %4,%%xmm5 \n"
709 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
710 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
711 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
712 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
713 "pmaddubsw %%xmm4,%%xmm0 \n"
714 "pmaddubsw %%xmm4,%%xmm1 \n"
715 "pmaddubsw %%xmm4,%%xmm2 \n"
716 "pmaddubsw %%xmm4,%%xmm3 \n"
717 "lea " MEMLEA(0x40,0) ",%0 \n"
718 "phaddw %%xmm1,%%xmm0 \n"
719 "phaddw %%xmm3,%%xmm2 \n"
720 "paddw %%xmm5,%%xmm0 \n"
721 "paddw %%xmm5,%%xmm2 \n"
722 "psrlw $0x7,%%xmm0 \n"
723 "psrlw $0x7,%%xmm2 \n"
724 "packuswb %%xmm2,%%xmm0 \n"
725 "movdqu %%xmm0," MEMACCESS(1) " \n"
726 "lea " MEMLEA(0x10,1) ",%1 \n"
729 : "+r"(src_argb), // %0
732 : "m"(kARGBToYJ), // %3
734 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
737 #endif // HAS_ARGBTOYJROW_SSSE3
739 #ifdef HAS_ARGBTOYROW_AVX2
740 // vpermd for vphaddw + vpackuswb vpermd.
741 static const lvec32 kPermdARGBToY_AVX = {
742 0, 4, 1, 5, 2, 6, 3, 7
745 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
746 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
748 "vbroadcastf128 %3,%%ymm4 \n"
749 "vbroadcastf128 %4,%%ymm5 \n"
750 "vmovdqu %5,%%ymm6 \n"
753 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
754 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
755 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
756 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
757 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
758 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
759 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
760 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
761 "lea " MEMLEA(0x80,0) ",%0 \n"
762 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
763 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
764 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
765 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
766 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
767 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
768 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
769 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
770 "lea " MEMLEA(0x20,1) ",%1 \n"
774 : "+r"(src_argb), // %0
777 : "m"(kARGBToY), // %3
779 "m"(kPermdARGBToY_AVX) // %5
780 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
783 #endif // HAS_ARGBTOYROW_AVX2
785 #ifdef HAS_ARGBTOYJROW_AVX2
786 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
787 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
789 "vbroadcastf128 %3,%%ymm4 \n"
790 "vbroadcastf128 %4,%%ymm5 \n"
791 "vmovdqu %5,%%ymm6 \n"
794 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
795 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
796 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
797 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
798 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
799 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
800 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
801 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
802 "lea " MEMLEA(0x80,0) ",%0 \n"
803 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
804 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
805 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
806 "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
807 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
808 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
809 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
810 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
811 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
812 "lea " MEMLEA(0x20,1) ",%1 \n"
816 : "+r"(src_argb), // %0
819 : "m"(kARGBToYJ), // %3
821 "m"(kPermdARGBToY_AVX) // %5
822 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
825 #endif // HAS_ARGBTOYJROW_AVX2
827 #ifdef HAS_ARGBTOUVROW_SSSE3
828 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
829 uint8* dst_u, uint8* dst_v, int width) {
831 "movdqa %5,%%xmm3 \n"
832 "movdqa %6,%%xmm4 \n"
833 "movdqa %7,%%xmm5 \n"
837 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
838 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
839 "pavgb %%xmm7,%%xmm0 \n"
840 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
841 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
842 "pavgb %%xmm7,%%xmm1 \n"
843 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
844 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
845 "pavgb %%xmm7,%%xmm2 \n"
846 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
847 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
848 "pavgb %%xmm7,%%xmm6 \n"
850 "lea " MEMLEA(0x40,0) ",%0 \n"
851 "movdqa %%xmm0,%%xmm7 \n"
852 "shufps $0x88,%%xmm1,%%xmm0 \n"
853 "shufps $0xdd,%%xmm1,%%xmm7 \n"
854 "pavgb %%xmm7,%%xmm0 \n"
855 "movdqa %%xmm2,%%xmm7 \n"
856 "shufps $0x88,%%xmm6,%%xmm2 \n"
857 "shufps $0xdd,%%xmm6,%%xmm7 \n"
858 "pavgb %%xmm7,%%xmm2 \n"
859 "movdqa %%xmm0,%%xmm1 \n"
860 "movdqa %%xmm2,%%xmm6 \n"
861 "pmaddubsw %%xmm4,%%xmm0 \n"
862 "pmaddubsw %%xmm4,%%xmm2 \n"
863 "pmaddubsw %%xmm3,%%xmm1 \n"
864 "pmaddubsw %%xmm3,%%xmm6 \n"
865 "phaddw %%xmm2,%%xmm0 \n"
866 "phaddw %%xmm6,%%xmm1 \n"
867 "psraw $0x8,%%xmm0 \n"
868 "psraw $0x8,%%xmm1 \n"
869 "packsswb %%xmm1,%%xmm0 \n"
870 "paddb %%xmm5,%%xmm0 \n"
871 "movlps %%xmm0," MEMACCESS(1) " \n"
872 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
873 "lea " MEMLEA(0x8,1) ",%1 \n"
876 : "+r"(src_argb0), // %0
880 : "r"((intptr_t)(src_stride_argb)), // %4
884 : "memory", "cc", NACL_R14
885 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
888 #endif // HAS_ARGBTOUVROW_SSSE3
890 #ifdef HAS_ARGBTOUVROW_AVX2
891 // vpshufb for vphaddw + vpackuswb packed to shorts.
892 static const lvec8 kShufARGBToUV_AVX = {
893 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
894 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
896 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
897 uint8* dst_u, uint8* dst_v, int width) {
899 "vbroadcastf128 %5,%%ymm5 \n"
900 "vbroadcastf128 %6,%%ymm6 \n"
901 "vbroadcastf128 %7,%%ymm7 \n"
905 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
906 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
907 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
908 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
909 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
910 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
911 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
912 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
913 "lea " MEMLEA(0x80,0) ",%0 \n"
914 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
915 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
916 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
917 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
918 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
919 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
921 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
922 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
923 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
924 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
925 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
926 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
927 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
928 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
929 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
930 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
931 "vpshufb %8,%%ymm0,%%ymm0 \n"
932 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
934 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
935 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
936 "lea " MEMLEA(0x10,1) ",%1 \n"
940 : "+r"(src_argb0), // %0
944 : "r"((intptr_t)(src_stride_argb)), // %4
945 "m"(kAddUV128), // %5
948 "m"(kShufARGBToUV_AVX) // %8
949 : "memory", "cc", NACL_R14
950 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
953 #endif // HAS_ARGBTOUVROW_AVX2
955 #ifdef HAS_ARGBTOUVJROW_SSSE3
956 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
957 uint8* dst_u, uint8* dst_v, int width) {
959 "movdqa %5,%%xmm3 \n"
960 "movdqa %6,%%xmm4 \n"
961 "movdqa %7,%%xmm5 \n"
965 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
966 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
967 "pavgb %%xmm7,%%xmm0 \n"
968 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
969 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
970 "pavgb %%xmm7,%%xmm1 \n"
971 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
972 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
973 "pavgb %%xmm7,%%xmm2 \n"
974 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
975 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
976 "pavgb %%xmm7,%%xmm6 \n"
978 "lea " MEMLEA(0x40,0) ",%0 \n"
979 "movdqa %%xmm0,%%xmm7 \n"
980 "shufps $0x88,%%xmm1,%%xmm0 \n"
981 "shufps $0xdd,%%xmm1,%%xmm7 \n"
982 "pavgb %%xmm7,%%xmm0 \n"
983 "movdqa %%xmm2,%%xmm7 \n"
984 "shufps $0x88,%%xmm6,%%xmm2 \n"
985 "shufps $0xdd,%%xmm6,%%xmm7 \n"
986 "pavgb %%xmm7,%%xmm2 \n"
987 "movdqa %%xmm0,%%xmm1 \n"
988 "movdqa %%xmm2,%%xmm6 \n"
989 "pmaddubsw %%xmm4,%%xmm0 \n"
990 "pmaddubsw %%xmm4,%%xmm2 \n"
991 "pmaddubsw %%xmm3,%%xmm1 \n"
992 "pmaddubsw %%xmm3,%%xmm6 \n"
993 "phaddw %%xmm2,%%xmm0 \n"
994 "phaddw %%xmm6,%%xmm1 \n"
995 "paddw %%xmm5,%%xmm0 \n"
996 "paddw %%xmm5,%%xmm1 \n"
997 "psraw $0x8,%%xmm0 \n"
998 "psraw $0x8,%%xmm1 \n"
999 "packsswb %%xmm1,%%xmm0 \n"
1000 "movlps %%xmm0," MEMACCESS(1) " \n"
1001 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1002 "lea " MEMLEA(0x8,1) ",%1 \n"
1005 : "+r"(src_argb0), // %0
1009 : "r"((intptr_t)(src_stride_argb)), // %4
1010 "m"(kARGBToVJ), // %5
1011 "m"(kARGBToUJ), // %6
1012 "m"(kAddUVJ128) // %7
1013 : "memory", "cc", NACL_R14
1014 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1017 #endif // HAS_ARGBTOUVJROW_SSSE3
1019 #ifdef HAS_ARGBTOUV444ROW_SSSE3
1020 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1023 "movdqa %4,%%xmm3 \n"
1024 "movdqa %5,%%xmm4 \n"
1025 "movdqa %6,%%xmm5 \n"
1029 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1030 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1031 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1032 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1033 "pmaddubsw %%xmm4,%%xmm0 \n"
1034 "pmaddubsw %%xmm4,%%xmm1 \n"
1035 "pmaddubsw %%xmm4,%%xmm2 \n"
1036 "pmaddubsw %%xmm4,%%xmm6 \n"
1037 "phaddw %%xmm1,%%xmm0 \n"
1038 "phaddw %%xmm6,%%xmm2 \n"
1039 "psraw $0x8,%%xmm0 \n"
1040 "psraw $0x8,%%xmm2 \n"
1041 "packsswb %%xmm2,%%xmm0 \n"
1042 "paddb %%xmm5,%%xmm0 \n"
1043 "movdqu %%xmm0," MEMACCESS(1) " \n"
1044 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1045 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1046 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1047 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1048 "pmaddubsw %%xmm3,%%xmm0 \n"
1049 "pmaddubsw %%xmm3,%%xmm1 \n"
1050 "pmaddubsw %%xmm3,%%xmm2 \n"
1051 "pmaddubsw %%xmm3,%%xmm6 \n"
1052 "phaddw %%xmm1,%%xmm0 \n"
1053 "phaddw %%xmm6,%%xmm2 \n"
1054 "psraw $0x8,%%xmm0 \n"
1055 "psraw $0x8,%%xmm2 \n"
1056 "packsswb %%xmm2,%%xmm0 \n"
1057 "paddb %%xmm5,%%xmm0 \n"
1058 "lea " MEMLEA(0x40,0) ",%0 \n"
1059 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
1060 "lea " MEMLEA(0x10,1) ",%1 \n"
1063 : "+r"(src_argb), // %0
1067 : "m"(kARGBToV), // %4
1068 "m"(kARGBToU), // %5
1069 "m"(kAddUV128) // %6
1070 : "memory", "cc", NACL_R14
1071 "xmm0", "xmm1", "xmm2", "xmm6"
1074 #endif // HAS_ARGBTOUV444ROW_SSSE3
1076 #ifdef HAS_ARGBTOUV422ROW_SSSE3
1077 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1078 uint8* dst_u, uint8* dst_v, int width) {
1080 "movdqa %4,%%xmm3 \n"
1081 "movdqa %5,%%xmm4 \n"
1082 "movdqa %6,%%xmm5 \n"
1086 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1087 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1088 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1089 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1090 "lea " MEMLEA(0x40,0) ",%0 \n"
1091 "movdqa %%xmm0,%%xmm7 \n"
1092 "shufps $0x88,%%xmm1,%%xmm0 \n"
1093 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1094 "pavgb %%xmm7,%%xmm0 \n"
1095 "movdqa %%xmm2,%%xmm7 \n"
1096 "shufps $0x88,%%xmm6,%%xmm2 \n"
1097 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1098 "pavgb %%xmm7,%%xmm2 \n"
1099 "movdqa %%xmm0,%%xmm1 \n"
1100 "movdqa %%xmm2,%%xmm6 \n"
1101 "pmaddubsw %%xmm4,%%xmm0 \n"
1102 "pmaddubsw %%xmm4,%%xmm2 \n"
1103 "pmaddubsw %%xmm3,%%xmm1 \n"
1104 "pmaddubsw %%xmm3,%%xmm6 \n"
1105 "phaddw %%xmm2,%%xmm0 \n"
1106 "phaddw %%xmm6,%%xmm1 \n"
1107 "psraw $0x8,%%xmm0 \n"
1108 "psraw $0x8,%%xmm1 \n"
1109 "packsswb %%xmm1,%%xmm0 \n"
1110 "paddb %%xmm5,%%xmm0 \n"
1111 "movlps %%xmm0," MEMACCESS(1) " \n"
1112 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1113 "lea " MEMLEA(0x8,1) ",%1 \n"
1116 : "+r"(src_argb0), // %0
1120 : "m"(kARGBToV), // %4
1121 "m"(kARGBToU), // %5
1122 "m"(kAddUV128) // %6
1123 : "memory", "cc", NACL_R14
1124 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1127 #endif // HAS_ARGBTOUV422ROW_SSSE3
1129 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
1131 "movdqa %4,%%xmm5 \n"
1132 "movdqa %3,%%xmm4 \n"
1135 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1136 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1137 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1138 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1139 "pmaddubsw %%xmm4,%%xmm0 \n"
1140 "pmaddubsw %%xmm4,%%xmm1 \n"
1141 "pmaddubsw %%xmm4,%%xmm2 \n"
1142 "pmaddubsw %%xmm4,%%xmm3 \n"
1143 "lea " MEMLEA(0x40,0) ",%0 \n"
1144 "phaddw %%xmm1,%%xmm0 \n"
1145 "phaddw %%xmm3,%%xmm2 \n"
1146 "psrlw $0x7,%%xmm0 \n"
1147 "psrlw $0x7,%%xmm2 \n"
1148 "packuswb %%xmm2,%%xmm0 \n"
1149 "paddb %%xmm5,%%xmm0 \n"
1150 "movdqu %%xmm0," MEMACCESS(1) " \n"
1151 "lea " MEMLEA(0x10,1) ",%1 \n"
1154 : "+r"(src_bgra), // %0
1157 : "m"(kBGRAToY), // %3
1159 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1163 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1164 uint8* dst_u, uint8* dst_v, int width) {
1166 "movdqa %5,%%xmm3 \n"
1167 "movdqa %6,%%xmm4 \n"
1168 "movdqa %7,%%xmm5 \n"
1172 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1173 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1174 "pavgb %%xmm7,%%xmm0 \n"
1175 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1176 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1177 "pavgb %%xmm7,%%xmm1 \n"
1178 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1179 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1180 "pavgb %%xmm7,%%xmm2 \n"
1181 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1182 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1183 "pavgb %%xmm7,%%xmm6 \n"
1185 "lea " MEMLEA(0x40,0) ",%0 \n"
1186 "movdqa %%xmm0,%%xmm7 \n"
1187 "shufps $0x88,%%xmm1,%%xmm0 \n"
1188 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1189 "pavgb %%xmm7,%%xmm0 \n"
1190 "movdqa %%xmm2,%%xmm7 \n"
1191 "shufps $0x88,%%xmm6,%%xmm2 \n"
1192 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1193 "pavgb %%xmm7,%%xmm2 \n"
1194 "movdqa %%xmm0,%%xmm1 \n"
1195 "movdqa %%xmm2,%%xmm6 \n"
1196 "pmaddubsw %%xmm4,%%xmm0 \n"
1197 "pmaddubsw %%xmm4,%%xmm2 \n"
1198 "pmaddubsw %%xmm3,%%xmm1 \n"
1199 "pmaddubsw %%xmm3,%%xmm6 \n"
1200 "phaddw %%xmm2,%%xmm0 \n"
1201 "phaddw %%xmm6,%%xmm1 \n"
1202 "psraw $0x8,%%xmm0 \n"
1203 "psraw $0x8,%%xmm1 \n"
1204 "packsswb %%xmm1,%%xmm0 \n"
1205 "paddb %%xmm5,%%xmm0 \n"
1206 "movlps %%xmm0," MEMACCESS(1) " \n"
1207 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1208 "lea " MEMLEA(0x8,1) ",%1 \n"
1211 : "+r"(src_bgra0), // %0
1215 : "r"((intptr_t)(src_stride_bgra)), // %4
1216 "m"(kBGRAToV), // %5
1217 "m"(kBGRAToU), // %6
1218 "m"(kAddUV128) // %7
1219 : "memory", "cc", NACL_R14
1220 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1224 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1226 "movdqa %4,%%xmm5 \n"
1227 "movdqa %3,%%xmm4 \n"
1230 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1231 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1232 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1233 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1234 "pmaddubsw %%xmm4,%%xmm0 \n"
1235 "pmaddubsw %%xmm4,%%xmm1 \n"
1236 "pmaddubsw %%xmm4,%%xmm2 \n"
1237 "pmaddubsw %%xmm4,%%xmm3 \n"
1238 "lea " MEMLEA(0x40,0) ",%0 \n"
1239 "phaddw %%xmm1,%%xmm0 \n"
1240 "phaddw %%xmm3,%%xmm2 \n"
1241 "psrlw $0x7,%%xmm0 \n"
1242 "psrlw $0x7,%%xmm2 \n"
1243 "packuswb %%xmm2,%%xmm0 \n"
1244 "paddb %%xmm5,%%xmm0 \n"
1245 "movdqu %%xmm0," MEMACCESS(1) " \n"
1246 "lea " MEMLEA(0x10,1) ",%1 \n"
1249 : "+r"(src_abgr), // %0
1252 : "m"(kABGRToY), // %3
1254 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1258 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1260 "movdqa %4,%%xmm5 \n"
1261 "movdqa %3,%%xmm4 \n"
1264 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1265 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1266 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1267 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1268 "pmaddubsw %%xmm4,%%xmm0 \n"
1269 "pmaddubsw %%xmm4,%%xmm1 \n"
1270 "pmaddubsw %%xmm4,%%xmm2 \n"
1271 "pmaddubsw %%xmm4,%%xmm3 \n"
1272 "lea " MEMLEA(0x40,0) ",%0 \n"
1273 "phaddw %%xmm1,%%xmm0 \n"
1274 "phaddw %%xmm3,%%xmm2 \n"
1275 "psrlw $0x7,%%xmm0 \n"
1276 "psrlw $0x7,%%xmm2 \n"
1277 "packuswb %%xmm2,%%xmm0 \n"
1278 "paddb %%xmm5,%%xmm0 \n"
1279 "movdqu %%xmm0," MEMACCESS(1) " \n"
1280 "lea " MEMLEA(0x10,1) ",%1 \n"
1283 : "+r"(src_rgba), // %0
1286 : "m"(kRGBAToY), // %3
1288 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1292 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1293 uint8* dst_u, uint8* dst_v, int width) {
1295 "movdqa %5,%%xmm3 \n"
1296 "movdqa %6,%%xmm4 \n"
1297 "movdqa %7,%%xmm5 \n"
1301 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1302 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1303 "pavgb %%xmm7,%%xmm0 \n"
1304 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1305 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1306 "pavgb %%xmm7,%%xmm1 \n"
1307 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1308 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1309 "pavgb %%xmm7,%%xmm2 \n"
1310 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1311 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1312 "pavgb %%xmm7,%%xmm6 \n"
1314 "lea " MEMLEA(0x40,0) ",%0 \n"
1315 "movdqa %%xmm0,%%xmm7 \n"
1316 "shufps $0x88,%%xmm1,%%xmm0 \n"
1317 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1318 "pavgb %%xmm7,%%xmm0 \n"
1319 "movdqa %%xmm2,%%xmm7 \n"
1320 "shufps $0x88,%%xmm6,%%xmm2 \n"
1321 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1322 "pavgb %%xmm7,%%xmm2 \n"
1323 "movdqa %%xmm0,%%xmm1 \n"
1324 "movdqa %%xmm2,%%xmm6 \n"
1325 "pmaddubsw %%xmm4,%%xmm0 \n"
1326 "pmaddubsw %%xmm4,%%xmm2 \n"
1327 "pmaddubsw %%xmm3,%%xmm1 \n"
1328 "pmaddubsw %%xmm3,%%xmm6 \n"
1329 "phaddw %%xmm2,%%xmm0 \n"
1330 "phaddw %%xmm6,%%xmm1 \n"
1331 "psraw $0x8,%%xmm0 \n"
1332 "psraw $0x8,%%xmm1 \n"
1333 "packsswb %%xmm1,%%xmm0 \n"
1334 "paddb %%xmm5,%%xmm0 \n"
1335 "movlps %%xmm0," MEMACCESS(1) " \n"
1336 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1337 "lea " MEMLEA(0x8,1) ",%1 \n"
1340 : "+r"(src_abgr0), // %0
1344 : "r"((intptr_t)(src_stride_abgr)), // %4
1345 "m"(kABGRToV), // %5
1346 "m"(kABGRToU), // %6
1347 "m"(kAddUV128) // %7
1348 : "memory", "cc", NACL_R14
1349 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1353 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1354 uint8* dst_u, uint8* dst_v, int width) {
1356 "movdqa %5,%%xmm3 \n"
1357 "movdqa %6,%%xmm4 \n"
1358 "movdqa %7,%%xmm5 \n"
1362 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1363 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1364 "pavgb %%xmm7,%%xmm0 \n"
1365 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1366 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1367 "pavgb %%xmm7,%%xmm1 \n"
1368 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1369 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1370 "pavgb %%xmm7,%%xmm2 \n"
1371 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1372 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1373 "pavgb %%xmm7,%%xmm6 \n"
1375 "lea " MEMLEA(0x40,0) ",%0 \n"
1376 "movdqa %%xmm0,%%xmm7 \n"
1377 "shufps $0x88,%%xmm1,%%xmm0 \n"
1378 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1379 "pavgb %%xmm7,%%xmm0 \n"
1380 "movdqa %%xmm2,%%xmm7 \n"
1381 "shufps $0x88,%%xmm6,%%xmm2 \n"
1382 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1383 "pavgb %%xmm7,%%xmm2 \n"
1384 "movdqa %%xmm0,%%xmm1 \n"
1385 "movdqa %%xmm2,%%xmm6 \n"
1386 "pmaddubsw %%xmm4,%%xmm0 \n"
1387 "pmaddubsw %%xmm4,%%xmm2 \n"
1388 "pmaddubsw %%xmm3,%%xmm1 \n"
1389 "pmaddubsw %%xmm3,%%xmm6 \n"
1390 "phaddw %%xmm2,%%xmm0 \n"
1391 "phaddw %%xmm6,%%xmm1 \n"
1392 "psraw $0x8,%%xmm0 \n"
1393 "psraw $0x8,%%xmm1 \n"
1394 "packsswb %%xmm1,%%xmm0 \n"
1395 "paddb %%xmm5,%%xmm0 \n"
1396 "movlps %%xmm0," MEMACCESS(1) " \n"
1397 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1398 "lea " MEMLEA(0x8,1) ",%1 \n"
1401 : "+r"(src_rgba0), // %0
1405 : "r"((intptr_t)(src_stride_rgba)), // %4
1406 "m"(kRGBAToV), // %5
1407 "m"(kRGBAToU), // %6
1408 "m"(kAddUV128) // %7
1409 : "memory", "cc", NACL_R14
1410 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1414 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1416 struct YuvConstants {
1420 lvec16 kUVBiasB; // 96
1421 lvec16 kUVBiasG; // 128
1422 lvec16 kUVBiasR; // 160
1423 lvec16 kYToRgb; // 192
1426 // BT.601 YUV to RGB reference
1427 // R = (Y - 16) * 1.164 - V * -1.596
1428 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
1429 // B = (Y - 16) * 1.164 - U * -2.018
1431 // Y contribution to R,G,B. Scale and bias.
1432 // TODO(fbarchard): Consider moving constants into a common header.
1433 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1434 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1436 // U and V contributions to R,G,B.
1437 #define UB -128 /* max(-128, round(-2.018 * 64)) */
1438 #define UG 25 /* round(0.391 * 64) */
1439 #define VG 52 /* round(0.813 * 64) */
1440 #define VR -102 /* round(-1.596 * 64) */
1442 // Bias values to subtract 16 from Y and 128 from U and V.
1443 #define BB (UB * 128 + YGB)
1444 #define BG (UG * 128 + VG * 128 + YGB)
1445 #define BR (VR * 128 + YGB)
1447 // BT601 constants for YUV to RGB.
1448 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
1449 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1450 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
1451 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1452 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1453 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1454 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
1455 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1456 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1457 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1458 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1461 // BT601 constants for NV21 where chroma plane is VU instead of UV.
1462 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
1463 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1464 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
1465 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1466 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1467 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1468 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
1469 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1470 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1471 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1472 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1485 // JPEG YUV to RGB reference
1486 // * R = Y - V * -1.40200
1487 // * G = Y - U * 0.34414 - V * 0.71414
1488 // * B = Y - U * -1.77200
1490 // Y contribution to R,G,B. Scale and bias.
1491 // TODO(fbarchard): Consider moving constants into a common header.
1492 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
1493 #define YGBJ 32 /* 64 / 2 */
1495 // U and V contributions to R,G,B.
1496 #define UBJ -113 /* round(-1.77200 * 64) */
1497 #define UGJ 22 /* round(0.34414 * 64) */
1498 #define VGJ 46 /* round(0.71414 * 64) */
1499 #define VRJ -90 /* round(-1.40200 * 64) */
1501 // Bias values to subtract 16 from Y and 128 from U and V.
1502 #define BBJ (UBJ * 128 + YGBJ)
1503 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
1504 #define BRJ (VRJ * 128 + YGBJ)
1506 // JPEG constants for YUV to RGB.
1507 YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
1508 { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
1509 UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
1510 { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1511 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1512 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1513 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
1514 { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
1515 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
1516 { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
1517 BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
1518 { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
1519 BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
1520 { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
1521 BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
1522 { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
1523 YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
1536 // Read 8 UV from 411
1537 #define READYUV444 \
1538 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1539 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1540 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1541 "punpcklbw %%xmm1,%%xmm0 \n"
1543 // Read 4 UV from 422, upsample to 8 UV
1544 #define READYUV422 \
1545 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1546 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1547 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1548 "punpcklbw %%xmm1,%%xmm0 \n" \
1549 "punpcklwd %%xmm0,%%xmm0 \n"
1551 // Read 2 UV from 411, upsample to 8 UV
1552 #define READYUV411 \
1553 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1554 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1555 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
1556 "punpcklbw %%xmm1,%%xmm0 \n" \
1557 "punpcklwd %%xmm0,%%xmm0 \n" \
1558 "punpckldq %%xmm0,%%xmm0 \n"
1560 // Read 4 UV from NV12, upsample to 8 UV
1562 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1563 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
1564 "punpcklwd %%xmm0,%%xmm0 \n"
1566 // Convert 8 pixels: 8 UV and 8 Y
1567 #define YUVTORGB(YuvConstants) \
1568 "movdqa %%xmm0,%%xmm1 \n" \
1569 "movdqa %%xmm0,%%xmm2 \n" \
1570 "movdqa %%xmm0,%%xmm3 \n" \
1571 "movdqa " MEMACCESS2(96, [YuvConstants]) ",%%xmm0 \n" \
1572 "pmaddubsw " MEMACCESS([YuvConstants]) ",%%xmm1 \n" \
1573 "psubw %%xmm1,%%xmm0 \n" \
1574 "movdqa " MEMACCESS2(128, [YuvConstants]) ",%%xmm1 \n" \
1575 "pmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%xmm2 \n" \
1576 "psubw %%xmm2,%%xmm1 \n" \
1577 "movdqa " MEMACCESS2(160, [YuvConstants]) ",%%xmm2 \n" \
1578 "pmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%xmm3 \n" \
1579 "psubw %%xmm3,%%xmm2 \n" \
1580 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
1581 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
1582 "punpcklbw %%xmm3,%%xmm3 \n" \
1583 "pmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%xmm3 \n" \
1584 "paddsw %%xmm3,%%xmm0 \n" \
1585 "paddsw %%xmm3,%%xmm1 \n" \
1586 "paddsw %%xmm3,%%xmm2 \n" \
1587 "psraw $0x6,%%xmm0 \n" \
1588 "psraw $0x6,%%xmm1 \n" \
1589 "psraw $0x6,%%xmm2 \n" \
1590 "packuswb %%xmm0,%%xmm0 \n" \
1591 "packuswb %%xmm1,%%xmm1 \n" \
1592 "packuswb %%xmm2,%%xmm2 \n"
1594 // Store 8 ARGB values. Assumes XMM5 is zero.
1596 "punpcklbw %%xmm1,%%xmm0 \n" \
1597 "punpcklbw %%xmm5,%%xmm2 \n" \
1598 "movdqa %%xmm0,%%xmm1 \n" \
1599 "punpcklwd %%xmm2,%%xmm0 \n" \
1600 "punpckhwd %%xmm2,%%xmm1 \n" \
1601 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
1602 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
1603 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
1605 // Store 8 BGRA values. Assumes XMM5 is zero.
1607 "pcmpeqb %%xmm5,%%xmm5 \n" \
1608 "punpcklbw %%xmm0,%%xmm1 \n" \
1609 "punpcklbw %%xmm2,%%xmm5 \n" \
1610 "movdqa %%xmm5,%%xmm0 \n" \
1611 "punpcklwd %%xmm1,%%xmm5 \n" \
1612 "punpckhwd %%xmm1,%%xmm0 \n" \
1613 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \
1614 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \
1615 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n"
1617 // Store 8 ABGR values. Assumes XMM5 is zero.
1619 "punpcklbw %%xmm1,%%xmm2 \n" \
1620 "punpcklbw %%xmm5,%%xmm0 \n" \
1621 "movdqa %%xmm2,%%xmm1 \n" \
1622 "punpcklwd %%xmm0,%%xmm2 \n" \
1623 "punpckhwd %%xmm0,%%xmm1 \n" \
1624 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \
1625 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \
1626 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n"
1628 // Store 8 RGBA values. Assumes XMM5 is zero.
1630 "pcmpeqb %%xmm5,%%xmm5 \n" \
1631 "punpcklbw %%xmm2,%%xmm1 \n" \
1632 "punpcklbw %%xmm0,%%xmm5 \n" \
1633 "movdqa %%xmm5,%%xmm0 \n" \
1634 "punpcklwd %%xmm1,%%xmm5 \n" \
1635 "punpckhwd %%xmm1,%%xmm0 \n" \
1636 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
1637 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
1638 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
1640 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1646 "sub %[u_buf],%[v_buf] \n"
1647 "pcmpeqb %%xmm5,%%xmm5 \n"
1651 YUVTORGB(kYuvConstants)
1653 "sub $0x8,%[width] \n"
1655 : [y_buf]"+r"(y_buf), // %[y_buf]
1656 [u_buf]"+r"(u_buf), // %[u_buf]
1657 [v_buf]"+r"(v_buf), // %[v_buf]
1658 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1659 [width]"+rm"(width) // %[width]
1660 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1661 : "memory", "cc", NACL_R14
1662 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1666 // TODO(fbarchard): Consider putting masks into constants.
1667 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1673 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1674 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1675 "sub %[u_buf],%[v_buf] \n"
1679 YUVTORGB(kYuvConstants)
1680 "punpcklbw %%xmm1,%%xmm0 \n"
1681 "punpcklbw %%xmm2,%%xmm2 \n"
1682 "movdqa %%xmm0,%%xmm1 \n"
1683 "punpcklwd %%xmm2,%%xmm0 \n"
1684 "punpckhwd %%xmm2,%%xmm1 \n"
1685 "pshufb %%xmm5,%%xmm0 \n"
1686 "pshufb %%xmm6,%%xmm1 \n"
1687 "palignr $0xc,%%xmm0,%%xmm1 \n"
1688 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1689 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1690 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1691 "subl $0x8,%[width] \n"
1693 : [y_buf]"+r"(y_buf), // %[y_buf]
1694 [u_buf]"+r"(u_buf), // %[u_buf]
1695 [v_buf]"+r"(v_buf), // %[v_buf]
1696 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
1697 // TODO(fbarchard): Make width a register for 32 bit.
1698 #if defined(__i386__) && defined(__pic__)
1699 [width]"+m"(width) // %[width]
1701 [width]"+rm"(width) // %[width]
1703 : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1704 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1705 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1706 : "memory", "cc", NACL_R14
1707 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1711 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1717 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1718 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1719 "sub %[u_buf],%[v_buf] \n"
1723 YUVTORGB(kYuvConstants)
1724 "punpcklbw %%xmm1,%%xmm0 \n"
1725 "punpcklbw %%xmm2,%%xmm2 \n"
1726 "movdqa %%xmm0,%%xmm1 \n"
1727 "punpcklwd %%xmm2,%%xmm0 \n"
1728 "punpckhwd %%xmm2,%%xmm1 \n"
1729 "pshufb %%xmm5,%%xmm0 \n"
1730 "pshufb %%xmm6,%%xmm1 \n"
1731 "palignr $0xc,%%xmm0,%%xmm1 \n"
1732 "movq %%xmm0," MEMACCESS([dst_raw]) " \n"
1733 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
1734 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
1735 "subl $0x8,%[width] \n"
1737 : [y_buf]"+r"(y_buf), // %[y_buf]
1738 [u_buf]"+r"(u_buf), // %[u_buf]
1739 [v_buf]"+r"(v_buf), // %[v_buf]
1740 [dst_raw]"+r"(dst_raw), // %[dst_raw]
1741 // TODO(fbarchard): Make width a register for 32 bit.
1742 #if defined(__i386__) && defined(__pic__)
1743 [width]"+m"(width) // %[width]
1745 [width]"+rm"(width) // %[width]
1747 : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1748 [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1749 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
1750 : "memory", "cc", NACL_R14
1751 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1755 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1761 "sub %[u_buf],%[v_buf] \n"
1762 "pcmpeqb %%xmm5,%%xmm5 \n"
1766 YUVTORGB(kYuvConstants)
1768 "sub $0x8,%[width] \n"
1770 : [y_buf]"+r"(y_buf), // %[y_buf]
1771 [u_buf]"+r"(u_buf), // %[u_buf]
1772 [v_buf]"+r"(v_buf), // %[v_buf]
1773 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1774 [width]"+rm"(width) // %[width]
1775 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1776 : "memory", "cc", NACL_R14
1777 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1781 void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
1787 "sub %[u_buf],%[v_buf] \n"
1788 "pcmpeqb %%xmm5,%%xmm5 \n"
1792 YUVTORGB(kYuvConstants)
1794 "sub $0x8,%[width] \n"
1796 : [y_buf]"+r"(y_buf), // %[y_buf]
1797 [u_buf]"+r"(u_buf), // %[u_buf]
1798 [v_buf]"+r"(v_buf), // %[v_buf]
1799 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1800 [width]"+rm"(width) // %[width]
1801 : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
1802 : "memory", "cc", NACL_R14
1803 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1807 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1813 "sub %[u_buf],%[v_buf] \n"
1814 "pcmpeqb %%xmm5,%%xmm5 \n"
1818 YUVTORGB(kYuvConstants)
1820 "sub $0x8,%[width] \n"
1822 : [y_buf]"+r"(y_buf), // %[y_buf]
1823 [u_buf]"+r"(u_buf), // %[u_buf]
1824 [v_buf]"+r"(v_buf), // %[v_buf]
1825 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1826 [width]"+rm"(width) // %[width]
1827 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1828 : "memory", "cc", NACL_R14
1829 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1833 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1834 const uint8* uv_buf,
1838 "pcmpeqb %%xmm5,%%xmm5 \n"
1842 YUVTORGB(kYuvConstants)
1844 "sub $0x8,%[width] \n"
1846 : [y_buf]"+r"(y_buf), // %[y_buf]
1847 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1848 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1849 [width]"+rm"(width) // %[width]
1850 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1851 // Does not use r14.
1852 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1856 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1857 const uint8* uv_buf,
1861 "pcmpeqb %%xmm5,%%xmm5 \n"
1865 YUVTORGB(kYuvConstants)
1867 "sub $0x8,%[width] \n"
1869 : [y_buf]"+r"(y_buf), // %[y_buf]
1870 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1871 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1872 [width]"+rm"(width) // %[width]
1873 : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
1874 // Does not use r14.
1875 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1879 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1885 "sub %[u_buf],%[v_buf] \n"
1886 "pcmpeqb %%xmm5,%%xmm5 \n"
1890 YUVTORGB(kYuvConstants)
1892 "sub $0x8,%[width] \n"
1894 : [y_buf]"+r"(y_buf), // %[y_buf]
1895 [u_buf]"+r"(u_buf), // %[u_buf]
1896 [v_buf]"+r"(v_buf), // %[v_buf]
1897 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
1898 [width]"+rm"(width) // %[width]
1899 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1900 : "memory", "cc", NACL_R14
1901 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1905 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
1911 "sub %[u_buf],%[v_buf] \n"
1912 "pcmpeqb %%xmm5,%%xmm5 \n"
1916 YUVTORGB(kYuvConstants)
1918 "sub $0x8,%[width] \n"
1920 : [y_buf]"+r"(y_buf), // %[y_buf]
1921 [u_buf]"+r"(u_buf), // %[u_buf]
1922 [v_buf]"+r"(v_buf), // %[v_buf]
1923 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
1924 [width]"+rm"(width) // %[width]
1925 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1926 : "memory", "cc", NACL_R14
1927 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1931 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1937 "sub %[u_buf],%[v_buf] \n"
1938 "pcmpeqb %%xmm5,%%xmm5 \n"
1942 YUVTORGB(kYuvConstants)
1944 "sub $0x8,%[width] \n"
1946 : [y_buf]"+r"(y_buf), // %[y_buf]
1947 [u_buf]"+r"(u_buf), // %[u_buf]
1948 [v_buf]"+r"(v_buf), // %[v_buf]
1949 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
1950 [width]"+rm"(width) // %[width]
1951 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1952 : "memory", "cc", NACL_R14
1953 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1957 #endif // HAS_I422TOARGBROW_SSSE3
1959 // Read 8 UV from 422, upsample to 16 UV.
1960 #define READYUV422_AVX2 \
1961 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1962 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1963 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1964 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1965 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1966 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
1968 // Convert 16 pixels: 16 UV and 16 Y.
1969 #define YUVTORGB_AVX2(YuvConstants) \
1970 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \
1971 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \
1972 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \
1973 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \
1974 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
1975 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \
1976 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
1977 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \
1978 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
1979 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \
1980 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
1981 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
1982 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \
1983 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \
1984 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \
1985 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \
1986 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \
1987 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
1988 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
1989 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
1990 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
1991 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
1992 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
1994 #if defined(HAS_I422TOBGRAROW_AVX2)
1996 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
1997 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
2003 "sub %[u_buf],%[v_buf] \n"
2004 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2008 YUVTORGB_AVX2(kYuvConstants)
2010 // Step 3: Weave into BGRA
2011 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB
2012 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2013 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR
2014 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2015 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels
2016 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels
2018 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n"
2019 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
2020 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
2021 "sub $0x10,%[width] \n"
2024 : [y_buf]"+r"(y_buf), // %[y_buf]
2025 [u_buf]"+r"(u_buf), // %[u_buf]
2026 [v_buf]"+r"(v_buf), // %[v_buf]
2027 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
2028 [width]"+rm"(width) // %[width]
2029 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2030 : "memory", "cc", NACL_R14
2031 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2034 #endif // HAS_I422TOBGRAROW_AVX2
2036 #if defined(HAS_I422TOARGBROW_AVX2)
2038 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2039 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
2045 "sub %[u_buf],%[v_buf] \n"
2046 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2050 YUVTORGB_AVX2(kYuvConstants)
2052 // Step 3: Weave into ARGB
2053 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
2054 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2055 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
2056 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2057 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
2058 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
2060 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
2061 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
2062 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2063 "sub $0x10,%[width] \n"
2066 : [y_buf]"+r"(y_buf), // %[y_buf]
2067 [u_buf]"+r"(u_buf), // %[u_buf]
2068 [v_buf]"+r"(v_buf), // %[v_buf]
2069 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2070 [width]"+rm"(width) // %[width]
2071 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2072 : "memory", "cc", NACL_R14
2073 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2076 #endif // HAS_I422TOARGBROW_AVX2
2078 #if defined(HAS_J422TOARGBROW_AVX2)
2080 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2081 void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
2087 "sub %[u_buf],%[v_buf] \n"
2088 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2092 YUVTORGB_AVX2(kYuvConstants)
2094 // Step 3: Weave into ARGB
2095 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
2096 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2097 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
2098 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2099 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
2100 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
2102 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
2103 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
2104 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2105 "sub $0x10,%[width] \n"
2108 : [y_buf]"+r"(y_buf), // %[y_buf]
2109 [u_buf]"+r"(u_buf), // %[u_buf]
2110 [v_buf]"+r"(v_buf), // %[v_buf]
2111 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2112 [width]"+rm"(width) // %[width]
2113 : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
2114 : "memory", "cc", NACL_R14
2115 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2118 #endif // HAS_J422TOARGBROW_AVX2
2120 #if defined(HAS_I422TOABGRROW_AVX2)
2122 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
2123 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
2129 "sub %[u_buf],%[v_buf] \n"
2130 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2134 YUVTORGB_AVX2(kYuvConstants)
2136 // Step 3: Weave into ABGR
2137 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG
2138 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2139 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA
2140 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2141 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels
2142 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels
2143 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
2144 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2145 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2146 "sub $0x10,%[width] \n"
2149 : [y_buf]"+r"(y_buf), // %[y_buf]
2150 [u_buf]"+r"(u_buf), // %[u_buf]
2151 [v_buf]"+r"(v_buf), // %[v_buf]
2152 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2153 [width]"+rm"(width) // %[width]
2154 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2155 : "memory", "cc", NACL_R14
2156 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2159 #endif // HAS_I422TOABGRROW_AVX2
2161 #if defined(HAS_I422TORGBAROW_AVX2)
2163 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2164 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2170 "sub %[u_buf],%[v_buf] \n"
2171 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2175 YUVTORGB_AVX2(kYuvConstants)
2177 // Step 3: Weave into RGBA
2178 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
2179 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2180 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
2181 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2182 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
2183 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
2184 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
2185 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2186 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2187 "sub $0x10,%[width] \n"
2190 : [y_buf]"+r"(y_buf), // %[y_buf]
2191 [u_buf]"+r"(u_buf), // %[u_buf]
2192 [v_buf]"+r"(v_buf), // %[v_buf]
2193 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2194 [width]"+rm"(width) // %[width]
2195 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2196 : "memory", "cc", NACL_R14
2197 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2200 #endif // HAS_I422TORGBAROW_AVX2
2202 #ifdef HAS_I400TOARGBROW_SSE2
2203 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2205 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
2206 "movd %%eax,%%xmm2 \n"
2207 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2208 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
2209 "movd %%eax,%%xmm3 \n"
2210 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2211 "pcmpeqb %%xmm4,%%xmm4 \n"
2212 "pslld $0x18,%%xmm4 \n"
2215 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2216 "movq " MEMACCESS(0) ",%%xmm0 \n"
2217 "lea " MEMLEA(0x8,0) ",%0 \n"
2218 "punpcklbw %%xmm0,%%xmm0 \n"
2219 "pmulhuw %%xmm2,%%xmm0 \n"
2220 "psubusw %%xmm3,%%xmm0 \n"
2221 "psrlw $6, %%xmm0 \n"
2222 "packuswb %%xmm0,%%xmm0 \n"
2224 // Step 2: Weave into ARGB
2225 "punpcklbw %%xmm0,%%xmm0 \n"
2226 "movdqa %%xmm0,%%xmm1 \n"
2227 "punpcklwd %%xmm0,%%xmm0 \n"
2228 "punpckhwd %%xmm1,%%xmm1 \n"
2229 "por %%xmm4,%%xmm0 \n"
2230 "por %%xmm4,%%xmm1 \n"
2231 "movdqu %%xmm0," MEMACCESS(1) " \n"
2232 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2233 "lea " MEMLEA(0x20,1) ",%1 \n"
2237 : "+r"(y_buf), // %0
2238 "+r"(dst_argb), // %1
2241 : "memory", "cc", "eax"
2242 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2245 #endif // HAS_I400TOARGBROW_SSE2
2247 #ifdef HAS_I400TOARGBROW_AVX2
2248 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2249 // note: vpunpcklbw mutates and vpackuswb unmutates.
2250 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2252 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
2253 "vmovd %%eax,%%xmm2 \n"
2254 "vbroadcastss %%xmm2,%%ymm2 \n"
2255 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
2256 "vmovd %%eax,%%xmm3 \n"
2257 "vbroadcastss %%xmm3,%%ymm3 \n"
2258 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
2259 "vpslld $0x18,%%ymm4,%%ymm4 \n"
2263 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2264 "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
2265 "lea " MEMLEA(0x10,0) ",%0 \n"
2266 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2267 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
2268 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
2269 "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
2270 "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
2271 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
2272 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
2273 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2274 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
2275 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
2276 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
2277 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
2278 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2279 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2280 "lea " MEMLEA(0x40,1) ",%1 \n"
2284 : "+r"(y_buf), // %0
2285 "+r"(dst_argb), // %1
2288 : "memory", "cc", "eax"
2289 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2292 #endif // HAS_I400TOARGBROW_AVX2
2294 #ifdef HAS_MIRRORROW_SSSE3
2295 // Shuffle table for reversing the bytes.
2296 static uvec8 kShuffleMirror = {
2297 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2300 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2301 intptr_t temp_width = (intptr_t)(width);
2303 "movdqa %3,%%xmm5 \n"
2306 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
2307 "pshufb %%xmm5,%%xmm0 \n"
2308 "movdqu %%xmm0," MEMACCESS(1) " \n"
2309 "lea " MEMLEA(0x10,1) ",%1 \n"
2314 "+r"(temp_width) // %2
2315 : "m"(kShuffleMirror) // %3
2316 : "memory", "cc", NACL_R14
2320 #endif // HAS_MIRRORROW_SSSE3
2322 #ifdef HAS_MIRRORROW_AVX2
2323 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2324 intptr_t temp_width = (intptr_t)(width);
2326 "vbroadcastf128 %3,%%ymm5 \n"
2329 MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
2330 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
2331 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
2332 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2333 "lea " MEMLEA(0x20,1) ",%1 \n"
2339 "+r"(temp_width) // %2
2340 : "m"(kShuffleMirror) // %3
2341 : "memory", "cc", NACL_R14
2345 #endif // HAS_MIRRORROW_AVX2
2347 #ifdef HAS_MIRRORROW_SSE2
2348 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2349 intptr_t temp_width = (intptr_t)(width);
2353 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
2354 "movdqa %%xmm0,%%xmm1 \n"
2355 "psllw $0x8,%%xmm0 \n"
2356 "psrlw $0x8,%%xmm1 \n"
2357 "por %%xmm1,%%xmm0 \n"
2358 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2359 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2360 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2361 "movdqu %%xmm0," MEMACCESS(1) " \n"
2362 "lea " MEMLEA(0x10,1)",%1 \n"
2367 "+r"(temp_width) // %2
2369 : "memory", "cc", NACL_R14
2373 #endif // HAS_MIRRORROW_SSE2
2375 #ifdef HAS_MIRRORROW_UV_SSSE3
2376 // Shuffle table for reversing the bytes of UV channels.
2377 static uvec8 kShuffleMirrorUV = {
2378 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2380 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2382 intptr_t temp_width = (intptr_t)(width);
2384 "movdqa %4,%%xmm1 \n"
2385 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
2389 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2390 "lea " MEMLEA(-0x10,0) ",%0 \n"
2391 "pshufb %%xmm1,%%xmm0 \n"
2392 "movlpd %%xmm0," MEMACCESS(1) " \n"
2393 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
2394 "lea " MEMLEA(0x8,1) ",%1 \n"
2400 "+r"(temp_width) // %3
2401 : "m"(kShuffleMirrorUV) // %4
2402 : "memory", "cc", NACL_R14
2406 #endif // HAS_MIRRORROW_UV_SSSE3
2408 #ifdef HAS_ARGBMIRRORROW_SSE2
2410 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2411 intptr_t temp_width = (intptr_t)(width);
2413 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
2416 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2417 "pshufd $0x1b,%%xmm0,%%xmm0 \n"
2418 "lea " MEMLEA(-0x10,0) ",%0 \n"
2419 "movdqu %%xmm0," MEMACCESS(1) " \n"
2420 "lea " MEMLEA(0x10,1) ",%1 \n"
2425 "+r"(temp_width) // %2
2431 #endif // HAS_ARGBMIRRORROW_SSE2
2433 #ifdef HAS_ARGBMIRRORROW_AVX2
2434 // Shuffle table for reversing the bytes.
2435 static const ulvec32 kARGBShuffleMirror_AVX2 = {
2436 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2438 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2439 intptr_t temp_width = (intptr_t)(width);
2441 "vmovdqu %3,%%ymm5 \n"
2444 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2445 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2446 "lea " MEMLEA(0x20,1) ",%1 \n"
2452 "+r"(temp_width) // %2
2453 : "m"(kARGBShuffleMirror_AVX2) // %3
2454 : "memory", "cc", NACL_R14
2458 #endif // HAS_ARGBMIRRORROW_AVX2
2460 #ifdef HAS_SPLITUVROW_AVX2
2461 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2463 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2464 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2468 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2469 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2470 "lea " MEMLEA(0x40,0) ",%0 \n"
2471 "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
2472 "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
2473 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
2474 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
2475 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2476 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
2477 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2478 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2479 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2480 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
2481 "lea " MEMLEA(0x20,1) ",%1 \n"
2485 : "+r"(src_uv), // %0
2490 : "memory", "cc", NACL_R14
2491 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2494 #endif // HAS_SPLITUVROW_AVX2
2496 #ifdef HAS_SPLITUVROW_SSE2
2497 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2499 "pcmpeqb %%xmm5,%%xmm5 \n"
2500 "psrlw $0x8,%%xmm5 \n"
2504 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2505 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2506 "lea " MEMLEA(0x20,0) ",%0 \n"
2507 "movdqa %%xmm0,%%xmm2 \n"
2508 "movdqa %%xmm1,%%xmm3 \n"
2509 "pand %%xmm5,%%xmm0 \n"
2510 "pand %%xmm5,%%xmm1 \n"
2511 "packuswb %%xmm1,%%xmm0 \n"
2512 "psrlw $0x8,%%xmm2 \n"
2513 "psrlw $0x8,%%xmm3 \n"
2514 "packuswb %%xmm3,%%xmm2 \n"
2515 "movdqu %%xmm0," MEMACCESS(1) " \n"
2516 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
2517 "lea " MEMLEA(0x10,1) ",%1 \n"
2520 : "+r"(src_uv), // %0
2525 : "memory", "cc", NACL_R14
2526 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2529 #endif // HAS_SPLITUVROW_SSE2
2531 #ifdef HAS_MERGEUVROW_AVX2
2532 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2538 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2539 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
2540 "lea " MEMLEA(0x20,0) ",%0 \n"
2541 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
2542 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
2543 "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
2544 "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2545 "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2546 "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2547 "lea " MEMLEA(0x40,2) ",%2 \n"
2551 : "+r"(src_u), // %0
2556 : "memory", "cc", NACL_R14
2557 "xmm0", "xmm1", "xmm2"
2560 #endif // HAS_MERGEUVROW_AVX2
2562 #ifdef HAS_MERGEUVROW_SSE2
2563 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2569 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2570 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
2571 "lea " MEMLEA(0x10,0) ",%0 \n"
2572 "movdqa %%xmm0,%%xmm2 \n"
2573 "punpcklbw %%xmm1,%%xmm0 \n"
2574 "punpckhbw %%xmm1,%%xmm2 \n"
2575 "movdqu %%xmm0," MEMACCESS(2) " \n"
2576 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
2577 "lea " MEMLEA(0x20,2) ",%2 \n"
2580 : "+r"(src_u), // %0
2585 : "memory", "cc", NACL_R14
2586 "xmm0", "xmm1", "xmm2"
2589 #endif // HAS_MERGEUVROW_SSE2
2591 #ifdef HAS_COPYROW_SSE2
2592 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2596 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2597 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2598 "lea " MEMLEA(0x20,0) ",%0 \n"
2599 "movdqu %%xmm0," MEMACCESS(1) " \n"
2600 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2601 "lea " MEMLEA(0x20,1) ",%1 \n"
2612 #endif // HAS_COPYROW_SSE2
2614 #ifdef HAS_COPYROW_AVX
2615 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2619 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2620 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2621 "lea " MEMLEA(0x40,0) ",%0 \n"
2622 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2623 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2624 "lea " MEMLEA(0x40,1) ",%1 \n"
2635 #endif // HAS_COPYROW_AVX
2637 #ifdef HAS_COPYROW_ERMS
2639 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2640 size_t width_tmp = (size_t)(width);
2642 "rep movsb " MEMMOVESTRING(0,1) " \n"
2645 "+c"(width_tmp) // %2
2650 #endif // HAS_COPYROW_ERMS
2652 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2654 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2656 "pcmpeqb %%xmm0,%%xmm0 \n"
2657 "pslld $0x18,%%xmm0 \n"
2658 "pcmpeqb %%xmm1,%%xmm1 \n"
2659 "psrld $0x8,%%xmm1 \n"
2662 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
2663 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
2664 "lea " MEMLEA(0x20,0) ",%0 \n"
2665 "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2666 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2667 "pand %%xmm0,%%xmm2 \n"
2668 "pand %%xmm0,%%xmm3 \n"
2669 "pand %%xmm1,%%xmm4 \n"
2670 "pand %%xmm1,%%xmm5 \n"
2671 "por %%xmm4,%%xmm2 \n"
2672 "por %%xmm5,%%xmm3 \n"
2673 "movdqu %%xmm2," MEMACCESS(1) " \n"
2674 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2675 "lea " MEMLEA(0x20,1) ",%1 \n"
2683 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2686 #endif // HAS_ARGBCOPYALPHAROW_SSE2
2688 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
2690 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2692 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
2693 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
2696 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
2697 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
2698 "lea " MEMLEA(0x40,0) ",%0 \n"
2699 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
2700 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
2701 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
2702 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
2703 "lea " MEMLEA(0x40,1) ",%1 \n"
2712 , "xmm0", "xmm1", "xmm2"
2715 #endif // HAS_ARGBCOPYALPHAROW_AVX2
2717 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2719 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2721 "pcmpeqb %%xmm0,%%xmm0 \n"
2722 "pslld $0x18,%%xmm0 \n"
2723 "pcmpeqb %%xmm1,%%xmm1 \n"
2724 "psrld $0x8,%%xmm1 \n"
2727 "movq " MEMACCESS(0) ",%%xmm2 \n"
2728 "lea " MEMLEA(0x8,0) ",%0 \n"
2729 "punpcklbw %%xmm2,%%xmm2 \n"
2730 "punpckhwd %%xmm2,%%xmm3 \n"
2731 "punpcklwd %%xmm2,%%xmm2 \n"
2732 "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2733 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2734 "pand %%xmm0,%%xmm2 \n"
2735 "pand %%xmm0,%%xmm3 \n"
2736 "pand %%xmm1,%%xmm4 \n"
2737 "pand %%xmm1,%%xmm5 \n"
2738 "por %%xmm4,%%xmm2 \n"
2739 "por %%xmm5,%%xmm3 \n"
2740 "movdqu %%xmm2," MEMACCESS(1) " \n"
2741 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2742 "lea " MEMLEA(0x20,1) ",%1 \n"
2750 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2753 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
2755 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
2757 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2759 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
2760 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
2763 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
2764 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
2765 "lea " MEMLEA(0x10,0) ",%0 \n"
2766 "vpslld $0x18,%%ymm1,%%ymm1 \n"
2767 "vpslld $0x18,%%ymm2,%%ymm2 \n"
2768 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
2769 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
2770 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
2771 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
2772 "lea " MEMLEA(0x40,1) ",%1 \n"
2781 , "xmm0", "xmm1", "xmm2"
2784 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
2786 #ifdef HAS_SETROW_X86
2787 void SetRow_X86(uint8* dst, uint8 v8, int width) {
2788 size_t width_tmp = (size_t)(width >> 2);
2789 const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes.
2791 "rep stosl " MEMSTORESTRING(eax,0) " \n"
2793 "+c"(width_tmp) // %1
2798 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
2799 size_t width_tmp = (size_t)(width);
2801 "rep stosb " MEMSTORESTRING(al,0) " \n"
2803 "+c"(width_tmp) // %1
2808 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
2809 size_t width_tmp = (size_t)(width);
2811 "rep stosl " MEMSTORESTRING(eax,0) " \n"
2812 : "+D"(dst_argb), // %0
2813 "+c"(width_tmp) // %1
2817 #endif // HAS_SETROW_X86
2819 #ifdef HAS_YUY2TOYROW_SSE2
2820 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
2822 "pcmpeqb %%xmm5,%%xmm5 \n"
2823 "psrlw $0x8,%%xmm5 \n"
2826 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2827 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2828 "lea " MEMLEA(0x20,0) ",%0 \n"
2829 "pand %%xmm5,%%xmm0 \n"
2830 "pand %%xmm5,%%xmm1 \n"
2831 "packuswb %%xmm1,%%xmm0 \n"
2832 "movdqu %%xmm0," MEMACCESS(1) " \n"
2833 "lea " MEMLEA(0x10,1) ",%1 \n"
2836 : "+r"(src_yuy2), // %0
2841 , "xmm0", "xmm1", "xmm5"
2845 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2846 uint8* dst_u, uint8* dst_v, int pix) {
2848 "pcmpeqb %%xmm5,%%xmm5 \n"
2849 "psrlw $0x8,%%xmm5 \n"
2853 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2854 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2855 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
2856 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
2857 "lea " MEMLEA(0x20,0) ",%0 \n"
2858 "pavgb %%xmm2,%%xmm0 \n"
2859 "pavgb %%xmm3,%%xmm1 \n"
2860 "psrlw $0x8,%%xmm0 \n"
2861 "psrlw $0x8,%%xmm1 \n"
2862 "packuswb %%xmm1,%%xmm0 \n"
2863 "movdqa %%xmm0,%%xmm1 \n"
2864 "pand %%xmm5,%%xmm0 \n"
2865 "packuswb %%xmm0,%%xmm0 \n"
2866 "psrlw $0x8,%%xmm1 \n"
2867 "packuswb %%xmm1,%%xmm1 \n"
2868 "movq %%xmm0," MEMACCESS(1) " \n"
2869 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2870 "lea " MEMLEA(0x8,1) ",%1 \n"
2873 : "+r"(src_yuy2), // %0
2877 : "r"((intptr_t)(stride_yuy2)) // %4
2878 : "memory", "cc", NACL_R14
2879 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2883 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2884 uint8* dst_u, uint8* dst_v, int pix) {
2886 "pcmpeqb %%xmm5,%%xmm5 \n"
2887 "psrlw $0x8,%%xmm5 \n"
2891 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2892 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2893 "lea " MEMLEA(0x20,0) ",%0 \n"
2894 "psrlw $0x8,%%xmm0 \n"
2895 "psrlw $0x8,%%xmm1 \n"
2896 "packuswb %%xmm1,%%xmm0 \n"
2897 "movdqa %%xmm0,%%xmm1 \n"
2898 "pand %%xmm5,%%xmm0 \n"
2899 "packuswb %%xmm0,%%xmm0 \n"
2900 "psrlw $0x8,%%xmm1 \n"
2901 "packuswb %%xmm1,%%xmm1 \n"
2902 "movq %%xmm0," MEMACCESS(1) " \n"
2903 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2904 "lea " MEMLEA(0x8,1) ",%1 \n"
2907 : "+r"(src_yuy2), // %0
2912 : "memory", "cc", NACL_R14
2913 "xmm0", "xmm1", "xmm5"
2917 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
2921 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2922 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2923 "lea " MEMLEA(0x20,0) ",%0 \n"
2924 "psrlw $0x8,%%xmm0 \n"
2925 "psrlw $0x8,%%xmm1 \n"
2926 "packuswb %%xmm1,%%xmm0 \n"
2927 "movdqu %%xmm0," MEMACCESS(1) " \n"
2928 "lea " MEMLEA(0x10,1) ",%1 \n"
2931 : "+r"(src_uyvy), // %0
2940 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2941 uint8* dst_u, uint8* dst_v, int pix) {
2943 "pcmpeqb %%xmm5,%%xmm5 \n"
2944 "psrlw $0x8,%%xmm5 \n"
2948 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2949 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2950 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
2951 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
2952 "lea " MEMLEA(0x20,0) ",%0 \n"
2953 "pavgb %%xmm2,%%xmm0 \n"
2954 "pavgb %%xmm3,%%xmm1 \n"
2955 "pand %%xmm5,%%xmm0 \n"
2956 "pand %%xmm5,%%xmm1 \n"
2957 "packuswb %%xmm1,%%xmm0 \n"
2958 "movdqa %%xmm0,%%xmm1 \n"
2959 "pand %%xmm5,%%xmm0 \n"
2960 "packuswb %%xmm0,%%xmm0 \n"
2961 "psrlw $0x8,%%xmm1 \n"
2962 "packuswb %%xmm1,%%xmm1 \n"
2963 "movq %%xmm0," MEMACCESS(1) " \n"
2964 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2965 "lea " MEMLEA(0x8,1) ",%1 \n"
2968 : "+r"(src_uyvy), // %0
2972 : "r"((intptr_t)(stride_uyvy)) // %4
2973 : "memory", "cc", NACL_R14
2974 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2978 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2979 uint8* dst_u, uint8* dst_v, int pix) {
2981 "pcmpeqb %%xmm5,%%xmm5 \n"
2982 "psrlw $0x8,%%xmm5 \n"
2986 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2987 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2988 "lea " MEMLEA(0x20,0) ",%0 \n"
2989 "pand %%xmm5,%%xmm0 \n"
2990 "pand %%xmm5,%%xmm1 \n"
2991 "packuswb %%xmm1,%%xmm0 \n"
2992 "movdqa %%xmm0,%%xmm1 \n"
2993 "pand %%xmm5,%%xmm0 \n"
2994 "packuswb %%xmm0,%%xmm0 \n"
2995 "psrlw $0x8,%%xmm1 \n"
2996 "packuswb %%xmm1,%%xmm1 \n"
2997 "movq %%xmm0," MEMACCESS(1) " \n"
2998 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2999 "lea " MEMLEA(0x8,1) ",%1 \n"
3002 : "+r"(src_uyvy), // %0
3007 : "memory", "cc", NACL_R14
3008 "xmm0", "xmm1", "xmm5"
3011 #endif // HAS_YUY2TOYROW_SSE2
3013 #ifdef HAS_YUY2TOYROW_AVX2
3014 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
3016 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3017 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3020 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3021 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3022 "lea " MEMLEA(0x40,0) ",%0 \n"
3023 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3024 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3025 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3026 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3027 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
3028 "lea " MEMLEA(0x20,1) ",%1 \n"
3032 : "+r"(src_yuy2), // %0
3037 , "xmm0", "xmm1", "xmm5"
3041 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3042 uint8* dst_u, uint8* dst_v, int pix) {
3044 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3045 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3049 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3050 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3051 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3052 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3053 "lea " MEMLEA(0x40,0) ",%0 \n"
3054 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3055 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3056 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3057 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3058 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3059 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3060 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3061 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3062 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3063 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3064 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3065 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3066 "lea " MEMLEA(0x10,1) ",%1 \n"
3070 : "+r"(src_yuy2), // %0
3074 : "r"((intptr_t)(stride_yuy2)) // %4
3075 : "memory", "cc", NACL_R14
3076 "xmm0", "xmm1", "xmm5"
3080 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3081 uint8* dst_u, uint8* dst_v, int pix) {
3083 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3084 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3088 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3089 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3090 "lea " MEMLEA(0x40,0) ",%0 \n"
3091 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3092 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3093 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3094 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3095 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3096 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3097 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3098 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3099 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3100 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3101 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3102 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3103 "lea " MEMLEA(0x10,1) ",%1 \n"
3107 : "+r"(src_yuy2), // %0
3112 : "memory", "cc", NACL_R14
3113 "xmm0", "xmm1", "xmm5"
3117 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
3121 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3122 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3123 "lea " MEMLEA(0x40,0) ",%0 \n"
3124 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3125 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3126 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3127 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3128 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
3129 "lea " MEMLEA(0x20,1) ",%1 \n"
3133 : "+r"(src_uyvy), // %0
3138 , "xmm0", "xmm1", "xmm5"
3141 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3142 uint8* dst_u, uint8* dst_v, int pix) {
3144 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3145 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3150 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3151 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3152 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3153 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3154 "lea " MEMLEA(0x40,0) ",%0 \n"
3155 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3156 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3157 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3158 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3159 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3160 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3161 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3162 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3163 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3164 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3165 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3166 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3167 "lea " MEMLEA(0x10,1) ",%1 \n"
3171 : "+r"(src_uyvy), // %0
3175 : "r"((intptr_t)(stride_uyvy)) // %4
3176 : "memory", "cc", NACL_R14
3177 "xmm0", "xmm1", "xmm5"
3181 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3182 uint8* dst_u, uint8* dst_v, int pix) {
3184 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3185 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3189 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3190 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3191 "lea " MEMLEA(0x40,0) ",%0 \n"
3192 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3193 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3194 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3195 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3196 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3197 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3198 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3199 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3200 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3201 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3202 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3203 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3204 "lea " MEMLEA(0x10,1) ",%1 \n"
3208 : "+r"(src_uyvy), // %0
3213 : "memory", "cc", NACL_R14
3214 "xmm0", "xmm1", "xmm5"
3217 #endif // HAS_YUY2TOYROW_AVX2
3219 #ifdef HAS_ARGBBLENDROW_SSE2
3220 // Blend 8 pixels at a time.
3221 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3222 uint8* dst_argb, int width) {
3224 "pcmpeqb %%xmm7,%%xmm7 \n"
3225 "psrlw $0xf,%%xmm7 \n"
3226 "pcmpeqb %%xmm6,%%xmm6 \n"
3227 "psrlw $0x8,%%xmm6 \n"
3228 "pcmpeqb %%xmm5,%%xmm5 \n"
3229 "psllw $0x8,%%xmm5 \n"
3230 "pcmpeqb %%xmm4,%%xmm4 \n"
3231 "pslld $0x18,%%xmm4 \n"
3238 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3239 "lea " MEMLEA(0x10,0) ",%0 \n"
3240 "movdqa %%xmm3,%%xmm0 \n"
3241 "pxor %%xmm4,%%xmm3 \n"
3242 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3243 "psrlw $0x8,%%xmm3 \n"
3244 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3245 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3246 "pand %%xmm6,%%xmm2 \n"
3247 "paddw %%xmm7,%%xmm3 \n"
3248 "pmullw %%xmm3,%%xmm2 \n"
3249 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3250 "lea " MEMLEA(0x10,1) ",%1 \n"
3251 "psrlw $0x8,%%xmm1 \n"
3252 "por %%xmm4,%%xmm0 \n"
3253 "pmullw %%xmm3,%%xmm1 \n"
3254 "psrlw $0x8,%%xmm2 \n"
3255 "paddusb %%xmm2,%%xmm0 \n"
3256 "pand %%xmm5,%%xmm1 \n"
3257 "paddusb %%xmm1,%%xmm0 \n"
3258 "movdqu %%xmm0," MEMACCESS(2) " \n"
3259 "lea " MEMLEA(0x10,2) ",%2 \n"
3269 "movd " MEMACCESS(0) ",%%xmm3 \n"
3270 "lea " MEMLEA(0x4,0) ",%0 \n"
3271 "movdqa %%xmm3,%%xmm0 \n"
3272 "pxor %%xmm4,%%xmm3 \n"
3273 "movd " MEMACCESS(1) ",%%xmm2 \n"
3274 "psrlw $0x8,%%xmm3 \n"
3275 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3276 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3277 "pand %%xmm6,%%xmm2 \n"
3278 "paddw %%xmm7,%%xmm3 \n"
3279 "pmullw %%xmm3,%%xmm2 \n"
3280 "movd " MEMACCESS(1) ",%%xmm1 \n"
3281 "lea " MEMLEA(0x4,1) ",%1 \n"
3282 "psrlw $0x8,%%xmm1 \n"
3283 "por %%xmm4,%%xmm0 \n"
3284 "pmullw %%xmm3,%%xmm1 \n"
3285 "psrlw $0x8,%%xmm2 \n"
3286 "paddusb %%xmm2,%%xmm0 \n"
3287 "pand %%xmm5,%%xmm1 \n"
3288 "paddusb %%xmm1,%%xmm0 \n"
3289 "movd %%xmm0," MEMACCESS(2) " \n"
3290 "lea " MEMLEA(0x4,2) ",%2 \n"
3294 : "+r"(src_argb0), // %0
3295 "+r"(src_argb1), // %1
3296 "+r"(dst_argb), // %2
3300 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3303 #endif // HAS_ARGBBLENDROW_SSE2
3305 #ifdef HAS_ARGBBLENDROW_SSSE3
3306 // Shuffle table for isolating alpha.
3307 static uvec8 kShuffleAlpha = {
3308 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3309 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3312 // Blend 8 pixels at a time
3313 // Shuffle table for reversing the bytes.
3315 // Same as SSE2, but replaces
3316 // psrlw xmm3, 8 // alpha
3317 // pshufhw xmm3, xmm3,0F5h // 8 alpha words
3318 // pshuflw xmm3, xmm3,0F5h
3320 // pshufb xmm3, kShuffleAlpha // alpha
3322 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3323 uint8* dst_argb, int width) {
3325 "pcmpeqb %%xmm7,%%xmm7 \n"
3326 "psrlw $0xf,%%xmm7 \n"
3327 "pcmpeqb %%xmm6,%%xmm6 \n"
3328 "psrlw $0x8,%%xmm6 \n"
3329 "pcmpeqb %%xmm5,%%xmm5 \n"
3330 "psllw $0x8,%%xmm5 \n"
3331 "pcmpeqb %%xmm4,%%xmm4 \n"
3332 "pslld $0x18,%%xmm4 \n"
3339 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3340 "lea " MEMLEA(0x10,0) ",%0 \n"
3341 "movdqa %%xmm3,%%xmm0 \n"
3342 "pxor %%xmm4,%%xmm3 \n"
3343 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3344 "pshufb %4,%%xmm3 \n"
3345 "pand %%xmm6,%%xmm2 \n"
3346 "paddw %%xmm7,%%xmm3 \n"
3347 "pmullw %%xmm3,%%xmm2 \n"
3348 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3349 "lea " MEMLEA(0x10,1) ",%1 \n"
3350 "psrlw $0x8,%%xmm1 \n"
3351 "por %%xmm4,%%xmm0 \n"
3352 "pmullw %%xmm3,%%xmm1 \n"
3353 "psrlw $0x8,%%xmm2 \n"
3354 "paddusb %%xmm2,%%xmm0 \n"
3355 "pand %%xmm5,%%xmm1 \n"
3356 "paddusb %%xmm1,%%xmm0 \n"
3357 "movdqu %%xmm0," MEMACCESS(2) " \n"
3358 "lea " MEMLEA(0x10,2) ",%2 \n"
3368 "movd " MEMACCESS(0) ",%%xmm3 \n"
3369 "lea " MEMLEA(0x4,0) ",%0 \n"
3370 "movdqa %%xmm3,%%xmm0 \n"
3371 "pxor %%xmm4,%%xmm3 \n"
3372 "movd " MEMACCESS(1) ",%%xmm2 \n"
3373 "pshufb %4,%%xmm3 \n"
3374 "pand %%xmm6,%%xmm2 \n"
3375 "paddw %%xmm7,%%xmm3 \n"
3376 "pmullw %%xmm3,%%xmm2 \n"
3377 "movd " MEMACCESS(1) ",%%xmm1 \n"
3378 "lea " MEMLEA(0x4,1) ",%1 \n"
3379 "psrlw $0x8,%%xmm1 \n"
3380 "por %%xmm4,%%xmm0 \n"
3381 "pmullw %%xmm3,%%xmm1 \n"
3382 "psrlw $0x8,%%xmm2 \n"
3383 "paddusb %%xmm2,%%xmm0 \n"
3384 "pand %%xmm5,%%xmm1 \n"
3385 "paddusb %%xmm1,%%xmm0 \n"
3386 "movd %%xmm0," MEMACCESS(2) " \n"
3387 "lea " MEMLEA(0x4,2) ",%2 \n"
3391 : "+r"(src_argb0), // %0
3392 "+r"(src_argb1), // %1
3393 "+r"(dst_argb), // %2
3395 : "m"(kShuffleAlpha) // %4
3397 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3400 #endif // HAS_ARGBBLENDROW_SSSE3
3402 #ifdef HAS_ARGBATTENUATEROW_SSE2
3403 // Attenuate 4 pixels at a time.
3404 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3406 "pcmpeqb %%xmm4,%%xmm4 \n"
3407 "pslld $0x18,%%xmm4 \n"
3408 "pcmpeqb %%xmm5,%%xmm5 \n"
3409 "psrld $0x8,%%xmm5 \n"
3414 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3415 "punpcklbw %%xmm0,%%xmm0 \n"
3416 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3417 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3418 "pmulhuw %%xmm2,%%xmm0 \n"
3419 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3420 "punpckhbw %%xmm1,%%xmm1 \n"
3421 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3422 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3423 "pmulhuw %%xmm2,%%xmm1 \n"
3424 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3425 "lea " MEMLEA(0x10,0) ",%0 \n"
3426 "psrlw $0x8,%%xmm0 \n"
3427 "pand %%xmm4,%%xmm2 \n"
3428 "psrlw $0x8,%%xmm1 \n"
3429 "packuswb %%xmm1,%%xmm0 \n"
3430 "pand %%xmm5,%%xmm0 \n"
3431 "por %%xmm2,%%xmm0 \n"
3432 "movdqu %%xmm0," MEMACCESS(1) " \n"
3433 "lea " MEMLEA(0x10,1) ",%1 \n"
3436 : "+r"(src_argb), // %0
3437 "+r"(dst_argb), // %1
3441 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3444 #endif // HAS_ARGBATTENUATEROW_SSE2
3446 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3447 // Shuffle table duplicating alpha
3448 static uvec8 kShuffleAlpha0 = {
3449 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
3451 static uvec8 kShuffleAlpha1 = {
3452 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3453 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
3455 // Attenuate 4 pixels at a time.
3456 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3458 "pcmpeqb %%xmm3,%%xmm3 \n"
3459 "pslld $0x18,%%xmm3 \n"
3460 "movdqa %3,%%xmm4 \n"
3461 "movdqa %4,%%xmm5 \n"
3466 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3467 "pshufb %%xmm4,%%xmm0 \n"
3468 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3469 "punpcklbw %%xmm1,%%xmm1 \n"
3470 "pmulhuw %%xmm1,%%xmm0 \n"
3471 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3472 "pshufb %%xmm5,%%xmm1 \n"
3473 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3474 "punpckhbw %%xmm2,%%xmm2 \n"
3475 "pmulhuw %%xmm2,%%xmm1 \n"
3476 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3477 "lea " MEMLEA(0x10,0) ",%0 \n"
3478 "pand %%xmm3,%%xmm2 \n"
3479 "psrlw $0x8,%%xmm0 \n"
3480 "psrlw $0x8,%%xmm1 \n"
3481 "packuswb %%xmm1,%%xmm0 \n"
3482 "por %%xmm2,%%xmm0 \n"
3483 "movdqu %%xmm0," MEMACCESS(1) " \n"
3484 "lea " MEMLEA(0x10,1) ",%1 \n"
3487 : "+r"(src_argb), // %0
3488 "+r"(dst_argb), // %1
3490 : "m"(kShuffleAlpha0), // %3
3491 "m"(kShuffleAlpha1) // %4
3493 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3496 #endif // HAS_ARGBATTENUATEROW_SSSE3
3498 #ifdef HAS_ARGBATTENUATEROW_AVX2
3499 // Shuffle table duplicating alpha.
3500 static const uvec8 kShuffleAlpha_AVX2 = {
3501 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
3503 // Attenuate 8 pixels at a time.
3504 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3506 "vbroadcastf128 %3,%%ymm4 \n"
3507 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3508 "vpslld $0x18,%%ymm5,%%ymm5 \n"
3514 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3515 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3516 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3517 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
3518 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
3519 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3520 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3521 "vpand %%ymm5,%%ymm6,%%ymm6 \n"
3522 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3523 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3524 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3525 "vpor %%ymm6,%%ymm0,%%ymm0 \n"
3526 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3527 "lea " MEMLEA(0x20,0) ",%0 \n"
3531 : "+r"(src_argb), // %0
3532 "+r"(dst_argb), // %1
3534 : "m"(kShuffleAlpha_AVX2) // %3
3536 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3539 #endif // HAS_ARGBATTENUATEROW_AVX2
3541 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3542 // Unattenuate 4 pixels at a time.
3543 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3545 uintptr_t alpha = 0;
3550 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3551 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3552 "punpcklbw %%xmm0,%%xmm0 \n"
3553 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3554 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3555 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3556 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3557 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3558 "movlhps %%xmm3,%%xmm2 \n"
3559 "pmulhuw %%xmm2,%%xmm0 \n"
3560 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3561 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3562 "punpckhbw %%xmm1,%%xmm1 \n"
3563 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3564 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3565 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3566 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3567 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3568 "movlhps %%xmm3,%%xmm2 \n"
3569 "pmulhuw %%xmm2,%%xmm1 \n"
3570 "lea " MEMLEA(0x10,0) ",%0 \n"
3571 "packuswb %%xmm1,%%xmm0 \n"
3572 "movdqu %%xmm0," MEMACCESS(1) " \n"
3573 "lea " MEMLEA(0x10,1) ",%1 \n"
3576 : "+r"(src_argb), // %0
3577 "+r"(dst_argb), // %1
3580 : "r"(fixed_invtbl8) // %4
3581 : "memory", "cc", NACL_R14
3582 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3585 #endif // HAS_ARGBUNATTENUATEROW_SSE2
3587 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3588 // Shuffle table duplicating alpha.
3589 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3590 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
3592 // Unattenuate 8 pixels at a time.
3593 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
3595 uintptr_t alpha = 0;
3598 "vbroadcastf128 %5,%%ymm5 \n"
3604 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3605 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3606 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3607 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3608 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3609 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
3610 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3611 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3612 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3613 "movzb " MEMACCESS2(0x13,0) ",%3 \n"
3614 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
3615 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3616 "movzb " MEMACCESS2(0x17,0) ",%3 \n"
3617 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3618 "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
3619 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
3620 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3621 "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
3622 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3623 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
3624 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
3625 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
3626 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
3629 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3630 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3631 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3632 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
3633 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
3634 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
3635 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
3636 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3637 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3638 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3639 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3640 "lea " MEMLEA(0x20,0) ",%0 \n"
3644 : "+r"(src_argb), // %0
3645 "+r"(dst_argb), // %1
3648 : "r"(fixed_invtbl8), // %4
3649 "m"(kUnattenShuffleAlpha_AVX2) // %5
3650 : "memory", "cc", NACL_R14
3651 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3654 #endif // HAS_ARGBUNATTENUATEROW_AVX2
3656 #ifdef HAS_ARGBGRAYROW_SSSE3
3657 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
3658 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3660 "movdqa %3,%%xmm4 \n"
3661 "movdqa %4,%%xmm5 \n"
3666 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3667 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3668 "pmaddubsw %%xmm4,%%xmm0 \n"
3669 "pmaddubsw %%xmm4,%%xmm1 \n"
3670 "phaddw %%xmm1,%%xmm0 \n"
3671 "paddw %%xmm5,%%xmm0 \n"
3672 "psrlw $0x7,%%xmm0 \n"
3673 "packuswb %%xmm0,%%xmm0 \n"
3674 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3675 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
3676 "lea " MEMLEA(0x20,0) ",%0 \n"
3677 "psrld $0x18,%%xmm2 \n"
3678 "psrld $0x18,%%xmm3 \n"
3679 "packuswb %%xmm3,%%xmm2 \n"
3680 "packuswb %%xmm2,%%xmm2 \n"
3681 "movdqa %%xmm0,%%xmm3 \n"
3682 "punpcklbw %%xmm0,%%xmm0 \n"
3683 "punpcklbw %%xmm2,%%xmm3 \n"
3684 "movdqa %%xmm0,%%xmm1 \n"
3685 "punpcklwd %%xmm3,%%xmm0 \n"
3686 "punpckhwd %%xmm3,%%xmm1 \n"
3687 "movdqu %%xmm0," MEMACCESS(1) " \n"
3688 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
3689 "lea " MEMLEA(0x20,1) ",%1 \n"
3692 : "+r"(src_argb), // %0
3693 "+r"(dst_argb), // %1
3695 : "m"(kARGBToYJ), // %3
3698 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3701 #endif // HAS_ARGBGRAYROW_SSSE3
3703 #ifdef HAS_ARGBSEPIAROW_SSSE3
3704 // b = (r * 35 + g * 68 + b * 17) >> 7
3705 // g = (r * 45 + g * 88 + b * 22) >> 7
3706 // r = (r * 50 + g * 98 + b * 24) >> 7
3707 // Constant for ARGB color to sepia tone
3708 static vec8 kARGBToSepiaB = {
3709 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3712 static vec8 kARGBToSepiaG = {
3713 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3716 static vec8 kARGBToSepiaR = {
3717 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3720 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3721 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3723 "movdqa %2,%%xmm2 \n"
3724 "movdqa %3,%%xmm3 \n"
3725 "movdqa %4,%%xmm4 \n"
3730 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3731 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
3732 "pmaddubsw %%xmm2,%%xmm0 \n"
3733 "pmaddubsw %%xmm2,%%xmm6 \n"
3734 "phaddw %%xmm6,%%xmm0 \n"
3735 "psrlw $0x7,%%xmm0 \n"
3736 "packuswb %%xmm0,%%xmm0 \n"
3737 "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3738 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3739 "pmaddubsw %%xmm3,%%xmm5 \n"
3740 "pmaddubsw %%xmm3,%%xmm1 \n"
3741 "phaddw %%xmm1,%%xmm5 \n"
3742 "psrlw $0x7,%%xmm5 \n"
3743 "packuswb %%xmm5,%%xmm5 \n"
3744 "punpcklbw %%xmm5,%%xmm0 \n"
3745 "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3746 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3747 "pmaddubsw %%xmm4,%%xmm5 \n"
3748 "pmaddubsw %%xmm4,%%xmm1 \n"
3749 "phaddw %%xmm1,%%xmm5 \n"
3750 "psrlw $0x7,%%xmm5 \n"
3751 "packuswb %%xmm5,%%xmm5 \n"
3752 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
3753 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3754 "psrld $0x18,%%xmm6 \n"
3755 "psrld $0x18,%%xmm1 \n"
3756 "packuswb %%xmm1,%%xmm6 \n"
3757 "packuswb %%xmm6,%%xmm6 \n"
3758 "punpcklbw %%xmm6,%%xmm5 \n"
3759 "movdqa %%xmm0,%%xmm1 \n"
3760 "punpcklwd %%xmm5,%%xmm0 \n"
3761 "punpckhwd %%xmm5,%%xmm1 \n"
3762 "movdqu %%xmm0," MEMACCESS(0) " \n"
3763 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
3764 "lea " MEMLEA(0x20,0) ",%0 \n"
3767 : "+r"(dst_argb), // %0
3769 : "m"(kARGBToSepiaB), // %2
3770 "m"(kARGBToSepiaG), // %3
3771 "m"(kARGBToSepiaR) // %4
3773 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3776 #endif // HAS_ARGBSEPIAROW_SSSE3
3778 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3779 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3780 // Same as Sepia except matrix is provided.
3781 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
3782 const int8* matrix_argb, int width) {
3784 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
3785 "pshufd $0x00,%%xmm5,%%xmm2 \n"
3786 "pshufd $0x55,%%xmm5,%%xmm3 \n"
3787 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
3788 "pshufd $0xff,%%xmm5,%%xmm5 \n"
3793 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3794 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
3795 "pmaddubsw %%xmm2,%%xmm0 \n"
3796 "pmaddubsw %%xmm2,%%xmm7 \n"
3797 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
3798 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3799 "pmaddubsw %%xmm3,%%xmm6 \n"
3800 "pmaddubsw %%xmm3,%%xmm1 \n"
3801 "phaddsw %%xmm7,%%xmm0 \n"
3802 "phaddsw %%xmm1,%%xmm6 \n"
3803 "psraw $0x6,%%xmm0 \n"
3804 "psraw $0x6,%%xmm6 \n"
3805 "packuswb %%xmm0,%%xmm0 \n"
3806 "packuswb %%xmm6,%%xmm6 \n"
3807 "punpcklbw %%xmm6,%%xmm0 \n"
3808 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3809 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
3810 "pmaddubsw %%xmm4,%%xmm1 \n"
3811 "pmaddubsw %%xmm4,%%xmm7 \n"
3812 "phaddsw %%xmm7,%%xmm1 \n"
3813 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
3814 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
3815 "pmaddubsw %%xmm5,%%xmm6 \n"
3816 "pmaddubsw %%xmm5,%%xmm7 \n"
3817 "phaddsw %%xmm7,%%xmm6 \n"
3818 "psraw $0x6,%%xmm1 \n"
3819 "psraw $0x6,%%xmm6 \n"
3820 "packuswb %%xmm1,%%xmm1 \n"
3821 "packuswb %%xmm6,%%xmm6 \n"
3822 "punpcklbw %%xmm6,%%xmm1 \n"
3823 "movdqa %%xmm0,%%xmm6 \n"
3824 "punpcklwd %%xmm1,%%xmm0 \n"
3825 "punpckhwd %%xmm1,%%xmm6 \n"
3826 "movdqu %%xmm0," MEMACCESS(1) " \n"
3827 "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
3828 "lea " MEMLEA(0x20,0) ",%0 \n"
3829 "lea " MEMLEA(0x20,1) ",%1 \n"
3832 : "+r"(src_argb), // %0
3833 "+r"(dst_argb), // %1
3835 : "r"(matrix_argb) // %3
3837 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3840 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3842 #ifdef HAS_ARGBQUANTIZEROW_SSE2
3843 // Quantize 4 ARGB pixels (16 bytes).
3844 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3845 int interval_offset, int width) {
3850 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3851 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3852 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3853 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3854 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3855 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3856 "pxor %%xmm5,%%xmm5 \n"
3857 "pcmpeqb %%xmm6,%%xmm6 \n"
3858 "pslld $0x18,%%xmm6 \n"
3863 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3864 "punpcklbw %%xmm5,%%xmm0 \n"
3865 "pmulhuw %%xmm2,%%xmm0 \n"
3866 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3867 "punpckhbw %%xmm5,%%xmm1 \n"
3868 "pmulhuw %%xmm2,%%xmm1 \n"
3869 "pmullw %%xmm3,%%xmm0 \n"
3870 "movdqu " MEMACCESS(0) ",%%xmm7 \n"
3871 "pmullw %%xmm3,%%xmm1 \n"
3872 "pand %%xmm6,%%xmm7 \n"
3873 "paddw %%xmm4,%%xmm0 \n"
3874 "paddw %%xmm4,%%xmm1 \n"
3875 "packuswb %%xmm1,%%xmm0 \n"
3876 "por %%xmm7,%%xmm0 \n"
3877 "movdqu %%xmm0," MEMACCESS(0) " \n"
3878 "lea " MEMLEA(0x10,0) ",%0 \n"
3881 : "+r"(dst_argb), // %0
3884 "r"(interval_size), // %3
3885 "r"(interval_offset) // %4
3887 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3890 #endif // HAS_ARGBQUANTIZEROW_SSE2
3892 #ifdef HAS_ARGBSHADEROW_SSE2
3893 // Shade 4 pixels at a time by specified value.
3894 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3898 "punpcklbw %%xmm2,%%xmm2 \n"
3899 "punpcklqdq %%xmm2,%%xmm2 \n"
3904 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3905 "lea " MEMLEA(0x10,0) ",%0 \n"
3906 "movdqa %%xmm0,%%xmm1 \n"
3907 "punpcklbw %%xmm0,%%xmm0 \n"
3908 "punpckhbw %%xmm1,%%xmm1 \n"
3909 "pmulhuw %%xmm2,%%xmm0 \n"
3910 "pmulhuw %%xmm2,%%xmm1 \n"
3911 "psrlw $0x8,%%xmm0 \n"
3912 "psrlw $0x8,%%xmm1 \n"
3913 "packuswb %%xmm1,%%xmm0 \n"
3914 "movdqu %%xmm0," MEMACCESS(1) " \n"
3915 "lea " MEMLEA(0x10,1) ",%1 \n"
3918 : "+r"(src_argb), // %0
3919 "+r"(dst_argb), // %1
3923 , "xmm0", "xmm1", "xmm2"
3926 #endif // HAS_ARGBSHADEROW_SSE2
3928 #ifdef HAS_ARGBMULTIPLYROW_SSE2
3929 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
3930 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3931 uint8* dst_argb, int width) {
3933 "pxor %%xmm5,%%xmm5 \n"
3938 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3939 "lea " MEMLEA(0x10,0) ",%0 \n"
3940 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3941 "lea " MEMLEA(0x10,1) ",%1 \n"
3942 "movdqu %%xmm0,%%xmm1 \n"
3943 "movdqu %%xmm2,%%xmm3 \n"
3944 "punpcklbw %%xmm0,%%xmm0 \n"
3945 "punpckhbw %%xmm1,%%xmm1 \n"
3946 "punpcklbw %%xmm5,%%xmm2 \n"
3947 "punpckhbw %%xmm5,%%xmm3 \n"
3948 "pmulhuw %%xmm2,%%xmm0 \n"
3949 "pmulhuw %%xmm3,%%xmm1 \n"
3950 "packuswb %%xmm1,%%xmm0 \n"
3951 "movdqu %%xmm0," MEMACCESS(2) " \n"
3952 "lea " MEMLEA(0x10,2) ",%2 \n"
3955 : "+r"(src_argb0), // %0
3956 "+r"(src_argb1), // %1
3957 "+r"(dst_argb), // %2
3961 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3964 #endif // HAS_ARGBMULTIPLYROW_SSE2
3966 #ifdef HAS_ARGBMULTIPLYROW_AVX2
3967 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
3968 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
3969 uint8* dst_argb, int width) {
3971 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
3976 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
3977 "lea " MEMLEA(0x20,0) ",%0 \n"
3978 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
3979 "lea " MEMLEA(0x20,1) ",%1 \n"
3980 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
3981 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
3982 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
3983 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
3984 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3985 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3986 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3987 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
3988 "lea " MEMLEA(0x20,2) ",%2 \n"
3992 : "+r"(src_argb0), // %0
3993 "+r"(src_argb1), // %1
3994 "+r"(dst_argb), // %2
3998 #if defined(__AVX2__)
3999 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4003 #endif // HAS_ARGBMULTIPLYROW_AVX2
4005 #ifdef HAS_ARGBADDROW_SSE2
4006 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4007 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4008 uint8* dst_argb, int width) {
4013 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4014 "lea " MEMLEA(0x10,0) ",%0 \n"
4015 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4016 "lea " MEMLEA(0x10,1) ",%1 \n"
4017 "paddusb %%xmm1,%%xmm0 \n"
4018 "movdqu %%xmm0," MEMACCESS(2) " \n"
4019 "lea " MEMLEA(0x10,2) ",%2 \n"
4022 : "+r"(src_argb0), // %0
4023 "+r"(src_argb1), // %1
4024 "+r"(dst_argb), // %2
4031 #endif // HAS_ARGBADDROW_SSE2
4033 #ifdef HAS_ARGBADDROW_AVX2
4034 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4035 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4036 uint8* dst_argb, int width) {
4041 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4042 "lea " MEMLEA(0x20,0) ",%0 \n"
4043 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4044 "lea " MEMLEA(0x20,1) ",%1 \n"
4045 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4046 "lea " MEMLEA(0x20,2) ",%2 \n"
4050 : "+r"(src_argb0), // %0
4051 "+r"(src_argb1), // %1
4052 "+r"(dst_argb), // %2
4059 #endif // HAS_ARGBADDROW_AVX2
4061 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4062 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
4063 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4064 uint8* dst_argb, int width) {
4069 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4070 "lea " MEMLEA(0x10,0) ",%0 \n"
4071 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4072 "lea " MEMLEA(0x10,1) ",%1 \n"
4073 "psubusb %%xmm1,%%xmm0 \n"
4074 "movdqu %%xmm0," MEMACCESS(2) " \n"
4075 "lea " MEMLEA(0x10,2) ",%2 \n"
4078 : "+r"(src_argb0), // %0
4079 "+r"(src_argb1), // %1
4080 "+r"(dst_argb), // %2
4087 #endif // HAS_ARGBSUBTRACTROW_SSE2
4089 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4090 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
4091 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4092 uint8* dst_argb, int width) {
4097 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4098 "lea " MEMLEA(0x20,0) ",%0 \n"
4099 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4100 "lea " MEMLEA(0x20,1) ",%1 \n"
4101 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4102 "lea " MEMLEA(0x20,2) ",%2 \n"
4106 : "+r"(src_argb0), // %0
4107 "+r"(src_argb1), // %1
4108 "+r"(dst_argb), // %2
4115 #endif // HAS_ARGBSUBTRACTROW_AVX2
4117 #ifdef HAS_SOBELXROW_SSE2
4118 // SobelX as a matrix is
4122 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4123 const uint8* src_y2, uint8* dst_sobelx, int width) {
4128 "pxor %%xmm5,%%xmm5 \n"
4133 "movq " MEMACCESS(0) ",%%xmm0 \n"
4134 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
4135 "punpcklbw %%xmm5,%%xmm0 \n"
4136 "punpcklbw %%xmm5,%%xmm1 \n"
4137 "psubw %%xmm1,%%xmm0 \n"
4138 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4139 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
4140 "punpcklbw %%xmm5,%%xmm1 \n"
4141 "punpcklbw %%xmm5,%%xmm2 \n"
4142 "psubw %%xmm2,%%xmm1 \n"
4143 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
4144 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
4145 "punpcklbw %%xmm5,%%xmm2 \n"
4146 "punpcklbw %%xmm5,%%xmm3 \n"
4147 "psubw %%xmm3,%%xmm2 \n"
4148 "paddw %%xmm2,%%xmm0 \n"
4149 "paddw %%xmm1,%%xmm0 \n"
4150 "paddw %%xmm1,%%xmm0 \n"
4151 "pxor %%xmm1,%%xmm1 \n"
4152 "psubw %%xmm0,%%xmm1 \n"
4153 "pmaxsw %%xmm1,%%xmm0 \n"
4154 "packuswb %%xmm0,%%xmm0 \n"
4155 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
4156 "lea " MEMLEA(0x8,0) ",%0 \n"
4159 : "+r"(src_y0), // %0
4162 "+r"(dst_sobelx), // %3
4165 : "memory", "cc", NACL_R14
4166 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4169 #endif // HAS_SOBELXROW_SSE2
4171 #ifdef HAS_SOBELYROW_SSE2
4172 // SobelY as a matrix is
4176 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4177 uint8* dst_sobely, int width) {
4181 "pxor %%xmm5,%%xmm5 \n"
4186 "movq " MEMACCESS(0) ",%%xmm0 \n"
4187 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4188 "punpcklbw %%xmm5,%%xmm0 \n"
4189 "punpcklbw %%xmm5,%%xmm1 \n"
4190 "psubw %%xmm1,%%xmm0 \n"
4191 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
4192 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
4193 "punpcklbw %%xmm5,%%xmm1 \n"
4194 "punpcklbw %%xmm5,%%xmm2 \n"
4195 "psubw %%xmm2,%%xmm1 \n"
4196 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
4197 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
4198 "punpcklbw %%xmm5,%%xmm2 \n"
4199 "punpcklbw %%xmm5,%%xmm3 \n"
4200 "psubw %%xmm3,%%xmm2 \n"
4201 "paddw %%xmm2,%%xmm0 \n"
4202 "paddw %%xmm1,%%xmm0 \n"
4203 "paddw %%xmm1,%%xmm0 \n"
4204 "pxor %%xmm1,%%xmm1 \n"
4205 "psubw %%xmm0,%%xmm1 \n"
4206 "pmaxsw %%xmm1,%%xmm0 \n"
4207 "packuswb %%xmm0,%%xmm0 \n"
4208 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
4209 "lea " MEMLEA(0x8,0) ",%0 \n"
4212 : "+r"(src_y0), // %0
4214 "+r"(dst_sobely), // %2
4217 : "memory", "cc", NACL_R14
4218 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4221 #endif // HAS_SOBELYROW_SSE2
4223 #ifdef HAS_SOBELROW_SSE2
4224 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4229 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4230 uint8* dst_argb, int width) {
4233 "pcmpeqb %%xmm5,%%xmm5 \n"
4234 "pslld $0x18,%%xmm5 \n"
4239 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4240 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4241 "lea " MEMLEA(0x10,0) ",%0 \n"
4242 "paddusb %%xmm1,%%xmm0 \n"
4243 "movdqa %%xmm0,%%xmm2 \n"
4244 "punpcklbw %%xmm0,%%xmm2 \n"
4245 "punpckhbw %%xmm0,%%xmm0 \n"
4246 "movdqa %%xmm2,%%xmm1 \n"
4247 "punpcklwd %%xmm2,%%xmm1 \n"
4248 "punpckhwd %%xmm2,%%xmm2 \n"
4249 "por %%xmm5,%%xmm1 \n"
4250 "por %%xmm5,%%xmm2 \n"
4251 "movdqa %%xmm0,%%xmm3 \n"
4252 "punpcklwd %%xmm0,%%xmm3 \n"
4253 "punpckhwd %%xmm0,%%xmm0 \n"
4254 "por %%xmm5,%%xmm3 \n"
4255 "por %%xmm5,%%xmm0 \n"
4256 "movdqu %%xmm1," MEMACCESS(2) " \n"
4257 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
4258 "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
4259 "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
4260 "lea " MEMLEA(0x40,2) ",%2 \n"
4263 : "+r"(src_sobelx), // %0
4264 "+r"(src_sobely), // %1
4265 "+r"(dst_argb), // %2
4268 : "memory", "cc", NACL_R14
4269 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4272 #endif // HAS_SOBELROW_SSE2
4274 #ifdef HAS_SOBELTOPLANEROW_SSE2
4275 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
4276 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4277 uint8* dst_y, int width) {
4280 "pcmpeqb %%xmm5,%%xmm5 \n"
4281 "pslld $0x18,%%xmm5 \n"
4286 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4287 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4288 "lea " MEMLEA(0x10,0) ",%0 \n"
4289 "paddusb %%xmm1,%%xmm0 \n"
4290 "movdqu %%xmm0," MEMACCESS(2) " \n"
4291 "lea " MEMLEA(0x10,2) ",%2 \n"
4294 : "+r"(src_sobelx), // %0
4295 "+r"(src_sobely), // %1
4299 : "memory", "cc", NACL_R14
4303 #endif // HAS_SOBELTOPLANEROW_SSE2
4305 #ifdef HAS_SOBELXYROW_SSE2
4306 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4311 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4312 uint8* dst_argb, int width) {
4315 "pcmpeqb %%xmm5,%%xmm5 \n"
4320 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4321 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4322 "lea " MEMLEA(0x10,0) ",%0 \n"
4323 "movdqa %%xmm0,%%xmm2 \n"
4324 "paddusb %%xmm1,%%xmm2 \n"
4325 "movdqa %%xmm0,%%xmm3 \n"
4326 "punpcklbw %%xmm5,%%xmm3 \n"
4327 "punpckhbw %%xmm5,%%xmm0 \n"
4328 "movdqa %%xmm1,%%xmm4 \n"
4329 "punpcklbw %%xmm2,%%xmm4 \n"
4330 "punpckhbw %%xmm2,%%xmm1 \n"
4331 "movdqa %%xmm4,%%xmm6 \n"
4332 "punpcklwd %%xmm3,%%xmm6 \n"
4333 "punpckhwd %%xmm3,%%xmm4 \n"
4334 "movdqa %%xmm1,%%xmm7 \n"
4335 "punpcklwd %%xmm0,%%xmm7 \n"
4336 "punpckhwd %%xmm0,%%xmm1 \n"
4337 "movdqu %%xmm6," MEMACCESS(2) " \n"
4338 "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
4339 "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
4340 "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
4341 "lea " MEMLEA(0x40,2) ",%2 \n"
4344 : "+r"(src_sobelx), // %0
4345 "+r"(src_sobely), // %1
4346 "+r"(dst_argb), // %2
4349 : "memory", "cc", NACL_R14
4350 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4353 #endif // HAS_SOBELXYROW_SSE2
4355 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4356 // Creates a table of cumulative sums where each value is a sum of all values
4357 // above and to the left of the value, inclusive of the value.
4358 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
4359 const int32* previous_cumsum, int width) {
4361 "pxor %%xmm0,%%xmm0 \n"
4362 "pxor %%xmm1,%%xmm1 \n"
4371 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
4372 "lea " MEMLEA(0x10,0) ",%0 \n"
4373 "movdqa %%xmm2,%%xmm4 \n"
4374 "punpcklbw %%xmm1,%%xmm2 \n"
4375 "movdqa %%xmm2,%%xmm3 \n"
4376 "punpcklwd %%xmm1,%%xmm2 \n"
4377 "punpckhwd %%xmm1,%%xmm3 \n"
4378 "punpckhbw %%xmm1,%%xmm4 \n"
4379 "movdqa %%xmm4,%%xmm5 \n"
4380 "punpcklwd %%xmm1,%%xmm4 \n"
4381 "punpckhwd %%xmm1,%%xmm5 \n"
4382 "paddd %%xmm2,%%xmm0 \n"
4383 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4384 "paddd %%xmm0,%%xmm2 \n"
4385 "paddd %%xmm3,%%xmm0 \n"
4386 "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
4387 "paddd %%xmm0,%%xmm3 \n"
4388 "paddd %%xmm4,%%xmm0 \n"
4389 "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
4390 "paddd %%xmm0,%%xmm4 \n"
4391 "paddd %%xmm5,%%xmm0 \n"
4392 "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
4393 "lea " MEMLEA(0x40,2) ",%2 \n"
4394 "paddd %%xmm0,%%xmm5 \n"
4395 "movdqu %%xmm2," MEMACCESS(1) " \n"
4396 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
4397 "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
4398 "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
4399 "lea " MEMLEA(0x40,1) ",%1 \n"
4410 "movd " MEMACCESS(0) ",%%xmm2 \n"
4411 "lea " MEMLEA(0x4,0) ",%0 \n"
4412 "punpcklbw %%xmm1,%%xmm2 \n"
4413 "punpcklwd %%xmm1,%%xmm2 \n"
4414 "paddd %%xmm2,%%xmm0 \n"
4415 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4416 "lea " MEMLEA(0x10,2) ",%2 \n"
4417 "paddd %%xmm0,%%xmm2 \n"
4418 "movdqu %%xmm2," MEMACCESS(1) " \n"
4419 "lea " MEMLEA(0x10,1) ",%1 \n"
4426 "+r"(previous_cumsum), // %2
4430 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4433 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4435 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4436 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4437 int width, int area, uint8* dst,
4441 "cvtdq2ps %%xmm5,%%xmm5 \n"
4442 "rcpss %%xmm5,%%xmm4 \n"
4443 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4449 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4450 "pcmpeqb %%xmm6,%%xmm6 \n"
4451 "psrld $0x10,%%xmm6 \n"
4452 "cvtdq2ps %%xmm6,%%xmm6 \n"
4453 "addps %%xmm6,%%xmm5 \n"
4454 "mulps %%xmm4,%%xmm5 \n"
4455 "cvtps2dq %%xmm5,%%xmm5 \n"
4456 "packssdw %%xmm5,%%xmm5 \n"
4458 // 4 pixel small loop \n"
4461 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4462 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4463 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4464 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4465 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4466 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4467 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4468 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4469 "lea " MEMLEA(0x40,0) ",%0 \n"
4470 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4471 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4472 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4473 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4474 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4475 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4476 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4477 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4478 "lea " MEMLEA(0x40,1) ",%1 \n"
4479 "packssdw %%xmm1,%%xmm0 \n"
4480 "packssdw %%xmm3,%%xmm2 \n"
4481 "pmulhuw %%xmm5,%%xmm0 \n"
4482 "pmulhuw %%xmm5,%%xmm2 \n"
4483 "packuswb %%xmm2,%%xmm0 \n"
4484 "movdqu %%xmm0," MEMACCESS(2) " \n"
4485 "lea " MEMLEA(0x10,2) ",%2 \n"
4493 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4494 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4495 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4496 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4497 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4498 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4499 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4500 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4501 "lea " MEMLEA(0x40,0) ",%0 \n"
4502 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4503 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4504 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4505 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4506 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4507 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4508 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4509 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4510 "lea " MEMLEA(0x40,1) ",%1 \n"
4511 "cvtdq2ps %%xmm0,%%xmm0 \n"
4512 "cvtdq2ps %%xmm1,%%xmm1 \n"
4513 "mulps %%xmm4,%%xmm0 \n"
4514 "mulps %%xmm4,%%xmm1 \n"
4515 "cvtdq2ps %%xmm2,%%xmm2 \n"
4516 "cvtdq2ps %%xmm3,%%xmm3 \n"
4517 "mulps %%xmm4,%%xmm2 \n"
4518 "mulps %%xmm4,%%xmm3 \n"
4519 "cvtps2dq %%xmm0,%%xmm0 \n"
4520 "cvtps2dq %%xmm1,%%xmm1 \n"
4521 "cvtps2dq %%xmm2,%%xmm2 \n"
4522 "cvtps2dq %%xmm3,%%xmm3 \n"
4523 "packssdw %%xmm1,%%xmm0 \n"
4524 "packssdw %%xmm3,%%xmm2 \n"
4525 "packuswb %%xmm2,%%xmm0 \n"
4526 "movdqu %%xmm0," MEMACCESS(2) " \n"
4527 "lea " MEMLEA(0x10,2) ",%2 \n"
4538 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4539 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4540 "lea " MEMLEA(0x10,0) ",%0 \n"
4541 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4542 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4543 "lea " MEMLEA(0x10,1) ",%1 \n"
4544 "cvtdq2ps %%xmm0,%%xmm0 \n"
4545 "mulps %%xmm4,%%xmm0 \n"
4546 "cvtps2dq %%xmm0,%%xmm0 \n"
4547 "packssdw %%xmm0,%%xmm0 \n"
4548 "packuswb %%xmm0,%%xmm0 \n"
4549 "movd %%xmm0," MEMACCESS(2) " \n"
4550 "lea " MEMLEA(0x4,2) ",%2 \n"
4554 : "+r"(topleft), // %0
4555 "+r"(botleft), // %1
4558 : "r"((intptr_t)(width)), // %4
4560 : "memory", "cc", NACL_R14
4561 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4564 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4566 #ifdef HAS_ARGBAFFINEROW_SSE2
4567 // Copy ARGB pixels from source image with slope to a row of destination.
4569 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
4570 uint8* dst_argb, const float* src_dudv, int width) {
4571 intptr_t src_argb_stride_temp = src_argb_stride;
4574 "movq " MEMACCESS(3) ",%%xmm2 \n"
4575 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
4582 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4583 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4584 "movdqa %%xmm2,%%xmm0 \n"
4585 "addps %%xmm7,%%xmm0 \n"
4586 "movlhps %%xmm0,%%xmm2 \n"
4587 "movdqa %%xmm7,%%xmm4 \n"
4588 "addps %%xmm4,%%xmm4 \n"
4589 "movdqa %%xmm2,%%xmm3 \n"
4590 "addps %%xmm4,%%xmm3 \n"
4591 "addps %%xmm4,%%xmm4 \n"
4596 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
4597 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
4598 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
4599 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
4600 "movd %%xmm0,%k1 \n"
4601 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4602 "movd %%xmm0,%k5 \n"
4603 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4604 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
4605 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4606 "punpckldq %%xmm6,%%xmm1 \n"
4607 "addps %%xmm4,%%xmm2 \n"
4608 "movq %%xmm1," MEMACCESS(2) " \n"
4609 "movd %%xmm0,%k1 \n"
4610 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4611 "movd %%xmm0,%k5 \n"
4612 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4613 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4614 "punpckldq %%xmm6,%%xmm0 \n"
4615 "addps %%xmm4,%%xmm3 \n"
4616 "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
4617 "lea " MEMLEA(0x10,2) ",%2 \n"
4628 "cvttps2dq %%xmm2,%%xmm0 \n"
4629 "packssdw %%xmm0,%%xmm0 \n"
4630 "pmaddwd %%xmm5,%%xmm0 \n"
4631 "addps %%xmm7,%%xmm2 \n"
4632 "movd %%xmm0,%k1 \n"
4633 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4634 "movd %%xmm0," MEMACCESS(2) " \n"
4635 "lea " MEMLEA(0x04,2) ",%2 \n"
4639 : "+r"(src_argb), // %0
4640 "+r"(src_argb_stride_temp), // %1
4641 "+r"(dst_argb), // %2
4642 "+r"(src_dudv), // %3
4646 : "memory", "cc", NACL_R14
4647 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4650 #endif // HAS_ARGBAFFINEROW_SSE2
4652 #ifdef HAS_INTERPOLATEROW_SSSE3
4653 // Bilinear filter 16x2 -> 16x1
4654 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4655 ptrdiff_t src_stride, int dst_width,
4656 int source_y_fraction) {
4673 "punpcklbw %%xmm0,%%xmm5 \n"
4674 "punpcklwd %%xmm5,%%xmm5 \n"
4675 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4677 // General purpose row blend.
4680 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4681 MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4682 "movdqa %%xmm0,%%xmm1 \n"
4683 "punpcklbw %%xmm2,%%xmm0 \n"
4684 "punpckhbw %%xmm2,%%xmm1 \n"
4685 "pmaddubsw %%xmm5,%%xmm0 \n"
4686 "pmaddubsw %%xmm5,%%xmm1 \n"
4687 "psrlw $0x7,%%xmm0 \n"
4688 "psrlw $0x7,%%xmm1 \n"
4689 "packuswb %%xmm1,%%xmm0 \n"
4690 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4691 "lea " MEMLEA(0x10,1) ",%1 \n"
4699 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4700 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4701 "pavgb %%xmm1,%%xmm0 \n"
4702 "pavgb %%xmm1,%%xmm0 \n"
4703 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4704 "lea " MEMLEA(0x10,1) ",%1 \n"
4712 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4713 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4714 "pavgb %%xmm1,%%xmm0 \n"
4715 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4716 "lea " MEMLEA(0x10,1) ",%1 \n"
4724 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4725 MEMOPREG(movdqu,0x00,1,4,1,xmm0)
4726 "pavgb %%xmm1,%%xmm0 \n"
4727 "pavgb %%xmm1,%%xmm0 \n"
4728 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4729 "lea " MEMLEA(0x10,1) ",%1 \n"
4734 // Blend 100 / 0 - Copy row unchanged.
4737 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4738 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4739 "lea " MEMLEA(0x10,1) ",%1 \n"
4744 : "+r"(dst_ptr), // %0
4745 "+r"(src_ptr), // %1
4746 "+r"(dst_width), // %2
4747 "+r"(source_y_fraction) // %3
4748 : "r"((intptr_t)(src_stride)) // %4
4749 : "memory", "cc", NACL_R14
4750 "xmm0", "xmm1", "xmm2", "xmm5"
4753 #endif // HAS_INTERPOLATEROW_SSSE3
4755 #ifdef HAS_INTERPOLATEROW_AVX2
4756 // Bilinear filter 32x2 -> 32x1
4757 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
4758 ptrdiff_t src_stride, int dst_width,
4759 int source_y_fraction) {
4772 "vmovd %3,%%xmm0 \n"
4775 "vmovd %3,%%xmm5 \n"
4776 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
4777 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
4778 "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
4779 "vpermd %%ymm5,%%ymm0,%%ymm5 \n"
4781 // General purpose row blend.
4784 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4785 MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
4786 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
4787 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
4788 "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n"
4789 "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n"
4790 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
4791 "vpsrlw $0x7,%%ymm1,%%ymm1 \n"
4792 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4793 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4794 "lea " MEMLEA(0x20,1) ",%1 \n"
4802 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4803 MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
4804 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4805 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4806 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4807 "lea " MEMLEA(0x20,1) ",%1 \n"
4815 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4816 VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
4817 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4818 "lea " MEMLEA(0x20,1) ",%1 \n"
4826 "vmovdqu " MEMACCESS(1) ",%%ymm1 \n"
4827 MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
4828 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4829 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4830 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4831 "lea " MEMLEA(0x20,1) ",%1 \n"
4836 // Blend 100 / 0 - Copy row unchanged.
4839 "rep movsb " MEMMOVESTRING(1,0) " \n"
4845 : "+D"(dst_ptr), // %0
4846 "+S"(src_ptr), // %1
4847 "+c"(dst_width), // %2
4848 "+r"(source_y_fraction) // %3
4849 : "r"((intptr_t)(src_stride)) // %4
4850 : "memory", "cc", NACL_R14
4851 "xmm0", "xmm1", "xmm2", "xmm5"
4854 #endif // HAS_INTERPOLATEROW_AVX2
4856 #ifdef HAS_INTERPOLATEROW_SSE2
4857 // Bilinear filter 16x2 -> 16x1
4858 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
4859 ptrdiff_t src_stride, int dst_width,
4860 int source_y_fraction) {
4877 "punpcklbw %%xmm0,%%xmm5 \n"
4878 "punpcklwd %%xmm5,%%xmm5 \n"
4879 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4880 "pxor %%xmm4,%%xmm4 \n"
4882 // General purpose row blend.
4885 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4886 MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2
4887 "movdqa %%xmm0,%%xmm1 \n"
4888 "movdqa %%xmm2,%%xmm3 \n"
4889 "punpcklbw %%xmm4,%%xmm2 \n"
4890 "punpckhbw %%xmm4,%%xmm3 \n"
4891 "punpcklbw %%xmm4,%%xmm0 \n"
4892 "punpckhbw %%xmm4,%%xmm1 \n"
4893 "psubw %%xmm0,%%xmm2 \n"
4894 "psubw %%xmm1,%%xmm3 \n"
4895 "paddw %%xmm2,%%xmm2 \n"
4896 "paddw %%xmm3,%%xmm3 \n"
4897 "pmulhw %%xmm5,%%xmm2 \n"
4898 "pmulhw %%xmm5,%%xmm3 \n"
4899 "paddw %%xmm2,%%xmm0 \n"
4900 "paddw %%xmm3,%%xmm1 \n"
4901 "packuswb %%xmm1,%%xmm0 \n"
4902 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4903 "lea " MEMLEA(0x10,1) ",%1 \n"
4911 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4912 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
4913 "pavgb %%xmm1,%%xmm0 \n"
4914 "pavgb %%xmm1,%%xmm0 \n"
4915 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4916 "lea " MEMLEA(0x10,1) ",%1 \n"
4924 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4925 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
4926 "pavgb %%xmm1,%%xmm0 \n"
4927 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4928 "lea " MEMLEA(0x10,1) ",%1 \n"
4936 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4937 MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
4938 "pavgb %%xmm1,%%xmm0 \n"
4939 "pavgb %%xmm1,%%xmm0 \n"
4940 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4941 "lea " MEMLEA(0x10,1) ",%1 \n"
4946 // Blend 100 / 0 - Copy row unchanged.
4949 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4950 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4951 "lea " MEMLEA(0x10,1) ",%1 \n"
4956 : "+r"(dst_ptr), // %0
4957 "+r"(src_ptr), // %1
4958 "+r"(dst_width), // %2
4959 "+r"(source_y_fraction) // %3
4960 : "r"((intptr_t)(src_stride)) // %4
4961 : "memory", "cc", NACL_R14
4962 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4965 #endif // HAS_INTERPOLATEROW_SSE2
4967 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
4968 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4969 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4970 const uint8* shuffler, int pix) {
4972 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
4975 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4976 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4977 "lea " MEMLEA(0x20,0) ",%0 \n"
4978 "pshufb %%xmm5,%%xmm0 \n"
4979 "pshufb %%xmm5,%%xmm1 \n"
4980 "movdqu %%xmm0," MEMACCESS(1) " \n"
4981 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
4982 "lea " MEMLEA(0x20,1) ",%1 \n"
4985 : "+r"(src_argb), // %0
4986 "+r"(dst_argb), // %1
4988 : "r"(shuffler) // %3
4990 , "xmm0", "xmm1", "xmm5"
4993 #endif // HAS_ARGBSHUFFLEROW_SSSE3
4995 #ifdef HAS_ARGBSHUFFLEROW_AVX2
4996 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4997 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4998 const uint8* shuffler, int pix) {
5000 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
5003 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
5004 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
5005 "lea " MEMLEA(0x40,0) ",%0 \n"
5006 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
5007 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
5008 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
5009 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
5010 "lea " MEMLEA(0x40,1) ",%1 \n"
5014 : "+r"(src_argb), // %0
5015 "+r"(dst_argb), // %1
5017 : "r"(shuffler) // %3
5019 , "xmm0", "xmm1", "xmm5"
5022 #endif // HAS_ARGBSHUFFLEROW_AVX2
5024 #ifdef HAS_ARGBSHUFFLEROW_SSE2
5025 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5026 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5027 const uint8* shuffler, int pix) {
5028 uintptr_t pixel_temp = 0u;
5030 "pxor %%xmm5,%%xmm5 \n"
5031 "mov " MEMACCESS(4) ",%k2 \n"
5032 "cmp $0x3000102,%k2 \n"
5034 "cmp $0x10203,%k2 \n"
5036 "cmp $0x30201,%k2 \n"
5038 "cmp $0x2010003,%k2 \n"
5043 "movzb " MEMACCESS(4) ",%2 \n"
5044 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5045 "mov %b2," MEMACCESS(1) " \n"
5046 "movzb " MEMACCESS2(0x1,4) ",%2 \n"
5047 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5048 "mov %b2," MEMACCESS2(0x1,1) " \n"
5049 "movzb " MEMACCESS2(0x2,4) ",%2 \n"
5050 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5051 "mov %b2," MEMACCESS2(0x2,1) " \n"
5052 "movzb " MEMACCESS2(0x3,4) ",%2 \n"
5053 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5054 "mov %b2," MEMACCESS2(0x3,1) " \n"
5055 "lea " MEMLEA(0x4,0) ",%0 \n"
5056 "lea " MEMLEA(0x4,1) ",%1 \n"
5063 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5064 "lea " MEMLEA(0x10,0) ",%0 \n"
5065 "movdqa %%xmm0,%%xmm1 \n"
5066 "punpcklbw %%xmm5,%%xmm0 \n"
5067 "punpckhbw %%xmm5,%%xmm1 \n"
5068 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
5069 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
5070 "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
5071 "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
5072 "packuswb %%xmm1,%%xmm0 \n"
5073 "movdqu %%xmm0," MEMACCESS(1) " \n"
5074 "lea " MEMLEA(0x10,1) ",%1 \n"
5081 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5082 "lea " MEMLEA(0x10,0) ",%0 \n"
5083 "movdqa %%xmm0,%%xmm1 \n"
5084 "punpcklbw %%xmm5,%%xmm0 \n"
5085 "punpckhbw %%xmm5,%%xmm1 \n"
5086 "pshufhw $0x39,%%xmm0,%%xmm0 \n"
5087 "pshuflw $0x39,%%xmm0,%%xmm0 \n"
5088 "pshufhw $0x39,%%xmm1,%%xmm1 \n"
5089 "pshuflw $0x39,%%xmm1,%%xmm1 \n"
5090 "packuswb %%xmm1,%%xmm0 \n"
5091 "movdqu %%xmm0," MEMACCESS(1) " \n"
5092 "lea " MEMLEA(0x10,1) ",%1 \n"
5099 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5100 "lea " MEMLEA(0x10,0) ",%0 \n"
5101 "movdqa %%xmm0,%%xmm1 \n"
5102 "punpcklbw %%xmm5,%%xmm0 \n"
5103 "punpckhbw %%xmm5,%%xmm1 \n"
5104 "pshufhw $0x93,%%xmm0,%%xmm0 \n"
5105 "pshuflw $0x93,%%xmm0,%%xmm0 \n"
5106 "pshufhw $0x93,%%xmm1,%%xmm1 \n"
5107 "pshuflw $0x93,%%xmm1,%%xmm1 \n"
5108 "packuswb %%xmm1,%%xmm0 \n"
5109 "movdqu %%xmm0," MEMACCESS(1) " \n"
5110 "lea " MEMLEA(0x10,1) ",%1 \n"
5117 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5118 "lea " MEMLEA(0x10,0) ",%0 \n"
5119 "movdqa %%xmm0,%%xmm1 \n"
5120 "punpcklbw %%xmm5,%%xmm0 \n"
5121 "punpckhbw %%xmm5,%%xmm1 \n"
5122 "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
5123 "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
5124 "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
5125 "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
5126 "packuswb %%xmm1,%%xmm0 \n"
5127 "movdqu %%xmm0," MEMACCESS(1) " \n"
5128 "lea " MEMLEA(0x10,1) ",%1 \n"
5133 : "+r"(src_argb), // %0
5134 "+r"(dst_argb), // %1
5135 "+d"(pixel_temp), // %2
5137 : "r"(shuffler) // %4
5138 : "memory", "cc", NACL_R14
5139 "xmm0", "xmm1", "xmm5"
5142 #endif // HAS_ARGBSHUFFLEROW_SSE2
5144 #ifdef HAS_I422TOYUY2ROW_SSE2
5145 void I422ToYUY2Row_SSE2(const uint8* src_y,
5148 uint8* dst_frame, int width) {
5153 "movq " MEMACCESS(1) ",%%xmm2 \n"
5154 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5155 "lea " MEMLEA(0x8,1) ",%1 \n"
5156 "punpcklbw %%xmm3,%%xmm2 \n"
5157 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5158 "lea " MEMLEA(0x10,0) ",%0 \n"
5159 "movdqa %%xmm0,%%xmm1 \n"
5160 "punpcklbw %%xmm2,%%xmm0 \n"
5161 "punpckhbw %%xmm2,%%xmm1 \n"
5162 "movdqu %%xmm0," MEMACCESS(3) " \n"
5163 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
5164 "lea " MEMLEA(0x20,3) ",%3 \n"
5167 : "+r"(src_y), // %0
5170 "+r"(dst_frame), // %3
5173 : "memory", "cc", NACL_R14
5174 "xmm0", "xmm1", "xmm2", "xmm3"
5177 #endif // HAS_I422TOYUY2ROW_SSE2
5179 #ifdef HAS_I422TOUYVYROW_SSE2
5180 void I422ToUYVYRow_SSE2(const uint8* src_y,
5183 uint8* dst_frame, int width) {
5188 "movq " MEMACCESS(1) ",%%xmm2 \n"
5189 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5190 "lea " MEMLEA(0x8,1) ",%1 \n"
5191 "punpcklbw %%xmm3,%%xmm2 \n"
5192 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5193 "movdqa %%xmm2,%%xmm1 \n"
5194 "lea " MEMLEA(0x10,0) ",%0 \n"
5195 "punpcklbw %%xmm0,%%xmm1 \n"
5196 "punpckhbw %%xmm0,%%xmm2 \n"
5197 "movdqu %%xmm1," MEMACCESS(3) " \n"
5198 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
5199 "lea " MEMLEA(0x20,3) ",%3 \n"
5202 : "+r"(src_y), // %0
5205 "+r"(dst_frame), // %3
5208 : "memory", "cc", NACL_R14
5209 "xmm0", "xmm1", "xmm2", "xmm3"
5212 #endif // HAS_I422TOUYVYROW_SSE2
5214 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5215 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5216 uint8* dst_argb, const float* poly,
5219 "pxor %%xmm3,%%xmm3 \n"
5224 "movq " MEMACCESS(0) ",%%xmm0 \n"
5225 "lea " MEMLEA(0x8,0) ",%0 \n"
5226 "punpcklbw %%xmm3,%%xmm0 \n"
5227 "movdqa %%xmm0,%%xmm4 \n"
5228 "punpcklwd %%xmm3,%%xmm0 \n"
5229 "punpckhwd %%xmm3,%%xmm4 \n"
5230 "cvtdq2ps %%xmm0,%%xmm0 \n"
5231 "cvtdq2ps %%xmm4,%%xmm4 \n"
5232 "movdqa %%xmm0,%%xmm1 \n"
5233 "movdqa %%xmm4,%%xmm5 \n"
5234 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
5235 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
5236 "addps " MEMACCESS(3) ",%%xmm0 \n"
5237 "addps " MEMACCESS(3) ",%%xmm4 \n"
5238 "movdqa %%xmm1,%%xmm2 \n"
5239 "movdqa %%xmm5,%%xmm6 \n"
5240 "mulps %%xmm1,%%xmm2 \n"
5241 "mulps %%xmm5,%%xmm6 \n"
5242 "mulps %%xmm2,%%xmm1 \n"
5243 "mulps %%xmm6,%%xmm5 \n"
5244 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
5245 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
5246 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
5247 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
5248 "addps %%xmm2,%%xmm0 \n"
5249 "addps %%xmm6,%%xmm4 \n"
5250 "addps %%xmm1,%%xmm0 \n"
5251 "addps %%xmm5,%%xmm4 \n"
5252 "cvttps2dq %%xmm0,%%xmm0 \n"
5253 "cvttps2dq %%xmm4,%%xmm4 \n"
5254 "packuswb %%xmm4,%%xmm0 \n"
5255 "packuswb %%xmm0,%%xmm0 \n"
5256 "movq %%xmm0," MEMACCESS(1) " \n"
5257 "lea " MEMLEA(0x8,1) ",%1 \n"
5260 : "+r"(src_argb), // %0
5261 "+r"(dst_argb), // %1
5265 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5268 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
5270 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5271 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5272 uint8* dst_argb, const float* poly,
5275 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
5276 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5277 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5278 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5283 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
5284 "lea " MEMLEA(0x8,0) ",%0 \n"
5285 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
5286 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
5287 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
5288 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
5289 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
5290 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
5291 "vcvttps2dq %%ymm0,%%ymm0 \n"
5292 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
5293 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5294 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
5295 "vmovq %%xmm0," MEMACCESS(1) " \n"
5296 "lea " MEMLEA(0x8,1) ",%1 \n"
5300 : "+r"(src_argb), // %0
5301 "+r"(dst_argb), // %1
5305 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5308 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
5310 #ifdef HAS_ARGBCOLORTABLEROW_X86
5311 // Tranform ARGB pixels with color table.
5312 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
5314 uintptr_t pixel_temp = 0u;
5319 "movzb " MEMACCESS(0) ",%1 \n"
5320 "lea " MEMLEA(0x4,0) ",%0 \n"
5321 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5322 "mov %b1," MEMACCESS2(-0x4,0) " \n"
5323 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5324 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5325 "mov %b1," MEMACCESS2(-0x3,0) " \n"
5326 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5327 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5328 "mov %b1," MEMACCESS2(-0x2,0) " \n"
5329 "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
5330 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
5331 "mov %b1," MEMACCESS2(-0x1,0) " \n"
5334 : "+r"(dst_argb), // %0
5335 "+d"(pixel_temp), // %1
5337 : "r"(table_argb) // %3
5340 #endif // HAS_ARGBCOLORTABLEROW_X86
5342 #ifdef HAS_RGBCOLORTABLEROW_X86
5343 // Tranform RGB pixels with color table.
5344 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5345 uintptr_t pixel_temp = 0u;
5350 "movzb " MEMACCESS(0) ",%1 \n"
5351 "lea " MEMLEA(0x4,0) ",%0 \n"
5352 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5353 "mov %b1," MEMACCESS2(-0x4,0) " \n"
5354 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5355 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5356 "mov %b1," MEMACCESS2(-0x3,0) " \n"
5357 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5358 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5359 "mov %b1," MEMACCESS2(-0x2,0) " \n"
5362 : "+r"(dst_argb), // %0
5363 "+d"(pixel_temp), // %1
5365 : "r"(table_argb) // %3
5368 #endif // HAS_RGBCOLORTABLEROW_X86
5370 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5371 // Tranform RGB pixels with luma table.
5372 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5374 const uint8* luma, uint32 lumacoeff) {
5375 uintptr_t pixel_temp = 0u;
5376 uintptr_t table_temp = 0u;
5379 "pshufd $0x0,%%xmm3,%%xmm3 \n"
5380 "pcmpeqb %%xmm4,%%xmm4 \n"
5381 "psllw $0x8,%%xmm4 \n"
5382 "pxor %%xmm5,%%xmm5 \n"
5387 "movdqu " MEMACCESS(2) ",%%xmm0 \n"
5388 "pmaddubsw %%xmm3,%%xmm0 \n"
5389 "phaddw %%xmm0,%%xmm0 \n"
5390 "pand %%xmm4,%%xmm0 \n"
5391 "punpcklwd %%xmm5,%%xmm0 \n"
5392 "movd %%xmm0,%k1 \n" // 32 bit offset
5394 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5396 "movzb " MEMACCESS(2) ",%0 \n"
5397 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5398 "mov %b0," MEMACCESS(3) " \n"
5399 "movzb " MEMACCESS2(0x1,2) ",%0 \n"
5400 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5401 "mov %b0," MEMACCESS2(0x1,3) " \n"
5402 "movzb " MEMACCESS2(0x2,2) ",%0 \n"
5403 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5404 "mov %b0," MEMACCESS2(0x2,3) " \n"
5405 "movzb " MEMACCESS2(0x3,2) ",%0 \n"
5406 "mov %b0," MEMACCESS2(0x3,3) " \n"
5408 "movd %%xmm0,%k1 \n" // 32 bit offset
5410 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5412 "movzb " MEMACCESS2(0x4,2) ",%0 \n"
5413 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5414 "mov %b0," MEMACCESS2(0x4,3) " \n"
5415 "movzb " MEMACCESS2(0x5,2) ",%0 \n"
5416 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5417 "mov %b0," MEMACCESS2(0x5,3) " \n"
5418 "movzb " MEMACCESS2(0x6,2) ",%0 \n"
5419 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5420 "mov %b0," MEMACCESS2(0x6,3) " \n"
5421 "movzb " MEMACCESS2(0x7,2) ",%0 \n"
5422 "mov %b0," MEMACCESS2(0x7,3) " \n"
5424 "movd %%xmm0,%k1 \n" // 32 bit offset
5426 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5428 "movzb " MEMACCESS2(0x8,2) ",%0 \n"
5429 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5430 "mov %b0," MEMACCESS2(0x8,3) " \n"
5431 "movzb " MEMACCESS2(0x9,2) ",%0 \n"
5432 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5433 "mov %b0," MEMACCESS2(0x9,3) " \n"
5434 "movzb " MEMACCESS2(0xa,2) ",%0 \n"
5435 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5436 "mov %b0," MEMACCESS2(0xa,3) " \n"
5437 "movzb " MEMACCESS2(0xb,2) ",%0 \n"
5438 "mov %b0," MEMACCESS2(0xb,3) " \n"
5440 "movd %%xmm0,%k1 \n" // 32 bit offset
5443 "movzb " MEMACCESS2(0xc,2) ",%0 \n"
5444 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5445 "mov %b0," MEMACCESS2(0xc,3) " \n"
5446 "movzb " MEMACCESS2(0xd,2) ",%0 \n"
5447 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5448 "mov %b0," MEMACCESS2(0xd,3) " \n"
5449 "movzb " MEMACCESS2(0xe,2) ",%0 \n"
5450 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5451 "mov %b0," MEMACCESS2(0xe,3) " \n"
5452 "movzb " MEMACCESS2(0xf,2) ",%0 \n"
5453 "mov %b0," MEMACCESS2(0xf,3) " \n"
5454 "lea " MEMLEA(0x10,2) ",%2 \n"
5455 "lea " MEMLEA(0x10,3) ",%3 \n"
5458 : "+d"(pixel_temp), // %0
5459 "+a"(table_temp), // %1
5460 "+r"(src_argb), // %2
5461 "+r"(dst_argb), // %3
5464 "rm"(lumacoeff) // %6
5465 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5468 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5470 #endif // defined(__x86_64__) || defined(__i386__)
5474 } // namespace libyuv