3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree.
12 #include "libyuv/row.h"
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
25 static vec8 kARGBToY = {
26 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
30 static vec8 kARGBToYJ = {
31 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
33 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
35 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
37 static vec8 kARGBToU = {
38 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
41 static vec8 kARGBToUJ = {
42 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
45 static vec8 kARGBToV = {
46 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
49 static vec8 kARGBToVJ = {
50 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
54 static vec8 kBGRAToY = {
55 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
58 static vec8 kBGRAToU = {
59 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
62 static vec8 kBGRAToV = {
63 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
67 static vec8 kABGRToY = {
68 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
71 static vec8 kABGRToU = {
72 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
75 static vec8 kABGRToV = {
76 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
79 // Constants for RGBA.
80 static vec8 kRGBAToY = {
81 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
84 static vec8 kRGBAToU = {
85 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
88 static vec8 kRGBAToV = {
89 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
92 static uvec8 kAddY16 = {
93 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
96 // 7 bit fixed point 0.5.
97 static vec16 kAddYJ64 = {
98 64, 64, 64, 64, 64, 64, 64, 64
101 static uvec8 kAddUV128 = {
102 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
103 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
106 static uvec16 kAddUVJ128 = {
107 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
109 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
111 #ifdef HAS_RGB24TOARGBROW_SSSE3
113 // Shuffle table for converting RGB24 to ARGB.
114 static uvec8 kShuffleMaskRGB24ToARGB = {
115 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
118 // Shuffle table for converting RAW to ARGB.
119 static uvec8 kShuffleMaskRAWToARGB = {
120 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
123 // Shuffle table for converting ARGB to RGB24.
124 static uvec8 kShuffleMaskARGBToRGB24 = {
125 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
128 // Shuffle table for converting ARGB to RAW.
129 static uvec8 kShuffleMaskARGBToRAW = {
130 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
133 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
134 static uvec8 kShuffleMaskARGBToRGB24_0 = {
135 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
138 // Shuffle table for converting ARGB to RAW.
139 static uvec8 kShuffleMaskARGBToRAW_0 = {
140 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
142 #endif // HAS_RGB24TOARGBROW_SSSE3
144 #if defined(TESTING) && defined(__x86_64__)
145 void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
159 "mov %%r10d,%%r10d \n"
160 "mov %%r11d,%%r11d \n"
161 "mov %%r12d,%%r12d \n"
162 "mov %%r13d,%%r13d \n"
163 "mov %%r14d,%%r14d \n"
164 "mov %%r15d,%%r15d \n"
166 "lea (%%rax),%%eax \n"
167 "lea (%%rbx),%%ebx \n"
168 "lea (%%rcx),%%ecx \n"
169 "lea (%%rdx),%%edx \n"
170 "lea (%%rsi),%%esi \n"
171 "lea (%%rdi),%%edi \n"
172 "lea (%%rbp),%%ebp \n"
173 "lea (%%rsp),%%esp \n"
175 "lea (%%r8),%%r8d \n"
176 "lea (%%r9),%%r9d \n"
177 "lea (%%r10),%%r10d \n"
178 "lea (%%r11),%%r11d \n"
179 "lea (%%r12),%%r12d \n"
180 "lea (%%r13),%%r13d \n"
181 "lea (%%r14),%%r14d \n"
182 "lea (%%r15),%%r15d \n"
185 "lea 0x10(%%rax),%%eax \n"
186 "lea 0x10(%%rbx),%%ebx \n"
187 "lea 0x10(%%rcx),%%ecx \n"
188 "lea 0x10(%%rdx),%%edx \n"
189 "lea 0x10(%%rsi),%%esi \n"
190 "lea 0x10(%%rdi),%%edi \n"
191 "lea 0x10(%%rbp),%%ebp \n"
192 "lea 0x10(%%rsp),%%esp \n"
194 "lea 0x10(%%r8),%%r8d \n"
195 "lea 0x10(%%r9),%%r9d \n"
196 "lea 0x10(%%r10),%%r10d \n"
197 "lea 0x10(%%r11),%%r11d \n"
198 "lea 0x10(%%r12),%%r12d \n"
199 "lea 0x10(%%r13),%%r13d \n"
200 "lea 0x10(%%r14),%%r14d \n"
201 "lea 0x10(%%r15),%%r15d \n"
224 "movq " MEMACCESS(0) ",%%xmm0 \n"
225 "lea " MEMLEA(0x8,0) ",%0 \n"
226 "movdqu %%xmm0," MEMACCESS(1) " \n"
227 "lea " MEMLEA(0x20,1) ",%1 \n"
231 "+r"(dst_argb), // %1
234 : "memory", "cc", "xmm0", "xmm1", "xmm5"
239 #ifdef HAS_I400TOARGBROW_SSE2
240 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
242 "pcmpeqb %%xmm5,%%xmm5 \n"
243 "pslld $0x18,%%xmm5 \n"
246 "movq " MEMACCESS(0) ",%%xmm0 \n"
247 "lea " MEMLEA(0x8,0) ",%0 \n"
248 "punpcklbw %%xmm0,%%xmm0 \n"
249 "movdqa %%xmm0,%%xmm1 \n"
250 "punpcklwd %%xmm0,%%xmm0 \n"
251 "punpckhwd %%xmm1,%%xmm1 \n"
252 "por %%xmm5,%%xmm0 \n"
253 "por %%xmm5,%%xmm1 \n"
254 "movdqu %%xmm0," MEMACCESS(1) " \n"
255 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
256 "lea " MEMLEA(0x20,1) ",%1 \n"
260 "+r"(dst_argb), // %1
262 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
265 #endif // HAS_I400TOARGBROW_SSE2
267 #ifdef HAS_RGB24TOARGBROW_SSSE3
268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
270 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
271 "pslld $0x18,%%xmm5 \n"
272 "movdqa %3,%%xmm4 \n"
275 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
276 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
277 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
278 "lea " MEMLEA(0x30,0) ",%0 \n"
279 "movdqa %%xmm3,%%xmm2 \n"
280 "palignr $0x8,%%xmm1,%%xmm2 \n"
281 "pshufb %%xmm4,%%xmm2 \n"
282 "por %%xmm5,%%xmm2 \n"
283 "palignr $0xc,%%xmm0,%%xmm1 \n"
284 "pshufb %%xmm4,%%xmm0 \n"
285 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
286 "por %%xmm5,%%xmm0 \n"
287 "pshufb %%xmm4,%%xmm1 \n"
288 "movdqu %%xmm0," MEMACCESS(1) " \n"
289 "por %%xmm5,%%xmm1 \n"
290 "palignr $0x4,%%xmm3,%%xmm3 \n"
291 "pshufb %%xmm4,%%xmm3 \n"
292 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
293 "por %%xmm5,%%xmm3 \n"
294 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
295 "lea " MEMLEA(0x40,1) ",%1 \n"
298 : "+r"(src_rgb24), // %0
299 "+r"(dst_argb), // %1
301 : "m"(kShuffleMaskRGB24ToARGB) // %3
302 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
306 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
308 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
309 "pslld $0x18,%%xmm5 \n"
310 "movdqa %3,%%xmm4 \n"
313 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
314 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
315 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
316 "lea " MEMLEA(0x30,0) ",%0 \n"
317 "movdqa %%xmm3,%%xmm2 \n"
318 "palignr $0x8,%%xmm1,%%xmm2 \n"
319 "pshufb %%xmm4,%%xmm2 \n"
320 "por %%xmm5,%%xmm2 \n"
321 "palignr $0xc,%%xmm0,%%xmm1 \n"
322 "pshufb %%xmm4,%%xmm0 \n"
323 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
324 "por %%xmm5,%%xmm0 \n"
325 "pshufb %%xmm4,%%xmm1 \n"
326 "movdqu %%xmm0," MEMACCESS(1) " \n"
327 "por %%xmm5,%%xmm1 \n"
328 "palignr $0x4,%%xmm3,%%xmm3 \n"
329 "pshufb %%xmm4,%%xmm3 \n"
330 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
331 "por %%xmm5,%%xmm3 \n"
332 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
333 "lea " MEMLEA(0x40,1) ",%1 \n"
336 : "+r"(src_raw), // %0
337 "+r"(dst_argb), // %1
339 : "m"(kShuffleMaskRAWToARGB) // %3
340 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
344 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
346 "mov $0x1080108,%%eax \n"
347 "movd %%eax,%%xmm5 \n"
348 "pshufd $0x0,%%xmm5,%%xmm5 \n"
349 "mov $0x20802080,%%eax \n"
350 "movd %%eax,%%xmm6 \n"
351 "pshufd $0x0,%%xmm6,%%xmm6 \n"
352 "pcmpeqb %%xmm3,%%xmm3 \n"
353 "psllw $0xb,%%xmm3 \n"
354 "pcmpeqb %%xmm4,%%xmm4 \n"
355 "psllw $0xa,%%xmm4 \n"
356 "psrlw $0x5,%%xmm4 \n"
357 "pcmpeqb %%xmm7,%%xmm7 \n"
358 "psllw $0x8,%%xmm7 \n"
363 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
364 "movdqa %%xmm0,%%xmm1 \n"
365 "movdqa %%xmm0,%%xmm2 \n"
366 "pand %%xmm3,%%xmm1 \n"
367 "psllw $0xb,%%xmm2 \n"
368 "pmulhuw %%xmm5,%%xmm1 \n"
369 "pmulhuw %%xmm5,%%xmm2 \n"
370 "psllw $0x8,%%xmm1 \n"
371 "por %%xmm2,%%xmm1 \n"
372 "pand %%xmm4,%%xmm0 \n"
373 "pmulhuw %%xmm6,%%xmm0 \n"
374 "por %%xmm7,%%xmm0 \n"
375 "movdqa %%xmm1,%%xmm2 \n"
376 "punpcklbw %%xmm0,%%xmm1 \n"
377 "punpckhbw %%xmm0,%%xmm2 \n"
378 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
379 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
380 "lea " MEMLEA(0x10,0) ",%0 \n"
387 : "memory", "cc", "eax", NACL_R14
388 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
392 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
394 "mov $0x1080108,%%eax \n"
395 "movd %%eax,%%xmm5 \n"
396 "pshufd $0x0,%%xmm5,%%xmm5 \n"
397 "mov $0x42004200,%%eax \n"
398 "movd %%eax,%%xmm6 \n"
399 "pshufd $0x0,%%xmm6,%%xmm6 \n"
400 "pcmpeqb %%xmm3,%%xmm3 \n"
401 "psllw $0xb,%%xmm3 \n"
402 "movdqa %%xmm3,%%xmm4 \n"
403 "psrlw $0x6,%%xmm4 \n"
404 "pcmpeqb %%xmm7,%%xmm7 \n"
405 "psllw $0x8,%%xmm7 \n"
410 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
411 "movdqa %%xmm0,%%xmm1 \n"
412 "movdqa %%xmm0,%%xmm2 \n"
413 "psllw $0x1,%%xmm1 \n"
414 "psllw $0xb,%%xmm2 \n"
415 "pand %%xmm3,%%xmm1 \n"
416 "pmulhuw %%xmm5,%%xmm2 \n"
417 "pmulhuw %%xmm5,%%xmm1 \n"
418 "psllw $0x8,%%xmm1 \n"
419 "por %%xmm2,%%xmm1 \n"
420 "movdqa %%xmm0,%%xmm2 \n"
421 "pand %%xmm4,%%xmm0 \n"
422 "psraw $0x8,%%xmm2 \n"
423 "pmulhuw %%xmm6,%%xmm0 \n"
424 "pand %%xmm7,%%xmm2 \n"
425 "por %%xmm2,%%xmm0 \n"
426 "movdqa %%xmm1,%%xmm2 \n"
427 "punpcklbw %%xmm0,%%xmm1 \n"
428 "punpckhbw %%xmm0,%%xmm2 \n"
429 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
430 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
431 "lea " MEMLEA(0x10,0) ",%0 \n"
438 : "memory", "cc", "eax", NACL_R14
439 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
443 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
445 "mov $0xf0f0f0f,%%eax \n"
446 "movd %%eax,%%xmm4 \n"
447 "pshufd $0x0,%%xmm4,%%xmm4 \n"
448 "movdqa %%xmm4,%%xmm5 \n"
449 "pslld $0x4,%%xmm5 \n"
454 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
455 "movdqa %%xmm0,%%xmm2 \n"
456 "pand %%xmm4,%%xmm0 \n"
457 "pand %%xmm5,%%xmm2 \n"
458 "movdqa %%xmm0,%%xmm1 \n"
459 "movdqa %%xmm2,%%xmm3 \n"
460 "psllw $0x4,%%xmm1 \n"
461 "psrlw $0x4,%%xmm3 \n"
462 "por %%xmm1,%%xmm0 \n"
463 "por %%xmm3,%%xmm2 \n"
464 "movdqa %%xmm0,%%xmm1 \n"
465 "punpcklbw %%xmm2,%%xmm0 \n"
466 "punpckhbw %%xmm2,%%xmm1 \n"
467 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
468 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
469 "lea " MEMLEA(0x10,0) ",%0 \n"
476 : "memory", "cc", "eax", NACL_R14
477 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
481 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
483 "movdqa %3,%%xmm6 \n"
486 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
487 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
488 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
489 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
490 "lea " MEMLEA(0x40,0) ",%0 \n"
491 "pshufb %%xmm6,%%xmm0 \n"
492 "pshufb %%xmm6,%%xmm1 \n"
493 "pshufb %%xmm6,%%xmm2 \n"
494 "pshufb %%xmm6,%%xmm3 \n"
495 "movdqa %%xmm1,%%xmm4 \n"
496 "psrldq $0x4,%%xmm1 \n"
497 "pslldq $0xc,%%xmm4 \n"
498 "movdqa %%xmm2,%%xmm5 \n"
499 "por %%xmm4,%%xmm0 \n"
500 "pslldq $0x8,%%xmm5 \n"
501 "movdqu %%xmm0," MEMACCESS(1) " \n"
502 "por %%xmm5,%%xmm1 \n"
503 "psrldq $0x8,%%xmm2 \n"
504 "pslldq $0x4,%%xmm3 \n"
505 "por %%xmm3,%%xmm2 \n"
506 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
507 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
508 "lea " MEMLEA(0x30,1) ",%1 \n"
514 : "m"(kShuffleMaskARGBToRGB24) // %3
515 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
519 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
521 "movdqa %3,%%xmm6 \n"
524 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
525 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
526 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
527 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
528 "lea " MEMLEA(0x40,0) ",%0 \n"
529 "pshufb %%xmm6,%%xmm0 \n"
530 "pshufb %%xmm6,%%xmm1 \n"
531 "pshufb %%xmm6,%%xmm2 \n"
532 "pshufb %%xmm6,%%xmm3 \n"
533 "movdqa %%xmm1,%%xmm4 \n"
534 "psrldq $0x4,%%xmm1 \n"
535 "pslldq $0xc,%%xmm4 \n"
536 "movdqa %%xmm2,%%xmm5 \n"
537 "por %%xmm4,%%xmm0 \n"
538 "pslldq $0x8,%%xmm5 \n"
539 "movdqu %%xmm0," MEMACCESS(1) " \n"
540 "por %%xmm5,%%xmm1 \n"
541 "psrldq $0x8,%%xmm2 \n"
542 "pslldq $0x4,%%xmm3 \n"
543 "por %%xmm3,%%xmm2 \n"
544 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
545 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
546 "lea " MEMLEA(0x30,1) ",%1 \n"
552 : "m"(kShuffleMaskARGBToRAW) // %3
553 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
557 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
559 "pcmpeqb %%xmm3,%%xmm3 \n"
560 "psrld $0x1b,%%xmm3 \n"
561 "pcmpeqb %%xmm4,%%xmm4 \n"
562 "psrld $0x1a,%%xmm4 \n"
563 "pslld $0x5,%%xmm4 \n"
564 "pcmpeqb %%xmm5,%%xmm5 \n"
565 "pslld $0xb,%%xmm5 \n"
568 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
569 "movdqa %%xmm0,%%xmm1 \n"
570 "movdqa %%xmm0,%%xmm2 \n"
571 "pslld $0x8,%%xmm0 \n"
572 "psrld $0x3,%%xmm1 \n"
573 "psrld $0x5,%%xmm2 \n"
574 "psrad $0x10,%%xmm0 \n"
575 "pand %%xmm3,%%xmm1 \n"
576 "pand %%xmm4,%%xmm2 \n"
577 "pand %%xmm5,%%xmm0 \n"
578 "por %%xmm2,%%xmm1 \n"
579 "por %%xmm1,%%xmm0 \n"
580 "packssdw %%xmm0,%%xmm0 \n"
581 "lea " MEMLEA(0x10,0) ",%0 \n"
582 "movq %%xmm0," MEMACCESS(1) " \n"
583 "lea " MEMLEA(0x8,1) ",%1 \n"
589 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
593 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
595 "pcmpeqb %%xmm4,%%xmm4 \n"
596 "psrld $0x1b,%%xmm4 \n"
597 "movdqa %%xmm4,%%xmm5 \n"
598 "pslld $0x5,%%xmm5 \n"
599 "movdqa %%xmm4,%%xmm6 \n"
600 "pslld $0xa,%%xmm6 \n"
601 "pcmpeqb %%xmm7,%%xmm7 \n"
602 "pslld $0xf,%%xmm7 \n"
605 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
606 "movdqa %%xmm0,%%xmm1 \n"
607 "movdqa %%xmm0,%%xmm2 \n"
608 "movdqa %%xmm0,%%xmm3 \n"
609 "psrad $0x10,%%xmm0 \n"
610 "psrld $0x3,%%xmm1 \n"
611 "psrld $0x6,%%xmm2 \n"
612 "psrld $0x9,%%xmm3 \n"
613 "pand %%xmm7,%%xmm0 \n"
614 "pand %%xmm4,%%xmm1 \n"
615 "pand %%xmm5,%%xmm2 \n"
616 "pand %%xmm6,%%xmm3 \n"
617 "por %%xmm1,%%xmm0 \n"
618 "por %%xmm3,%%xmm2 \n"
619 "por %%xmm2,%%xmm0 \n"
620 "packssdw %%xmm0,%%xmm0 \n"
621 "lea " MEMLEA(0x10,0) ",%0 \n"
622 "movq %%xmm0," MEMACCESS(1) " \n"
623 "lea " MEMLEA(0x8,1) ",%1 \n"
630 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
634 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
636 "pcmpeqb %%xmm4,%%xmm4 \n"
637 "psllw $0xc,%%xmm4 \n"
638 "movdqa %%xmm4,%%xmm3 \n"
639 "psrlw $0x8,%%xmm3 \n"
642 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
643 "movdqa %%xmm0,%%xmm1 \n"
644 "pand %%xmm3,%%xmm0 \n"
645 "pand %%xmm4,%%xmm1 \n"
646 "psrlq $0x4,%%xmm0 \n"
647 "psrlq $0x8,%%xmm1 \n"
648 "por %%xmm1,%%xmm0 \n"
649 "packuswb %%xmm0,%%xmm0 \n"
650 "lea " MEMLEA(0x10,0) ",%0 \n"
651 "movq %%xmm0," MEMACCESS(1) " \n"
652 "lea " MEMLEA(0x8,1) ",%1 \n"
658 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
661 #endif // HAS_RGB24TOARGBROW_SSSE3
663 #ifdef HAS_ARGBTOYROW_SSSE3
664 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
665 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
667 "movdqa %3,%%xmm4 \n"
668 "movdqa %4,%%xmm5 \n"
671 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
672 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
673 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
674 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
675 "pmaddubsw %%xmm4,%%xmm0 \n"
676 "pmaddubsw %%xmm4,%%xmm1 \n"
677 "pmaddubsw %%xmm4,%%xmm2 \n"
678 "pmaddubsw %%xmm4,%%xmm3 \n"
679 "lea " MEMLEA(0x40,0) ",%0 \n"
680 "phaddw %%xmm1,%%xmm0 \n"
681 "phaddw %%xmm3,%%xmm2 \n"
682 "psrlw $0x7,%%xmm0 \n"
683 "psrlw $0x7,%%xmm2 \n"
684 "packuswb %%xmm2,%%xmm0 \n"
685 "paddb %%xmm5,%%xmm0 \n"
686 "movdqu %%xmm0," MEMACCESS(1) " \n"
687 "lea " MEMLEA(0x10,1) ",%1 \n"
690 : "+r"(src_argb), // %0
693 : "m"(kARGBToY), // %3
695 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
698 #endif // HAS_ARGBTOYROW_SSSE3
700 #ifdef HAS_ARGBTOYJROW_SSSE3
701 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
702 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
703 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
705 "movdqa %3,%%xmm4 \n"
706 "movdqa %4,%%xmm5 \n"
709 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
710 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
711 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
712 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
713 "pmaddubsw %%xmm4,%%xmm0 \n"
714 "pmaddubsw %%xmm4,%%xmm1 \n"
715 "pmaddubsw %%xmm4,%%xmm2 \n"
716 "pmaddubsw %%xmm4,%%xmm3 \n"
717 "lea " MEMLEA(0x40,0) ",%0 \n"
718 "phaddw %%xmm1,%%xmm0 \n"
719 "phaddw %%xmm3,%%xmm2 \n"
720 "paddw %%xmm5,%%xmm0 \n"
721 "paddw %%xmm5,%%xmm2 \n"
722 "psrlw $0x7,%%xmm0 \n"
723 "psrlw $0x7,%%xmm2 \n"
724 "packuswb %%xmm2,%%xmm0 \n"
725 "movdqu %%xmm0," MEMACCESS(1) " \n"
726 "lea " MEMLEA(0x10,1) ",%1 \n"
729 : "+r"(src_argb), // %0
732 : "m"(kARGBToYJ), // %3
734 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
737 #endif // HAS_ARGBTOYJROW_SSSE3
739 #ifdef HAS_ARGBTOYROW_AVX2
740 // vpermd for vphaddw + vpackuswb vpermd.
741 static const lvec32 kPermdARGBToY_AVX = {
742 0, 4, 1, 5, 2, 6, 3, 7
745 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
746 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
748 "vbroadcastf128 %3,%%ymm4 \n"
749 "vbroadcastf128 %4,%%ymm5 \n"
750 "vmovdqu %5,%%ymm6 \n"
753 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
754 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
755 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
756 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
757 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
758 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
759 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
760 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
761 "lea " MEMLEA(0x80,0) ",%0 \n"
762 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
763 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
764 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
765 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
766 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
767 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
768 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
769 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
770 "lea " MEMLEA(0x20,1) ",%1 \n"
774 : "+r"(src_argb), // %0
777 : "m"(kARGBToY), // %3
779 "m"(kPermdARGBToY_AVX) // %5
780 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
783 #endif // HAS_ARGBTOYROW_AVX2
785 #ifdef HAS_ARGBTOYJROW_AVX2
786 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
787 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
789 "vbroadcastf128 %3,%%ymm4 \n"
790 "vbroadcastf128 %4,%%ymm5 \n"
791 "vmovdqu %5,%%ymm6 \n"
794 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
795 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
796 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
797 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
798 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
799 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
800 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
801 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
802 "lea " MEMLEA(0x80,0) ",%0 \n"
803 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
804 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
805 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
806 "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
807 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
808 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
809 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
810 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
811 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
812 "lea " MEMLEA(0x20,1) ",%1 \n"
816 : "+r"(src_argb), // %0
819 : "m"(kARGBToYJ), // %3
821 "m"(kPermdARGBToY_AVX) // %5
822 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
825 #endif // HAS_ARGBTOYJROW_AVX2
827 #ifdef HAS_ARGBTOUVROW_SSSE3
828 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
829 uint8* dst_u, uint8* dst_v, int width) {
831 "movdqa %5,%%xmm3 \n"
832 "movdqa %6,%%xmm4 \n"
833 "movdqa %7,%%xmm5 \n"
837 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
838 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
839 "pavgb %%xmm7,%%xmm0 \n"
840 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
841 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
842 "pavgb %%xmm7,%%xmm1 \n"
843 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
844 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
845 "pavgb %%xmm7,%%xmm2 \n"
846 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
847 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
848 "pavgb %%xmm7,%%xmm6 \n"
850 "lea " MEMLEA(0x40,0) ",%0 \n"
851 "movdqa %%xmm0,%%xmm7 \n"
852 "shufps $0x88,%%xmm1,%%xmm0 \n"
853 "shufps $0xdd,%%xmm1,%%xmm7 \n"
854 "pavgb %%xmm7,%%xmm0 \n"
855 "movdqa %%xmm2,%%xmm7 \n"
856 "shufps $0x88,%%xmm6,%%xmm2 \n"
857 "shufps $0xdd,%%xmm6,%%xmm7 \n"
858 "pavgb %%xmm7,%%xmm2 \n"
859 "movdqa %%xmm0,%%xmm1 \n"
860 "movdqa %%xmm2,%%xmm6 \n"
861 "pmaddubsw %%xmm4,%%xmm0 \n"
862 "pmaddubsw %%xmm4,%%xmm2 \n"
863 "pmaddubsw %%xmm3,%%xmm1 \n"
864 "pmaddubsw %%xmm3,%%xmm6 \n"
865 "phaddw %%xmm2,%%xmm0 \n"
866 "phaddw %%xmm6,%%xmm1 \n"
867 "psraw $0x8,%%xmm0 \n"
868 "psraw $0x8,%%xmm1 \n"
869 "packsswb %%xmm1,%%xmm0 \n"
870 "paddb %%xmm5,%%xmm0 \n"
871 "movlps %%xmm0," MEMACCESS(1) " \n"
872 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
873 "lea " MEMLEA(0x8,1) ",%1 \n"
876 : "+r"(src_argb0), // %0
880 : "r"((intptr_t)(src_stride_argb)), // %4
884 : "memory", "cc", NACL_R14
885 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
888 #endif // HAS_ARGBTOUVROW_SSSE3
890 #ifdef HAS_ARGBTOUVROW_AVX2
891 // vpshufb for vphaddw + vpackuswb packed to shorts.
892 static const lvec8 kShufARGBToUV_AVX = {
893 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
894 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
896 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
897 uint8* dst_u, uint8* dst_v, int width) {
899 "vbroadcastf128 %5,%%ymm5 \n"
900 "vbroadcastf128 %6,%%ymm6 \n"
901 "vbroadcastf128 %7,%%ymm7 \n"
905 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
906 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
907 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
908 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
909 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
910 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
911 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
912 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
913 "lea " MEMLEA(0x80,0) ",%0 \n"
914 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
915 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
916 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
917 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
918 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
919 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
921 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
922 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
923 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
924 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
925 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
926 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
927 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
928 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
929 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
930 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
931 "vpshufb %8,%%ymm0,%%ymm0 \n"
932 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
934 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
935 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
936 "lea " MEMLEA(0x10,1) ",%1 \n"
940 : "+r"(src_argb0), // %0
944 : "r"((intptr_t)(src_stride_argb)), // %4
945 "m"(kAddUV128), // %5
948 "m"(kShufARGBToUV_AVX) // %8
949 : "memory", "cc", NACL_R14
950 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
953 #endif // HAS_ARGBTOUVROW_AVX2
955 #ifdef HAS_ARGBTOUVJROW_SSSE3
956 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
957 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
958 uint8* dst_u, uint8* dst_v, int width) {
960 "movdqa %5,%%xmm3 \n"
961 "movdqa %6,%%xmm4 \n"
962 "movdqa %7,%%xmm5 \n"
966 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
967 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
968 "pavgb %%xmm7,%%xmm0 \n"
969 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
970 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
971 "pavgb %%xmm7,%%xmm1 \n"
972 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
973 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
974 "pavgb %%xmm7,%%xmm2 \n"
975 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
976 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
977 "pavgb %%xmm7,%%xmm6 \n"
979 "lea " MEMLEA(0x40,0) ",%0 \n"
980 "movdqa %%xmm0,%%xmm7 \n"
981 "shufps $0x88,%%xmm1,%%xmm0 \n"
982 "shufps $0xdd,%%xmm1,%%xmm7 \n"
983 "pavgb %%xmm7,%%xmm0 \n"
984 "movdqa %%xmm2,%%xmm7 \n"
985 "shufps $0x88,%%xmm6,%%xmm2 \n"
986 "shufps $0xdd,%%xmm6,%%xmm7 \n"
987 "pavgb %%xmm7,%%xmm2 \n"
988 "movdqa %%xmm0,%%xmm1 \n"
989 "movdqa %%xmm2,%%xmm6 \n"
990 "pmaddubsw %%xmm4,%%xmm0 \n"
991 "pmaddubsw %%xmm4,%%xmm2 \n"
992 "pmaddubsw %%xmm3,%%xmm1 \n"
993 "pmaddubsw %%xmm3,%%xmm6 \n"
994 "phaddw %%xmm2,%%xmm0 \n"
995 "phaddw %%xmm6,%%xmm1 \n"
996 "paddw %%xmm5,%%xmm0 \n"
997 "paddw %%xmm5,%%xmm1 \n"
998 "psraw $0x8,%%xmm0 \n"
999 "psraw $0x8,%%xmm1 \n"
1000 "packsswb %%xmm1,%%xmm0 \n"
1001 "movlps %%xmm0," MEMACCESS(1) " \n"
1002 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1003 "lea " MEMLEA(0x8,1) ",%1 \n"
1006 : "+r"(src_argb0), // %0
1010 : "r"((intptr_t)(src_stride_argb)), // %4
1011 "m"(kARGBToVJ), // %5
1012 "m"(kARGBToUJ), // %6
1013 "m"(kAddUVJ128) // %7
1014 : "memory", "cc", NACL_R14
1015 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1018 #endif // HAS_ARGBTOUVJROW_SSSE3
1020 #ifdef HAS_ARGBTOUV444ROW_SSSE3
1021 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1024 "movdqa %4,%%xmm3 \n"
1025 "movdqa %5,%%xmm4 \n"
1026 "movdqa %6,%%xmm5 \n"
1030 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1031 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1032 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1033 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1034 "pmaddubsw %%xmm4,%%xmm0 \n"
1035 "pmaddubsw %%xmm4,%%xmm1 \n"
1036 "pmaddubsw %%xmm4,%%xmm2 \n"
1037 "pmaddubsw %%xmm4,%%xmm6 \n"
1038 "phaddw %%xmm1,%%xmm0 \n"
1039 "phaddw %%xmm6,%%xmm2 \n"
1040 "psraw $0x8,%%xmm0 \n"
1041 "psraw $0x8,%%xmm2 \n"
1042 "packsswb %%xmm2,%%xmm0 \n"
1043 "paddb %%xmm5,%%xmm0 \n"
1044 "movdqu %%xmm0," MEMACCESS(1) " \n"
1045 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1046 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1047 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1048 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1049 "pmaddubsw %%xmm3,%%xmm0 \n"
1050 "pmaddubsw %%xmm3,%%xmm1 \n"
1051 "pmaddubsw %%xmm3,%%xmm2 \n"
1052 "pmaddubsw %%xmm3,%%xmm6 \n"
1053 "phaddw %%xmm1,%%xmm0 \n"
1054 "phaddw %%xmm6,%%xmm2 \n"
1055 "psraw $0x8,%%xmm0 \n"
1056 "psraw $0x8,%%xmm2 \n"
1057 "packsswb %%xmm2,%%xmm0 \n"
1058 "paddb %%xmm5,%%xmm0 \n"
1059 "lea " MEMLEA(0x40,0) ",%0 \n"
1060 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
1061 "lea " MEMLEA(0x10,1) ",%1 \n"
1064 : "+r"(src_argb), // %0
1068 : "m"(kARGBToV), // %4
1069 "m"(kARGBToU), // %5
1070 "m"(kAddUV128) // %6
1071 : "memory", "cc", NACL_R14
1072 "xmm0", "xmm1", "xmm2", "xmm6"
1075 #endif // HAS_ARGBTOUV444ROW_SSSE3
1077 #ifdef HAS_ARGBTOUV422ROW_SSSE3
1078 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1079 uint8* dst_u, uint8* dst_v, int width) {
1081 "movdqa %4,%%xmm3 \n"
1082 "movdqa %5,%%xmm4 \n"
1083 "movdqa %6,%%xmm5 \n"
1087 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1088 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1089 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1090 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1091 "lea " MEMLEA(0x40,0) ",%0 \n"
1092 "movdqa %%xmm0,%%xmm7 \n"
1093 "shufps $0x88,%%xmm1,%%xmm0 \n"
1094 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1095 "pavgb %%xmm7,%%xmm0 \n"
1096 "movdqa %%xmm2,%%xmm7 \n"
1097 "shufps $0x88,%%xmm6,%%xmm2 \n"
1098 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1099 "pavgb %%xmm7,%%xmm2 \n"
1100 "movdqa %%xmm0,%%xmm1 \n"
1101 "movdqa %%xmm2,%%xmm6 \n"
1102 "pmaddubsw %%xmm4,%%xmm0 \n"
1103 "pmaddubsw %%xmm4,%%xmm2 \n"
1104 "pmaddubsw %%xmm3,%%xmm1 \n"
1105 "pmaddubsw %%xmm3,%%xmm6 \n"
1106 "phaddw %%xmm2,%%xmm0 \n"
1107 "phaddw %%xmm6,%%xmm1 \n"
1108 "psraw $0x8,%%xmm0 \n"
1109 "psraw $0x8,%%xmm1 \n"
1110 "packsswb %%xmm1,%%xmm0 \n"
1111 "paddb %%xmm5,%%xmm0 \n"
1112 "movlps %%xmm0," MEMACCESS(1) " \n"
1113 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1114 "lea " MEMLEA(0x8,1) ",%1 \n"
1117 : "+r"(src_argb0), // %0
1121 : "m"(kARGBToV), // %4
1122 "m"(kARGBToU), // %5
1123 "m"(kAddUV128) // %6
1124 : "memory", "cc", NACL_R14
1125 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1128 #endif // HAS_ARGBTOUV422ROW_SSSE3
1130 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
1132 "movdqa %4,%%xmm5 \n"
1133 "movdqa %3,%%xmm4 \n"
1136 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1137 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1138 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1139 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1140 "pmaddubsw %%xmm4,%%xmm0 \n"
1141 "pmaddubsw %%xmm4,%%xmm1 \n"
1142 "pmaddubsw %%xmm4,%%xmm2 \n"
1143 "pmaddubsw %%xmm4,%%xmm3 \n"
1144 "lea " MEMLEA(0x40,0) ",%0 \n"
1145 "phaddw %%xmm1,%%xmm0 \n"
1146 "phaddw %%xmm3,%%xmm2 \n"
1147 "psrlw $0x7,%%xmm0 \n"
1148 "psrlw $0x7,%%xmm2 \n"
1149 "packuswb %%xmm2,%%xmm0 \n"
1150 "paddb %%xmm5,%%xmm0 \n"
1151 "movdqu %%xmm0," MEMACCESS(1) " \n"
1152 "lea " MEMLEA(0x10,1) ",%1 \n"
1155 : "+r"(src_bgra), // %0
1158 : "m"(kBGRAToY), // %3
1160 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1164 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1165 uint8* dst_u, uint8* dst_v, int width) {
1167 "movdqa %5,%%xmm3 \n"
1168 "movdqa %6,%%xmm4 \n"
1169 "movdqa %7,%%xmm5 \n"
1173 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1174 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1175 "pavgb %%xmm7,%%xmm0 \n"
1176 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1177 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1178 "pavgb %%xmm7,%%xmm1 \n"
1179 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1180 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1181 "pavgb %%xmm7,%%xmm2 \n"
1182 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1183 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1184 "pavgb %%xmm7,%%xmm6 \n"
1186 "lea " MEMLEA(0x40,0) ",%0 \n"
1187 "movdqa %%xmm0,%%xmm7 \n"
1188 "shufps $0x88,%%xmm1,%%xmm0 \n"
1189 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1190 "pavgb %%xmm7,%%xmm0 \n"
1191 "movdqa %%xmm2,%%xmm7 \n"
1192 "shufps $0x88,%%xmm6,%%xmm2 \n"
1193 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1194 "pavgb %%xmm7,%%xmm2 \n"
1195 "movdqa %%xmm0,%%xmm1 \n"
1196 "movdqa %%xmm2,%%xmm6 \n"
1197 "pmaddubsw %%xmm4,%%xmm0 \n"
1198 "pmaddubsw %%xmm4,%%xmm2 \n"
1199 "pmaddubsw %%xmm3,%%xmm1 \n"
1200 "pmaddubsw %%xmm3,%%xmm6 \n"
1201 "phaddw %%xmm2,%%xmm0 \n"
1202 "phaddw %%xmm6,%%xmm1 \n"
1203 "psraw $0x8,%%xmm0 \n"
1204 "psraw $0x8,%%xmm1 \n"
1205 "packsswb %%xmm1,%%xmm0 \n"
1206 "paddb %%xmm5,%%xmm0 \n"
1207 "movlps %%xmm0," MEMACCESS(1) " \n"
1208 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1209 "lea " MEMLEA(0x8,1) ",%1 \n"
1212 : "+r"(src_bgra0), // %0
1216 : "r"((intptr_t)(src_stride_bgra)), // %4
1217 "m"(kBGRAToV), // %5
1218 "m"(kBGRAToU), // %6
1219 "m"(kAddUV128) // %7
1220 : "memory", "cc", NACL_R14
1221 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1225 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1227 "movdqa %4,%%xmm5 \n"
1228 "movdqa %3,%%xmm4 \n"
1231 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1232 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1233 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1234 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1235 "pmaddubsw %%xmm4,%%xmm0 \n"
1236 "pmaddubsw %%xmm4,%%xmm1 \n"
1237 "pmaddubsw %%xmm4,%%xmm2 \n"
1238 "pmaddubsw %%xmm4,%%xmm3 \n"
1239 "lea " MEMLEA(0x40,0) ",%0 \n"
1240 "phaddw %%xmm1,%%xmm0 \n"
1241 "phaddw %%xmm3,%%xmm2 \n"
1242 "psrlw $0x7,%%xmm0 \n"
1243 "psrlw $0x7,%%xmm2 \n"
1244 "packuswb %%xmm2,%%xmm0 \n"
1245 "paddb %%xmm5,%%xmm0 \n"
1246 "movdqu %%xmm0," MEMACCESS(1) " \n"
1247 "lea " MEMLEA(0x10,1) ",%1 \n"
1250 : "+r"(src_abgr), // %0
1253 : "m"(kABGRToY), // %3
1255 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1259 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1261 "movdqa %4,%%xmm5 \n"
1262 "movdqa %3,%%xmm4 \n"
1265 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1266 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1267 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1268 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1269 "pmaddubsw %%xmm4,%%xmm0 \n"
1270 "pmaddubsw %%xmm4,%%xmm1 \n"
1271 "pmaddubsw %%xmm4,%%xmm2 \n"
1272 "pmaddubsw %%xmm4,%%xmm3 \n"
1273 "lea " MEMLEA(0x40,0) ",%0 \n"
1274 "phaddw %%xmm1,%%xmm0 \n"
1275 "phaddw %%xmm3,%%xmm2 \n"
1276 "psrlw $0x7,%%xmm0 \n"
1277 "psrlw $0x7,%%xmm2 \n"
1278 "packuswb %%xmm2,%%xmm0 \n"
1279 "paddb %%xmm5,%%xmm0 \n"
1280 "movdqu %%xmm0," MEMACCESS(1) " \n"
1281 "lea " MEMLEA(0x10,1) ",%1 \n"
1284 : "+r"(src_rgba), // %0
1287 : "m"(kRGBAToY), // %3
1289 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1293 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1294 uint8* dst_u, uint8* dst_v, int width) {
1296 "movdqa %5,%%xmm3 \n"
1297 "movdqa %6,%%xmm4 \n"
1298 "movdqa %7,%%xmm5 \n"
1302 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1303 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1304 "pavgb %%xmm7,%%xmm0 \n"
1305 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1306 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1307 "pavgb %%xmm7,%%xmm1 \n"
1308 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1309 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1310 "pavgb %%xmm7,%%xmm2 \n"
1311 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1312 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1313 "pavgb %%xmm7,%%xmm6 \n"
1315 "lea " MEMLEA(0x40,0) ",%0 \n"
1316 "movdqa %%xmm0,%%xmm7 \n"
1317 "shufps $0x88,%%xmm1,%%xmm0 \n"
1318 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1319 "pavgb %%xmm7,%%xmm0 \n"
1320 "movdqa %%xmm2,%%xmm7 \n"
1321 "shufps $0x88,%%xmm6,%%xmm2 \n"
1322 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1323 "pavgb %%xmm7,%%xmm2 \n"
1324 "movdqa %%xmm0,%%xmm1 \n"
1325 "movdqa %%xmm2,%%xmm6 \n"
1326 "pmaddubsw %%xmm4,%%xmm0 \n"
1327 "pmaddubsw %%xmm4,%%xmm2 \n"
1328 "pmaddubsw %%xmm3,%%xmm1 \n"
1329 "pmaddubsw %%xmm3,%%xmm6 \n"
1330 "phaddw %%xmm2,%%xmm0 \n"
1331 "phaddw %%xmm6,%%xmm1 \n"
1332 "psraw $0x8,%%xmm0 \n"
1333 "psraw $0x8,%%xmm1 \n"
1334 "packsswb %%xmm1,%%xmm0 \n"
1335 "paddb %%xmm5,%%xmm0 \n"
1336 "movlps %%xmm0," MEMACCESS(1) " \n"
1337 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1338 "lea " MEMLEA(0x8,1) ",%1 \n"
1341 : "+r"(src_abgr0), // %0
1345 : "r"((intptr_t)(src_stride_abgr)), // %4
1346 "m"(kABGRToV), // %5
1347 "m"(kABGRToU), // %6
1348 "m"(kAddUV128) // %7
1349 : "memory", "cc", NACL_R14
1350 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1354 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1355 uint8* dst_u, uint8* dst_v, int width) {
1357 "movdqa %5,%%xmm3 \n"
1358 "movdqa %6,%%xmm4 \n"
1359 "movdqa %7,%%xmm5 \n"
1363 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1364 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1365 "pavgb %%xmm7,%%xmm0 \n"
1366 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1367 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1368 "pavgb %%xmm7,%%xmm1 \n"
1369 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1370 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1371 "pavgb %%xmm7,%%xmm2 \n"
1372 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1373 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1374 "pavgb %%xmm7,%%xmm6 \n"
1376 "lea " MEMLEA(0x40,0) ",%0 \n"
1377 "movdqa %%xmm0,%%xmm7 \n"
1378 "shufps $0x88,%%xmm1,%%xmm0 \n"
1379 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1380 "pavgb %%xmm7,%%xmm0 \n"
1381 "movdqa %%xmm2,%%xmm7 \n"
1382 "shufps $0x88,%%xmm6,%%xmm2 \n"
1383 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1384 "pavgb %%xmm7,%%xmm2 \n"
1385 "movdqa %%xmm0,%%xmm1 \n"
1386 "movdqa %%xmm2,%%xmm6 \n"
1387 "pmaddubsw %%xmm4,%%xmm0 \n"
1388 "pmaddubsw %%xmm4,%%xmm2 \n"
1389 "pmaddubsw %%xmm3,%%xmm1 \n"
1390 "pmaddubsw %%xmm3,%%xmm6 \n"
1391 "phaddw %%xmm2,%%xmm0 \n"
1392 "phaddw %%xmm6,%%xmm1 \n"
1393 "psraw $0x8,%%xmm0 \n"
1394 "psraw $0x8,%%xmm1 \n"
1395 "packsswb %%xmm1,%%xmm0 \n"
1396 "paddb %%xmm5,%%xmm0 \n"
1397 "movlps %%xmm0," MEMACCESS(1) " \n"
1398 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1399 "lea " MEMLEA(0x8,1) ",%1 \n"
1402 : "+r"(src_rgba0), // %0
1406 : "r"((intptr_t)(src_stride_rgba)), // %4
1407 "m"(kRGBAToV), // %5
1408 "m"(kRGBAToU), // %6
1409 "m"(kAddUV128) // %7
1410 : "memory", "cc", NACL_R14
1411 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1415 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1417 // YUV to RGB conversion constants.
1418 // Y contribution to R,G,B. Scale and bias.
1419 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1420 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
1422 // U and V contributions to R,G,B.
1423 #define UB -128 /* -min(128, round(2.018 * 64)) */
1424 #define UG 25 /* -round(-0.391 * 64) */
1425 #define VG 52 /* -round(-0.813 * 64) */
1426 #define VR -102 /* -round(1.596 * 64) */
1428 // Bias values to subtract 16 from Y and 128 from U and V.
1429 #define BB (UB * 128 - YGB)
1430 #define BG (UG * 128 + VG * 128 - YGB)
1431 #define BR (VR * 128 - YGB)
1433 struct YuvConstants {
1437 lvec16 kUVBiasB; // 96
1438 lvec16 kUVBiasG; // 128
1439 lvec16 kUVBiasR; // 160
1440 lvec16 kYToRgb; // 192
1443 // BT601 constants for YUV to RGB.
1444 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
1445 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1446 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
1447 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1448 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1449 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1450 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
1451 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1452 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1453 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1454 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1457 // BT601 constants for NV21 where chroma plane is VU instead of UV.
1458 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
1459 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1460 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
1461 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1462 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1463 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1464 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
1465 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1466 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1467 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1468 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1471 // Read 8 UV from 411
1472 #define READYUV444 \
1473 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1474 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1475 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1476 "punpcklbw %%xmm1,%%xmm0 \n"
1478 // Read 4 UV from 422, upsample to 8 UV
1479 #define READYUV422 \
1480 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1481 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1482 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1483 "punpcklbw %%xmm1,%%xmm0 \n" \
1484 "punpcklwd %%xmm0,%%xmm0 \n"
1486 // Read 2 UV from 411, upsample to 8 UV
1487 #define READYUV411 \
1488 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1489 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1490 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
1491 "punpcklbw %%xmm1,%%xmm0 \n" \
1492 "punpcklwd %%xmm0,%%xmm0 \n" \
1493 "punpckldq %%xmm0,%%xmm0 \n"
1495 // Read 4 UV from NV12, upsample to 8 UV
1497 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1498 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
1499 "punpcklwd %%xmm0,%%xmm0 \n"
1501 // Convert 8 pixels: 8 UV and 8 Y
1502 #define YUVTORGB(YuvConstants) \
1503 "movdqa %%xmm0,%%xmm1 \n" \
1504 "movdqa %%xmm0,%%xmm2 \n" \
1505 "movdqa %%xmm0,%%xmm3 \n" \
1506 "movdqa " MEMACCESS2(96, [YuvConstants]) ",%%xmm0 \n" \
1507 "pmaddubsw " MEMACCESS([YuvConstants]) ",%%xmm1 \n" \
1508 "psubw %%xmm1,%%xmm0 \n" \
1509 "movdqa " MEMACCESS2(128, [YuvConstants]) ",%%xmm1 \n" \
1510 "pmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%xmm2 \n" \
1511 "psubw %%xmm2,%%xmm1 \n" \
1512 "movdqa " MEMACCESS2(160, [YuvConstants]) ",%%xmm2 \n" \
1513 "pmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%xmm3 \n" \
1514 "psubw %%xmm3,%%xmm2 \n" \
1515 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
1516 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
1517 "punpcklbw %%xmm3,%%xmm3 \n" \
1518 "pmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%xmm3 \n" \
1519 "paddsw %%xmm3,%%xmm0 \n" \
1520 "paddsw %%xmm3,%%xmm1 \n" \
1521 "paddsw %%xmm3,%%xmm2 \n" \
1522 "psraw $0x6,%%xmm0 \n" \
1523 "psraw $0x6,%%xmm1 \n" \
1524 "psraw $0x6,%%xmm2 \n" \
1525 "packuswb %%xmm0,%%xmm0 \n" \
1526 "packuswb %%xmm1,%%xmm1 \n" \
1527 "packuswb %%xmm2,%%xmm2 \n"
1529 // Store 8 ARGB values. Assumes XMM5 is zero.
1531 "punpcklbw %%xmm1,%%xmm0 \n" \
1532 "punpcklbw %%xmm5,%%xmm2 \n" \
1533 "movdqa %%xmm0,%%xmm1 \n" \
1534 "punpcklwd %%xmm2,%%xmm0 \n" \
1535 "punpckhwd %%xmm2,%%xmm1 \n" \
1536 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
1537 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \
1538 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
1540 // Store 8 BGRA values. Assumes XMM5 is zero.
1542 "pcmpeqb %%xmm5,%%xmm5 \n" \
1543 "punpcklbw %%xmm0,%%xmm1 \n" \
1544 "punpcklbw %%xmm2,%%xmm5 \n" \
1545 "movdqa %%xmm5,%%xmm0 \n" \
1546 "punpcklwd %%xmm1,%%xmm5 \n" \
1547 "punpckhwd %%xmm1,%%xmm0 \n" \
1548 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \
1549 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \
1550 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
1552 // Store 8 ABGR values. Assumes XMM5 is zero.
1554 "punpcklbw %%xmm1,%%xmm2 \n" \
1555 "punpcklbw %%xmm5,%%xmm0 \n" \
1556 "movdqa %%xmm2,%%xmm1 \n" \
1557 "punpcklwd %%xmm0,%%xmm2 \n" \
1558 "punpckhwd %%xmm0,%%xmm1 \n" \
1559 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \
1560 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \
1561 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
1563 // Store 8 RGBA values. Assumes XMM5 is zero.
1565 "pcmpeqb %%xmm5,%%xmm5 \n" \
1566 "punpcklbw %%xmm2,%%xmm1 \n" \
1567 "punpcklbw %%xmm0,%%xmm5 \n" \
1568 "movdqa %%xmm5,%%xmm0 \n" \
1569 "punpcklwd %%xmm1,%%xmm5 \n" \
1570 "punpckhwd %%xmm1,%%xmm0 \n" \
1571 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
1572 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \
1573 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
1575 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1581 "sub %[u_buf],%[v_buf] \n"
1582 "pcmpeqb %%xmm5,%%xmm5 \n"
1586 YUVTORGB(kYuvConstants)
1588 "sub $0x8,%[width] \n"
1590 : [y_buf]"+r"(y_buf), // %[y_buf]
1591 [u_buf]"+r"(u_buf), // %[u_buf]
1592 [v_buf]"+r"(v_buf), // %[v_buf]
1593 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1594 [width]"+rm"(width) // %[width]
1595 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1596 : "memory", "cc", NACL_R14
1597 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1601 // TODO(fbarchard): Consider putting masks into constants.
1602 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1608 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1609 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1610 "sub %[u_buf],%[v_buf] \n"
1614 YUVTORGB(kYuvConstants)
1615 "punpcklbw %%xmm1,%%xmm0 \n"
1616 "punpcklbw %%xmm2,%%xmm2 \n"
1617 "movdqa %%xmm0,%%xmm1 \n"
1618 "punpcklwd %%xmm2,%%xmm0 \n"
1619 "punpckhwd %%xmm2,%%xmm1 \n"
1620 "pshufb %%xmm5,%%xmm0 \n"
1621 "pshufb %%xmm6,%%xmm1 \n"
1622 "palignr $0xc,%%xmm0,%%xmm1 \n"
1623 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1624 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1625 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1626 "subl $0x8,%[width] \n"
1628 : [y_buf]"+r"(y_buf), // %[y_buf]
1629 [u_buf]"+r"(u_buf), // %[u_buf]
1630 [v_buf]"+r"(v_buf), // %[v_buf]
1631 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
1632 // TODO(fbarchard): Make width a register for 32 bit.
1633 #if defined(__i386__) && defined(__pic__)
1634 [width]"+m"(width) // %[width]
1636 [width]"+rm"(width) // %[width]
1638 : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1639 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1640 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1641 : "memory", "cc", NACL_R14
1642 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1646 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1652 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1653 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1654 "sub %[u_buf],%[v_buf] \n"
1658 YUVTORGB(kYuvConstants)
1659 "punpcklbw %%xmm1,%%xmm0 \n"
1660 "punpcklbw %%xmm2,%%xmm2 \n"
1661 "movdqa %%xmm0,%%xmm1 \n"
1662 "punpcklwd %%xmm2,%%xmm0 \n"
1663 "punpckhwd %%xmm2,%%xmm1 \n"
1664 "pshufb %%xmm5,%%xmm0 \n"
1665 "pshufb %%xmm6,%%xmm1 \n"
1666 "palignr $0xc,%%xmm0,%%xmm1 \n"
1667 "movq %%xmm0," MEMACCESS([dst_raw]) " \n"
1668 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
1669 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
1670 "subl $0x8,%[width] \n"
1672 : [y_buf]"+r"(y_buf), // %[y_buf]
1673 [u_buf]"+r"(u_buf), // %[u_buf]
1674 [v_buf]"+r"(v_buf), // %[v_buf]
1675 [dst_raw]"+r"(dst_raw), // %[dst_raw]
1676 // TODO(fbarchard): Make width a register for 32 bit.
1677 #if defined(__i386__) && defined(__pic__)
1678 [width]"+m"(width) // %[width]
1680 [width]"+rm"(width) // %[width]
1682 : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1683 [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1684 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
1685 : "memory", "cc", NACL_R14
1686 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1690 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1696 "sub %[u_buf],%[v_buf] \n"
1697 "pcmpeqb %%xmm5,%%xmm5 \n"
1701 YUVTORGB(kYuvConstants)
1703 "sub $0x8,%[width] \n"
1705 : [y_buf]"+r"(y_buf), // %[y_buf]
1706 [u_buf]"+r"(u_buf), // %[u_buf]
1707 [v_buf]"+r"(v_buf), // %[v_buf]
1708 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1709 [width]"+rm"(width) // %[width]
1710 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1711 : "memory", "cc", NACL_R14
1712 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1716 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1722 "sub %[u_buf],%[v_buf] \n"
1723 "pcmpeqb %%xmm5,%%xmm5 \n"
1727 YUVTORGB(kYuvConstants)
1729 "sub $0x8,%[width] \n"
1731 : [y_buf]"+r"(y_buf), // %[y_buf]
1732 [u_buf]"+r"(u_buf), // %[u_buf]
1733 [v_buf]"+r"(v_buf), // %[v_buf]
1734 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1735 [width]"+rm"(width) // %[width]
1736 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1737 : "memory", "cc", NACL_R14
1738 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1742 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1743 const uint8* uv_buf,
1747 "pcmpeqb %%xmm5,%%xmm5 \n"
1751 YUVTORGB(kYuvConstants)
1753 "sub $0x8,%[width] \n"
1755 : [y_buf]"+r"(y_buf), // %[y_buf]
1756 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1757 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1758 [width]"+rm"(width) // %[width]
1759 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1760 // Does not use r14.
1761 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1765 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1766 const uint8* uv_buf,
1770 "pcmpeqb %%xmm5,%%xmm5 \n"
1774 YUVTORGB(kYuvConstants)
1776 "sub $0x8,%[width] \n"
1778 : [y_buf]"+r"(y_buf), // %[y_buf]
1779 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1780 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1781 [width]"+rm"(width) // %[width]
1782 : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
1783 // Does not use r14.
1784 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1788 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1794 "sub %[u_buf],%[v_buf] \n"
1795 "pcmpeqb %%xmm5,%%xmm5 \n"
1799 YUVTORGB(kYuvConstants)
1801 "sub $0x8,%[width] \n"
1803 : [y_buf]"+r"(y_buf), // %[y_buf]
1804 [u_buf]"+r"(u_buf), // %[u_buf]
1805 [v_buf]"+r"(v_buf), // %[v_buf]
1806 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
1807 [width]"+rm"(width) // %[width]
1808 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1809 : "memory", "cc", NACL_R14
1810 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1814 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
1820 "sub %[u_buf],%[v_buf] \n"
1821 "pcmpeqb %%xmm5,%%xmm5 \n"
1825 YUVTORGB(kYuvConstants)
1827 "sub $0x8,%[width] \n"
1829 : [y_buf]"+r"(y_buf), // %[y_buf]
1830 [u_buf]"+r"(u_buf), // %[u_buf]
1831 [v_buf]"+r"(v_buf), // %[v_buf]
1832 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
1833 [width]"+rm"(width) // %[width]
1834 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1835 : "memory", "cc", NACL_R14
1836 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1840 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1846 "sub %[u_buf],%[v_buf] \n"
1847 "pcmpeqb %%xmm5,%%xmm5 \n"
1851 YUVTORGB(kYuvConstants)
1853 "sub $0x8,%[width] \n"
1855 : [y_buf]"+r"(y_buf), // %[y_buf]
1856 [u_buf]"+r"(u_buf), // %[u_buf]
1857 [v_buf]"+r"(v_buf), // %[v_buf]
1858 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
1859 [width]"+rm"(width) // %[width]
1860 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1861 : "memory", "cc", NACL_R14
1862 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1866 #endif // HAS_I422TOARGBROW_SSSE3
1868 // Read 8 UV from 422, upsample to 16 UV.
1869 #define READYUV422_AVX2 \
1870 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1871 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1872 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1873 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1874 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1875 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
1877 // Convert 16 pixels: 16 UV and 16 Y.
1878 #define YUVTORGB_AVX2(YuvConstants) \
1879 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \
1880 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \
1881 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \
1882 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \
1883 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
1884 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \
1885 "vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \
1886 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \
1887 "vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \
1888 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \
1889 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
1890 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
1891 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \
1892 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \
1893 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \
1894 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \
1895 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \
1896 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
1897 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
1898 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
1899 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
1900 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
1901 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
1903 #if defined(HAS_I422TOBGRAROW_AVX2)
1905 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
1906 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
1912 "sub %[u_buf],%[v_buf] \n"
1913 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
1917 YUVTORGB_AVX2(kYuvConstants)
1919 // Step 3: Weave into BGRA
1920 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB
1921 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
1922 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR
1923 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
1924 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels
1925 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels
1927 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n"
1928 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
1929 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
1930 "sub $0x10,%[width] \n"
1933 : [y_buf]"+r"(y_buf), // %[y_buf]
1934 [u_buf]"+r"(u_buf), // %[u_buf]
1935 [v_buf]"+r"(v_buf), // %[v_buf]
1936 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
1937 [width]"+rm"(width) // %[width]
1938 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1939 : "memory", "cc", NACL_R14
1940 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1943 #endif // HAS_I422TOBGRAROW_AVX2
1945 #if defined(HAS_I422TOARGBROW_AVX2)
1947 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
1948 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
1954 "sub %[u_buf],%[v_buf] \n"
1955 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
1959 YUVTORGB_AVX2(kYuvConstants)
1961 // Step 3: Weave into ARGB
1962 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
1963 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1964 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
1965 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
1966 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
1967 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
1969 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
1970 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
1971 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
1972 "sub $0x10,%[width] \n"
1975 : [y_buf]"+r"(y_buf), // %[y_buf]
1976 [u_buf]"+r"(u_buf), // %[u_buf]
1977 [v_buf]"+r"(v_buf), // %[v_buf]
1978 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1979 [width]"+rm"(width) // %[width]
1980 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1981 : "memory", "cc", NACL_R14
1982 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1985 #endif // HAS_I422TOARGBROW_AVX2
1987 #if defined(HAS_I422TOABGRROW_AVX2)
1989 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
1990 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
1996 "sub %[u_buf],%[v_buf] \n"
1997 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2001 YUVTORGB_AVX2(kYuvConstants)
2003 // Step 3: Weave into ABGR
2004 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG
2005 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2006 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA
2007 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2008 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels
2009 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels
2010 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
2011 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2012 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2013 "sub $0x10,%[width] \n"
2016 : [y_buf]"+r"(y_buf), // %[y_buf]
2017 [u_buf]"+r"(u_buf), // %[u_buf]
2018 [v_buf]"+r"(v_buf), // %[v_buf]
2019 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2020 [width]"+rm"(width) // %[width]
2021 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2022 : "memory", "cc", NACL_R14
2023 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2026 #endif // HAS_I422TOABGRROW_AVX2
2028 #if defined(HAS_I422TORGBAROW_AVX2)
2030 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2031 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2037 "sub %[u_buf],%[v_buf] \n"
2038 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2042 YUVTORGB_AVX2(kYuvConstants)
2044 // Step 3: Weave into RGBA
2045 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
2046 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2047 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
2048 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2049 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
2050 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
2051 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
2052 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2053 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2054 "sub $0x10,%[width] \n"
2057 : [y_buf]"+r"(y_buf), // %[y_buf]
2058 [u_buf]"+r"(u_buf), // %[u_buf]
2059 [v_buf]"+r"(v_buf), // %[v_buf]
2060 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2061 [width]"+rm"(width) // %[width]
2062 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2063 : "memory", "cc", NACL_R14
2064 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2067 #endif // HAS_I422TORGBAROW_AVX2
2069 #ifdef HAS_YTOARGBROW_SSE2
2070 void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2072 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
2073 "movd %%eax,%%xmm2 \n"
2074 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2075 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
2076 "movd %%eax,%%xmm3 \n"
2077 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2078 "pcmpeqb %%xmm4,%%xmm4 \n"
2079 "pslld $0x18,%%xmm4 \n"
2082 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2083 "movq " MEMACCESS(0) ",%%xmm0 \n"
2084 "lea " MEMLEA(0x8,0) ",%0 \n"
2085 "punpcklbw %%xmm0,%%xmm0 \n"
2086 "pmulhuw %%xmm2,%%xmm0 \n"
2087 "psubusw %%xmm3,%%xmm0 \n"
2088 "psrlw $6, %%xmm0 \n"
2089 "packuswb %%xmm0,%%xmm0 \n"
2091 // Step 2: Weave into ARGB
2092 "punpcklbw %%xmm0,%%xmm0 \n"
2093 "movdqa %%xmm0,%%xmm1 \n"
2094 "punpcklwd %%xmm0,%%xmm0 \n"
2095 "punpckhwd %%xmm1,%%xmm1 \n"
2096 "por %%xmm4,%%xmm0 \n"
2097 "por %%xmm4,%%xmm1 \n"
2098 "movdqu %%xmm0," MEMACCESS(1) " \n"
2099 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2100 "lea " MEMLEA(0x20,1) ",%1 \n"
2104 : "+r"(y_buf), // %0
2105 "+r"(dst_argb), // %1
2108 : "memory", "cc", "eax"
2109 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2112 #endif // HAS_YTOARGBROW_SSE2
2114 #ifdef HAS_YTOARGBROW_AVX2
2115 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2116 // note: vpunpcklbw mutates and vpackuswb unmutates.
2117 void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2119 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
2120 "vmovd %%eax,%%xmm2 \n"
2121 "vbroadcastss %%xmm2,%%ymm2 \n"
2122 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
2123 "vmovd %%eax,%%xmm3 \n"
2124 "vbroadcastss %%xmm3,%%ymm3 \n"
2125 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
2126 "vpslld $0x18,%%ymm4,%%ymm4 \n"
2130 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2131 "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
2132 "lea " MEMLEA(0x10,0) ",%0 \n"
2133 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2134 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
2135 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
2136 "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
2137 "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
2138 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
2139 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
2140 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2141 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
2142 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
2143 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
2144 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
2145 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2146 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2147 "lea " MEMLEA(0x40,1) ",%1 \n"
2151 : "+r"(y_buf), // %0
2152 "+r"(dst_argb), // %1
2155 : "memory", "cc", "eax"
2156 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2159 #endif // HAS_YTOARGBROW_AVX2
2161 #ifdef HAS_MIRRORROW_SSSE3
2162 // Shuffle table for reversing the bytes.
2163 static uvec8 kShuffleMirror = {
2164 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2167 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2168 intptr_t temp_width = (intptr_t)(width);
2170 "movdqa %3,%%xmm5 \n"
2173 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
2174 "pshufb %%xmm5,%%xmm0 \n"
2175 "movdqu %%xmm0," MEMACCESS(1) " \n"
2176 "lea " MEMLEA(0x10,1) ",%1 \n"
2181 "+r"(temp_width) // %2
2182 : "m"(kShuffleMirror) // %3
2183 : "memory", "cc", NACL_R14
2187 #endif // HAS_MIRRORROW_SSSE3
2189 #ifdef HAS_MIRRORROW_AVX2
2190 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2191 intptr_t temp_width = (intptr_t)(width);
2193 "vbroadcastf128 %3,%%ymm5 \n"
2196 MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
2197 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
2198 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
2199 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2200 "lea " MEMLEA(0x20,1) ",%1 \n"
2206 "+r"(temp_width) // %2
2207 : "m"(kShuffleMirror) // %3
2208 : "memory", "cc", NACL_R14
2212 #endif // HAS_MIRRORROW_AVX2
2214 #ifdef HAS_MIRRORROW_SSE2
2215 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2216 intptr_t temp_width = (intptr_t)(width);
2220 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
2221 "movdqa %%xmm0,%%xmm1 \n"
2222 "psllw $0x8,%%xmm0 \n"
2223 "psrlw $0x8,%%xmm1 \n"
2224 "por %%xmm1,%%xmm0 \n"
2225 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2226 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2227 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2228 "movdqu %%xmm0," MEMACCESS(1) " \n"
2229 "lea " MEMLEA(0x10,1)",%1 \n"
2234 "+r"(temp_width) // %2
2236 : "memory", "cc", NACL_R14
2240 #endif // HAS_MIRRORROW_SSE2
2242 #ifdef HAS_MIRRORROW_UV_SSSE3
2243 // Shuffle table for reversing the bytes of UV channels.
2244 static uvec8 kShuffleMirrorUV = {
2245 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2247 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2249 intptr_t temp_width = (intptr_t)(width);
2251 "movdqa %4,%%xmm1 \n"
2252 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
2256 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2257 "lea " MEMLEA(-0x10,0) ",%0 \n"
2258 "pshufb %%xmm1,%%xmm0 \n"
2259 "movlpd %%xmm0," MEMACCESS(1) " \n"
2260 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
2261 "lea " MEMLEA(0x8,1) ",%1 \n"
2267 "+r"(temp_width) // %3
2268 : "m"(kShuffleMirrorUV) // %4
2269 : "memory", "cc", NACL_R14
2273 #endif // HAS_MIRRORROW_UV_SSSE3
2275 #ifdef HAS_ARGBMIRRORROW_SSE2
2277 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2278 intptr_t temp_width = (intptr_t)(width);
2280 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
2283 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2284 "pshufd $0x1b,%%xmm0,%%xmm0 \n"
2285 "lea " MEMLEA(-0x10,0) ",%0 \n"
2286 "movdqu %%xmm0," MEMACCESS(1) " \n"
2287 "lea " MEMLEA(0x10,1) ",%1 \n"
2292 "+r"(temp_width) // %2
2298 #endif // HAS_ARGBMIRRORROW_SSE2
2300 #ifdef HAS_ARGBMIRRORROW_AVX2
2301 // Shuffle table for reversing the bytes.
2302 static const ulvec32 kARGBShuffleMirror_AVX2 = {
2303 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2305 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2306 intptr_t temp_width = (intptr_t)(width);
2308 "vmovdqu %3,%%ymm5 \n"
2311 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2312 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2313 "lea " MEMLEA(0x20,1) ",%1 \n"
2319 "+r"(temp_width) // %2
2320 : "m"(kARGBShuffleMirror_AVX2) // %3
2321 : "memory", "cc", NACL_R14
2325 #endif // HAS_ARGBMIRRORROW_AVX2
2327 #ifdef HAS_SPLITUVROW_AVX2
2328 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2330 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2331 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2335 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2336 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2337 "lea " MEMLEA(0x40,0) ",%0 \n"
2338 "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
2339 "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
2340 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
2341 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
2342 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2343 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
2344 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2345 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2346 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2347 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
2348 "lea " MEMLEA(0x20,1) ",%1 \n"
2352 : "+r"(src_uv), // %0
2357 : "memory", "cc", NACL_R14
2358 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2361 #endif // HAS_SPLITUVROW_AVX2
2363 #ifdef HAS_SPLITUVROW_SSE2
2364 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2366 "pcmpeqb %%xmm5,%%xmm5 \n"
2367 "psrlw $0x8,%%xmm5 \n"
2371 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2372 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2373 "lea " MEMLEA(0x20,0) ",%0 \n"
2374 "movdqa %%xmm0,%%xmm2 \n"
2375 "movdqa %%xmm1,%%xmm3 \n"
2376 "pand %%xmm5,%%xmm0 \n"
2377 "pand %%xmm5,%%xmm1 \n"
2378 "packuswb %%xmm1,%%xmm0 \n"
2379 "psrlw $0x8,%%xmm2 \n"
2380 "psrlw $0x8,%%xmm3 \n"
2381 "packuswb %%xmm3,%%xmm2 \n"
2382 "movdqu %%xmm0," MEMACCESS(1) " \n"
2383 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
2384 "lea " MEMLEA(0x10,1) ",%1 \n"
2387 : "+r"(src_uv), // %0
2392 : "memory", "cc", NACL_R14
2393 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2396 #endif // HAS_SPLITUVROW_SSE2
2398 #ifdef HAS_MERGEUVROW_AVX2
2399 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2405 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2406 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
2407 "lea " MEMLEA(0x20,0) ",%0 \n"
2408 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
2409 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
2410 "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
2411 "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2412 "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2413 "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2414 "lea " MEMLEA(0x40,2) ",%2 \n"
2418 : "+r"(src_u), // %0
2423 : "memory", "cc", NACL_R14
2424 "xmm0", "xmm1", "xmm2"
2427 #endif // HAS_MERGEUVROW_AVX2
2429 #ifdef HAS_MERGEUVROW_SSE2
2430 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2436 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2437 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
2438 "lea " MEMLEA(0x10,0) ",%0 \n"
2439 "movdqa %%xmm0,%%xmm2 \n"
2440 "punpcklbw %%xmm1,%%xmm0 \n"
2441 "punpckhbw %%xmm1,%%xmm2 \n"
2442 "movdqu %%xmm0," MEMACCESS(2) " \n"
2443 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
2444 "lea " MEMLEA(0x20,2) ",%2 \n"
2447 : "+r"(src_u), // %0
2452 : "memory", "cc", NACL_R14
2453 "xmm0", "xmm1", "xmm2"
2456 #endif // HAS_MERGEUVROW_SSE2
2458 #ifdef HAS_COPYROW_SSE2
2459 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2463 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2464 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2465 "lea " MEMLEA(0x20,0) ",%0 \n"
2466 "movdqu %%xmm0," MEMACCESS(1) " \n"
2467 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2468 "lea " MEMLEA(0x20,1) ",%1 \n"
2479 #endif // HAS_COPYROW_SSE2
2481 #ifdef HAS_COPYROW_AVX
2482 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2486 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2487 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2488 "lea " MEMLEA(0x40,0) ",%0 \n"
2489 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2490 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2491 "lea " MEMLEA(0x40,1) ",%1 \n"
2502 #endif // HAS_COPYROW_AVX
2504 #ifdef HAS_COPYROW_ERMS
2506 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2507 size_t width_tmp = (size_t)(width);
2509 "rep movsb " MEMMOVESTRING(0,1) " \n"
2512 "+c"(width_tmp) // %2
2517 #endif // HAS_COPYROW_ERMS
2519 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2521 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2523 "pcmpeqb %%xmm0,%%xmm0 \n"
2524 "pslld $0x18,%%xmm0 \n"
2525 "pcmpeqb %%xmm1,%%xmm1 \n"
2526 "psrld $0x8,%%xmm1 \n"
2529 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
2530 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
2531 "lea " MEMLEA(0x20,0) ",%0 \n"
2532 "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2533 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2534 "pand %%xmm0,%%xmm2 \n"
2535 "pand %%xmm0,%%xmm3 \n"
2536 "pand %%xmm1,%%xmm4 \n"
2537 "pand %%xmm1,%%xmm5 \n"
2538 "por %%xmm4,%%xmm2 \n"
2539 "por %%xmm5,%%xmm3 \n"
2540 "movdqu %%xmm2," MEMACCESS(1) " \n"
2541 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2542 "lea " MEMLEA(0x20,1) ",%1 \n"
2550 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2553 #endif // HAS_ARGBCOPYALPHAROW_SSE2
2555 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
2557 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2559 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
2560 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
2563 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
2564 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
2565 "lea " MEMLEA(0x40,0) ",%0 \n"
2566 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
2567 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
2568 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
2569 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
2570 "lea " MEMLEA(0x40,1) ",%1 \n"
2579 , "xmm0", "xmm1", "xmm2"
2582 #endif // HAS_ARGBCOPYALPHAROW_AVX2
2584 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2586 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2588 "pcmpeqb %%xmm0,%%xmm0 \n"
2589 "pslld $0x18,%%xmm0 \n"
2590 "pcmpeqb %%xmm1,%%xmm1 \n"
2591 "psrld $0x8,%%xmm1 \n"
2594 "movq " MEMACCESS(0) ",%%xmm2 \n"
2595 "lea " MEMLEA(0x8,0) ",%0 \n"
2596 "punpcklbw %%xmm2,%%xmm2 \n"
2597 "punpckhwd %%xmm2,%%xmm3 \n"
2598 "punpcklwd %%xmm2,%%xmm2 \n"
2599 "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2600 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2601 "pand %%xmm0,%%xmm2 \n"
2602 "pand %%xmm0,%%xmm3 \n"
2603 "pand %%xmm1,%%xmm4 \n"
2604 "pand %%xmm1,%%xmm5 \n"
2605 "por %%xmm4,%%xmm2 \n"
2606 "por %%xmm5,%%xmm3 \n"
2607 "movdqu %%xmm2," MEMACCESS(1) " \n"
2608 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2609 "lea " MEMLEA(0x20,1) ",%1 \n"
2617 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2620 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
2622 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
2624 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2626 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
2627 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
2630 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
2631 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
2632 "lea " MEMLEA(0x10,0) ",%0 \n"
2633 "vpslld $0x18,%%ymm1,%%ymm1 \n"
2634 "vpslld $0x18,%%ymm2,%%ymm2 \n"
2635 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
2636 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
2637 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
2638 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
2639 "lea " MEMLEA(0x40,1) ",%1 \n"
2648 , "xmm0", "xmm1", "xmm2"
2651 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
2653 #ifdef HAS_SETROW_X86
2654 void SetRow_X86(uint8* dst, uint8 v8, int width) {
2655 size_t width_tmp = (size_t)(width >> 2);
2656 const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes.
2658 "rep stosl " MEMSTORESTRING(eax,0) " \n"
2660 "+c"(width_tmp) // %1
2665 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
2666 size_t width_tmp = (size_t)(width);
2668 "rep stosb " MEMSTORESTRING(al,0) " \n"
2670 "+c"(width_tmp) // %1
2675 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
2676 size_t width_tmp = (size_t)(width);
2678 "rep stosl " MEMSTORESTRING(eax,0) " \n"
2679 : "+D"(dst_argb), // %0
2680 "+c"(width_tmp) // %1
2684 #endif // HAS_SETROW_X86
2686 #ifdef HAS_YUY2TOYROW_SSE2
2687 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
2689 "pcmpeqb %%xmm5,%%xmm5 \n"
2690 "psrlw $0x8,%%xmm5 \n"
2693 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2694 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2695 "lea " MEMLEA(0x20,0) ",%0 \n"
2696 "pand %%xmm5,%%xmm0 \n"
2697 "pand %%xmm5,%%xmm1 \n"
2698 "packuswb %%xmm1,%%xmm0 \n"
2699 "movdqu %%xmm0," MEMACCESS(1) " \n"
2700 "lea " MEMLEA(0x10,1) ",%1 \n"
2703 : "+r"(src_yuy2), // %0
2708 , "xmm0", "xmm1", "xmm5"
2712 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2713 uint8* dst_u, uint8* dst_v, int pix) {
2715 "pcmpeqb %%xmm5,%%xmm5 \n"
2716 "psrlw $0x8,%%xmm5 \n"
2720 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2721 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2722 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
2723 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
2724 "lea " MEMLEA(0x20,0) ",%0 \n"
2725 "pavgb %%xmm2,%%xmm0 \n"
2726 "pavgb %%xmm3,%%xmm1 \n"
2727 "psrlw $0x8,%%xmm0 \n"
2728 "psrlw $0x8,%%xmm1 \n"
2729 "packuswb %%xmm1,%%xmm0 \n"
2730 "movdqa %%xmm0,%%xmm1 \n"
2731 "pand %%xmm5,%%xmm0 \n"
2732 "packuswb %%xmm0,%%xmm0 \n"
2733 "psrlw $0x8,%%xmm1 \n"
2734 "packuswb %%xmm1,%%xmm1 \n"
2735 "movq %%xmm0," MEMACCESS(1) " \n"
2736 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2737 "lea " MEMLEA(0x8,1) ",%1 \n"
2740 : "+r"(src_yuy2), // %0
2744 : "r"((intptr_t)(stride_yuy2)) // %4
2745 : "memory", "cc", NACL_R14
2746 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2750 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2751 uint8* dst_u, uint8* dst_v, int pix) {
2753 "pcmpeqb %%xmm5,%%xmm5 \n"
2754 "psrlw $0x8,%%xmm5 \n"
2758 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2759 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2760 "lea " MEMLEA(0x20,0) ",%0 \n"
2761 "psrlw $0x8,%%xmm0 \n"
2762 "psrlw $0x8,%%xmm1 \n"
2763 "packuswb %%xmm1,%%xmm0 \n"
2764 "movdqa %%xmm0,%%xmm1 \n"
2765 "pand %%xmm5,%%xmm0 \n"
2766 "packuswb %%xmm0,%%xmm0 \n"
2767 "psrlw $0x8,%%xmm1 \n"
2768 "packuswb %%xmm1,%%xmm1 \n"
2769 "movq %%xmm0," MEMACCESS(1) " \n"
2770 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2771 "lea " MEMLEA(0x8,1) ",%1 \n"
2774 : "+r"(src_yuy2), // %0
2779 : "memory", "cc", NACL_R14
2780 "xmm0", "xmm1", "xmm5"
2784 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
2788 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2789 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2790 "lea " MEMLEA(0x20,0) ",%0 \n"
2791 "psrlw $0x8,%%xmm0 \n"
2792 "psrlw $0x8,%%xmm1 \n"
2793 "packuswb %%xmm1,%%xmm0 \n"
2794 "movdqu %%xmm0," MEMACCESS(1) " \n"
2795 "lea " MEMLEA(0x10,1) ",%1 \n"
2798 : "+r"(src_uyvy), // %0
2807 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2808 uint8* dst_u, uint8* dst_v, int pix) {
2810 "pcmpeqb %%xmm5,%%xmm5 \n"
2811 "psrlw $0x8,%%xmm5 \n"
2815 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2816 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2817 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
2818 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
2819 "lea " MEMLEA(0x20,0) ",%0 \n"
2820 "pavgb %%xmm2,%%xmm0 \n"
2821 "pavgb %%xmm3,%%xmm1 \n"
2822 "pand %%xmm5,%%xmm0 \n"
2823 "pand %%xmm5,%%xmm1 \n"
2824 "packuswb %%xmm1,%%xmm0 \n"
2825 "movdqa %%xmm0,%%xmm1 \n"
2826 "pand %%xmm5,%%xmm0 \n"
2827 "packuswb %%xmm0,%%xmm0 \n"
2828 "psrlw $0x8,%%xmm1 \n"
2829 "packuswb %%xmm1,%%xmm1 \n"
2830 "movq %%xmm0," MEMACCESS(1) " \n"
2831 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2832 "lea " MEMLEA(0x8,1) ",%1 \n"
2835 : "+r"(src_uyvy), // %0
2839 : "r"((intptr_t)(stride_uyvy)) // %4
2840 : "memory", "cc", NACL_R14
2841 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2845 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2846 uint8* dst_u, uint8* dst_v, int pix) {
2848 "pcmpeqb %%xmm5,%%xmm5 \n"
2849 "psrlw $0x8,%%xmm5 \n"
2853 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2854 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2855 "lea " MEMLEA(0x20,0) ",%0 \n"
2856 "pand %%xmm5,%%xmm0 \n"
2857 "pand %%xmm5,%%xmm1 \n"
2858 "packuswb %%xmm1,%%xmm0 \n"
2859 "movdqa %%xmm0,%%xmm1 \n"
2860 "pand %%xmm5,%%xmm0 \n"
2861 "packuswb %%xmm0,%%xmm0 \n"
2862 "psrlw $0x8,%%xmm1 \n"
2863 "packuswb %%xmm1,%%xmm1 \n"
2864 "movq %%xmm0," MEMACCESS(1) " \n"
2865 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2866 "lea " MEMLEA(0x8,1) ",%1 \n"
2869 : "+r"(src_uyvy), // %0
2874 : "memory", "cc", NACL_R14
2875 "xmm0", "xmm1", "xmm5"
2878 #endif // HAS_YUY2TOYROW_SSE2
2880 #ifdef HAS_YUY2TOYROW_AVX2
2881 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
2883 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2884 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2887 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2888 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2889 "lea " MEMLEA(0x40,0) ",%0 \n"
2890 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
2891 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
2892 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2893 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2894 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2895 "lea " MEMLEA(0x20,1) ",%1 \n"
2899 : "+r"(src_yuy2), // %0
2904 , "xmm0", "xmm1", "xmm5"
2908 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
2909 uint8* dst_u, uint8* dst_v, int pix) {
2911 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2912 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2916 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2917 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2918 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
2919 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
2920 "lea " MEMLEA(0x40,0) ",%0 \n"
2921 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
2922 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
2923 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2924 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2925 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
2926 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
2927 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
2928 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
2929 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2930 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2931 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
2932 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
2933 "lea " MEMLEA(0x10,1) ",%1 \n"
2937 : "+r"(src_yuy2), // %0
2941 : "r"((intptr_t)(stride_yuy2)) // %4
2942 : "memory", "cc", NACL_R14
2943 "xmm0", "xmm1", "xmm5"
2947 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
2948 uint8* dst_u, uint8* dst_v, int pix) {
2950 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2951 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2955 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2956 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2957 "lea " MEMLEA(0x40,0) ",%0 \n"
2958 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
2959 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
2960 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2961 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2962 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
2963 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
2964 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
2965 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
2966 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2967 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2968 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
2969 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
2970 "lea " MEMLEA(0x10,1) ",%1 \n"
2974 : "+r"(src_yuy2), // %0
2979 : "memory", "cc", NACL_R14
2980 "xmm0", "xmm1", "xmm5"
2984 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
2988 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2989 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2990 "lea " MEMLEA(0x40,0) ",%0 \n"
2991 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
2992 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
2993 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2994 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2995 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2996 "lea " MEMLEA(0x20,1) ",%1 \n"
3000 : "+r"(src_uyvy), // %0
3005 , "xmm0", "xmm1", "xmm5"
3008 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3009 uint8* dst_u, uint8* dst_v, int pix) {
3011 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3012 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3017 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3018 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3019 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3020 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3021 "lea " MEMLEA(0x40,0) ",%0 \n"
3022 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3023 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3024 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3025 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3026 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3027 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3028 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3029 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3030 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3031 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3032 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3033 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3034 "lea " MEMLEA(0x10,1) ",%1 \n"
3038 : "+r"(src_uyvy), // %0
3042 : "r"((intptr_t)(stride_uyvy)) // %4
3043 : "memory", "cc", NACL_R14
3044 "xmm0", "xmm1", "xmm5"
3048 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3049 uint8* dst_u, uint8* dst_v, int pix) {
3051 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3052 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3056 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3057 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3058 "lea " MEMLEA(0x40,0) ",%0 \n"
3059 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3060 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3061 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3062 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3063 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3064 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3065 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3066 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3067 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3068 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3069 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3070 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3071 "lea " MEMLEA(0x10,1) ",%1 \n"
3075 : "+r"(src_uyvy), // %0
3080 : "memory", "cc", NACL_R14
3081 "xmm0", "xmm1", "xmm5"
3084 #endif // HAS_YUY2TOYROW_AVX2
3086 #ifdef HAS_ARGBBLENDROW_SSE2
3087 // Blend 8 pixels at a time.
3088 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3089 uint8* dst_argb, int width) {
3091 "pcmpeqb %%xmm7,%%xmm7 \n"
3092 "psrlw $0xf,%%xmm7 \n"
3093 "pcmpeqb %%xmm6,%%xmm6 \n"
3094 "psrlw $0x8,%%xmm6 \n"
3095 "pcmpeqb %%xmm5,%%xmm5 \n"
3096 "psllw $0x8,%%xmm5 \n"
3097 "pcmpeqb %%xmm4,%%xmm4 \n"
3098 "pslld $0x18,%%xmm4 \n"
3103 // 1 pixel loop until destination pointer is aligned.
3107 "movd " MEMACCESS(0) ",%%xmm3 \n"
3108 "lea " MEMLEA(0x4,0) ",%0 \n"
3109 "movdqa %%xmm3,%%xmm0 \n"
3110 "pxor %%xmm4,%%xmm3 \n"
3111 "movd " MEMACCESS(1) ",%%xmm2 \n"
3112 "psrlw $0x8,%%xmm3 \n"
3113 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3114 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3115 "pand %%xmm6,%%xmm2 \n"
3116 "paddw %%xmm7,%%xmm3 \n"
3117 "pmullw %%xmm3,%%xmm2 \n"
3118 "movd " MEMACCESS(1) ",%%xmm1 \n"
3119 "lea " MEMLEA(0x4,1) ",%1 \n"
3120 "psrlw $0x8,%%xmm1 \n"
3121 "por %%xmm4,%%xmm0 \n"
3122 "pmullw %%xmm3,%%xmm1 \n"
3123 "psrlw $0x8,%%xmm2 \n"
3124 "paddusb %%xmm2,%%xmm0 \n"
3125 "pand %%xmm5,%%xmm1 \n"
3126 "paddusb %%xmm1,%%xmm0 \n"
3127 "movd %%xmm0," MEMACCESS(2) " \n"
3128 "lea " MEMLEA(0x4,2) ",%2 \n"
3139 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3140 "lea " MEMLEA(0x10,0) ",%0 \n"
3141 "movdqa %%xmm3,%%xmm0 \n"
3142 "pxor %%xmm4,%%xmm3 \n"
3143 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3144 "psrlw $0x8,%%xmm3 \n"
3145 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3146 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3147 "pand %%xmm6,%%xmm2 \n"
3148 "paddw %%xmm7,%%xmm3 \n"
3149 "pmullw %%xmm3,%%xmm2 \n"
3150 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3151 "lea " MEMLEA(0x10,1) ",%1 \n"
3152 "psrlw $0x8,%%xmm1 \n"
3153 "por %%xmm4,%%xmm0 \n"
3154 "pmullw %%xmm3,%%xmm1 \n"
3155 "psrlw $0x8,%%xmm2 \n"
3156 "paddusb %%xmm2,%%xmm0 \n"
3157 "pand %%xmm5,%%xmm1 \n"
3158 "paddusb %%xmm1,%%xmm0 \n"
3159 "movdqu %%xmm0," MEMACCESS(2) " \n"
3160 "lea " MEMLEA(0x10,2) ",%2 \n"
3170 "movd " MEMACCESS(0) ",%%xmm3 \n"
3171 "lea " MEMLEA(0x4,0) ",%0 \n"
3172 "movdqa %%xmm3,%%xmm0 \n"
3173 "pxor %%xmm4,%%xmm3 \n"
3174 "movd " MEMACCESS(1) ",%%xmm2 \n"
3175 "psrlw $0x8,%%xmm3 \n"
3176 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3177 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3178 "pand %%xmm6,%%xmm2 \n"
3179 "paddw %%xmm7,%%xmm3 \n"
3180 "pmullw %%xmm3,%%xmm2 \n"
3181 "movd " MEMACCESS(1) ",%%xmm1 \n"
3182 "lea " MEMLEA(0x4,1) ",%1 \n"
3183 "psrlw $0x8,%%xmm1 \n"
3184 "por %%xmm4,%%xmm0 \n"
3185 "pmullw %%xmm3,%%xmm1 \n"
3186 "psrlw $0x8,%%xmm2 \n"
3187 "paddusb %%xmm2,%%xmm0 \n"
3188 "pand %%xmm5,%%xmm1 \n"
3189 "paddusb %%xmm1,%%xmm0 \n"
3190 "movd %%xmm0," MEMACCESS(2) " \n"
3191 "lea " MEMLEA(0x4,2) ",%2 \n"
3195 : "+r"(src_argb0), // %0
3196 "+r"(src_argb1), // %1
3197 "+r"(dst_argb), // %2
3201 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3204 #endif // HAS_ARGBBLENDROW_SSE2
3206 #ifdef HAS_ARGBBLENDROW_SSSE3
3207 // Shuffle table for isolating alpha.
3208 static uvec8 kShuffleAlpha = {
3209 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3210 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3213 // Blend 8 pixels at a time
3214 // Shuffle table for reversing the bytes.
3216 // Same as SSE2, but replaces
3217 // psrlw xmm3, 8 // alpha
3218 // pshufhw xmm3, xmm3,0F5h // 8 alpha words
3219 // pshuflw xmm3, xmm3,0F5h
3221 // pshufb xmm3, kShuffleAlpha // alpha
3223 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3224 uint8* dst_argb, int width) {
3226 "pcmpeqb %%xmm7,%%xmm7 \n"
3227 "psrlw $0xf,%%xmm7 \n"
3228 "pcmpeqb %%xmm6,%%xmm6 \n"
3229 "psrlw $0x8,%%xmm6 \n"
3230 "pcmpeqb %%xmm5,%%xmm5 \n"
3231 "psllw $0x8,%%xmm5 \n"
3232 "pcmpeqb %%xmm4,%%xmm4 \n"
3233 "pslld $0x18,%%xmm4 \n"
3238 // 1 pixel loop until destination pointer is aligned.
3242 "movd " MEMACCESS(0) ",%%xmm3 \n"
3243 "lea " MEMLEA(0x4,0) ",%0 \n"
3244 "movdqa %%xmm3,%%xmm0 \n"
3245 "pxor %%xmm4,%%xmm3 \n"
3246 "movd " MEMACCESS(1) ",%%xmm2 \n"
3247 "pshufb %4,%%xmm3 \n"
3248 "pand %%xmm6,%%xmm2 \n"
3249 "paddw %%xmm7,%%xmm3 \n"
3250 "pmullw %%xmm3,%%xmm2 \n"
3251 "movd " MEMACCESS(1) ",%%xmm1 \n"
3252 "lea " MEMLEA(0x4,1) ",%1 \n"
3253 "psrlw $0x8,%%xmm1 \n"
3254 "por %%xmm4,%%xmm0 \n"
3255 "pmullw %%xmm3,%%xmm1 \n"
3256 "psrlw $0x8,%%xmm2 \n"
3257 "paddusb %%xmm2,%%xmm0 \n"
3258 "pand %%xmm5,%%xmm1 \n"
3259 "paddusb %%xmm1,%%xmm0 \n"
3260 "movd %%xmm0," MEMACCESS(2) " \n"
3261 "lea " MEMLEA(0x4,2) ",%2 \n"
3272 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3273 "lea " MEMLEA(0x10,0) ",%0 \n"
3274 "movdqa %%xmm3,%%xmm0 \n"
3275 "pxor %%xmm4,%%xmm3 \n"
3276 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3277 "pshufb %4,%%xmm3 \n"
3278 "pand %%xmm6,%%xmm2 \n"
3279 "paddw %%xmm7,%%xmm3 \n"
3280 "pmullw %%xmm3,%%xmm2 \n"
3281 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3282 "lea " MEMLEA(0x10,1) ",%1 \n"
3283 "psrlw $0x8,%%xmm1 \n"
3284 "por %%xmm4,%%xmm0 \n"
3285 "pmullw %%xmm3,%%xmm1 \n"
3286 "psrlw $0x8,%%xmm2 \n"
3287 "paddusb %%xmm2,%%xmm0 \n"
3288 "pand %%xmm5,%%xmm1 \n"
3289 "paddusb %%xmm1,%%xmm0 \n"
3290 "movdqu %%xmm0," MEMACCESS(2) " \n"
3291 "lea " MEMLEA(0x10,2) ",%2 \n"
3301 "movd " MEMACCESS(0) ",%%xmm3 \n"
3302 "lea " MEMLEA(0x4,0) ",%0 \n"
3303 "movdqa %%xmm3,%%xmm0 \n"
3304 "pxor %%xmm4,%%xmm3 \n"
3305 "movd " MEMACCESS(1) ",%%xmm2 \n"
3306 "pshufb %4,%%xmm3 \n"
3307 "pand %%xmm6,%%xmm2 \n"
3308 "paddw %%xmm7,%%xmm3 \n"
3309 "pmullw %%xmm3,%%xmm2 \n"
3310 "movd " MEMACCESS(1) ",%%xmm1 \n"
3311 "lea " MEMLEA(0x4,1) ",%1 \n"
3312 "psrlw $0x8,%%xmm1 \n"
3313 "por %%xmm4,%%xmm0 \n"
3314 "pmullw %%xmm3,%%xmm1 \n"
3315 "psrlw $0x8,%%xmm2 \n"
3316 "paddusb %%xmm2,%%xmm0 \n"
3317 "pand %%xmm5,%%xmm1 \n"
3318 "paddusb %%xmm1,%%xmm0 \n"
3319 "movd %%xmm0," MEMACCESS(2) " \n"
3320 "lea " MEMLEA(0x4,2) ",%2 \n"
3324 : "+r"(src_argb0), // %0
3325 "+r"(src_argb1), // %1
3326 "+r"(dst_argb), // %2
3328 : "m"(kShuffleAlpha) // %4
3330 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3333 #endif // HAS_ARGBBLENDROW_SSSE3
3335 #ifdef HAS_ARGBATTENUATEROW_SSE2
3336 // Attenuate 4 pixels at a time.
3337 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3339 "pcmpeqb %%xmm4,%%xmm4 \n"
3340 "pslld $0x18,%%xmm4 \n"
3341 "pcmpeqb %%xmm5,%%xmm5 \n"
3342 "psrld $0x8,%%xmm5 \n"
3347 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3348 "punpcklbw %%xmm0,%%xmm0 \n"
3349 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3350 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3351 "pmulhuw %%xmm2,%%xmm0 \n"
3352 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3353 "punpckhbw %%xmm1,%%xmm1 \n"
3354 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3355 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3356 "pmulhuw %%xmm2,%%xmm1 \n"
3357 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3358 "lea " MEMLEA(0x10,0) ",%0 \n"
3359 "psrlw $0x8,%%xmm0 \n"
3360 "pand %%xmm4,%%xmm2 \n"
3361 "psrlw $0x8,%%xmm1 \n"
3362 "packuswb %%xmm1,%%xmm0 \n"
3363 "pand %%xmm5,%%xmm0 \n"
3364 "por %%xmm2,%%xmm0 \n"
3365 "movdqu %%xmm0," MEMACCESS(1) " \n"
3366 "lea " MEMLEA(0x10,1) ",%1 \n"
3369 : "+r"(src_argb), // %0
3370 "+r"(dst_argb), // %1
3374 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3377 #endif // HAS_ARGBATTENUATEROW_SSE2
3379 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3380 // Shuffle table duplicating alpha
3381 static uvec8 kShuffleAlpha0 = {
3382 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
3384 static uvec8 kShuffleAlpha1 = {
3385 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3386 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
3388 // Attenuate 4 pixels at a time.
3389 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3391 "pcmpeqb %%xmm3,%%xmm3 \n"
3392 "pslld $0x18,%%xmm3 \n"
3393 "movdqa %3,%%xmm4 \n"
3394 "movdqa %4,%%xmm5 \n"
3399 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3400 "pshufb %%xmm4,%%xmm0 \n"
3401 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3402 "punpcklbw %%xmm1,%%xmm1 \n"
3403 "pmulhuw %%xmm1,%%xmm0 \n"
3404 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3405 "pshufb %%xmm5,%%xmm1 \n"
3406 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3407 "punpckhbw %%xmm2,%%xmm2 \n"
3408 "pmulhuw %%xmm2,%%xmm1 \n"
3409 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3410 "lea " MEMLEA(0x10,0) ",%0 \n"
3411 "pand %%xmm3,%%xmm2 \n"
3412 "psrlw $0x8,%%xmm0 \n"
3413 "psrlw $0x8,%%xmm1 \n"
3414 "packuswb %%xmm1,%%xmm0 \n"
3415 "por %%xmm2,%%xmm0 \n"
3416 "movdqu %%xmm0," MEMACCESS(1) " \n"
3417 "lea " MEMLEA(0x10,1) ",%1 \n"
3420 : "+r"(src_argb), // %0
3421 "+r"(dst_argb), // %1
3423 : "m"(kShuffleAlpha0), // %3
3424 "m"(kShuffleAlpha1) // %4
3426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3429 #endif // HAS_ARGBATTENUATEROW_SSSE3
3431 #ifdef HAS_ARGBATTENUATEROW_AVX2
3432 // Shuffle table duplicating alpha.
3433 static const uvec8 kShuffleAlpha_AVX2 = {
3434 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
3436 // Attenuate 8 pixels at a time.
3437 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3439 "vbroadcastf128 %3,%%ymm4 \n"
3440 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3441 "vpslld $0x18,%%ymm5,%%ymm5 \n"
3447 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3448 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3449 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3450 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
3451 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
3452 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3453 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3454 "vpand %%ymm5,%%ymm6,%%ymm6 \n"
3455 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3456 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3457 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3458 "vpor %%ymm6,%%ymm0,%%ymm0 \n"
3459 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3460 "lea " MEMLEA(0x20,0) ",%0 \n"
3464 : "+r"(src_argb), // %0
3465 "+r"(dst_argb), // %1
3467 : "m"(kShuffleAlpha_AVX2) // %3
3469 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3472 #endif // HAS_ARGBATTENUATEROW_AVX2
3474 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3475 // Unattenuate 4 pixels at a time.
3476 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3478 uintptr_t alpha = 0;
3483 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3484 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3485 "punpcklbw %%xmm0,%%xmm0 \n"
3486 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3487 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3488 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3489 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3490 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3491 "movlhps %%xmm3,%%xmm2 \n"
3492 "pmulhuw %%xmm2,%%xmm0 \n"
3493 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3494 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3495 "punpckhbw %%xmm1,%%xmm1 \n"
3496 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3497 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3498 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3499 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3500 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3501 "movlhps %%xmm3,%%xmm2 \n"
3502 "pmulhuw %%xmm2,%%xmm1 \n"
3503 "lea " MEMLEA(0x10,0) ",%0 \n"
3504 "packuswb %%xmm1,%%xmm0 \n"
3505 "movdqu %%xmm0," MEMACCESS(1) " \n"
3506 "lea " MEMLEA(0x10,1) ",%1 \n"
3509 : "+r"(src_argb), // %0
3510 "+r"(dst_argb), // %1
3513 : "r"(fixed_invtbl8) // %4
3514 : "memory", "cc", NACL_R14
3515 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3518 #endif // HAS_ARGBUNATTENUATEROW_SSE2
3520 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3521 // Shuffle table duplicating alpha.
3522 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3523 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
3525 // Unattenuate 8 pixels at a time.
3526 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
3528 uintptr_t alpha = 0;
3531 "vbroadcastf128 %5,%%ymm5 \n"
3537 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3538 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3539 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3540 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3541 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3542 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
3543 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3544 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3545 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3546 "movzb " MEMACCESS2(0x13,0) ",%3 \n"
3547 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
3548 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3549 "movzb " MEMACCESS2(0x17,0) ",%3 \n"
3550 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3551 "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
3552 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
3553 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3554 "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
3555 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3556 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
3557 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
3558 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
3559 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
3562 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3563 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3564 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3565 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
3566 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
3567 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
3568 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
3569 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3570 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3571 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3572 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3573 "lea " MEMLEA(0x20,0) ",%0 \n"
3577 : "+r"(src_argb), // %0
3578 "+r"(dst_argb), // %1
3581 : "r"(fixed_invtbl8), // %4
3582 "m"(kUnattenShuffleAlpha_AVX2) // %5
3583 : "memory", "cc", NACL_R14
3584 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3587 #endif // HAS_ARGBUNATTENUATEROW_AVX2
3589 #ifdef HAS_ARGBGRAYROW_SSSE3
3590 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
3591 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3593 "movdqa %3,%%xmm4 \n"
3594 "movdqa %4,%%xmm5 \n"
3599 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3600 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3601 "pmaddubsw %%xmm4,%%xmm0 \n"
3602 "pmaddubsw %%xmm4,%%xmm1 \n"
3603 "phaddw %%xmm1,%%xmm0 \n"
3604 "paddw %%xmm5,%%xmm0 \n"
3605 "psrlw $0x7,%%xmm0 \n"
3606 "packuswb %%xmm0,%%xmm0 \n"
3607 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3608 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
3609 "lea " MEMLEA(0x20,0) ",%0 \n"
3610 "psrld $0x18,%%xmm2 \n"
3611 "psrld $0x18,%%xmm3 \n"
3612 "packuswb %%xmm3,%%xmm2 \n"
3613 "packuswb %%xmm2,%%xmm2 \n"
3614 "movdqa %%xmm0,%%xmm3 \n"
3615 "punpcklbw %%xmm0,%%xmm0 \n"
3616 "punpcklbw %%xmm2,%%xmm3 \n"
3617 "movdqa %%xmm0,%%xmm1 \n"
3618 "punpcklwd %%xmm3,%%xmm0 \n"
3619 "punpckhwd %%xmm3,%%xmm1 \n"
3620 "movdqu %%xmm0," MEMACCESS(1) " \n"
3621 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
3622 "lea " MEMLEA(0x20,1) ",%1 \n"
3625 : "+r"(src_argb), // %0
3626 "+r"(dst_argb), // %1
3628 : "m"(kARGBToYJ), // %3
3631 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3634 #endif // HAS_ARGBGRAYROW_SSSE3
3636 #ifdef HAS_ARGBSEPIAROW_SSSE3
3637 // b = (r * 35 + g * 68 + b * 17) >> 7
3638 // g = (r * 45 + g * 88 + b * 22) >> 7
3639 // r = (r * 50 + g * 98 + b * 24) >> 7
3640 // Constant for ARGB color to sepia tone
3641 static vec8 kARGBToSepiaB = {
3642 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3645 static vec8 kARGBToSepiaG = {
3646 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3649 static vec8 kARGBToSepiaR = {
3650 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3653 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3654 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3656 "movdqa %2,%%xmm2 \n"
3657 "movdqa %3,%%xmm3 \n"
3658 "movdqa %4,%%xmm4 \n"
3663 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3664 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
3665 "pmaddubsw %%xmm2,%%xmm0 \n"
3666 "pmaddubsw %%xmm2,%%xmm6 \n"
3667 "phaddw %%xmm6,%%xmm0 \n"
3668 "psrlw $0x7,%%xmm0 \n"
3669 "packuswb %%xmm0,%%xmm0 \n"
3670 "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3671 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3672 "pmaddubsw %%xmm3,%%xmm5 \n"
3673 "pmaddubsw %%xmm3,%%xmm1 \n"
3674 "phaddw %%xmm1,%%xmm5 \n"
3675 "psrlw $0x7,%%xmm5 \n"
3676 "packuswb %%xmm5,%%xmm5 \n"
3677 "punpcklbw %%xmm5,%%xmm0 \n"
3678 "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3679 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3680 "pmaddubsw %%xmm4,%%xmm5 \n"
3681 "pmaddubsw %%xmm4,%%xmm1 \n"
3682 "phaddw %%xmm1,%%xmm5 \n"
3683 "psrlw $0x7,%%xmm5 \n"
3684 "packuswb %%xmm5,%%xmm5 \n"
3685 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
3686 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3687 "psrld $0x18,%%xmm6 \n"
3688 "psrld $0x18,%%xmm1 \n"
3689 "packuswb %%xmm1,%%xmm6 \n"
3690 "packuswb %%xmm6,%%xmm6 \n"
3691 "punpcklbw %%xmm6,%%xmm5 \n"
3692 "movdqa %%xmm0,%%xmm1 \n"
3693 "punpcklwd %%xmm5,%%xmm0 \n"
3694 "punpckhwd %%xmm5,%%xmm1 \n"
3695 "movdqu %%xmm0," MEMACCESS(0) " \n"
3696 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
3697 "lea " MEMLEA(0x20,0) ",%0 \n"
3700 : "+r"(dst_argb), // %0
3702 : "m"(kARGBToSepiaB), // %2
3703 "m"(kARGBToSepiaG), // %3
3704 "m"(kARGBToSepiaR) // %4
3706 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3709 #endif // HAS_ARGBSEPIAROW_SSSE3
3711 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3712 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3713 // Same as Sepia except matrix is provided.
3714 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
3715 const int8* matrix_argb, int width) {
3717 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
3718 "pshufd $0x00,%%xmm5,%%xmm2 \n"
3719 "pshufd $0x55,%%xmm5,%%xmm3 \n"
3720 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
3721 "pshufd $0xff,%%xmm5,%%xmm5 \n"
3726 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3727 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
3728 "pmaddubsw %%xmm2,%%xmm0 \n"
3729 "pmaddubsw %%xmm2,%%xmm7 \n"
3730 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
3731 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3732 "pmaddubsw %%xmm3,%%xmm6 \n"
3733 "pmaddubsw %%xmm3,%%xmm1 \n"
3734 "phaddsw %%xmm7,%%xmm0 \n"
3735 "phaddsw %%xmm1,%%xmm6 \n"
3736 "psraw $0x6,%%xmm0 \n"
3737 "psraw $0x6,%%xmm6 \n"
3738 "packuswb %%xmm0,%%xmm0 \n"
3739 "packuswb %%xmm6,%%xmm6 \n"
3740 "punpcklbw %%xmm6,%%xmm0 \n"
3741 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3742 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
3743 "pmaddubsw %%xmm4,%%xmm1 \n"
3744 "pmaddubsw %%xmm4,%%xmm7 \n"
3745 "phaddsw %%xmm7,%%xmm1 \n"
3746 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
3747 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
3748 "pmaddubsw %%xmm5,%%xmm6 \n"
3749 "pmaddubsw %%xmm5,%%xmm7 \n"
3750 "phaddsw %%xmm7,%%xmm6 \n"
3751 "psraw $0x6,%%xmm1 \n"
3752 "psraw $0x6,%%xmm6 \n"
3753 "packuswb %%xmm1,%%xmm1 \n"
3754 "packuswb %%xmm6,%%xmm6 \n"
3755 "punpcklbw %%xmm6,%%xmm1 \n"
3756 "movdqa %%xmm0,%%xmm6 \n"
3757 "punpcklwd %%xmm1,%%xmm0 \n"
3758 "punpckhwd %%xmm1,%%xmm6 \n"
3759 "movdqu %%xmm0," MEMACCESS(1) " \n"
3760 "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
3761 "lea " MEMLEA(0x20,0) ",%0 \n"
3762 "lea " MEMLEA(0x20,1) ",%1 \n"
3765 : "+r"(src_argb), // %0
3766 "+r"(dst_argb), // %1
3768 : "r"(matrix_argb) // %3
3770 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3773 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3775 #ifdef HAS_ARGBQUANTIZEROW_SSE2
3776 // Quantize 4 ARGB pixels (16 bytes).
3777 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3778 int interval_offset, int width) {
3783 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3784 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3785 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3786 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3787 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3788 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3789 "pxor %%xmm5,%%xmm5 \n"
3790 "pcmpeqb %%xmm6,%%xmm6 \n"
3791 "pslld $0x18,%%xmm6 \n"
3796 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3797 "punpcklbw %%xmm5,%%xmm0 \n"
3798 "pmulhuw %%xmm2,%%xmm0 \n"
3799 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3800 "punpckhbw %%xmm5,%%xmm1 \n"
3801 "pmulhuw %%xmm2,%%xmm1 \n"
3802 "pmullw %%xmm3,%%xmm0 \n"
3803 "movdqu " MEMACCESS(0) ",%%xmm7 \n"
3804 "pmullw %%xmm3,%%xmm1 \n"
3805 "pand %%xmm6,%%xmm7 \n"
3806 "paddw %%xmm4,%%xmm0 \n"
3807 "paddw %%xmm4,%%xmm1 \n"
3808 "packuswb %%xmm1,%%xmm0 \n"
3809 "por %%xmm7,%%xmm0 \n"
3810 "movdqu %%xmm0," MEMACCESS(0) " \n"
3811 "lea " MEMLEA(0x10,0) ",%0 \n"
3814 : "+r"(dst_argb), // %0
3817 "r"(interval_size), // %3
3818 "r"(interval_offset) // %4
3820 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3823 #endif // HAS_ARGBQUANTIZEROW_SSE2
3825 #ifdef HAS_ARGBSHADEROW_SSE2
3826 // Shade 4 pixels at a time by specified value.
3827 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3831 "punpcklbw %%xmm2,%%xmm2 \n"
3832 "punpcklqdq %%xmm2,%%xmm2 \n"
3837 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3838 "lea " MEMLEA(0x10,0) ",%0 \n"
3839 "movdqa %%xmm0,%%xmm1 \n"
3840 "punpcklbw %%xmm0,%%xmm0 \n"
3841 "punpckhbw %%xmm1,%%xmm1 \n"
3842 "pmulhuw %%xmm2,%%xmm0 \n"
3843 "pmulhuw %%xmm2,%%xmm1 \n"
3844 "psrlw $0x8,%%xmm0 \n"
3845 "psrlw $0x8,%%xmm1 \n"
3846 "packuswb %%xmm1,%%xmm0 \n"
3847 "movdqu %%xmm0," MEMACCESS(1) " \n"
3848 "lea " MEMLEA(0x10,1) ",%1 \n"
3851 : "+r"(src_argb), // %0
3852 "+r"(dst_argb), // %1
3856 , "xmm0", "xmm1", "xmm2"
3859 #endif // HAS_ARGBSHADEROW_SSE2
3861 #ifdef HAS_ARGBMULTIPLYROW_SSE2
3862 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
3863 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3864 uint8* dst_argb, int width) {
3866 "pxor %%xmm5,%%xmm5 \n"
3871 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3872 "lea " MEMLEA(0x10,0) ",%0 \n"
3873 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3874 "lea " MEMLEA(0x10,1) ",%1 \n"
3875 "movdqu %%xmm0,%%xmm1 \n"
3876 "movdqu %%xmm2,%%xmm3 \n"
3877 "punpcklbw %%xmm0,%%xmm0 \n"
3878 "punpckhbw %%xmm1,%%xmm1 \n"
3879 "punpcklbw %%xmm5,%%xmm2 \n"
3880 "punpckhbw %%xmm5,%%xmm3 \n"
3881 "pmulhuw %%xmm2,%%xmm0 \n"
3882 "pmulhuw %%xmm3,%%xmm1 \n"
3883 "packuswb %%xmm1,%%xmm0 \n"
3884 "movdqu %%xmm0," MEMACCESS(2) " \n"
3885 "lea " MEMLEA(0x10,2) ",%2 \n"
3888 : "+r"(src_argb0), // %0
3889 "+r"(src_argb1), // %1
3890 "+r"(dst_argb), // %2
3894 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3897 #endif // HAS_ARGBMULTIPLYROW_SSE2
3899 #ifdef HAS_ARGBMULTIPLYROW_AVX2
3900 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
3901 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
3902 uint8* dst_argb, int width) {
3904 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
3909 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
3910 "lea " MEMLEA(0x20,0) ",%0 \n"
3911 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
3912 "lea " MEMLEA(0x20,1) ",%1 \n"
3913 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
3914 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
3915 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
3916 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
3917 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3918 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3919 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3920 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
3921 "lea " MEMLEA(0x20,2) ",%2 \n"
3925 : "+r"(src_argb0), // %0
3926 "+r"(src_argb1), // %1
3927 "+r"(dst_argb), // %2
3931 #if defined(__AVX2__)
3932 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3936 #endif // HAS_ARGBMULTIPLYROW_AVX2
3938 #ifdef HAS_ARGBADDROW_SSE2
3939 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
3940 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3941 uint8* dst_argb, int width) {
3946 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3947 "lea " MEMLEA(0x10,0) ",%0 \n"
3948 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3949 "lea " MEMLEA(0x10,1) ",%1 \n"
3950 "paddusb %%xmm1,%%xmm0 \n"
3951 "movdqu %%xmm0," MEMACCESS(2) " \n"
3952 "lea " MEMLEA(0x10,2) ",%2 \n"
3955 : "+r"(src_argb0), // %0
3956 "+r"(src_argb1), // %1
3957 "+r"(dst_argb), // %2
3964 #endif // HAS_ARGBADDROW_SSE2
3966 #ifdef HAS_ARGBADDROW_AVX2
3967 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
3968 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
3969 uint8* dst_argb, int width) {
3974 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3975 "lea " MEMLEA(0x20,0) ",%0 \n"
3976 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
3977 "lea " MEMLEA(0x20,1) ",%1 \n"
3978 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
3979 "lea " MEMLEA(0x20,2) ",%2 \n"
3983 : "+r"(src_argb0), // %0
3984 "+r"(src_argb1), // %1
3985 "+r"(dst_argb), // %2
3992 #endif // HAS_ARGBADDROW_AVX2
3994 #ifdef HAS_ARGBSUBTRACTROW_SSE2
3995 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
3996 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3997 uint8* dst_argb, int width) {
4002 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4003 "lea " MEMLEA(0x10,0) ",%0 \n"
4004 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4005 "lea " MEMLEA(0x10,1) ",%1 \n"
4006 "psubusb %%xmm1,%%xmm0 \n"
4007 "movdqu %%xmm0," MEMACCESS(2) " \n"
4008 "lea " MEMLEA(0x10,2) ",%2 \n"
4011 : "+r"(src_argb0), // %0
4012 "+r"(src_argb1), // %1
4013 "+r"(dst_argb), // %2
4020 #endif // HAS_ARGBSUBTRACTROW_SSE2
4022 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4023 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
4024 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4025 uint8* dst_argb, int width) {
4030 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4031 "lea " MEMLEA(0x20,0) ",%0 \n"
4032 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4033 "lea " MEMLEA(0x20,1) ",%1 \n"
4034 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4035 "lea " MEMLEA(0x20,2) ",%2 \n"
4039 : "+r"(src_argb0), // %0
4040 "+r"(src_argb1), // %1
4041 "+r"(dst_argb), // %2
4048 #endif // HAS_ARGBSUBTRACTROW_AVX2
4050 #ifdef HAS_SOBELXROW_SSE2
4051 // SobelX as a matrix is
4055 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4056 const uint8* src_y2, uint8* dst_sobelx, int width) {
4061 "pxor %%xmm5,%%xmm5 \n"
4066 "movq " MEMACCESS(0) ",%%xmm0 \n"
4067 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
4068 "punpcklbw %%xmm5,%%xmm0 \n"
4069 "punpcklbw %%xmm5,%%xmm1 \n"
4070 "psubw %%xmm1,%%xmm0 \n"
4071 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4072 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
4073 "punpcklbw %%xmm5,%%xmm1 \n"
4074 "punpcklbw %%xmm5,%%xmm2 \n"
4075 "psubw %%xmm2,%%xmm1 \n"
4076 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
4077 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
4078 "punpcklbw %%xmm5,%%xmm2 \n"
4079 "punpcklbw %%xmm5,%%xmm3 \n"
4080 "psubw %%xmm3,%%xmm2 \n"
4081 "paddw %%xmm2,%%xmm0 \n"
4082 "paddw %%xmm1,%%xmm0 \n"
4083 "paddw %%xmm1,%%xmm0 \n"
4084 "pxor %%xmm1,%%xmm1 \n"
4085 "psubw %%xmm0,%%xmm1 \n"
4086 "pmaxsw %%xmm1,%%xmm0 \n"
4087 "packuswb %%xmm0,%%xmm0 \n"
4088 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
4089 "lea " MEMLEA(0x8,0) ",%0 \n"
4092 : "+r"(src_y0), // %0
4095 "+r"(dst_sobelx), // %3
4098 : "memory", "cc", NACL_R14
4099 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4102 #endif // HAS_SOBELXROW_SSE2
4104 #ifdef HAS_SOBELYROW_SSE2
4105 // SobelY as a matrix is
4109 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4110 uint8* dst_sobely, int width) {
4114 "pxor %%xmm5,%%xmm5 \n"
4119 "movq " MEMACCESS(0) ",%%xmm0 \n"
4120 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4121 "punpcklbw %%xmm5,%%xmm0 \n"
4122 "punpcklbw %%xmm5,%%xmm1 \n"
4123 "psubw %%xmm1,%%xmm0 \n"
4124 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
4125 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
4126 "punpcklbw %%xmm5,%%xmm1 \n"
4127 "punpcklbw %%xmm5,%%xmm2 \n"
4128 "psubw %%xmm2,%%xmm1 \n"
4129 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
4130 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
4131 "punpcklbw %%xmm5,%%xmm2 \n"
4132 "punpcklbw %%xmm5,%%xmm3 \n"
4133 "psubw %%xmm3,%%xmm2 \n"
4134 "paddw %%xmm2,%%xmm0 \n"
4135 "paddw %%xmm1,%%xmm0 \n"
4136 "paddw %%xmm1,%%xmm0 \n"
4137 "pxor %%xmm1,%%xmm1 \n"
4138 "psubw %%xmm0,%%xmm1 \n"
4139 "pmaxsw %%xmm1,%%xmm0 \n"
4140 "packuswb %%xmm0,%%xmm0 \n"
4141 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
4142 "lea " MEMLEA(0x8,0) ",%0 \n"
4145 : "+r"(src_y0), // %0
4147 "+r"(dst_sobely), // %2
4150 : "memory", "cc", NACL_R14
4151 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4154 #endif // HAS_SOBELYROW_SSE2
4156 #ifdef HAS_SOBELROW_SSE2
4157 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4162 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4163 uint8* dst_argb, int width) {
4166 "pcmpeqb %%xmm5,%%xmm5 \n"
4167 "pslld $0x18,%%xmm5 \n"
4172 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4173 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4174 "lea " MEMLEA(0x10,0) ",%0 \n"
4175 "paddusb %%xmm1,%%xmm0 \n"
4176 "movdqa %%xmm0,%%xmm2 \n"
4177 "punpcklbw %%xmm0,%%xmm2 \n"
4178 "punpckhbw %%xmm0,%%xmm0 \n"
4179 "movdqa %%xmm2,%%xmm1 \n"
4180 "punpcklwd %%xmm2,%%xmm1 \n"
4181 "punpckhwd %%xmm2,%%xmm2 \n"
4182 "por %%xmm5,%%xmm1 \n"
4183 "por %%xmm5,%%xmm2 \n"
4184 "movdqa %%xmm0,%%xmm3 \n"
4185 "punpcklwd %%xmm0,%%xmm3 \n"
4186 "punpckhwd %%xmm0,%%xmm0 \n"
4187 "por %%xmm5,%%xmm3 \n"
4188 "por %%xmm5,%%xmm0 \n"
4189 "movdqu %%xmm1," MEMACCESS(2) " \n"
4190 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
4191 "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
4192 "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
4193 "lea " MEMLEA(0x40,2) ",%2 \n"
4196 : "+r"(src_sobelx), // %0
4197 "+r"(src_sobely), // %1
4198 "+r"(dst_argb), // %2
4201 : "memory", "cc", NACL_R14
4202 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4205 #endif // HAS_SOBELROW_SSE2
4207 #ifdef HAS_SOBELTOPLANEROW_SSE2
4208 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
4209 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4210 uint8* dst_y, int width) {
4213 "pcmpeqb %%xmm5,%%xmm5 \n"
4214 "pslld $0x18,%%xmm5 \n"
4219 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4220 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4221 "lea " MEMLEA(0x10,0) ",%0 \n"
4222 "paddusb %%xmm1,%%xmm0 \n"
4223 "movdqu %%xmm0," MEMACCESS(2) " \n"
4224 "lea " MEMLEA(0x10,2) ",%2 \n"
4227 : "+r"(src_sobelx), // %0
4228 "+r"(src_sobely), // %1
4232 : "memory", "cc", NACL_R14
4236 #endif // HAS_SOBELTOPLANEROW_SSE2
4238 #ifdef HAS_SOBELXYROW_SSE2
4239 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4244 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4245 uint8* dst_argb, int width) {
4248 "pcmpeqb %%xmm5,%%xmm5 \n"
4253 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4254 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4255 "lea " MEMLEA(0x10,0) ",%0 \n"
4256 "movdqa %%xmm0,%%xmm2 \n"
4257 "paddusb %%xmm1,%%xmm2 \n"
4258 "movdqa %%xmm0,%%xmm3 \n"
4259 "punpcklbw %%xmm5,%%xmm3 \n"
4260 "punpckhbw %%xmm5,%%xmm0 \n"
4261 "movdqa %%xmm1,%%xmm4 \n"
4262 "punpcklbw %%xmm2,%%xmm4 \n"
4263 "punpckhbw %%xmm2,%%xmm1 \n"
4264 "movdqa %%xmm4,%%xmm6 \n"
4265 "punpcklwd %%xmm3,%%xmm6 \n"
4266 "punpckhwd %%xmm3,%%xmm4 \n"
4267 "movdqa %%xmm1,%%xmm7 \n"
4268 "punpcklwd %%xmm0,%%xmm7 \n"
4269 "punpckhwd %%xmm0,%%xmm1 \n"
4270 "movdqu %%xmm6," MEMACCESS(2) " \n"
4271 "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
4272 "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
4273 "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
4274 "lea " MEMLEA(0x40,2) ",%2 \n"
4277 : "+r"(src_sobelx), // %0
4278 "+r"(src_sobely), // %1
4279 "+r"(dst_argb), // %2
4282 : "memory", "cc", NACL_R14
4283 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4286 #endif // HAS_SOBELXYROW_SSE2
4288 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4289 // Creates a table of cumulative sums where each value is a sum of all values
4290 // above and to the left of the value, inclusive of the value.
4291 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
4292 const int32* previous_cumsum, int width) {
4294 "pxor %%xmm0,%%xmm0 \n"
4295 "pxor %%xmm1,%%xmm1 \n"
4304 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
4305 "lea " MEMLEA(0x10,0) ",%0 \n"
4306 "movdqa %%xmm2,%%xmm4 \n"
4307 "punpcklbw %%xmm1,%%xmm2 \n"
4308 "movdqa %%xmm2,%%xmm3 \n"
4309 "punpcklwd %%xmm1,%%xmm2 \n"
4310 "punpckhwd %%xmm1,%%xmm3 \n"
4311 "punpckhbw %%xmm1,%%xmm4 \n"
4312 "movdqa %%xmm4,%%xmm5 \n"
4313 "punpcklwd %%xmm1,%%xmm4 \n"
4314 "punpckhwd %%xmm1,%%xmm5 \n"
4315 "paddd %%xmm2,%%xmm0 \n"
4316 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4317 "paddd %%xmm0,%%xmm2 \n"
4318 "paddd %%xmm3,%%xmm0 \n"
4319 "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
4320 "paddd %%xmm0,%%xmm3 \n"
4321 "paddd %%xmm4,%%xmm0 \n"
4322 "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
4323 "paddd %%xmm0,%%xmm4 \n"
4324 "paddd %%xmm5,%%xmm0 \n"
4325 "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
4326 "lea " MEMLEA(0x40,2) ",%2 \n"
4327 "paddd %%xmm0,%%xmm5 \n"
4328 "movdqu %%xmm2," MEMACCESS(1) " \n"
4329 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
4330 "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
4331 "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
4332 "lea " MEMLEA(0x40,1) ",%1 \n"
4343 "movd " MEMACCESS(0) ",%%xmm2 \n"
4344 "lea " MEMLEA(0x4,0) ",%0 \n"
4345 "punpcklbw %%xmm1,%%xmm2 \n"
4346 "punpcklwd %%xmm1,%%xmm2 \n"
4347 "paddd %%xmm2,%%xmm0 \n"
4348 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4349 "lea " MEMLEA(0x10,2) ",%2 \n"
4350 "paddd %%xmm0,%%xmm2 \n"
4351 "movdqu %%xmm2," MEMACCESS(1) " \n"
4352 "lea " MEMLEA(0x10,1) ",%1 \n"
4359 "+r"(previous_cumsum), // %2
4363 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4366 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4368 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4369 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4370 int width, int area, uint8* dst,
4374 "cvtdq2ps %%xmm5,%%xmm5 \n"
4375 "rcpss %%xmm5,%%xmm4 \n"
4376 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4382 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4383 "pcmpeqb %%xmm6,%%xmm6 \n"
4384 "psrld $0x10,%%xmm6 \n"
4385 "cvtdq2ps %%xmm6,%%xmm6 \n"
4386 "addps %%xmm6,%%xmm5 \n"
4387 "mulps %%xmm4,%%xmm5 \n"
4388 "cvtps2dq %%xmm5,%%xmm5 \n"
4389 "packssdw %%xmm5,%%xmm5 \n"
4391 // 4 pixel small loop \n"
4394 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4395 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4396 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4397 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4398 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4399 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4400 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4401 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4402 "lea " MEMLEA(0x40,0) ",%0 \n"
4403 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4404 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4405 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4406 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4407 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4408 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4409 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4410 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4411 "lea " MEMLEA(0x40,1) ",%1 \n"
4412 "packssdw %%xmm1,%%xmm0 \n"
4413 "packssdw %%xmm3,%%xmm2 \n"
4414 "pmulhuw %%xmm5,%%xmm0 \n"
4415 "pmulhuw %%xmm5,%%xmm2 \n"
4416 "packuswb %%xmm2,%%xmm0 \n"
4417 "movdqu %%xmm0," MEMACCESS(2) " \n"
4418 "lea " MEMLEA(0x10,2) ",%2 \n"
4426 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4427 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4428 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4429 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4430 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4431 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4432 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4433 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4434 "lea " MEMLEA(0x40,0) ",%0 \n"
4435 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4436 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4437 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4438 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4439 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4440 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4441 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4442 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4443 "lea " MEMLEA(0x40,1) ",%1 \n"
4444 "cvtdq2ps %%xmm0,%%xmm0 \n"
4445 "cvtdq2ps %%xmm1,%%xmm1 \n"
4446 "mulps %%xmm4,%%xmm0 \n"
4447 "mulps %%xmm4,%%xmm1 \n"
4448 "cvtdq2ps %%xmm2,%%xmm2 \n"
4449 "cvtdq2ps %%xmm3,%%xmm3 \n"
4450 "mulps %%xmm4,%%xmm2 \n"
4451 "mulps %%xmm4,%%xmm3 \n"
4452 "cvtps2dq %%xmm0,%%xmm0 \n"
4453 "cvtps2dq %%xmm1,%%xmm1 \n"
4454 "cvtps2dq %%xmm2,%%xmm2 \n"
4455 "cvtps2dq %%xmm3,%%xmm3 \n"
4456 "packssdw %%xmm1,%%xmm0 \n"
4457 "packssdw %%xmm3,%%xmm2 \n"
4458 "packuswb %%xmm2,%%xmm0 \n"
4459 "movdqu %%xmm0," MEMACCESS(2) " \n"
4460 "lea " MEMLEA(0x10,2) ",%2 \n"
4471 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4472 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4473 "lea " MEMLEA(0x10,0) ",%0 \n"
4474 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4475 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4476 "lea " MEMLEA(0x10,1) ",%1 \n"
4477 "cvtdq2ps %%xmm0,%%xmm0 \n"
4478 "mulps %%xmm4,%%xmm0 \n"
4479 "cvtps2dq %%xmm0,%%xmm0 \n"
4480 "packssdw %%xmm0,%%xmm0 \n"
4481 "packuswb %%xmm0,%%xmm0 \n"
4482 "movd %%xmm0," MEMACCESS(2) " \n"
4483 "lea " MEMLEA(0x4,2) ",%2 \n"
4487 : "+r"(topleft), // %0
4488 "+r"(botleft), // %1
4491 : "r"((intptr_t)(width)), // %4
4493 : "memory", "cc", NACL_R14
4494 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4497 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4499 #ifdef HAS_ARGBAFFINEROW_SSE2
4500 // Copy ARGB pixels from source image with slope to a row of destination.
4502 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
4503 uint8* dst_argb, const float* src_dudv, int width) {
4504 intptr_t src_argb_stride_temp = src_argb_stride;
4507 "movq " MEMACCESS(3) ",%%xmm2 \n"
4508 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
4515 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4516 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4517 "movdqa %%xmm2,%%xmm0 \n"
4518 "addps %%xmm7,%%xmm0 \n"
4519 "movlhps %%xmm0,%%xmm2 \n"
4520 "movdqa %%xmm7,%%xmm4 \n"
4521 "addps %%xmm4,%%xmm4 \n"
4522 "movdqa %%xmm2,%%xmm3 \n"
4523 "addps %%xmm4,%%xmm3 \n"
4524 "addps %%xmm4,%%xmm4 \n"
4529 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
4530 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
4531 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
4532 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
4533 "movd %%xmm0,%k1 \n"
4534 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4535 "movd %%xmm0,%k5 \n"
4536 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4537 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
4538 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4539 "punpckldq %%xmm6,%%xmm1 \n"
4540 "addps %%xmm4,%%xmm2 \n"
4541 "movq %%xmm1," MEMACCESS(2) " \n"
4542 "movd %%xmm0,%k1 \n"
4543 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4544 "movd %%xmm0,%k5 \n"
4545 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4546 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4547 "punpckldq %%xmm6,%%xmm0 \n"
4548 "addps %%xmm4,%%xmm3 \n"
4549 "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
4550 "lea " MEMLEA(0x10,2) ",%2 \n"
4561 "cvttps2dq %%xmm2,%%xmm0 \n"
4562 "packssdw %%xmm0,%%xmm0 \n"
4563 "pmaddwd %%xmm5,%%xmm0 \n"
4564 "addps %%xmm7,%%xmm2 \n"
4565 "movd %%xmm0,%k1 \n"
4566 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4567 "movd %%xmm0," MEMACCESS(2) " \n"
4568 "lea " MEMLEA(0x04,2) ",%2 \n"
4572 : "+r"(src_argb), // %0
4573 "+r"(src_argb_stride_temp), // %1
4574 "+r"(dst_argb), // %2
4575 "+r"(src_dudv), // %3
4579 : "memory", "cc", NACL_R14
4580 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4583 #endif // HAS_ARGBAFFINEROW_SSE2
4585 #ifdef HAS_INTERPOLATEROW_SSSE3
4586 // Bilinear filter 16x2 -> 16x1
4587 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4588 ptrdiff_t src_stride, int dst_width,
4589 int source_y_fraction) {
4606 "punpcklbw %%xmm0,%%xmm5 \n"
4607 "punpcklwd %%xmm5,%%xmm5 \n"
4608 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4610 // General purpose row blend.
4613 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4614 MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4615 "movdqa %%xmm0,%%xmm1 \n"
4616 "punpcklbw %%xmm2,%%xmm0 \n"
4617 "punpckhbw %%xmm2,%%xmm1 \n"
4618 "pmaddubsw %%xmm5,%%xmm0 \n"
4619 "pmaddubsw %%xmm5,%%xmm1 \n"
4620 "psrlw $0x7,%%xmm0 \n"
4621 "psrlw $0x7,%%xmm1 \n"
4622 "packuswb %%xmm1,%%xmm0 \n"
4623 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4624 "lea " MEMLEA(0x10,1) ",%1 \n"
4632 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4633 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4634 "pavgb %%xmm1,%%xmm0 \n"
4635 "pavgb %%xmm1,%%xmm0 \n"
4636 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4637 "lea " MEMLEA(0x10,1) ",%1 \n"
4645 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4646 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4647 "pavgb %%xmm1,%%xmm0 \n"
4648 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4649 "lea " MEMLEA(0x10,1) ",%1 \n"
4657 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4658 MEMOPREG(movdqu,0x00,1,4,1,xmm0)
4659 "pavgb %%xmm1,%%xmm0 \n"
4660 "pavgb %%xmm1,%%xmm0 \n"
4661 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4662 "lea " MEMLEA(0x10,1) ",%1 \n"
4667 // Blend 100 / 0 - Copy row unchanged.
4670 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4671 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4672 "lea " MEMLEA(0x10,1) ",%1 \n"
4677 : "+r"(dst_ptr), // %0
4678 "+r"(src_ptr), // %1
4679 "+r"(dst_width), // %2
4680 "+r"(source_y_fraction) // %3
4681 : "r"((intptr_t)(src_stride)) // %4
4682 : "memory", "cc", NACL_R14
4683 "xmm0", "xmm1", "xmm2", "xmm5"
4686 #endif // HAS_INTERPOLATEROW_SSSE3
4688 #ifdef HAS_INTERPOLATEROW_AVX2
4689 // Bilinear filter 32x2 -> 32x1
4690 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
4691 ptrdiff_t src_stride, int dst_width,
4692 int source_y_fraction) {
4705 "vmovd %3,%%xmm0 \n"
4708 "vmovd %3,%%xmm5 \n"
4709 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
4710 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
4711 "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
4712 "vpermd %%ymm5,%%ymm0,%%ymm5 \n"
4714 // General purpose row blend.
4717 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4718 MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
4719 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
4720 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
4721 "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n"
4722 "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n"
4723 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
4724 "vpsrlw $0x7,%%ymm1,%%ymm1 \n"
4725 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4726 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4727 "lea " MEMLEA(0x20,1) ",%1 \n"
4735 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4736 MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
4737 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4738 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4739 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4740 "lea " MEMLEA(0x20,1) ",%1 \n"
4748 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4749 VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
4750 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4751 "lea " MEMLEA(0x20,1) ",%1 \n"
4759 "vmovdqu " MEMACCESS(1) ",%%ymm1 \n"
4760 MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
4761 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4762 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4763 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4764 "lea " MEMLEA(0x20,1) ",%1 \n"
4769 // Blend 100 / 0 - Copy row unchanged.
4772 "rep movsb " MEMMOVESTRING(1,0) " \n"
4778 : "+D"(dst_ptr), // %0
4779 "+S"(src_ptr), // %1
4780 "+c"(dst_width), // %2
4781 "+r"(source_y_fraction) // %3
4782 : "r"((intptr_t)(src_stride)) // %4
4783 : "memory", "cc", NACL_R14
4784 "xmm0", "xmm1", "xmm2", "xmm5"
4787 #endif // HAS_INTERPOLATEROW_AVX2
4789 #ifdef HAS_INTERPOLATEROW_SSE2
4790 // Bilinear filter 16x2 -> 16x1
4791 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
4792 ptrdiff_t src_stride, int dst_width,
4793 int source_y_fraction) {
4810 "punpcklbw %%xmm0,%%xmm5 \n"
4811 "punpcklwd %%xmm5,%%xmm5 \n"
4812 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4813 "pxor %%xmm4,%%xmm4 \n"
4815 // General purpose row blend.
4818 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4819 MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2
4820 "movdqa %%xmm0,%%xmm1 \n"
4821 "movdqa %%xmm2,%%xmm3 \n"
4822 "punpcklbw %%xmm4,%%xmm2 \n"
4823 "punpckhbw %%xmm4,%%xmm3 \n"
4824 "punpcklbw %%xmm4,%%xmm0 \n"
4825 "punpckhbw %%xmm4,%%xmm1 \n"
4826 "psubw %%xmm0,%%xmm2 \n"
4827 "psubw %%xmm1,%%xmm3 \n"
4828 "paddw %%xmm2,%%xmm2 \n"
4829 "paddw %%xmm3,%%xmm3 \n"
4830 "pmulhw %%xmm5,%%xmm2 \n"
4831 "pmulhw %%xmm5,%%xmm3 \n"
4832 "paddw %%xmm2,%%xmm0 \n"
4833 "paddw %%xmm3,%%xmm1 \n"
4834 "packuswb %%xmm1,%%xmm0 \n"
4835 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4836 "lea " MEMLEA(0x10,1) ",%1 \n"
4844 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4845 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
4846 "pavgb %%xmm1,%%xmm0 \n"
4847 "pavgb %%xmm1,%%xmm0 \n"
4848 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4849 "lea " MEMLEA(0x10,1) ",%1 \n"
4857 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4858 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
4859 "pavgb %%xmm1,%%xmm0 \n"
4860 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4861 "lea " MEMLEA(0x10,1) ",%1 \n"
4869 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4870 MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
4871 "pavgb %%xmm1,%%xmm0 \n"
4872 "pavgb %%xmm1,%%xmm0 \n"
4873 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4874 "lea " MEMLEA(0x10,1) ",%1 \n"
4879 // Blend 100 / 0 - Copy row unchanged.
4882 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4883 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4884 "lea " MEMLEA(0x10,1) ",%1 \n"
4889 : "+r"(dst_ptr), // %0
4890 "+r"(src_ptr), // %1
4891 "+r"(dst_width), // %2
4892 "+r"(source_y_fraction) // %3
4893 : "r"((intptr_t)(src_stride)) // %4
4894 : "memory", "cc", NACL_R14
4895 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4898 #endif // HAS_INTERPOLATEROW_SSE2
4900 #ifdef HAS_ARGBTOBAYERGGROW_SSE2
4901 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
4902 uint32 selector, int pix) {
4904 "pcmpeqb %%xmm5,%%xmm5 \n"
4905 "psrld $0x18,%%xmm5 \n"
4908 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4909 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4910 "lea " MEMLEA(0x20,0) ",%0 \n"
4911 "psrld $0x8,%%xmm0 \n"
4912 "psrld $0x8,%%xmm1 \n"
4913 "pand %%xmm5,%%xmm0 \n"
4914 "pand %%xmm5,%%xmm1 \n"
4915 "packssdw %%xmm1,%%xmm0 \n"
4916 "packuswb %%xmm1,%%xmm0 \n"
4917 "movq %%xmm0," MEMACCESS(1) " \n"
4918 "lea " MEMLEA(0x8,1) ",%1 \n"
4921 : "+r"(src_argb), // %0
4922 "+r"(dst_bayer), // %1
4926 , "xmm0", "xmm1", "xmm5"
4929 #endif // HAS_ARGBTOBAYERGGROW_SSE2
4931 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
4932 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4933 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4934 const uint8* shuffler, int pix) {
4936 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
4939 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4940 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4941 "lea " MEMLEA(0x20,0) ",%0 \n"
4942 "pshufb %%xmm5,%%xmm0 \n"
4943 "pshufb %%xmm5,%%xmm1 \n"
4944 "movdqu %%xmm0," MEMACCESS(1) " \n"
4945 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
4946 "lea " MEMLEA(0x20,1) ",%1 \n"
4949 : "+r"(src_argb), // %0
4950 "+r"(dst_argb), // %1
4952 : "r"(shuffler) // %3
4954 , "xmm0", "xmm1", "xmm5"
4957 #endif // HAS_ARGBSHUFFLEROW_SSSE3
4959 #ifdef HAS_ARGBSHUFFLEROW_AVX2
4960 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4961 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4962 const uint8* shuffler, int pix) {
4964 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
4967 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4968 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
4969 "lea " MEMLEA(0x40,0) ",%0 \n"
4970 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
4971 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
4972 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
4973 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
4974 "lea " MEMLEA(0x40,1) ",%1 \n"
4978 : "+r"(src_argb), // %0
4979 "+r"(dst_argb), // %1
4981 : "r"(shuffler) // %3
4983 , "xmm0", "xmm1", "xmm5"
4986 #endif // HAS_ARGBSHUFFLEROW_AVX2
4988 #ifdef HAS_ARGBSHUFFLEROW_SSE2
4989 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4990 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4991 const uint8* shuffler, int pix) {
4992 uintptr_t pixel_temp = 0u;
4994 "pxor %%xmm5,%%xmm5 \n"
4995 "mov " MEMACCESS(4) ",%k2 \n"
4996 "cmp $0x3000102,%k2 \n"
4998 "cmp $0x10203,%k2 \n"
5000 "cmp $0x30201,%k2 \n"
5002 "cmp $0x2010003,%k2 \n"
5007 "movzb " MEMACCESS(4) ",%2 \n"
5008 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5009 "mov %b2," MEMACCESS(1) " \n"
5010 "movzb " MEMACCESS2(0x1,4) ",%2 \n"
5011 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5012 "mov %b2," MEMACCESS2(0x1,1) " \n"
5013 "movzb " MEMACCESS2(0x2,4) ",%2 \n"
5014 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5015 "mov %b2," MEMACCESS2(0x2,1) " \n"
5016 "movzb " MEMACCESS2(0x3,4) ",%2 \n"
5017 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5018 "mov %b2," MEMACCESS2(0x3,1) " \n"
5019 "lea " MEMLEA(0x4,0) ",%0 \n"
5020 "lea " MEMLEA(0x4,1) ",%1 \n"
5027 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5028 "lea " MEMLEA(0x10,0) ",%0 \n"
5029 "movdqa %%xmm0,%%xmm1 \n"
5030 "punpcklbw %%xmm5,%%xmm0 \n"
5031 "punpckhbw %%xmm5,%%xmm1 \n"
5032 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
5033 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
5034 "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
5035 "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
5036 "packuswb %%xmm1,%%xmm0 \n"
5037 "movdqu %%xmm0," MEMACCESS(1) " \n"
5038 "lea " MEMLEA(0x10,1) ",%1 \n"
5045 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5046 "lea " MEMLEA(0x10,0) ",%0 \n"
5047 "movdqa %%xmm0,%%xmm1 \n"
5048 "punpcklbw %%xmm5,%%xmm0 \n"
5049 "punpckhbw %%xmm5,%%xmm1 \n"
5050 "pshufhw $0x39,%%xmm0,%%xmm0 \n"
5051 "pshuflw $0x39,%%xmm0,%%xmm0 \n"
5052 "pshufhw $0x39,%%xmm1,%%xmm1 \n"
5053 "pshuflw $0x39,%%xmm1,%%xmm1 \n"
5054 "packuswb %%xmm1,%%xmm0 \n"
5055 "movdqu %%xmm0," MEMACCESS(1) " \n"
5056 "lea " MEMLEA(0x10,1) ",%1 \n"
5063 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5064 "lea " MEMLEA(0x10,0) ",%0 \n"
5065 "movdqa %%xmm0,%%xmm1 \n"
5066 "punpcklbw %%xmm5,%%xmm0 \n"
5067 "punpckhbw %%xmm5,%%xmm1 \n"
5068 "pshufhw $0x93,%%xmm0,%%xmm0 \n"
5069 "pshuflw $0x93,%%xmm0,%%xmm0 \n"
5070 "pshufhw $0x93,%%xmm1,%%xmm1 \n"
5071 "pshuflw $0x93,%%xmm1,%%xmm1 \n"
5072 "packuswb %%xmm1,%%xmm0 \n"
5073 "movdqu %%xmm0," MEMACCESS(1) " \n"
5074 "lea " MEMLEA(0x10,1) ",%1 \n"
5081 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5082 "lea " MEMLEA(0x10,0) ",%0 \n"
5083 "movdqa %%xmm0,%%xmm1 \n"
5084 "punpcklbw %%xmm5,%%xmm0 \n"
5085 "punpckhbw %%xmm5,%%xmm1 \n"
5086 "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
5087 "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
5088 "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
5089 "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
5090 "packuswb %%xmm1,%%xmm0 \n"
5091 "movdqu %%xmm0," MEMACCESS(1) " \n"
5092 "lea " MEMLEA(0x10,1) ",%1 \n"
5097 : "+r"(src_argb), // %0
5098 "+r"(dst_argb), // %1
5099 "+d"(pixel_temp), // %2
5101 : "r"(shuffler) // %4
5102 : "memory", "cc", NACL_R14
5103 "xmm0", "xmm1", "xmm5"
5106 #endif // HAS_ARGBSHUFFLEROW_SSE2
5108 #ifdef HAS_I422TOYUY2ROW_SSE2
5109 void I422ToYUY2Row_SSE2(const uint8* src_y,
5112 uint8* dst_frame, int width) {
5117 "movq " MEMACCESS(1) ",%%xmm2 \n"
5118 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5119 "lea " MEMLEA(0x8,1) ",%1 \n"
5120 "punpcklbw %%xmm3,%%xmm2 \n"
5121 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5122 "lea " MEMLEA(0x10,0) ",%0 \n"
5123 "movdqa %%xmm0,%%xmm1 \n"
5124 "punpcklbw %%xmm2,%%xmm0 \n"
5125 "punpckhbw %%xmm2,%%xmm1 \n"
5126 "movdqu %%xmm0," MEMACCESS(3) " \n"
5127 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
5128 "lea " MEMLEA(0x20,3) ",%3 \n"
5131 : "+r"(src_y), // %0
5134 "+r"(dst_frame), // %3
5137 : "memory", "cc", NACL_R14
5138 "xmm0", "xmm1", "xmm2", "xmm3"
5141 #endif // HAS_I422TOYUY2ROW_SSE2
5143 #ifdef HAS_I422TOUYVYROW_SSE2
5144 void I422ToUYVYRow_SSE2(const uint8* src_y,
5147 uint8* dst_frame, int width) {
5152 "movq " MEMACCESS(1) ",%%xmm2 \n"
5153 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5154 "lea " MEMLEA(0x8,1) ",%1 \n"
5155 "punpcklbw %%xmm3,%%xmm2 \n"
5156 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5157 "movdqa %%xmm2,%%xmm1 \n"
5158 "lea " MEMLEA(0x10,0) ",%0 \n"
5159 "punpcklbw %%xmm0,%%xmm1 \n"
5160 "punpckhbw %%xmm0,%%xmm2 \n"
5161 "movdqu %%xmm1," MEMACCESS(3) " \n"
5162 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
5163 "lea " MEMLEA(0x20,3) ",%3 \n"
5166 : "+r"(src_y), // %0
5169 "+r"(dst_frame), // %3
5172 : "memory", "cc", NACL_R14
5173 "xmm0", "xmm1", "xmm2", "xmm3"
5176 #endif // HAS_I422TOUYVYROW_SSE2
5178 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5179 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5180 uint8* dst_argb, const float* poly,
5183 "pxor %%xmm3,%%xmm3 \n"
5188 "movq " MEMACCESS(0) ",%%xmm0 \n"
5189 "lea " MEMLEA(0x8,0) ",%0 \n"
5190 "punpcklbw %%xmm3,%%xmm0 \n"
5191 "movdqa %%xmm0,%%xmm4 \n"
5192 "punpcklwd %%xmm3,%%xmm0 \n"
5193 "punpckhwd %%xmm3,%%xmm4 \n"
5194 "cvtdq2ps %%xmm0,%%xmm0 \n"
5195 "cvtdq2ps %%xmm4,%%xmm4 \n"
5196 "movdqa %%xmm0,%%xmm1 \n"
5197 "movdqa %%xmm4,%%xmm5 \n"
5198 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
5199 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
5200 "addps " MEMACCESS(3) ",%%xmm0 \n"
5201 "addps " MEMACCESS(3) ",%%xmm4 \n"
5202 "movdqa %%xmm1,%%xmm2 \n"
5203 "movdqa %%xmm5,%%xmm6 \n"
5204 "mulps %%xmm1,%%xmm2 \n"
5205 "mulps %%xmm5,%%xmm6 \n"
5206 "mulps %%xmm2,%%xmm1 \n"
5207 "mulps %%xmm6,%%xmm5 \n"
5208 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
5209 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
5210 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
5211 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
5212 "addps %%xmm2,%%xmm0 \n"
5213 "addps %%xmm6,%%xmm4 \n"
5214 "addps %%xmm1,%%xmm0 \n"
5215 "addps %%xmm5,%%xmm4 \n"
5216 "cvttps2dq %%xmm0,%%xmm0 \n"
5217 "cvttps2dq %%xmm4,%%xmm4 \n"
5218 "packuswb %%xmm4,%%xmm0 \n"
5219 "packuswb %%xmm0,%%xmm0 \n"
5220 "movq %%xmm0," MEMACCESS(1) " \n"
5221 "lea " MEMLEA(0x8,1) ",%1 \n"
5224 : "+r"(src_argb), // %0
5225 "+r"(dst_argb), // %1
5229 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5232 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
5234 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5235 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5236 uint8* dst_argb, const float* poly,
5239 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
5240 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5241 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5242 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5247 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
5248 "lea " MEMLEA(0x8,0) ",%0 \n"
5249 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
5250 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
5251 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
5252 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
5253 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
5254 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
5255 "vcvttps2dq %%ymm0,%%ymm0 \n"
5256 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
5257 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5258 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
5259 "vmovq %%xmm0," MEMACCESS(1) " \n"
5260 "lea " MEMLEA(0x8,1) ",%1 \n"
5264 : "+r"(src_argb), // %0
5265 "+r"(dst_argb), // %1
5269 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5272 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
5274 #ifdef HAS_ARGBCOLORTABLEROW_X86
5275 // Tranform ARGB pixels with color table.
5276 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
5278 uintptr_t pixel_temp = 0u;
5283 "movzb " MEMACCESS(0) ",%1 \n"
5284 "lea " MEMLEA(0x4,0) ",%0 \n"
5285 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5286 "mov %b1," MEMACCESS2(-0x4,0) " \n"
5287 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5288 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5289 "mov %b1," MEMACCESS2(-0x3,0) " \n"
5290 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5291 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5292 "mov %b1," MEMACCESS2(-0x2,0) " \n"
5293 "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
5294 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
5295 "mov %b1," MEMACCESS2(-0x1,0) " \n"
5298 : "+r"(dst_argb), // %0
5299 "+d"(pixel_temp), // %1
5301 : "r"(table_argb) // %3
5304 #endif // HAS_ARGBCOLORTABLEROW_X86
5306 #ifdef HAS_RGBCOLORTABLEROW_X86
5307 // Tranform RGB pixels with color table.
5308 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5309 uintptr_t pixel_temp = 0u;
5314 "movzb " MEMACCESS(0) ",%1 \n"
5315 "lea " MEMLEA(0x4,0) ",%0 \n"
5316 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5317 "mov %b1," MEMACCESS2(-0x4,0) " \n"
5318 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5319 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5320 "mov %b1," MEMACCESS2(-0x3,0) " \n"
5321 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5322 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5323 "mov %b1," MEMACCESS2(-0x2,0) " \n"
5326 : "+r"(dst_argb), // %0
5327 "+d"(pixel_temp), // %1
5329 : "r"(table_argb) // %3
5332 #endif // HAS_RGBCOLORTABLEROW_X86
5334 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5335 // Tranform RGB pixels with luma table.
5336 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5338 const uint8* luma, uint32 lumacoeff) {
5339 uintptr_t pixel_temp = 0u;
5340 uintptr_t table_temp = 0u;
5343 "pshufd $0x0,%%xmm3,%%xmm3 \n"
5344 "pcmpeqb %%xmm4,%%xmm4 \n"
5345 "psllw $0x8,%%xmm4 \n"
5346 "pxor %%xmm5,%%xmm5 \n"
5351 "movdqu " MEMACCESS(2) ",%%xmm0 \n"
5352 "pmaddubsw %%xmm3,%%xmm0 \n"
5353 "phaddw %%xmm0,%%xmm0 \n"
5354 "pand %%xmm4,%%xmm0 \n"
5355 "punpcklwd %%xmm5,%%xmm0 \n"
5356 "movd %%xmm0,%k1 \n" // 32 bit offset
5358 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5360 "movzb " MEMACCESS(2) ",%0 \n"
5361 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5362 "mov %b0," MEMACCESS(3) " \n"
5363 "movzb " MEMACCESS2(0x1,2) ",%0 \n"
5364 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5365 "mov %b0," MEMACCESS2(0x1,3) " \n"
5366 "movzb " MEMACCESS2(0x2,2) ",%0 \n"
5367 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5368 "mov %b0," MEMACCESS2(0x2,3) " \n"
5369 "movzb " MEMACCESS2(0x3,2) ",%0 \n"
5370 "mov %b0," MEMACCESS2(0x3,3) " \n"
5372 "movd %%xmm0,%k1 \n" // 32 bit offset
5374 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5376 "movzb " MEMACCESS2(0x4,2) ",%0 \n"
5377 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5378 "mov %b0," MEMACCESS2(0x4,3) " \n"
5379 "movzb " MEMACCESS2(0x5,2) ",%0 \n"
5380 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5381 "mov %b0," MEMACCESS2(0x5,3) " \n"
5382 "movzb " MEMACCESS2(0x6,2) ",%0 \n"
5383 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5384 "mov %b0," MEMACCESS2(0x6,3) " \n"
5385 "movzb " MEMACCESS2(0x7,2) ",%0 \n"
5386 "mov %b0," MEMACCESS2(0x7,3) " \n"
5388 "movd %%xmm0,%k1 \n" // 32 bit offset
5390 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5392 "movzb " MEMACCESS2(0x8,2) ",%0 \n"
5393 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5394 "mov %b0," MEMACCESS2(0x8,3) " \n"
5395 "movzb " MEMACCESS2(0x9,2) ",%0 \n"
5396 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5397 "mov %b0," MEMACCESS2(0x9,3) " \n"
5398 "movzb " MEMACCESS2(0xa,2) ",%0 \n"
5399 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5400 "mov %b0," MEMACCESS2(0xa,3) " \n"
5401 "movzb " MEMACCESS2(0xb,2) ",%0 \n"
5402 "mov %b0," MEMACCESS2(0xb,3) " \n"
5404 "movd %%xmm0,%k1 \n" // 32 bit offset
5407 "movzb " MEMACCESS2(0xc,2) ",%0 \n"
5408 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5409 "mov %b0," MEMACCESS2(0xc,3) " \n"
5410 "movzb " MEMACCESS2(0xd,2) ",%0 \n"
5411 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5412 "mov %b0," MEMACCESS2(0xd,3) " \n"
5413 "movzb " MEMACCESS2(0xe,2) ",%0 \n"
5414 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5415 "mov %b0," MEMACCESS2(0xe,3) " \n"
5416 "movzb " MEMACCESS2(0xf,2) ",%0 \n"
5417 "mov %b0," MEMACCESS2(0xf,3) " \n"
5418 "lea " MEMLEA(0x10,2) ",%2 \n"
5419 "lea " MEMLEA(0x10,3) ",%3 \n"
5422 : "+d"(pixel_temp), // %0
5423 "+a"(table_temp), // %1
5424 "+r"(src_argb), // %2
5425 "+r"(dst_argb), // %3
5428 "rm"(lumacoeff) // %6
5429 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5432 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5434 #endif // defined(__x86_64__) || defined(__i386__)
5438 } // namespace libyuv