]> granicus.if.org Git - libvpx/blob - third_party/libyuv/source/row_gcc.cc
update libyuv to r1456
[libvpx] / third_party / libyuv / source / row_gcc.cc
1 // VERSION 2
2 /*
3  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
4  *
5  *  Use of this source code is governed by a BSD-style license
6  *  that can be found in the LICENSE file in the root of the source
7  *  tree. An additional intellectual property rights grant can be found
8  *  in the file PATENTS. All contributing project authors may
9  *  be found in the AUTHORS file in the root of the source tree.
10  */
11
12 #include "libyuv/row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23
24 // Constants for ARGB
25 static vec8 kARGBToY = {
26   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
27 };
28
29 // JPeg full range.
30 static vec8 kARGBToYJ = {
31   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
32 };
33 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
34
35 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
36
37 static vec8 kARGBToU = {
38   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
39 };
40
41 static vec8 kARGBToUJ = {
42   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
43 };
44
45 static vec8 kARGBToV = {
46   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
47 };
48
49 static vec8 kARGBToVJ = {
50   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
51 };
52
53 // Constants for BGRA
54 static vec8 kBGRAToY = {
55   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
56 };
57
58 static vec8 kBGRAToU = {
59   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
60 };
61
62 static vec8 kBGRAToV = {
63   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
64 };
65
66 // Constants for ABGR
67 static vec8 kABGRToY = {
68   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
69 };
70
71 static vec8 kABGRToU = {
72   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
73 };
74
75 static vec8 kABGRToV = {
76   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
77 };
78
79 // Constants for RGBA.
80 static vec8 kRGBAToY = {
81   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
82 };
83
84 static vec8 kRGBAToU = {
85   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
86 };
87
88 static vec8 kRGBAToV = {
89   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
90 };
91
92 static uvec8 kAddY16 = {
93   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
94 };
95
96 // 7 bit fixed point 0.5.
97 static vec16 kAddYJ64 = {
98   64, 64, 64, 64, 64, 64, 64, 64
99 };
100
101 static uvec8 kAddUV128 = {
102   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
103   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
104 };
105
106 static uvec16 kAddUVJ128 = {
107   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
108 };
109 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
110
111 #ifdef HAS_RGB24TOARGBROW_SSSE3
112
113 // Shuffle table for converting RGB24 to ARGB.
114 static uvec8 kShuffleMaskRGB24ToARGB = {
115   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
116 };
117
118 // Shuffle table for converting RAW to ARGB.
119 static uvec8 kShuffleMaskRAWToARGB = {
120   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
121 };
122
123 // Shuffle table for converting ARGB to RGB24.
124 static uvec8 kShuffleMaskARGBToRGB24 = {
125   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
126 };
127
128 // Shuffle table for converting ARGB to RAW.
129 static uvec8 kShuffleMaskARGBToRAW = {
130   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
131 };
132
133 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
134 static uvec8 kShuffleMaskARGBToRGB24_0 = {
135   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
136 };
137
138 // Shuffle table for converting ARGB to RAW.
139 static uvec8 kShuffleMaskARGBToRAW_0 = {
140   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
141 };
142 #endif  // HAS_RGB24TOARGBROW_SSSE3
143
144 #if defined(TESTING) && defined(__x86_64__)
145 void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
146   asm volatile (
147     ".p2align  5                               \n"
148     "mov       %%eax,%%eax                     \n"
149     "mov       %%ebx,%%ebx                     \n"
150     "mov       %%ecx,%%ecx                     \n"
151     "mov       %%edx,%%edx                     \n"
152     "mov       %%esi,%%esi                     \n"
153     "mov       %%edi,%%edi                     \n"
154     "mov       %%ebp,%%ebp                     \n"
155     "mov       %%esp,%%esp                     \n"
156     ".p2align  5                               \n"
157     "mov       %%r8d,%%r8d                     \n"
158     "mov       %%r9d,%%r9d                     \n"
159     "mov       %%r10d,%%r10d                   \n"
160     "mov       %%r11d,%%r11d                   \n"
161     "mov       %%r12d,%%r12d                   \n"
162     "mov       %%r13d,%%r13d                   \n"
163     "mov       %%r14d,%%r14d                   \n"
164     "mov       %%r15d,%%r15d                   \n"
165     ".p2align  5                               \n"
166     "lea       (%%rax),%%eax                   \n"
167     "lea       (%%rbx),%%ebx                   \n"
168     "lea       (%%rcx),%%ecx                   \n"
169     "lea       (%%rdx),%%edx                   \n"
170     "lea       (%%rsi),%%esi                   \n"
171     "lea       (%%rdi),%%edi                   \n"
172     "lea       (%%rbp),%%ebp                   \n"
173     "lea       (%%rsp),%%esp                   \n"
174     ".p2align  5                               \n"
175     "lea       (%%r8),%%r8d                    \n"
176     "lea       (%%r9),%%r9d                    \n"
177     "lea       (%%r10),%%r10d                  \n"
178     "lea       (%%r11),%%r11d                  \n"
179     "lea       (%%r12),%%r12d                  \n"
180     "lea       (%%r13),%%r13d                  \n"
181     "lea       (%%r14),%%r14d                  \n"
182     "lea       (%%r15),%%r15d                  \n"
183
184     ".p2align  5                               \n"
185     "lea       0x10(%%rax),%%eax               \n"
186     "lea       0x10(%%rbx),%%ebx               \n"
187     "lea       0x10(%%rcx),%%ecx               \n"
188     "lea       0x10(%%rdx),%%edx               \n"
189     "lea       0x10(%%rsi),%%esi               \n"
190     "lea       0x10(%%rdi),%%edi               \n"
191     "lea       0x10(%%rbp),%%ebp               \n"
192     "lea       0x10(%%rsp),%%esp               \n"
193     ".p2align  5                               \n"
194     "lea       0x10(%%r8),%%r8d                \n"
195     "lea       0x10(%%r9),%%r9d                \n"
196     "lea       0x10(%%r10),%%r10d              \n"
197     "lea       0x10(%%r11),%%r11d              \n"
198     "lea       0x10(%%r12),%%r12d              \n"
199     "lea       0x10(%%r13),%%r13d              \n"
200     "lea       0x10(%%r14),%%r14d              \n"
201     "lea       0x10(%%r15),%%r15d              \n"
202
203     ".p2align  5                               \n"
204     "add       0x10,%%eax                      \n"
205     "add       0x10,%%ebx                      \n"
206     "add       0x10,%%ecx                      \n"
207     "add       0x10,%%edx                      \n"
208     "add       0x10,%%esi                      \n"
209     "add       0x10,%%edi                      \n"
210     "add       0x10,%%ebp                      \n"
211     "add       0x10,%%esp                      \n"
212     ".p2align  5                               \n"
213     "add       0x10,%%r8d                      \n"
214     "add       0x10,%%r9d                      \n"
215     "add       0x10,%%r10d                     \n"
216     "add       0x10,%%r11d                     \n"
217     "add       0x10,%%r12d                     \n"
218     "add       0x10,%%r13d                     \n"
219     "add       0x10,%%r14d                     \n"
220     "add       0x10,%%r15d                     \n"
221
222     ".p2align  2                               \n"
223   "1:                                          \n"
224     "movq      " MEMACCESS(0) ",%%xmm0         \n"
225     "lea       " MEMLEA(0x8,0) ",%0            \n"
226     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
227     "lea       " MEMLEA(0x20,1) ",%1           \n"
228     "sub       $0x8,%2                         \n"
229     "jg        1b                              \n"
230   : "+r"(src_y),     // %0
231     "+r"(dst_argb),  // %1
232     "+r"(pix)        // %2
233   :
234   : "memory", "cc", "xmm0", "xmm1", "xmm5"
235   );
236 }
237 #endif  // TESTING
238
239 #ifdef HAS_J400TOARGBROW_SSE2
240 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
241   asm volatile (
242     "pcmpeqb   %%xmm5,%%xmm5                   \n"
243     "pslld     $0x18,%%xmm5                    \n"
244     LABELALIGN
245   "1:                                          \n"
246     "movq      " MEMACCESS(0) ",%%xmm0         \n"
247     "lea       " MEMLEA(0x8,0) ",%0            \n"
248     "punpcklbw %%xmm0,%%xmm0                   \n"
249     "movdqa    %%xmm0,%%xmm1                   \n"
250     "punpcklwd %%xmm0,%%xmm0                   \n"
251     "punpckhwd %%xmm1,%%xmm1                   \n"
252     "por       %%xmm5,%%xmm0                   \n"
253     "por       %%xmm5,%%xmm1                   \n"
254     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
255     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
256     "lea       " MEMLEA(0x20,1) ",%1           \n"
257     "sub       $0x8,%2                         \n"
258     "jg        1b                              \n"
259   : "+r"(src_y),     // %0
260     "+r"(dst_argb),  // %1
261     "+r"(pix)        // %2
262   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
263   );
264 }
265 #endif  // HAS_J400TOARGBROW_SSE2
266
267 #ifdef HAS_RGB24TOARGBROW_SSSE3
268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
269   asm volatile (
270     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
271     "pslld     $0x18,%%xmm5                    \n"
272     "movdqa    %3,%%xmm4                       \n"
273     LABELALIGN
274   "1:                                          \n"
275     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
276     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
277     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
278     "lea       " MEMLEA(0x30,0) ",%0           \n"
279     "movdqa    %%xmm3,%%xmm2                   \n"
280     "palignr   $0x8,%%xmm1,%%xmm2              \n"
281     "pshufb    %%xmm4,%%xmm2                   \n"
282     "por       %%xmm5,%%xmm2                   \n"
283     "palignr   $0xc,%%xmm0,%%xmm1              \n"
284     "pshufb    %%xmm4,%%xmm0                   \n"
285     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
286     "por       %%xmm5,%%xmm0                   \n"
287     "pshufb    %%xmm4,%%xmm1                   \n"
288     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
289     "por       %%xmm5,%%xmm1                   \n"
290     "palignr   $0x4,%%xmm3,%%xmm3              \n"
291     "pshufb    %%xmm4,%%xmm3                   \n"
292     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
293     "por       %%xmm5,%%xmm3                   \n"
294     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
295     "lea       " MEMLEA(0x40,1) ",%1           \n"
296     "sub       $0x10,%2                        \n"
297     "jg        1b                              \n"
298   : "+r"(src_rgb24),  // %0
299     "+r"(dst_argb),  // %1
300     "+r"(pix)        // %2
301   : "m"(kShuffleMaskRGB24ToARGB)  // %3
302   : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
303   );
304 }
305
306 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
307   asm volatile (
308     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
309     "pslld     $0x18,%%xmm5                    \n"
310     "movdqa    %3,%%xmm4                       \n"
311     LABELALIGN
312   "1:                                          \n"
313     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
314     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
315     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
316     "lea       " MEMLEA(0x30,0) ",%0           \n"
317     "movdqa    %%xmm3,%%xmm2                   \n"
318     "palignr   $0x8,%%xmm1,%%xmm2              \n"
319     "pshufb    %%xmm4,%%xmm2                   \n"
320     "por       %%xmm5,%%xmm2                   \n"
321     "palignr   $0xc,%%xmm0,%%xmm1              \n"
322     "pshufb    %%xmm4,%%xmm0                   \n"
323     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
324     "por       %%xmm5,%%xmm0                   \n"
325     "pshufb    %%xmm4,%%xmm1                   \n"
326     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
327     "por       %%xmm5,%%xmm1                   \n"
328     "palignr   $0x4,%%xmm3,%%xmm3              \n"
329     "pshufb    %%xmm4,%%xmm3                   \n"
330     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
331     "por       %%xmm5,%%xmm3                   \n"
332     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
333     "lea       " MEMLEA(0x40,1) ",%1           \n"
334     "sub       $0x10,%2                        \n"
335     "jg        1b                              \n"
336   : "+r"(src_raw),   // %0
337     "+r"(dst_argb),  // %1
338     "+r"(pix)        // %2
339   : "m"(kShuffleMaskRAWToARGB)  // %3
340   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
341   );
342 }
343
344 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
345   asm volatile (
346     "mov       $0x1080108,%%eax                \n"
347     "movd      %%eax,%%xmm5                    \n"
348     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
349     "mov       $0x20802080,%%eax               \n"
350     "movd      %%eax,%%xmm6                    \n"
351     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
352     "pcmpeqb   %%xmm3,%%xmm3                   \n"
353     "psllw     $0xb,%%xmm3                     \n"
354     "pcmpeqb   %%xmm4,%%xmm4                   \n"
355     "psllw     $0xa,%%xmm4                     \n"
356     "psrlw     $0x5,%%xmm4                     \n"
357     "pcmpeqb   %%xmm7,%%xmm7                   \n"
358     "psllw     $0x8,%%xmm7                     \n"
359     "sub       %0,%1                           \n"
360     "sub       %0,%1                           \n"
361     LABELALIGN
362   "1:                                          \n"
363     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
364     "movdqa    %%xmm0,%%xmm1                   \n"
365     "movdqa    %%xmm0,%%xmm2                   \n"
366     "pand      %%xmm3,%%xmm1                   \n"
367     "psllw     $0xb,%%xmm2                     \n"
368     "pmulhuw   %%xmm5,%%xmm1                   \n"
369     "pmulhuw   %%xmm5,%%xmm2                   \n"
370     "psllw     $0x8,%%xmm1                     \n"
371     "por       %%xmm2,%%xmm1                   \n"
372     "pand      %%xmm4,%%xmm0                   \n"
373     "pmulhuw   %%xmm6,%%xmm0                   \n"
374     "por       %%xmm7,%%xmm0                   \n"
375     "movdqa    %%xmm1,%%xmm2                   \n"
376     "punpcklbw %%xmm0,%%xmm1                   \n"
377     "punpckhbw %%xmm0,%%xmm2                   \n"
378     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
379     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
380     "lea       " MEMLEA(0x10,0) ",%0           \n"
381     "sub       $0x8,%2                         \n"
382     "jg        1b                              \n"
383   : "+r"(src),  // %0
384     "+r"(dst),  // %1
385     "+r"(pix)   // %2
386   :
387   : "memory", "cc", "eax", NACL_R14
388     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
389   );
390 }
391
392 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
393   asm volatile (
394     "mov       $0x1080108,%%eax                \n"
395     "movd      %%eax,%%xmm5                    \n"
396     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
397     "mov       $0x42004200,%%eax               \n"
398     "movd      %%eax,%%xmm6                    \n"
399     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
400     "pcmpeqb   %%xmm3,%%xmm3                   \n"
401     "psllw     $0xb,%%xmm3                     \n"
402     "movdqa    %%xmm3,%%xmm4                   \n"
403     "psrlw     $0x6,%%xmm4                     \n"
404     "pcmpeqb   %%xmm7,%%xmm7                   \n"
405     "psllw     $0x8,%%xmm7                     \n"
406     "sub       %0,%1                           \n"
407     "sub       %0,%1                           \n"
408     LABELALIGN
409   "1:                                          \n"
410     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
411     "movdqa    %%xmm0,%%xmm1                   \n"
412     "movdqa    %%xmm0,%%xmm2                   \n"
413     "psllw     $0x1,%%xmm1                     \n"
414     "psllw     $0xb,%%xmm2                     \n"
415     "pand      %%xmm3,%%xmm1                   \n"
416     "pmulhuw   %%xmm5,%%xmm2                   \n"
417     "pmulhuw   %%xmm5,%%xmm1                   \n"
418     "psllw     $0x8,%%xmm1                     \n"
419     "por       %%xmm2,%%xmm1                   \n"
420     "movdqa    %%xmm0,%%xmm2                   \n"
421     "pand      %%xmm4,%%xmm0                   \n"
422     "psraw     $0x8,%%xmm2                     \n"
423     "pmulhuw   %%xmm6,%%xmm0                   \n"
424     "pand      %%xmm7,%%xmm2                   \n"
425     "por       %%xmm2,%%xmm0                   \n"
426     "movdqa    %%xmm1,%%xmm2                   \n"
427     "punpcklbw %%xmm0,%%xmm1                   \n"
428     "punpckhbw %%xmm0,%%xmm2                   \n"
429     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
430     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
431     "lea       " MEMLEA(0x10,0) ",%0           \n"
432     "sub       $0x8,%2                         \n"
433     "jg        1b                              \n"
434   : "+r"(src),  // %0
435     "+r"(dst),  // %1
436     "+r"(pix)   // %2
437   :
438   : "memory", "cc", "eax", NACL_R14
439     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
440   );
441 }
442
443 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
444   asm volatile (
445     "mov       $0xf0f0f0f,%%eax                \n"
446     "movd      %%eax,%%xmm4                    \n"
447     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
448     "movdqa    %%xmm4,%%xmm5                   \n"
449     "pslld     $0x4,%%xmm5                     \n"
450     "sub       %0,%1                           \n"
451     "sub       %0,%1                           \n"
452     LABELALIGN
453   "1:                                          \n"
454     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
455     "movdqa    %%xmm0,%%xmm2                   \n"
456     "pand      %%xmm4,%%xmm0                   \n"
457     "pand      %%xmm5,%%xmm2                   \n"
458     "movdqa    %%xmm0,%%xmm1                   \n"
459     "movdqa    %%xmm2,%%xmm3                   \n"
460     "psllw     $0x4,%%xmm1                     \n"
461     "psrlw     $0x4,%%xmm3                     \n"
462     "por       %%xmm1,%%xmm0                   \n"
463     "por       %%xmm3,%%xmm2                   \n"
464     "movdqa    %%xmm0,%%xmm1                   \n"
465     "punpcklbw %%xmm2,%%xmm0                   \n"
466     "punpckhbw %%xmm2,%%xmm1                   \n"
467     MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
468     MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
469     "lea       " MEMLEA(0x10,0) ",%0           \n"
470     "sub       $0x8,%2                         \n"
471     "jg        1b                              \n"
472   : "+r"(src),  // %0
473     "+r"(dst),  // %1
474     "+r"(pix)   // %2
475   :
476   : "memory", "cc", "eax", NACL_R14
477     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
478   );
479 }
480
481 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
482   asm volatile (
483     "movdqa    %3,%%xmm6                       \n"
484     LABELALIGN
485   "1:                                          \n"
486     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
487     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
488     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
489     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
490     "lea       " MEMLEA(0x40,0) ",%0           \n"
491     "pshufb    %%xmm6,%%xmm0                   \n"
492     "pshufb    %%xmm6,%%xmm1                   \n"
493     "pshufb    %%xmm6,%%xmm2                   \n"
494     "pshufb    %%xmm6,%%xmm3                   \n"
495     "movdqa    %%xmm1,%%xmm4                   \n"
496     "psrldq    $0x4,%%xmm1                     \n"
497     "pslldq    $0xc,%%xmm4                     \n"
498     "movdqa    %%xmm2,%%xmm5                   \n"
499     "por       %%xmm4,%%xmm0                   \n"
500     "pslldq    $0x8,%%xmm5                     \n"
501     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
502     "por       %%xmm5,%%xmm1                   \n"
503     "psrldq    $0x8,%%xmm2                     \n"
504     "pslldq    $0x4,%%xmm3                     \n"
505     "por       %%xmm3,%%xmm2                   \n"
506     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
507     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
508     "lea       " MEMLEA(0x30,1) ",%1           \n"
509     "sub       $0x10,%2                        \n"
510     "jg        1b                              \n"
511   : "+r"(src),  // %0
512     "+r"(dst),  // %1
513     "+r"(pix)   // %2
514   : "m"(kShuffleMaskARGBToRGB24)  // %3
515   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
516   );
517 }
518
519 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
520   asm volatile (
521     "movdqa    %3,%%xmm6                       \n"
522     LABELALIGN
523   "1:                                          \n"
524     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
525     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
526     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
527     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
528     "lea       " MEMLEA(0x40,0) ",%0           \n"
529     "pshufb    %%xmm6,%%xmm0                   \n"
530     "pshufb    %%xmm6,%%xmm1                   \n"
531     "pshufb    %%xmm6,%%xmm2                   \n"
532     "pshufb    %%xmm6,%%xmm3                   \n"
533     "movdqa    %%xmm1,%%xmm4                   \n"
534     "psrldq    $0x4,%%xmm1                     \n"
535     "pslldq    $0xc,%%xmm4                     \n"
536     "movdqa    %%xmm2,%%xmm5                   \n"
537     "por       %%xmm4,%%xmm0                   \n"
538     "pslldq    $0x8,%%xmm5                     \n"
539     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
540     "por       %%xmm5,%%xmm1                   \n"
541     "psrldq    $0x8,%%xmm2                     \n"
542     "pslldq    $0x4,%%xmm3                     \n"
543     "por       %%xmm3,%%xmm2                   \n"
544     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
545     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
546     "lea       " MEMLEA(0x30,1) ",%1           \n"
547     "sub       $0x10,%2                        \n"
548     "jg        1b                              \n"
549   : "+r"(src),  // %0
550     "+r"(dst),  // %1
551     "+r"(pix)   // %2
552   : "m"(kShuffleMaskARGBToRAW)  // %3
553   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
554   );
555 }
556
557 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
558   asm volatile (
559     "pcmpeqb   %%xmm3,%%xmm3                   \n"
560     "psrld     $0x1b,%%xmm3                    \n"
561     "pcmpeqb   %%xmm4,%%xmm4                   \n"
562     "psrld     $0x1a,%%xmm4                    \n"
563     "pslld     $0x5,%%xmm4                     \n"
564     "pcmpeqb   %%xmm5,%%xmm5                   \n"
565     "pslld     $0xb,%%xmm5                     \n"
566     LABELALIGN
567   "1:                                          \n"
568     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
569     "movdqa    %%xmm0,%%xmm1                   \n"
570     "movdqa    %%xmm0,%%xmm2                   \n"
571     "pslld     $0x8,%%xmm0                     \n"
572     "psrld     $0x3,%%xmm1                     \n"
573     "psrld     $0x5,%%xmm2                     \n"
574     "psrad     $0x10,%%xmm0                    \n"
575     "pand      %%xmm3,%%xmm1                   \n"
576     "pand      %%xmm4,%%xmm2                   \n"
577     "pand      %%xmm5,%%xmm0                   \n"
578     "por       %%xmm2,%%xmm1                   \n"
579     "por       %%xmm1,%%xmm0                   \n"
580     "packssdw  %%xmm0,%%xmm0                   \n"
581     "lea       " MEMLEA(0x10,0) ",%0           \n"
582     "movq      %%xmm0," MEMACCESS(1) "         \n"
583     "lea       " MEMLEA(0x8,1) ",%1            \n"
584     "sub       $0x4,%2                         \n"
585     "jg        1b                              \n"
586   : "+r"(src),  // %0
587     "+r"(dst),  // %1
588     "+r"(pix)   // %2
589   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
590   );
591 }
592
593 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
594   asm volatile (
595     "pcmpeqb   %%xmm4,%%xmm4                   \n"
596     "psrld     $0x1b,%%xmm4                    \n"
597     "movdqa    %%xmm4,%%xmm5                   \n"
598     "pslld     $0x5,%%xmm5                     \n"
599     "movdqa    %%xmm4,%%xmm6                   \n"
600     "pslld     $0xa,%%xmm6                     \n"
601     "pcmpeqb   %%xmm7,%%xmm7                   \n"
602     "pslld     $0xf,%%xmm7                     \n"
603     LABELALIGN
604   "1:                                          \n"
605     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
606     "movdqa    %%xmm0,%%xmm1                   \n"
607     "movdqa    %%xmm0,%%xmm2                   \n"
608     "movdqa    %%xmm0,%%xmm3                   \n"
609     "psrad     $0x10,%%xmm0                    \n"
610     "psrld     $0x3,%%xmm1                     \n"
611     "psrld     $0x6,%%xmm2                     \n"
612     "psrld     $0x9,%%xmm3                     \n"
613     "pand      %%xmm7,%%xmm0                   \n"
614     "pand      %%xmm4,%%xmm1                   \n"
615     "pand      %%xmm5,%%xmm2                   \n"
616     "pand      %%xmm6,%%xmm3                   \n"
617     "por       %%xmm1,%%xmm0                   \n"
618     "por       %%xmm3,%%xmm2                   \n"
619     "por       %%xmm2,%%xmm0                   \n"
620     "packssdw  %%xmm0,%%xmm0                   \n"
621     "lea       " MEMLEA(0x10,0) ",%0           \n"
622     "movq      %%xmm0," MEMACCESS(1) "         \n"
623     "lea       " MEMLEA(0x8,1) ",%1            \n"
624     "sub       $0x4,%2                         \n"
625     "jg        1b                              \n"
626   : "+r"(src),  // %0
627     "+r"(dst),  // %1
628     "+r"(pix)   // %2
629   :: "memory", "cc",
630     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
631   );
632 }
633
634 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
635   asm volatile (
636     "pcmpeqb   %%xmm4,%%xmm4                   \n"
637     "psllw     $0xc,%%xmm4                     \n"
638     "movdqa    %%xmm4,%%xmm3                   \n"
639     "psrlw     $0x8,%%xmm3                     \n"
640     LABELALIGN
641   "1:                                          \n"
642     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
643     "movdqa    %%xmm0,%%xmm1                   \n"
644     "pand      %%xmm3,%%xmm0                   \n"
645     "pand      %%xmm4,%%xmm1                   \n"
646     "psrlq     $0x4,%%xmm0                     \n"
647     "psrlq     $0x8,%%xmm1                     \n"
648     "por       %%xmm1,%%xmm0                   \n"
649     "packuswb  %%xmm0,%%xmm0                   \n"
650     "lea       " MEMLEA(0x10,0) ",%0           \n"
651     "movq      %%xmm0," MEMACCESS(1) "         \n"
652     "lea       " MEMLEA(0x8,1) ",%1            \n"
653     "sub       $0x4,%2                         \n"
654     "jg        1b                              \n"
655   : "+r"(src),  // %0
656     "+r"(dst),  // %1
657     "+r"(pix)   // %2
658   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
659   );
660 }
661 #endif  // HAS_RGB24TOARGBROW_SSSE3
662
663 #ifdef HAS_ARGBTOYROW_SSSE3
664 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
665 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
666   asm volatile (
667     "movdqa    %3,%%xmm4                       \n"
668     "movdqa    %4,%%xmm5                       \n"
669     LABELALIGN
670   "1:                                          \n"
671     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
672     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
673     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
674     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
675     "pmaddubsw %%xmm4,%%xmm0                   \n"
676     "pmaddubsw %%xmm4,%%xmm1                   \n"
677     "pmaddubsw %%xmm4,%%xmm2                   \n"
678     "pmaddubsw %%xmm4,%%xmm3                   \n"
679     "lea       " MEMLEA(0x40,0) ",%0           \n"
680     "phaddw    %%xmm1,%%xmm0                   \n"
681     "phaddw    %%xmm3,%%xmm2                   \n"
682     "psrlw     $0x7,%%xmm0                     \n"
683     "psrlw     $0x7,%%xmm2                     \n"
684     "packuswb  %%xmm2,%%xmm0                   \n"
685     "paddb     %%xmm5,%%xmm0                   \n"
686     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
687     "lea       " MEMLEA(0x10,1) ",%1           \n"
688     "sub       $0x10,%2                        \n"
689     "jg        1b                              \n"
690   : "+r"(src_argb),  // %0
691     "+r"(dst_y),     // %1
692     "+r"(pix)        // %2
693   : "m"(kARGBToY),   // %3
694     "m"(kAddY16)     // %4
695   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
696   );
697 }
698 #endif  // HAS_ARGBTOYROW_SSSE3
699
700 #ifdef HAS_ARGBTOYJROW_SSSE3
701 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
702 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
703 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
704   asm volatile (
705     "movdqa    %3,%%xmm4                       \n"
706     "movdqa    %4,%%xmm5                       \n"
707     LABELALIGN
708   "1:                                          \n"
709     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
710     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
711     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
712     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
713     "pmaddubsw %%xmm4,%%xmm0                   \n"
714     "pmaddubsw %%xmm4,%%xmm1                   \n"
715     "pmaddubsw %%xmm4,%%xmm2                   \n"
716     "pmaddubsw %%xmm4,%%xmm3                   \n"
717     "lea       " MEMLEA(0x40,0) ",%0           \n"
718     "phaddw    %%xmm1,%%xmm0                   \n"
719     "phaddw    %%xmm3,%%xmm2                   \n"
720     "paddw     %%xmm5,%%xmm0                   \n"
721     "paddw     %%xmm5,%%xmm2                   \n"
722     "psrlw     $0x7,%%xmm0                     \n"
723     "psrlw     $0x7,%%xmm2                     \n"
724     "packuswb  %%xmm2,%%xmm0                   \n"
725     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
726     "lea       " MEMLEA(0x10,1) ",%1           \n"
727     "sub       $0x10,%2                        \n"
728     "jg        1b                              \n"
729   : "+r"(src_argb),  // %0
730     "+r"(dst_y),     // %1
731     "+r"(pix)        // %2
732   : "m"(kARGBToYJ),  // %3
733     "m"(kAddYJ64)    // %4
734   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
735   );
736 }
737 #endif  // HAS_ARGBTOYJROW_SSSE3
738
739 #ifdef HAS_ARGBTOYROW_AVX2
740 // vpermd for vphaddw + vpackuswb vpermd.
741 static const lvec32 kPermdARGBToY_AVX = {
742   0, 4, 1, 5, 2, 6, 3, 7
743 };
744
745 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
746 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
747   asm volatile (
748     "vbroadcastf128 %3,%%ymm4                  \n"
749     "vbroadcastf128 %4,%%ymm5                  \n"
750     "vmovdqu    %5,%%ymm6                      \n"
751     LABELALIGN
752   "1:                                          \n"
753     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
754     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
755     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
756     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
757     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
758     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
759     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
760     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
761     "lea       " MEMLEA(0x80,0) ",%0           \n"
762     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
763     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
764     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
765     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
766     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
767     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
768     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
769     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
770     "lea       " MEMLEA(0x20,1) ",%1           \n"
771     "sub       $0x20,%2                        \n"
772     "jg        1b                              \n"
773     "vzeroupper                                \n"
774   : "+r"(src_argb),  // %0
775     "+r"(dst_y),     // %1
776     "+r"(pix)        // %2
777   : "m"(kARGBToY),   // %3
778     "m"(kAddY16),    // %4
779     "m"(kPermdARGBToY_AVX)  // %5
780   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
781   );
782 }
783 #endif  // HAS_ARGBTOYROW_AVX2
784
785 #ifdef HAS_ARGBTOYJROW_AVX2
786 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
787 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
788   asm volatile (
789     "vbroadcastf128 %3,%%ymm4                  \n"
790     "vbroadcastf128 %4,%%ymm5                  \n"
791     "vmovdqu    %5,%%ymm6                      \n"
792     LABELALIGN
793   "1:                                          \n"
794     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
795     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
796     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
797     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
798     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
799     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
800     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
801     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
802     "lea       " MEMLEA(0x80,0) ",%0           \n"
803     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
804     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
805     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
806     "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
807     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
808     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
809     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
810     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
811     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
812     "lea       " MEMLEA(0x20,1) ",%1           \n"
813     "sub       $0x20,%2                        \n"
814     "jg        1b                              \n"
815     "vzeroupper                                \n"
816   : "+r"(src_argb),  // %0
817     "+r"(dst_y),     // %1
818     "+r"(pix)        // %2
819   : "m"(kARGBToYJ),   // %3
820     "m"(kAddYJ64),    // %4
821     "m"(kPermdARGBToY_AVX)  // %5
822   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
823   );
824 }
825 #endif  // HAS_ARGBTOYJROW_AVX2
826
827 #ifdef HAS_ARGBTOUVROW_SSSE3
828 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
829                        uint8* dst_u, uint8* dst_v, int width) {
830   asm volatile (
831     "movdqa    %5,%%xmm3                       \n"
832     "movdqa    %6,%%xmm4                       \n"
833     "movdqa    %7,%%xmm5                       \n"
834     "sub       %1,%2                           \n"
835     LABELALIGN
836   "1:                                          \n"
837     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
838     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
839     "pavgb     %%xmm7,%%xmm0                   \n"
840     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
841     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
842     "pavgb     %%xmm7,%%xmm1                   \n"
843     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
844     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
845     "pavgb     %%xmm7,%%xmm2                   \n"
846     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
847     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
848     "pavgb     %%xmm7,%%xmm6                   \n"
849
850     "lea       " MEMLEA(0x40,0) ",%0           \n"
851     "movdqa    %%xmm0,%%xmm7                   \n"
852     "shufps    $0x88,%%xmm1,%%xmm0             \n"
853     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
854     "pavgb     %%xmm7,%%xmm0                   \n"
855     "movdqa    %%xmm2,%%xmm7                   \n"
856     "shufps    $0x88,%%xmm6,%%xmm2             \n"
857     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
858     "pavgb     %%xmm7,%%xmm2                   \n"
859     "movdqa    %%xmm0,%%xmm1                   \n"
860     "movdqa    %%xmm2,%%xmm6                   \n"
861     "pmaddubsw %%xmm4,%%xmm0                   \n"
862     "pmaddubsw %%xmm4,%%xmm2                   \n"
863     "pmaddubsw %%xmm3,%%xmm1                   \n"
864     "pmaddubsw %%xmm3,%%xmm6                   \n"
865     "phaddw    %%xmm2,%%xmm0                   \n"
866     "phaddw    %%xmm6,%%xmm1                   \n"
867     "psraw     $0x8,%%xmm0                     \n"
868     "psraw     $0x8,%%xmm1                     \n"
869     "packsswb  %%xmm1,%%xmm0                   \n"
870     "paddb     %%xmm5,%%xmm0                   \n"
871     "movlps    %%xmm0," MEMACCESS(1) "         \n"
872     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
873     "lea       " MEMLEA(0x8,1) ",%1            \n"
874     "sub       $0x10,%3                        \n"
875     "jg        1b                              \n"
876   : "+r"(src_argb0),       // %0
877     "+r"(dst_u),           // %1
878     "+r"(dst_v),           // %2
879     "+rm"(width)           // %3
880   : "r"((intptr_t)(src_stride_argb)), // %4
881     "m"(kARGBToV),  // %5
882     "m"(kARGBToU),  // %6
883     "m"(kAddUV128)  // %7
884   : "memory", "cc", NACL_R14
885     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
886   );
887 }
888 #endif  // HAS_ARGBTOUVROW_SSSE3
889
890 #ifdef HAS_ARGBTOUVROW_AVX2
891 // vpshufb for vphaddw + vpackuswb packed to shorts.
892 static const lvec8 kShufARGBToUV_AVX = {
893   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
894   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
895 };
896 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
897                       uint8* dst_u, uint8* dst_v, int width) {
898   asm volatile (
899     "vbroadcastf128 %5,%%ymm5                  \n"
900     "vbroadcastf128 %6,%%ymm6                  \n"
901     "vbroadcastf128 %7,%%ymm7                  \n"
902     "sub       %1,%2                           \n"
903     LABELALIGN
904   "1:                                          \n"
905     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
906     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
907     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
908     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
909     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
910     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
911     VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
912     VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
913     "lea       " MEMLEA(0x80,0) ",%0           \n"
914     "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
915     "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
916     "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
917     "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
918     "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
919     "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
920
921     "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
922     "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
923     "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
924     "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
925     "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
926     "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
927     "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
928     "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
929     "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
930     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
931     "vpshufb    %8,%%ymm0,%%ymm0               \n"
932     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
933
934     "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
935     VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
936     "lea       " MEMLEA(0x10,1) ",%1           \n"
937     "sub       $0x20,%3                        \n"
938     "jg        1b                              \n"
939     "vzeroupper                                \n"
940   : "+r"(src_argb0),       // %0
941     "+r"(dst_u),           // %1
942     "+r"(dst_v),           // %2
943     "+rm"(width)           // %3
944   : "r"((intptr_t)(src_stride_argb)), // %4
945     "m"(kAddUV128),  // %5
946     "m"(kARGBToV),   // %6
947     "m"(kARGBToU),   // %7
948     "m"(kShufARGBToUV_AVX)  // %8
949   : "memory", "cc", NACL_R14
950     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
951   );
952 }
953 #endif  // HAS_ARGBTOUVROW_AVX2
954
955 #ifdef HAS_ARGBTOUVJROW_SSSE3
956 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
957                         uint8* dst_u, uint8* dst_v, int width) {
958   asm volatile (
959     "movdqa    %5,%%xmm3                       \n"
960     "movdqa    %6,%%xmm4                       \n"
961     "movdqa    %7,%%xmm5                       \n"
962     "sub       %1,%2                           \n"
963     LABELALIGN
964   "1:                                          \n"
965     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
966     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
967     "pavgb     %%xmm7,%%xmm0                   \n"
968     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
969     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
970     "pavgb     %%xmm7,%%xmm1                   \n"
971     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
972     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
973     "pavgb     %%xmm7,%%xmm2                   \n"
974     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
975     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
976     "pavgb     %%xmm7,%%xmm6                   \n"
977
978     "lea       " MEMLEA(0x40,0) ",%0           \n"
979     "movdqa    %%xmm0,%%xmm7                   \n"
980     "shufps    $0x88,%%xmm1,%%xmm0             \n"
981     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
982     "pavgb     %%xmm7,%%xmm0                   \n"
983     "movdqa    %%xmm2,%%xmm7                   \n"
984     "shufps    $0x88,%%xmm6,%%xmm2             \n"
985     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
986     "pavgb     %%xmm7,%%xmm2                   \n"
987     "movdqa    %%xmm0,%%xmm1                   \n"
988     "movdqa    %%xmm2,%%xmm6                   \n"
989     "pmaddubsw %%xmm4,%%xmm0                   \n"
990     "pmaddubsw %%xmm4,%%xmm2                   \n"
991     "pmaddubsw %%xmm3,%%xmm1                   \n"
992     "pmaddubsw %%xmm3,%%xmm6                   \n"
993     "phaddw    %%xmm2,%%xmm0                   \n"
994     "phaddw    %%xmm6,%%xmm1                   \n"
995     "paddw     %%xmm5,%%xmm0                   \n"
996     "paddw     %%xmm5,%%xmm1                   \n"
997     "psraw     $0x8,%%xmm0                     \n"
998     "psraw     $0x8,%%xmm1                     \n"
999     "packsswb  %%xmm1,%%xmm0                   \n"
1000     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1001     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1002     "lea       " MEMLEA(0x8,1) ",%1            \n"
1003     "sub       $0x10,%3                        \n"
1004     "jg        1b                              \n"
1005   : "+r"(src_argb0),       // %0
1006     "+r"(dst_u),           // %1
1007     "+r"(dst_v),           // %2
1008     "+rm"(width)           // %3
1009   : "r"((intptr_t)(src_stride_argb)), // %4
1010     "m"(kARGBToVJ),  // %5
1011     "m"(kARGBToUJ),  // %6
1012     "m"(kAddUVJ128)  // %7
1013   : "memory", "cc", NACL_R14
1014     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1015   );
1016 }
1017 #endif  // HAS_ARGBTOUVJROW_SSSE3
1018
1019 #ifdef HAS_ARGBTOUV444ROW_SSSE3
1020 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1021                           int width) {
1022   asm volatile (
1023     "movdqa    %4,%%xmm3                       \n"
1024     "movdqa    %5,%%xmm4                       \n"
1025     "movdqa    %6,%%xmm5                       \n"
1026     "sub       %1,%2                           \n"
1027     LABELALIGN
1028   "1:                                          \n"
1029     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1030     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1031     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1032     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1033     "pmaddubsw %%xmm4,%%xmm0                   \n"
1034     "pmaddubsw %%xmm4,%%xmm1                   \n"
1035     "pmaddubsw %%xmm4,%%xmm2                   \n"
1036     "pmaddubsw %%xmm4,%%xmm6                   \n"
1037     "phaddw    %%xmm1,%%xmm0                   \n"
1038     "phaddw    %%xmm6,%%xmm2                   \n"
1039     "psraw     $0x8,%%xmm0                     \n"
1040     "psraw     $0x8,%%xmm2                     \n"
1041     "packsswb  %%xmm2,%%xmm0                   \n"
1042     "paddb     %%xmm5,%%xmm0                   \n"
1043     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1044     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1045     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1046     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1047     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1048     "pmaddubsw %%xmm3,%%xmm0                   \n"
1049     "pmaddubsw %%xmm3,%%xmm1                   \n"
1050     "pmaddubsw %%xmm3,%%xmm2                   \n"
1051     "pmaddubsw %%xmm3,%%xmm6                   \n"
1052     "phaddw    %%xmm1,%%xmm0                   \n"
1053     "phaddw    %%xmm6,%%xmm2                   \n"
1054     "psraw     $0x8,%%xmm0                     \n"
1055     "psraw     $0x8,%%xmm2                     \n"
1056     "packsswb  %%xmm2,%%xmm0                   \n"
1057     "paddb     %%xmm5,%%xmm0                   \n"
1058     "lea       " MEMLEA(0x40,0) ",%0           \n"
1059     MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
1060     "lea       " MEMLEA(0x10,1) ",%1           \n"
1061     "sub       $0x10,%3                        \n"
1062     "jg        1b                              \n"
1063   : "+r"(src_argb),        // %0
1064     "+r"(dst_u),           // %1
1065     "+r"(dst_v),           // %2
1066     "+rm"(width)           // %3
1067   : "m"(kARGBToV),  // %4
1068     "m"(kARGBToU),  // %5
1069     "m"(kAddUV128)  // %6
1070   : "memory", "cc", NACL_R14
1071     "xmm0", "xmm1", "xmm2", "xmm6"
1072   );
1073 }
1074 #endif  // HAS_ARGBTOUV444ROW_SSSE3
1075
1076 #ifdef HAS_ARGBTOUV422ROW_SSSE3
1077 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1078                           uint8* dst_u, uint8* dst_v, int width) {
1079   asm volatile (
1080     "movdqa    %4,%%xmm3                       \n"
1081     "movdqa    %5,%%xmm4                       \n"
1082     "movdqa    %6,%%xmm5                       \n"
1083     "sub       %1,%2                           \n"
1084     LABELALIGN
1085   "1:                                          \n"
1086     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1087     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1088     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1089     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1090     "lea       " MEMLEA(0x40,0) ",%0           \n"
1091     "movdqa    %%xmm0,%%xmm7                   \n"
1092     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1093     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1094     "pavgb     %%xmm7,%%xmm0                   \n"
1095     "movdqa    %%xmm2,%%xmm7                   \n"
1096     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1097     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1098     "pavgb     %%xmm7,%%xmm2                   \n"
1099     "movdqa    %%xmm0,%%xmm1                   \n"
1100     "movdqa    %%xmm2,%%xmm6                   \n"
1101     "pmaddubsw %%xmm4,%%xmm0                   \n"
1102     "pmaddubsw %%xmm4,%%xmm2                   \n"
1103     "pmaddubsw %%xmm3,%%xmm1                   \n"
1104     "pmaddubsw %%xmm3,%%xmm6                   \n"
1105     "phaddw    %%xmm2,%%xmm0                   \n"
1106     "phaddw    %%xmm6,%%xmm1                   \n"
1107     "psraw     $0x8,%%xmm0                     \n"
1108     "psraw     $0x8,%%xmm1                     \n"
1109     "packsswb  %%xmm1,%%xmm0                   \n"
1110     "paddb     %%xmm5,%%xmm0                   \n"
1111     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1112     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1113     "lea       " MEMLEA(0x8,1) ",%1            \n"
1114     "sub       $0x10,%3                        \n"
1115     "jg        1b                              \n"
1116   : "+r"(src_argb0),       // %0
1117     "+r"(dst_u),           // %1
1118     "+r"(dst_v),           // %2
1119     "+rm"(width)           // %3
1120   : "m"(kARGBToV),  // %4
1121     "m"(kARGBToU),  // %5
1122     "m"(kAddUV128)  // %6
1123   : "memory", "cc", NACL_R14
1124     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1125   );
1126 }
1127 #endif  // HAS_ARGBTOUV422ROW_SSSE3
1128
1129 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
1130   asm volatile (
1131     "movdqa    %4,%%xmm5                       \n"
1132     "movdqa    %3,%%xmm4                       \n"
1133     LABELALIGN
1134   "1:                                          \n"
1135     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1136     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1137     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1138     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1139     "pmaddubsw %%xmm4,%%xmm0                   \n"
1140     "pmaddubsw %%xmm4,%%xmm1                   \n"
1141     "pmaddubsw %%xmm4,%%xmm2                   \n"
1142     "pmaddubsw %%xmm4,%%xmm3                   \n"
1143     "lea       " MEMLEA(0x40,0) ",%0           \n"
1144     "phaddw    %%xmm1,%%xmm0                   \n"
1145     "phaddw    %%xmm3,%%xmm2                   \n"
1146     "psrlw     $0x7,%%xmm0                     \n"
1147     "psrlw     $0x7,%%xmm2                     \n"
1148     "packuswb  %%xmm2,%%xmm0                   \n"
1149     "paddb     %%xmm5,%%xmm0                   \n"
1150     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1151     "lea       " MEMLEA(0x10,1) ",%1           \n"
1152     "sub       $0x10,%2                        \n"
1153     "jg        1b                              \n"
1154   : "+r"(src_bgra),  // %0
1155     "+r"(dst_y),     // %1
1156     "+r"(pix)        // %2
1157   : "m"(kBGRAToY),   // %3
1158     "m"(kAddY16)     // %4
1159   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1160   );
1161 }
1162
1163 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1164                        uint8* dst_u, uint8* dst_v, int width) {
1165   asm volatile (
1166     "movdqa    %5,%%xmm3                       \n"
1167     "movdqa    %6,%%xmm4                       \n"
1168     "movdqa    %7,%%xmm5                       \n"
1169     "sub       %1,%2                           \n"
1170     LABELALIGN
1171   "1:                                          \n"
1172     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1173     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1174     "pavgb     %%xmm7,%%xmm0                   \n"
1175     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1176     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1177     "pavgb     %%xmm7,%%xmm1                   \n"
1178     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1179     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1180     "pavgb     %%xmm7,%%xmm2                   \n"
1181     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1182     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1183     "pavgb     %%xmm7,%%xmm6                   \n"
1184
1185     "lea       " MEMLEA(0x40,0) ",%0           \n"
1186     "movdqa    %%xmm0,%%xmm7                   \n"
1187     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1188     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1189     "pavgb     %%xmm7,%%xmm0                   \n"
1190     "movdqa    %%xmm2,%%xmm7                   \n"
1191     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1192     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1193     "pavgb     %%xmm7,%%xmm2                   \n"
1194     "movdqa    %%xmm0,%%xmm1                   \n"
1195     "movdqa    %%xmm2,%%xmm6                   \n"
1196     "pmaddubsw %%xmm4,%%xmm0                   \n"
1197     "pmaddubsw %%xmm4,%%xmm2                   \n"
1198     "pmaddubsw %%xmm3,%%xmm1                   \n"
1199     "pmaddubsw %%xmm3,%%xmm6                   \n"
1200     "phaddw    %%xmm2,%%xmm0                   \n"
1201     "phaddw    %%xmm6,%%xmm1                   \n"
1202     "psraw     $0x8,%%xmm0                     \n"
1203     "psraw     $0x8,%%xmm1                     \n"
1204     "packsswb  %%xmm1,%%xmm0                   \n"
1205     "paddb     %%xmm5,%%xmm0                   \n"
1206     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1207     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1208     "lea       " MEMLEA(0x8,1) ",%1            \n"
1209     "sub       $0x10,%3                        \n"
1210     "jg        1b                              \n"
1211   : "+r"(src_bgra0),       // %0
1212     "+r"(dst_u),           // %1
1213     "+r"(dst_v),           // %2
1214     "+rm"(width)           // %3
1215   : "r"((intptr_t)(src_stride_bgra)), // %4
1216     "m"(kBGRAToV),  // %5
1217     "m"(kBGRAToU),  // %6
1218     "m"(kAddUV128)  // %7
1219   : "memory", "cc", NACL_R14
1220     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1221   );
1222 }
1223
1224 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1225   asm volatile (
1226     "movdqa    %4,%%xmm5                       \n"
1227     "movdqa    %3,%%xmm4                       \n"
1228     LABELALIGN
1229   "1:                                          \n"
1230     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1231     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1232     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1233     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1234     "pmaddubsw %%xmm4,%%xmm0                   \n"
1235     "pmaddubsw %%xmm4,%%xmm1                   \n"
1236     "pmaddubsw %%xmm4,%%xmm2                   \n"
1237     "pmaddubsw %%xmm4,%%xmm3                   \n"
1238     "lea       " MEMLEA(0x40,0) ",%0           \n"
1239     "phaddw    %%xmm1,%%xmm0                   \n"
1240     "phaddw    %%xmm3,%%xmm2                   \n"
1241     "psrlw     $0x7,%%xmm0                     \n"
1242     "psrlw     $0x7,%%xmm2                     \n"
1243     "packuswb  %%xmm2,%%xmm0                   \n"
1244     "paddb     %%xmm5,%%xmm0                   \n"
1245     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1246     "lea       " MEMLEA(0x10,1) ",%1           \n"
1247     "sub       $0x10,%2                        \n"
1248     "jg        1b                              \n"
1249   : "+r"(src_abgr),  // %0
1250     "+r"(dst_y),     // %1
1251     "+r"(pix)        // %2
1252   : "m"(kABGRToY),   // %3
1253     "m"(kAddY16)     // %4
1254   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1255   );
1256 }
1257
1258 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1259   asm volatile (
1260     "movdqa    %4,%%xmm5                       \n"
1261     "movdqa    %3,%%xmm4                       \n"
1262     LABELALIGN
1263   "1:                                          \n"
1264     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1265     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1266     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1267     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1268     "pmaddubsw %%xmm4,%%xmm0                   \n"
1269     "pmaddubsw %%xmm4,%%xmm1                   \n"
1270     "pmaddubsw %%xmm4,%%xmm2                   \n"
1271     "pmaddubsw %%xmm4,%%xmm3                   \n"
1272     "lea       " MEMLEA(0x40,0) ",%0           \n"
1273     "phaddw    %%xmm1,%%xmm0                   \n"
1274     "phaddw    %%xmm3,%%xmm2                   \n"
1275     "psrlw     $0x7,%%xmm0                     \n"
1276     "psrlw     $0x7,%%xmm2                     \n"
1277     "packuswb  %%xmm2,%%xmm0                   \n"
1278     "paddb     %%xmm5,%%xmm0                   \n"
1279     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1280     "lea       " MEMLEA(0x10,1) ",%1           \n"
1281     "sub       $0x10,%2                        \n"
1282     "jg        1b                              \n"
1283   : "+r"(src_rgba),  // %0
1284     "+r"(dst_y),     // %1
1285     "+r"(pix)        // %2
1286   : "m"(kRGBAToY),   // %3
1287     "m"(kAddY16)     // %4
1288   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1289   );
1290 }
1291
1292 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1293                        uint8* dst_u, uint8* dst_v, int width) {
1294   asm volatile (
1295     "movdqa    %5,%%xmm3                       \n"
1296     "movdqa    %6,%%xmm4                       \n"
1297     "movdqa    %7,%%xmm5                       \n"
1298     "sub       %1,%2                           \n"
1299     LABELALIGN
1300   "1:                                          \n"
1301     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1302     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1303     "pavgb     %%xmm7,%%xmm0                   \n"
1304     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1305     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1306     "pavgb     %%xmm7,%%xmm1                   \n"
1307     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1308     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1309     "pavgb     %%xmm7,%%xmm2                   \n"
1310     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1311     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1312     "pavgb     %%xmm7,%%xmm6                   \n"
1313
1314     "lea       " MEMLEA(0x40,0) ",%0           \n"
1315     "movdqa    %%xmm0,%%xmm7                   \n"
1316     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1317     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1318     "pavgb     %%xmm7,%%xmm0                   \n"
1319     "movdqa    %%xmm2,%%xmm7                   \n"
1320     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1321     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1322     "pavgb     %%xmm7,%%xmm2                   \n"
1323     "movdqa    %%xmm0,%%xmm1                   \n"
1324     "movdqa    %%xmm2,%%xmm6                   \n"
1325     "pmaddubsw %%xmm4,%%xmm0                   \n"
1326     "pmaddubsw %%xmm4,%%xmm2                   \n"
1327     "pmaddubsw %%xmm3,%%xmm1                   \n"
1328     "pmaddubsw %%xmm3,%%xmm6                   \n"
1329     "phaddw    %%xmm2,%%xmm0                   \n"
1330     "phaddw    %%xmm6,%%xmm1                   \n"
1331     "psraw     $0x8,%%xmm0                     \n"
1332     "psraw     $0x8,%%xmm1                     \n"
1333     "packsswb  %%xmm1,%%xmm0                   \n"
1334     "paddb     %%xmm5,%%xmm0                   \n"
1335     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1336     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1337     "lea       " MEMLEA(0x8,1) ",%1            \n"
1338     "sub       $0x10,%3                        \n"
1339     "jg        1b                              \n"
1340   : "+r"(src_abgr0),       // %0
1341     "+r"(dst_u),           // %1
1342     "+r"(dst_v),           // %2
1343     "+rm"(width)           // %3
1344   : "r"((intptr_t)(src_stride_abgr)), // %4
1345     "m"(kABGRToV),  // %5
1346     "m"(kABGRToU),  // %6
1347     "m"(kAddUV128)  // %7
1348   : "memory", "cc", NACL_R14
1349     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1350   );
1351 }
1352
1353 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1354                        uint8* dst_u, uint8* dst_v, int width) {
1355   asm volatile (
1356     "movdqa    %5,%%xmm3                       \n"
1357     "movdqa    %6,%%xmm4                       \n"
1358     "movdqa    %7,%%xmm5                       \n"
1359     "sub       %1,%2                           \n"
1360     LABELALIGN
1361   "1:                                          \n"
1362     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1363     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1364     "pavgb     %%xmm7,%%xmm0                   \n"
1365     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1366     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1367     "pavgb     %%xmm7,%%xmm1                   \n"
1368     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1369     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1370     "pavgb     %%xmm7,%%xmm2                   \n"
1371     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1372     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1373     "pavgb     %%xmm7,%%xmm6                   \n"
1374
1375     "lea       " MEMLEA(0x40,0) ",%0           \n"
1376     "movdqa    %%xmm0,%%xmm7                   \n"
1377     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1378     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1379     "pavgb     %%xmm7,%%xmm0                   \n"
1380     "movdqa    %%xmm2,%%xmm7                   \n"
1381     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1382     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1383     "pavgb     %%xmm7,%%xmm2                   \n"
1384     "movdqa    %%xmm0,%%xmm1                   \n"
1385     "movdqa    %%xmm2,%%xmm6                   \n"
1386     "pmaddubsw %%xmm4,%%xmm0                   \n"
1387     "pmaddubsw %%xmm4,%%xmm2                   \n"
1388     "pmaddubsw %%xmm3,%%xmm1                   \n"
1389     "pmaddubsw %%xmm3,%%xmm6                   \n"
1390     "phaddw    %%xmm2,%%xmm0                   \n"
1391     "phaddw    %%xmm6,%%xmm1                   \n"
1392     "psraw     $0x8,%%xmm0                     \n"
1393     "psraw     $0x8,%%xmm1                     \n"
1394     "packsswb  %%xmm1,%%xmm0                   \n"
1395     "paddb     %%xmm5,%%xmm0                   \n"
1396     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1397     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1398     "lea       " MEMLEA(0x8,1) ",%1            \n"
1399     "sub       $0x10,%3                        \n"
1400     "jg        1b                              \n"
1401   : "+r"(src_rgba0),       // %0
1402     "+r"(dst_u),           // %1
1403     "+r"(dst_v),           // %2
1404     "+rm"(width)           // %3
1405   : "r"((intptr_t)(src_stride_rgba)), // %4
1406     "m"(kRGBAToV),  // %5
1407     "m"(kRGBAToU),  // %6
1408     "m"(kAddUV128)  // %7
1409   : "memory", "cc", NACL_R14
1410     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1411   );
1412 }
1413
1414 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1415
1416 struct YuvConstants {
1417   lvec8 kUVToB;     // 0
1418   lvec8 kUVToG;     // 32
1419   lvec8 kUVToR;     // 64
1420   lvec16 kUVBiasB;  // 96
1421   lvec16 kUVBiasG;  // 128
1422   lvec16 kUVBiasR;  // 160
1423   lvec16 kYToRgb;   // 192
1424 };
1425
1426 // BT.601 YUV to RGB reference
1427 //  R = (Y - 16) * 1.164              - V * -1.596
1428 //  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
1429 //  B = (Y - 16) * 1.164 - U * -2.018
1430
1431 // Y contribution to R,G,B.  Scale and bias.
1432 // TODO(fbarchard): Consider moving constants into a common header.
1433 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1434 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1435
1436 // U and V contributions to R,G,B.
1437 #define UB -128 /* max(-128, round(-2.018 * 64)) */
1438 #define UG 25 /* round(0.391 * 64) */
1439 #define VG 52 /* round(0.813 * 64) */
1440 #define VR -102 /* round(-1.596 * 64) */
1441
1442 // Bias values to subtract 16 from Y and 128 from U and V.
1443 #define BB (UB * 128            + YGB)
1444 #define BG (UG * 128 + VG * 128 + YGB)
1445 #define BR            (VR * 128 + YGB)
1446
1447 // BT601 constants for YUV to RGB.
1448 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
1449   { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1450     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
1451   { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1452     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1453   { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1454     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
1455   { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1456   { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1457   { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1458   { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1459 };
1460
1461 // BT601 constants for NV21 where chroma plane is VU instead of UV.
1462 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
1463   { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1464     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
1465   { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1466     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1467   { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1468     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
1469   { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1470   { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1471   { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1472   { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1473 };
1474
1475 #undef YG
1476 #undef YGB
1477 #undef UB
1478 #undef UG
1479 #undef VG
1480 #undef VR
1481 #undef BB
1482 #undef BG
1483 #undef BR
1484
1485 // JPEG YUV to RGB reference
1486 // *  R = Y                - V * -1.40200
1487 // *  G = Y - U *  0.34414 - V *  0.71414
1488 // *  B = Y - U * -1.77200
1489
1490 // Y contribution to R,G,B.  Scale and bias.
1491 // TODO(fbarchard): Consider moving constants into a common header.
1492 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
1493 #define YGBJ 32  /* 64 / 2 */
1494
1495 // U and V contributions to R,G,B.
1496 #define UBJ -113 /* round(-1.77200 * 64) */
1497 #define UGJ 22 /* round(0.34414 * 64) */
1498 #define VGJ 46 /* round(0.71414  * 64) */
1499 #define VRJ -90 /* round(-1.40200 * 64) */
1500
1501 // Bias values to subtract 16 from Y and 128 from U and V.
1502 #define BBJ (UBJ * 128             + YGBJ)
1503 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
1504 #define BRJ             (VRJ * 128 + YGBJ)
1505
1506 // JPEG constants for YUV to RGB.
1507 YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
1508   { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
1509     UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
1510   { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1511     UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1512     UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1513     UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
1514   { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
1515     0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
1516   { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
1517     BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
1518   { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
1519     BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
1520   { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
1521     BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
1522   { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
1523     YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
1524 };
1525
1526 #undef YGJ
1527 #undef YGBJ
1528 #undef UBJ
1529 #undef UGJ
1530 #undef VGJ
1531 #undef VRJ
1532 #undef BBJ
1533 #undef BGJ
1534 #undef BRJ
1535
1536 // Read 8 UV from 411
1537 #define READYUV444                                                             \
1538     "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
1539     MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1540     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
1541     "punpcklbw  %%xmm1,%%xmm0                                   \n"
1542
1543 // Read 4 UV from 422, upsample to 8 UV
1544 #define READYUV422                                                             \
1545     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
1546     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1547     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
1548     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1549     "punpcklwd  %%xmm0,%%xmm0                                   \n"
1550
1551 // Read 2 UV from 411, upsample to 8 UV
1552 #define READYUV411                                                             \
1553     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
1554     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1555     "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
1556     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1557     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1558     "punpckldq  %%xmm0,%%xmm0                                   \n"
1559
1560 // Read 4 UV from NV12, upsample to 8 UV
1561 #define READNV12                                                               \
1562     "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
1563     "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
1564     "punpcklwd  %%xmm0,%%xmm0                                   \n"
1565
1566 // Convert 8 pixels: 8 UV and 8 Y
1567 #define YUVTORGB(YuvConstants)                                                 \
1568     "movdqa     %%xmm0,%%xmm1                                   \n"            \
1569     "movdqa     %%xmm0,%%xmm2                                   \n"            \
1570     "movdqa     %%xmm0,%%xmm3                                   \n"            \
1571     "movdqa     " MEMACCESS2(96, [YuvConstants]) ",%%xmm0       \n"            \
1572     "pmaddubsw  " MEMACCESS([YuvConstants]) ",%%xmm1            \n"            \
1573     "psubw      %%xmm1,%%xmm0                                   \n"            \
1574     "movdqa     " MEMACCESS2(128, [YuvConstants]) ",%%xmm1      \n"            \
1575     "pmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%xmm2       \n"            \
1576     "psubw      %%xmm2,%%xmm1                                   \n"            \
1577     "movdqa     " MEMACCESS2(160, [YuvConstants]) ",%%xmm2      \n"            \
1578     "pmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%xmm3       \n"            \
1579     "psubw      %%xmm3,%%xmm2                                   \n"            \
1580     "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
1581     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
1582     "punpcklbw  %%xmm3,%%xmm3                                   \n"            \
1583     "pmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%xmm3      \n"            \
1584     "paddsw     %%xmm3,%%xmm0                                   \n"            \
1585     "paddsw     %%xmm3,%%xmm1                                   \n"            \
1586     "paddsw     %%xmm3,%%xmm2                                   \n"            \
1587     "psraw      $0x6,%%xmm0                                     \n"            \
1588     "psraw      $0x6,%%xmm1                                     \n"            \
1589     "psraw      $0x6,%%xmm2                                     \n"            \
1590     "packuswb   %%xmm0,%%xmm0                                   \n"            \
1591     "packuswb   %%xmm1,%%xmm1                                   \n"            \
1592     "packuswb   %%xmm2,%%xmm2                                   \n"
1593
1594 // Store 8 ARGB values. Assumes XMM5 is zero.
1595 #define STOREARGB                                                              \
1596     "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
1597     "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
1598     "movdqa     %%xmm0,%%xmm1                                    \n"           \
1599     "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
1600     "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
1601     "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
1602     "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
1603     "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
1604
1605 // Store 8 BGRA values. Assumes XMM5 is zero.
1606 #define STOREBGRA                                                              \
1607     "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
1608     "punpcklbw %%xmm0,%%xmm1                                     \n"           \
1609     "punpcklbw %%xmm2,%%xmm5                                     \n"           \
1610     "movdqa    %%xmm5,%%xmm0                                     \n"           \
1611     "punpcklwd %%xmm1,%%xmm5                                     \n"           \
1612     "punpckhwd %%xmm1,%%xmm0                                     \n"           \
1613     "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "                  \n"           \
1614     "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_bgra]) "           \n"           \
1615     "lea       " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra]         \n"
1616
1617 // Store 8 ABGR values. Assumes XMM5 is zero.
1618 #define STOREABGR                                                              \
1619     "punpcklbw %%xmm1,%%xmm2                                     \n"           \
1620     "punpcklbw %%xmm5,%%xmm0                                     \n"           \
1621     "movdqa    %%xmm2,%%xmm1                                     \n"           \
1622     "punpcklwd %%xmm0,%%xmm2                                     \n"           \
1623     "punpckhwd %%xmm0,%%xmm1                                     \n"           \
1624     "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "                  \n"           \
1625     "movdqu    %%xmm1," MEMACCESS2(0x10, [dst_abgr]) "           \n"           \
1626     "lea       " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr]         \n"
1627
1628 // Store 8 RGBA values. Assumes XMM5 is zero.
1629 #define STORERGBA                                                              \
1630     "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
1631     "punpcklbw %%xmm2,%%xmm1                                     \n"           \
1632     "punpcklbw %%xmm0,%%xmm5                                     \n"           \
1633     "movdqa    %%xmm5,%%xmm0                                     \n"           \
1634     "punpcklwd %%xmm1,%%xmm5                                     \n"           \
1635     "punpckhwd %%xmm1,%%xmm0                                     \n"           \
1636     "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
1637     "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
1638     "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
1639
1640 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1641                                 const uint8* u_buf,
1642                                 const uint8* v_buf,
1643                                 uint8* dst_argb,
1644                                 int width) {
1645   asm volatile (
1646     "sub       %[u_buf],%[v_buf]               \n"
1647     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1648     LABELALIGN
1649   "1:                                          \n"
1650     READYUV444
1651     YUVTORGB(kYuvConstants)
1652     STOREARGB
1653     "sub       $0x8,%[width]                   \n"
1654     "jg        1b                              \n"
1655   : [y_buf]"+r"(y_buf),    // %[y_buf]
1656     [u_buf]"+r"(u_buf),    // %[u_buf]
1657     [v_buf]"+r"(v_buf),    // %[v_buf]
1658     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1659     [width]"+rm"(width)    // %[width]
1660   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1661   : "memory", "cc", NACL_R14
1662     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1663   );
1664 }
1665
1666 // TODO(fbarchard): Consider putting masks into constants.
1667 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1668                                  const uint8* u_buf,
1669                                  const uint8* v_buf,
1670                                  uint8* dst_rgb24,
1671                                  int width) {
1672   asm volatile (
1673     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1674     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
1675     "sub       %[u_buf],%[v_buf]               \n"
1676     LABELALIGN
1677   "1:                                          \n"
1678     READYUV422
1679     YUVTORGB(kYuvConstants)
1680     "punpcklbw %%xmm1,%%xmm0                   \n"
1681     "punpcklbw %%xmm2,%%xmm2                   \n"
1682     "movdqa    %%xmm0,%%xmm1                   \n"
1683     "punpcklwd %%xmm2,%%xmm0                   \n"
1684     "punpckhwd %%xmm2,%%xmm1                   \n"
1685     "pshufb    %%xmm5,%%xmm0                   \n"
1686     "pshufb    %%xmm6,%%xmm1                   \n"
1687     "palignr   $0xc,%%xmm0,%%xmm1              \n"
1688     "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1689     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1690     "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1691     "subl      $0x8,%[width]                   \n"
1692     "jg        1b                              \n"
1693   : [y_buf]"+r"(y_buf),    // %[y_buf]
1694     [u_buf]"+r"(u_buf),    // %[u_buf]
1695     [v_buf]"+r"(v_buf),    // %[v_buf]
1696     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
1697 // TODO(fbarchard): Make width a register for 32 bit.
1698 #if defined(__i386__) && defined(__pic__)
1699     [width]"+m"(width)     // %[width]
1700 #else
1701     [width]"+rm"(width)    // %[width]
1702 #endif
1703   : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1704     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1705     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1706   : "memory", "cc", NACL_R14
1707     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1708   );
1709 }
1710
1711 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1712                                const uint8* u_buf,
1713                                const uint8* v_buf,
1714                                uint8* dst_raw,
1715                                int width) {
1716   asm volatile (
1717     "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1718     "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
1719     "sub       %[u_buf],%[v_buf]               \n"
1720     LABELALIGN
1721   "1:                                          \n"
1722     READYUV422
1723     YUVTORGB(kYuvConstants)
1724     "punpcklbw %%xmm1,%%xmm0                   \n"
1725     "punpcklbw %%xmm2,%%xmm2                   \n"
1726     "movdqa    %%xmm0,%%xmm1                   \n"
1727     "punpcklwd %%xmm2,%%xmm0                   \n"
1728     "punpckhwd %%xmm2,%%xmm1                   \n"
1729     "pshufb    %%xmm5,%%xmm0                   \n"
1730     "pshufb    %%xmm6,%%xmm1                   \n"
1731     "palignr   $0xc,%%xmm0,%%xmm1              \n"
1732     "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
1733     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
1734     "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
1735     "subl      $0x8,%[width]                   \n"
1736     "jg        1b                              \n"
1737   : [y_buf]"+r"(y_buf),    // %[y_buf]
1738     [u_buf]"+r"(u_buf),    // %[u_buf]
1739     [v_buf]"+r"(v_buf),    // %[v_buf]
1740     [dst_raw]"+r"(dst_raw),  // %[dst_raw]
1741 // TODO(fbarchard): Make width a register for 32 bit.
1742 #if defined(__i386__) && defined(__pic__)
1743     [width]"+m"(width)    // %[width]
1744 #else
1745     [width]"+rm"(width)    // %[width]
1746 #endif
1747   : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1748     [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1749     [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
1750   : "memory", "cc", NACL_R14
1751     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1752   );
1753 }
1754
1755 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1756                                 const uint8* u_buf,
1757                                 const uint8* v_buf,
1758                                 uint8* dst_argb,
1759                                 int width) {
1760   asm volatile (
1761     "sub       %[u_buf],%[v_buf]               \n"
1762     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1763     LABELALIGN
1764   "1:                                          \n"
1765     READYUV422
1766     YUVTORGB(kYuvConstants)
1767     STOREARGB
1768     "sub       $0x8,%[width]                   \n"
1769     "jg        1b                              \n"
1770   : [y_buf]"+r"(y_buf),    // %[y_buf]
1771     [u_buf]"+r"(u_buf),    // %[u_buf]
1772     [v_buf]"+r"(v_buf),    // %[v_buf]
1773     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1774     [width]"+rm"(width)    // %[width]
1775   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1776   : "memory", "cc", NACL_R14
1777     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1778   );
1779 }
1780
1781 void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
1782                                 const uint8* u_buf,
1783                                 const uint8* v_buf,
1784                                 uint8* dst_argb,
1785                                 int width) {
1786   asm volatile (
1787     "sub       %[u_buf],%[v_buf]               \n"
1788     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1789     LABELALIGN
1790   "1:                                          \n"
1791     READYUV422
1792     YUVTORGB(kYuvConstants)
1793     STOREARGB
1794     "sub       $0x8,%[width]                   \n"
1795     "jg        1b                              \n"
1796   : [y_buf]"+r"(y_buf),    // %[y_buf]
1797     [u_buf]"+r"(u_buf),    // %[u_buf]
1798     [v_buf]"+r"(v_buf),    // %[v_buf]
1799     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1800     [width]"+rm"(width)    // %[width]
1801   : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
1802   : "memory", "cc", NACL_R14
1803     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1804   );
1805 }
1806
1807 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1808                                 const uint8* u_buf,
1809                                 const uint8* v_buf,
1810                                 uint8* dst_argb,
1811                                 int width) {
1812   asm volatile (
1813     "sub       %[u_buf],%[v_buf]               \n"
1814     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1815     LABELALIGN
1816   "1:                                          \n"
1817     READYUV411
1818     YUVTORGB(kYuvConstants)
1819     STOREARGB
1820     "sub       $0x8,%[width]                   \n"
1821     "jg        1b                              \n"
1822   : [y_buf]"+r"(y_buf),    // %[y_buf]
1823     [u_buf]"+r"(u_buf),    // %[u_buf]
1824     [v_buf]"+r"(v_buf),    // %[v_buf]
1825     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1826     [width]"+rm"(width)    // %[width]
1827   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1828   : "memory", "cc", NACL_R14
1829     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1830   );
1831 }
1832
1833 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1834                                 const uint8* uv_buf,
1835                                 uint8* dst_argb,
1836                                 int width) {
1837   asm volatile (
1838     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1839     LABELALIGN
1840   "1:                                          \n"
1841     READNV12
1842     YUVTORGB(kYuvConstants)
1843     STOREARGB
1844     "sub       $0x8,%[width]                   \n"
1845     "jg        1b                              \n"
1846   : [y_buf]"+r"(y_buf),    // %[y_buf]
1847     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1848     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1849     [width]"+rm"(width)    // %[width]
1850   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1851   // Does not use r14.
1852   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1853   );
1854 }
1855
1856 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1857                                 const uint8* uv_buf,
1858                                 uint8* dst_argb,
1859                                 int width) {
1860   asm volatile (
1861     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1862     LABELALIGN
1863   "1:                                          \n"
1864     READNV12
1865     YUVTORGB(kYuvConstants)
1866     STOREARGB
1867     "sub       $0x8,%[width]                   \n"
1868     "jg        1b                              \n"
1869   : [y_buf]"+r"(y_buf),    // %[y_buf]
1870     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1871     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1872     [width]"+rm"(width)    // %[width]
1873   : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
1874   // Does not use r14.
1875   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1876   );
1877 }
1878
1879 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1880                                 const uint8* u_buf,
1881                                 const uint8* v_buf,
1882                                 uint8* dst_bgra,
1883                                 int width) {
1884   asm volatile (
1885     "sub       %[u_buf],%[v_buf]               \n"
1886     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1887     LABELALIGN
1888   "1:                                          \n"
1889     READYUV422
1890     YUVTORGB(kYuvConstants)
1891     STOREBGRA
1892     "sub       $0x8,%[width]                   \n"
1893     "jg        1b                              \n"
1894   : [y_buf]"+r"(y_buf),    // %[y_buf]
1895     [u_buf]"+r"(u_buf),    // %[u_buf]
1896     [v_buf]"+r"(v_buf),    // %[v_buf]
1897     [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
1898     [width]"+rm"(width)    // %[width]
1899   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1900   : "memory", "cc", NACL_R14
1901     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1902   );
1903 }
1904
1905 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
1906                                 const uint8* u_buf,
1907                                 const uint8* v_buf,
1908                                 uint8* dst_abgr,
1909                                 int width) {
1910   asm volatile (
1911     "sub       %[u_buf],%[v_buf]               \n"
1912     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1913     LABELALIGN
1914   "1:                                          \n"
1915     READYUV422
1916     YUVTORGB(kYuvConstants)
1917     STOREABGR
1918     "sub       $0x8,%[width]                   \n"
1919     "jg        1b                              \n"
1920   : [y_buf]"+r"(y_buf),    // %[y_buf]
1921     [u_buf]"+r"(u_buf),    // %[u_buf]
1922     [v_buf]"+r"(v_buf),    // %[v_buf]
1923     [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
1924     [width]"+rm"(width)    // %[width]
1925   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1926   : "memory", "cc", NACL_R14
1927     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1928   );
1929 }
1930
1931 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1932                                 const uint8* u_buf,
1933                                 const uint8* v_buf,
1934                                 uint8* dst_rgba,
1935                                 int width) {
1936   asm volatile (
1937     "sub       %[u_buf],%[v_buf]               \n"
1938     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1939     LABELALIGN
1940   "1:                                          \n"
1941     READYUV422
1942     YUVTORGB(kYuvConstants)
1943     STORERGBA
1944     "sub       $0x8,%[width]                   \n"
1945     "jg        1b                              \n"
1946   : [y_buf]"+r"(y_buf),    // %[y_buf]
1947     [u_buf]"+r"(u_buf),    // %[u_buf]
1948     [v_buf]"+r"(v_buf),    // %[v_buf]
1949     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
1950     [width]"+rm"(width)    // %[width]
1951   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1952   : "memory", "cc", NACL_R14
1953     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1954   );
1955 }
1956
1957 #endif  // HAS_I422TOARGBROW_SSSE3
1958
1959 // Read 8 UV from 422, upsample to 16 UV.
1960 #define READYUV422_AVX2                                                        \
1961     "vmovq       " MEMACCESS([u_buf]) ",%%xmm0                      \n"        \
1962     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
1963     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
1964     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1965     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1966     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"
1967
1968 // Convert 16 pixels: 16 UV and 16 Y.
1969 #define YUVTORGB_AVX2(YuvConstants)                                            \
1970     "vpmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2   \n"        \
1971     "vpmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1   \n"        \
1972     "vpmaddubsw  " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0        \n"        \
1973     "vmovdqu     " MEMACCESS2(160, [YuvConstants]) ",%%ymm3         \n"        \
1974     "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
1975     "vmovdqu     " MEMACCESS2(128, [YuvConstants]) ",%%ymm3         \n"        \
1976     "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
1977     "vmovdqu     " MEMACCESS2(96, [YuvConstants]) ",%%ymm3          \n"        \
1978     "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
1979     "vmovdqu     " MEMACCESS([y_buf]) ",%%xmm3                      \n"        \
1980     "lea         " MEMLEA(0x10, [y_buf]) ",%[y_buf]                 \n"        \
1981     "vpermq      $0xd8,%%ymm3,%%ymm3                                \n"        \
1982     "vpunpcklbw  %%ymm3,%%ymm3,%%ymm3                               \n"        \
1983     "vpmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3  \n"        \
1984     "vpaddsw     %%ymm3,%%ymm0,%%ymm0           \n"                            \
1985     "vpaddsw     %%ymm3,%%ymm1,%%ymm1           \n"                            \
1986     "vpaddsw     %%ymm3,%%ymm2,%%ymm2           \n"                            \
1987     "vpsraw      $0x6,%%ymm0,%%ymm0             \n"                            \
1988     "vpsraw      $0x6,%%ymm1,%%ymm1             \n"                            \
1989     "vpsraw      $0x6,%%ymm2,%%ymm2             \n"                            \
1990     "vpackuswb   %%ymm0,%%ymm0,%%ymm0           \n"                            \
1991     "vpackuswb   %%ymm1,%%ymm1,%%ymm1           \n"                            \
1992     "vpackuswb   %%ymm2,%%ymm2,%%ymm2           \n"
1993
1994 #if defined(HAS_I422TOBGRAROW_AVX2)
1995 // 16 pixels
1996 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
1997 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
1998                                const uint8* u_buf,
1999                                const uint8* v_buf,
2000                                uint8* dst_bgra,
2001                                int width) {
2002   asm volatile (
2003     "sub       %[u_buf],%[v_buf]               \n"
2004     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2005     LABELALIGN
2006   "1:                                          \n"
2007     READYUV422_AVX2
2008     YUVTORGB_AVX2(kYuvConstants)
2009
2010     // Step 3: Weave into BGRA
2011     "vpunpcklbw %%ymm0,%%ymm1,%%ymm1           \n"  // GB
2012     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2013     "vpunpcklbw %%ymm2,%%ymm5,%%ymm2           \n"  // AR
2014     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2015     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"  // ARGB first 8 pixels
2016     "vpunpckhwd %%ymm1,%%ymm2,%%ymm2           \n"  // ARGB next 8 pixels
2017
2018     "vmovdqu    %%ymm0," MEMACCESS([dst_bgra]) "\n"
2019     "vmovdqu    %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
2020     "lea       " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
2021     "sub       $0x10,%[width]                  \n"
2022     "jg        1b                              \n"
2023     "vzeroupper                                \n"
2024   : [y_buf]"+r"(y_buf),    // %[y_buf]
2025     [u_buf]"+r"(u_buf),    // %[u_buf]
2026     [v_buf]"+r"(v_buf),    // %[v_buf]
2027     [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
2028     [width]"+rm"(width)    // %[width]
2029   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
2030   : "memory", "cc", NACL_R14
2031     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2032   );
2033 }
2034 #endif  // HAS_I422TOBGRAROW_AVX2
2035
2036 #if defined(HAS_I422TOARGBROW_AVX2)
2037 // 16 pixels
2038 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2039 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
2040                                const uint8* u_buf,
2041                                const uint8* v_buf,
2042                                uint8* dst_argb,
2043                                int width) {
2044   asm volatile (
2045     "sub       %[u_buf],%[v_buf]               \n"
2046     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2047     LABELALIGN
2048   "1:                                          \n"
2049     READYUV422_AVX2
2050     YUVTORGB_AVX2(kYuvConstants)
2051
2052     // Step 3: Weave into ARGB
2053     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
2054     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2055     "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
2056     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2057     "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
2058     "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
2059
2060     "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
2061     "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
2062     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2063     "sub       $0x10,%[width]                  \n"
2064     "jg        1b                              \n"
2065     "vzeroupper                                \n"
2066   : [y_buf]"+r"(y_buf),    // %[y_buf]
2067     [u_buf]"+r"(u_buf),    // %[u_buf]
2068     [v_buf]"+r"(v_buf),    // %[v_buf]
2069     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2070     [width]"+rm"(width)    // %[width]
2071   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
2072   : "memory", "cc", NACL_R14
2073     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2074   );
2075 }
2076 #endif  // HAS_I422TOARGBROW_AVX2
2077
2078 #if defined(HAS_J422TOARGBROW_AVX2)
2079 // 16 pixels
2080 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2081 void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
2082                                const uint8* u_buf,
2083                                const uint8* v_buf,
2084                                uint8* dst_argb,
2085                                int width) {
2086   asm volatile (
2087     "sub       %[u_buf],%[v_buf]               \n"
2088     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2089     LABELALIGN
2090   "1:                                          \n"
2091     READYUV422_AVX2
2092     YUVTORGB_AVX2(kYuvConstants)
2093
2094     // Step 3: Weave into ARGB
2095     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
2096     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2097     "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
2098     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2099     "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
2100     "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
2101
2102     "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
2103     "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
2104     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2105     "sub       $0x10,%[width]                  \n"
2106     "jg        1b                              \n"
2107     "vzeroupper                                \n"
2108   : [y_buf]"+r"(y_buf),    // %[y_buf]
2109     [u_buf]"+r"(u_buf),    // %[u_buf]
2110     [v_buf]"+r"(v_buf),    // %[v_buf]
2111     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2112     [width]"+rm"(width)    // %[width]
2113   : [kYuvConstants]"r"(&kYuvJConstants.kUVToB)  // %[kYuvConstants]
2114   : "memory", "cc", NACL_R14
2115     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2116   );
2117 }
2118 #endif  // HAS_J422TOARGBROW_AVX2
2119
2120 #if defined(HAS_I422TOABGRROW_AVX2)
2121 // 16 pixels
2122 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
2123 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
2124                                const uint8* u_buf,
2125                                const uint8* v_buf,
2126                                uint8* dst_argb,
2127                                int width) {
2128   asm volatile (
2129     "sub       %[u_buf],%[v_buf]               \n"
2130     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2131     LABELALIGN
2132   "1:                                          \n"
2133     READYUV422_AVX2
2134     YUVTORGB_AVX2(kYuvConstants)
2135
2136     // Step 3: Weave into ABGR
2137     "vpunpcklbw %%ymm1,%%ymm2,%%ymm1           \n"  // RG
2138     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2139     "vpunpcklbw %%ymm5,%%ymm0,%%ymm2           \n"  // BA
2140     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2141     "vpunpcklwd %%ymm2,%%ymm1,%%ymm0           \n"  // RGBA first 8 pixels
2142     "vpunpckhwd %%ymm2,%%ymm1,%%ymm1           \n"  // RGBA next 8 pixels
2143     "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
2144     "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2145     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2146     "sub       $0x10,%[width]                  \n"
2147     "jg        1b                              \n"
2148     "vzeroupper                                \n"
2149   : [y_buf]"+r"(y_buf),    // %[y_buf]
2150     [u_buf]"+r"(u_buf),    // %[u_buf]
2151     [v_buf]"+r"(v_buf),    // %[v_buf]
2152     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2153     [width]"+rm"(width)    // %[width]
2154   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
2155   : "memory", "cc", NACL_R14
2156     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2157   );
2158 }
2159 #endif  // HAS_I422TOABGRROW_AVX2
2160
2161 #if defined(HAS_I422TORGBAROW_AVX2)
2162 // 16 pixels
2163 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2164 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2165                                const uint8* u_buf,
2166                                const uint8* v_buf,
2167                                uint8* dst_argb,
2168                                int width) {
2169   asm volatile (
2170     "sub       %[u_buf],%[v_buf]               \n"
2171     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2172     LABELALIGN
2173   "1:                                          \n"
2174     READYUV422_AVX2
2175     YUVTORGB_AVX2(kYuvConstants)
2176
2177     // Step 3: Weave into RGBA
2178     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
2179     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2180     "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
2181     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2182     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
2183     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
2184     "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
2185     "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2186     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2187     "sub       $0x10,%[width]                  \n"
2188     "jg        1b                              \n"
2189     "vzeroupper                                \n"
2190   : [y_buf]"+r"(y_buf),    // %[y_buf]
2191     [u_buf]"+r"(u_buf),    // %[u_buf]
2192     [v_buf]"+r"(v_buf),    // %[v_buf]
2193     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2194     [width]"+rm"(width)    // %[width]
2195   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
2196   : "memory", "cc", NACL_R14
2197     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2198   );
2199 }
2200 #endif  // HAS_I422TORGBAROW_AVX2
2201
2202 #ifdef HAS_I400TOARGBROW_SSE2
2203 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2204   asm volatile (
2205     "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
2206     "movd      %%eax,%%xmm2                    \n"
2207     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
2208     "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
2209     "movd      %%eax,%%xmm3                    \n"
2210     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
2211     "pcmpeqb   %%xmm4,%%xmm4                   \n"
2212     "pslld     $0x18,%%xmm4                    \n"
2213     LABELALIGN
2214   "1:                                          \n"
2215     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2216     "movq      " MEMACCESS(0) ",%%xmm0         \n"
2217     "lea       " MEMLEA(0x8,0) ",%0            \n"
2218     "punpcklbw %%xmm0,%%xmm0                   \n"
2219     "pmulhuw   %%xmm2,%%xmm0                   \n"
2220     "psubusw   %%xmm3,%%xmm0                   \n"
2221     "psrlw     $6, %%xmm0                      \n"
2222     "packuswb  %%xmm0,%%xmm0                   \n"
2223
2224     // Step 2: Weave into ARGB
2225     "punpcklbw %%xmm0,%%xmm0                   \n"
2226     "movdqa    %%xmm0,%%xmm1                   \n"
2227     "punpcklwd %%xmm0,%%xmm0                   \n"
2228     "punpckhwd %%xmm1,%%xmm1                   \n"
2229     "por       %%xmm4,%%xmm0                   \n"
2230     "por       %%xmm4,%%xmm1                   \n"
2231     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2232     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2233     "lea       " MEMLEA(0x20,1) ",%1           \n"
2234
2235     "sub       $0x8,%2                         \n"
2236     "jg        1b                              \n"
2237   : "+r"(y_buf),     // %0
2238     "+r"(dst_argb),  // %1
2239     "+rm"(width)     // %2
2240   :
2241   : "memory", "cc", "eax"
2242     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2243   );
2244 }
2245 #endif  // HAS_I400TOARGBROW_SSE2
2246
2247 #ifdef HAS_I400TOARGBROW_AVX2
2248 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2249 // note: vpunpcklbw mutates and vpackuswb unmutates.
2250 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2251   asm volatile (
2252     "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
2253     "vmovd      %%eax,%%xmm2                   \n"
2254     "vbroadcastss %%xmm2,%%ymm2                \n"
2255     "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
2256     "vmovd      %%eax,%%xmm3                   \n"
2257     "vbroadcastss %%xmm3,%%ymm3                \n"
2258     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
2259     "vpslld     $0x18,%%ymm4,%%ymm4            \n"
2260
2261     LABELALIGN
2262   "1:                                          \n"
2263     // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2264     "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
2265     "lea        " MEMLEA(0x10,0) ",%0          \n"
2266     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2267     "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
2268     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
2269     "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
2270     "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
2271     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
2272     "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
2273     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2274     "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
2275     "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
2276     "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
2277     "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
2278     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2279     "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
2280     "lea       " MEMLEA(0x40,1) ",%1           \n"
2281     "sub        $0x10,%2                       \n"
2282     "jg        1b                              \n"
2283     "vzeroupper                                \n"
2284   : "+r"(y_buf),     // %0
2285     "+r"(dst_argb),  // %1
2286     "+rm"(width)     // %2
2287   :
2288   : "memory", "cc", "eax"
2289     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2290   );
2291 }
2292 #endif  // HAS_I400TOARGBROW_AVX2
2293
2294 #ifdef HAS_MIRRORROW_SSSE3
2295 // Shuffle table for reversing the bytes.
2296 static uvec8 kShuffleMirror = {
2297   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2298 };
2299
2300 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2301   intptr_t temp_width = (intptr_t)(width);
2302   asm volatile (
2303     "movdqa    %3,%%xmm5                       \n"
2304     LABELALIGN
2305   "1:                                          \n"
2306     MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
2307     "pshufb    %%xmm5,%%xmm0                   \n"
2308     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2309     "lea       " MEMLEA(0x10,1) ",%1           \n"
2310     "sub       $0x10,%2                        \n"
2311     "jg        1b                              \n"
2312   : "+r"(src),  // %0
2313     "+r"(dst),  // %1
2314     "+r"(temp_width)  // %2
2315   : "m"(kShuffleMirror) // %3
2316   : "memory", "cc", NACL_R14
2317     "xmm0", "xmm5"
2318   );
2319 }
2320 #endif  // HAS_MIRRORROW_SSSE3
2321
2322 #ifdef HAS_MIRRORROW_AVX2
2323 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2324   intptr_t temp_width = (intptr_t)(width);
2325   asm volatile (
2326     "vbroadcastf128 %3,%%ymm5                  \n"
2327     LABELALIGN
2328   "1:                                          \n"
2329     MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
2330     "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
2331     "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
2332     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2333     "lea       " MEMLEA(0x20,1) ",%1           \n"
2334     "sub       $0x20,%2                        \n"
2335     "jg        1b                              \n"
2336     "vzeroupper                                \n"
2337   : "+r"(src),  // %0
2338     "+r"(dst),  // %1
2339     "+r"(temp_width)  // %2
2340   : "m"(kShuffleMirror) // %3
2341   : "memory", "cc", NACL_R14
2342     "xmm0", "xmm5"
2343   );
2344 }
2345 #endif  // HAS_MIRRORROW_AVX2
2346
2347 #ifdef HAS_MIRRORROW_SSE2
2348 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2349   intptr_t temp_width = (intptr_t)(width);
2350   asm volatile (
2351     LABELALIGN
2352   "1:                                          \n"
2353     MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
2354     "movdqa    %%xmm0,%%xmm1                   \n"
2355     "psllw     $0x8,%%xmm0                     \n"
2356     "psrlw     $0x8,%%xmm1                     \n"
2357     "por       %%xmm1,%%xmm0                   \n"
2358     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
2359     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
2360     "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
2361     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2362     "lea       " MEMLEA(0x10,1)",%1            \n"
2363     "sub       $0x10,%2                        \n"
2364     "jg        1b                              \n"
2365   : "+r"(src),  // %0
2366     "+r"(dst),  // %1
2367     "+r"(temp_width)  // %2
2368   :
2369   : "memory", "cc", NACL_R14
2370     "xmm0", "xmm1"
2371   );
2372 }
2373 #endif  // HAS_MIRRORROW_SSE2
2374
2375 #ifdef HAS_MIRRORROW_UV_SSSE3
2376 // Shuffle table for reversing the bytes of UV channels.
2377 static uvec8 kShuffleMirrorUV = {
2378   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2379 };
2380 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2381                        int width) {
2382   intptr_t temp_width = (intptr_t)(width);
2383   asm volatile (
2384     "movdqa    %4,%%xmm1                       \n"
2385     "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
2386     "sub       %1,%2                           \n"
2387     LABELALIGN
2388   "1:                                          \n"
2389     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2390     "lea       " MEMLEA(-0x10,0) ",%0          \n"
2391     "pshufb    %%xmm1,%%xmm0                   \n"
2392     "movlpd    %%xmm0," MEMACCESS(1) "         \n"
2393     MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
2394     "lea       " MEMLEA(0x8,1) ",%1            \n"
2395     "sub       $8,%3                           \n"
2396     "jg        1b                              \n"
2397   : "+r"(src),      // %0
2398     "+r"(dst_u),    // %1
2399     "+r"(dst_v),    // %2
2400     "+r"(temp_width)  // %3
2401   : "m"(kShuffleMirrorUV)  // %4
2402   : "memory", "cc", NACL_R14
2403     "xmm0", "xmm1"
2404   );
2405 }
2406 #endif  // HAS_MIRRORROW_UV_SSSE3
2407
2408 #ifdef HAS_ARGBMIRRORROW_SSE2
2409
2410 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2411   intptr_t temp_width = (intptr_t)(width);
2412   asm volatile (
2413     "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
2414     LABELALIGN
2415   "1:                                          \n"
2416     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2417     "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
2418     "lea       " MEMLEA(-0x10,0) ",%0          \n"
2419     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2420     "lea       " MEMLEA(0x10,1) ",%1           \n"
2421     "sub       $0x4,%2                         \n"
2422     "jg        1b                              \n"
2423   : "+r"(src),  // %0
2424     "+r"(dst),  // %1
2425     "+r"(temp_width)  // %2
2426   :
2427   : "memory", "cc"
2428     , "xmm0"
2429   );
2430 }
2431 #endif  // HAS_ARGBMIRRORROW_SSE2
2432
2433 #ifdef HAS_ARGBMIRRORROW_AVX2
2434 // Shuffle table for reversing the bytes.
2435 static const ulvec32 kARGBShuffleMirror_AVX2 = {
2436   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2437 };
2438 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2439   intptr_t temp_width = (intptr_t)(width);
2440   asm volatile (
2441     "vmovdqu    %3,%%ymm5                      \n"
2442     LABELALIGN
2443   "1:                                          \n"
2444     VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2445     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2446     "lea        " MEMLEA(0x20,1) ",%1          \n"
2447     "sub        $0x8,%2                        \n"
2448     "jg         1b                             \n"
2449     "vzeroupper                                \n"
2450   : "+r"(src),  // %0
2451     "+r"(dst),  // %1
2452     "+r"(temp_width)  // %2
2453   : "m"(kARGBShuffleMirror_AVX2) // %3
2454   : "memory", "cc", NACL_R14
2455     "xmm0", "xmm5"
2456   );
2457 }
2458 #endif  // HAS_ARGBMIRRORROW_AVX2
2459
2460 #ifdef HAS_SPLITUVROW_AVX2
2461 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2462   asm volatile (
2463     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
2464     "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
2465     "sub        %1,%2                            \n"
2466     LABELALIGN
2467   "1:                                            \n"
2468     "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
2469     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
2470     "lea        " MEMLEA(0x40,0) ",%0            \n"
2471     "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
2472     "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
2473     "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
2474     "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
2475     "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
2476     "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
2477     "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
2478     "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
2479     "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
2480     MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
2481     "lea        " MEMLEA(0x20,1) ",%1            \n"
2482     "sub        $0x20,%3                         \n"
2483     "jg         1b                               \n"
2484     "vzeroupper                                  \n"
2485   : "+r"(src_uv),     // %0
2486     "+r"(dst_u),      // %1
2487     "+r"(dst_v),      // %2
2488     "+r"(pix)         // %3
2489   :
2490   : "memory", "cc", NACL_R14
2491     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2492   );
2493 }
2494 #endif  // HAS_SPLITUVROW_AVX2
2495
2496 #ifdef HAS_SPLITUVROW_SSE2
2497 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2498   asm volatile (
2499     "pcmpeqb    %%xmm5,%%xmm5                    \n"
2500     "psrlw      $0x8,%%xmm5                      \n"
2501     "sub        %1,%2                            \n"
2502     LABELALIGN
2503   "1:                                            \n"
2504     "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
2505     "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
2506     "lea        " MEMLEA(0x20,0) ",%0            \n"
2507     "movdqa     %%xmm0,%%xmm2                    \n"
2508     "movdqa     %%xmm1,%%xmm3                    \n"
2509     "pand       %%xmm5,%%xmm0                    \n"
2510     "pand       %%xmm5,%%xmm1                    \n"
2511     "packuswb   %%xmm1,%%xmm0                    \n"
2512     "psrlw      $0x8,%%xmm2                      \n"
2513     "psrlw      $0x8,%%xmm3                      \n"
2514     "packuswb   %%xmm3,%%xmm2                    \n"
2515     "movdqu     %%xmm0," MEMACCESS(1) "          \n"
2516     MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
2517     "lea        " MEMLEA(0x10,1) ",%1            \n"
2518     "sub        $0x10,%3                         \n"
2519     "jg         1b                               \n"
2520   : "+r"(src_uv),     // %0
2521     "+r"(dst_u),      // %1
2522     "+r"(dst_v),      // %2
2523     "+r"(pix)         // %3
2524   :
2525   : "memory", "cc", NACL_R14
2526     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2527   );
2528 }
2529 #endif  // HAS_SPLITUVROW_SSE2
2530
2531 #ifdef HAS_MERGEUVROW_AVX2
2532 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2533                      int width) {
2534   asm volatile (
2535     "sub       %0,%1                             \n"
2536     LABELALIGN
2537   "1:                                            \n"
2538     "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
2539     MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
2540     "lea       " MEMLEA(0x20,0) ",%0             \n"
2541     "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
2542     "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
2543     "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
2544     "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2545     "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2546     "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2547     "lea       " MEMLEA(0x40,2) ",%2             \n"
2548     "sub       $0x20,%3                          \n"
2549     "jg        1b                                \n"
2550     "vzeroupper                                  \n"
2551   : "+r"(src_u),     // %0
2552     "+r"(src_v),     // %1
2553     "+r"(dst_uv),    // %2
2554     "+r"(width)      // %3
2555   :
2556   : "memory", "cc", NACL_R14
2557     "xmm0", "xmm1", "xmm2"
2558   );
2559 }
2560 #endif  // HAS_MERGEUVROW_AVX2
2561
2562 #ifdef HAS_MERGEUVROW_SSE2
2563 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2564                      int width) {
2565   asm volatile (
2566     "sub       %0,%1                             \n"
2567     LABELALIGN
2568   "1:                                            \n"
2569     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
2570     MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
2571     "lea       " MEMLEA(0x10,0) ",%0             \n"
2572     "movdqa    %%xmm0,%%xmm2                     \n"
2573     "punpcklbw %%xmm1,%%xmm0                     \n"
2574     "punpckhbw %%xmm1,%%xmm2                     \n"
2575     "movdqu    %%xmm0," MEMACCESS(2) "           \n"
2576     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
2577     "lea       " MEMLEA(0x20,2) ",%2             \n"
2578     "sub       $0x10,%3                          \n"
2579     "jg        1b                                \n"
2580   : "+r"(src_u),     // %0
2581     "+r"(src_v),     // %1
2582     "+r"(dst_uv),    // %2
2583     "+r"(width)      // %3
2584   :
2585   : "memory", "cc", NACL_R14
2586     "xmm0", "xmm1", "xmm2"
2587   );
2588 }
2589 #endif  // HAS_MERGEUVROW_SSE2
2590
2591 #ifdef HAS_COPYROW_SSE2
2592 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2593   asm volatile (
2594     LABELALIGN
2595   "1:                                          \n"
2596     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2597     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2598     "lea       " MEMLEA(0x20,0) ",%0           \n"
2599     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2600     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2601     "lea       " MEMLEA(0x20,1) ",%1           \n"
2602     "sub       $0x20,%2                        \n"
2603     "jg        1b                              \n"
2604   : "+r"(src),   // %0
2605     "+r"(dst),   // %1
2606     "+r"(count)  // %2
2607   :
2608   : "memory", "cc"
2609     , "xmm0", "xmm1"
2610   );
2611 }
2612 #endif  // HAS_COPYROW_SSE2
2613
2614 #ifdef HAS_COPYROW_AVX
2615 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2616   asm volatile (
2617     LABELALIGN
2618   "1:                                          \n"
2619     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2620     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
2621     "lea       " MEMLEA(0x40,0) ",%0           \n"
2622     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
2623     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
2624     "lea       " MEMLEA(0x40,1) ",%1           \n"
2625     "sub       $0x40,%2                        \n"
2626     "jg        1b                              \n"
2627   : "+r"(src),   // %0
2628     "+r"(dst),   // %1
2629     "+r"(count)  // %2
2630   :
2631   : "memory", "cc"
2632     , "xmm0", "xmm1"
2633   );
2634 }
2635 #endif  // HAS_COPYROW_AVX
2636
2637 #ifdef HAS_COPYROW_ERMS
2638 // Multiple of 1.
2639 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2640   size_t width_tmp = (size_t)(width);
2641   asm volatile (
2642     "rep movsb " MEMMOVESTRING(0,1) "          \n"
2643   : "+S"(src),  // %0
2644     "+D"(dst),  // %1
2645     "+c"(width_tmp) // %2
2646   :
2647   : "memory", "cc"
2648   );
2649 }
2650 #endif  // HAS_COPYROW_ERMS
2651
2652 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2653 // width in pixels
2654 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2655   asm volatile (
2656     "pcmpeqb   %%xmm0,%%xmm0                   \n"
2657     "pslld     $0x18,%%xmm0                    \n"
2658     "pcmpeqb   %%xmm1,%%xmm1                   \n"
2659     "psrld     $0x8,%%xmm1                     \n"
2660     LABELALIGN
2661   "1:                                          \n"
2662     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
2663     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
2664     "lea       " MEMLEA(0x20,0) ",%0           \n"
2665     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2666     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2667     "pand      %%xmm0,%%xmm2                   \n"
2668     "pand      %%xmm0,%%xmm3                   \n"
2669     "pand      %%xmm1,%%xmm4                   \n"
2670     "pand      %%xmm1,%%xmm5                   \n"
2671     "por       %%xmm4,%%xmm2                   \n"
2672     "por       %%xmm5,%%xmm3                   \n"
2673     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2674     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2675     "lea       " MEMLEA(0x20,1) ",%1           \n"
2676     "sub       $0x8,%2                         \n"
2677     "jg        1b                              \n"
2678   : "+r"(src),   // %0
2679     "+r"(dst),   // %1
2680     "+r"(width)  // %2
2681   :
2682   : "memory", "cc"
2683     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2684   );
2685 }
2686 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
2687
2688 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
2689 // width in pixels
2690 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2691   asm volatile (
2692     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
2693     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
2694     LABELALIGN
2695   "1:                                          \n"
2696     "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
2697     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
2698     "lea       " MEMLEA(0x40,0) ",%0           \n"
2699     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
2700     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
2701     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
2702     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
2703     "lea       " MEMLEA(0x40,1) ",%1           \n"
2704     "sub       $0x10,%2                        \n"
2705     "jg        1b                              \n"
2706     "vzeroupper                                \n"
2707   : "+r"(src),   // %0
2708     "+r"(dst),   // %1
2709     "+r"(width)  // %2
2710   :
2711   : "memory", "cc"
2712     , "xmm0", "xmm1", "xmm2"
2713   );
2714 }
2715 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
2716
2717 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2718 // width in pixels
2719 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2720   asm volatile (
2721     "pcmpeqb   %%xmm0,%%xmm0                   \n"
2722     "pslld     $0x18,%%xmm0                    \n"
2723     "pcmpeqb   %%xmm1,%%xmm1                   \n"
2724     "psrld     $0x8,%%xmm1                     \n"
2725     LABELALIGN
2726   "1:                                          \n"
2727     "movq      " MEMACCESS(0) ",%%xmm2         \n"
2728     "lea       " MEMLEA(0x8,0) ",%0            \n"
2729     "punpcklbw %%xmm2,%%xmm2                   \n"
2730     "punpckhwd %%xmm2,%%xmm3                   \n"
2731     "punpcklwd %%xmm2,%%xmm2                   \n"
2732     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2733     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2734     "pand      %%xmm0,%%xmm2                   \n"
2735     "pand      %%xmm0,%%xmm3                   \n"
2736     "pand      %%xmm1,%%xmm4                   \n"
2737     "pand      %%xmm1,%%xmm5                   \n"
2738     "por       %%xmm4,%%xmm2                   \n"
2739     "por       %%xmm5,%%xmm3                   \n"
2740     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2741     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2742     "lea       " MEMLEA(0x20,1) ",%1           \n"
2743     "sub       $0x8,%2                         \n"
2744     "jg        1b                              \n"
2745   : "+r"(src),   // %0
2746     "+r"(dst),   // %1
2747     "+r"(width)  // %2
2748   :
2749   : "memory", "cc"
2750     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2751   );
2752 }
2753 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
2754
2755 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
2756 // width in pixels
2757 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2758   asm volatile (
2759     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
2760     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
2761     LABELALIGN
2762   "1:                                          \n"
2763     "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
2764     "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
2765     "lea       " MEMLEA(0x10,0) ",%0           \n"
2766     "vpslld    $0x18,%%ymm1,%%ymm1             \n"
2767     "vpslld    $0x18,%%ymm2,%%ymm2             \n"
2768     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
2769     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
2770     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
2771     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
2772     "lea       " MEMLEA(0x40,1) ",%1           \n"
2773     "sub       $0x10,%2                        \n"
2774     "jg        1b                              \n"
2775     "vzeroupper                                \n"
2776   : "+r"(src),   // %0
2777     "+r"(dst),   // %1
2778     "+r"(width)  // %2
2779   :
2780   : "memory", "cc"
2781     , "xmm0", "xmm1", "xmm2"
2782   );
2783 }
2784 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
2785
2786 #ifdef HAS_SETROW_X86
2787 void SetRow_X86(uint8* dst, uint8 v8, int width) {
2788   size_t width_tmp = (size_t)(width >> 2);
2789   const uint32 v32 = v8 * 0x01010101;  // Duplicate byte to all bytes.
2790   asm volatile (
2791     "rep stosl " MEMSTORESTRING(eax,0) "       \n"
2792     : "+D"(dst),       // %0
2793       "+c"(width_tmp)  // %1
2794     : "a"(v32)         // %2
2795     : "memory", "cc");
2796 }
2797
2798 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
2799   size_t width_tmp = (size_t)(width);
2800   asm volatile (
2801     "rep stosb " MEMSTORESTRING(al,0) "        \n"
2802     : "+D"(dst),       // %0
2803       "+c"(width_tmp)  // %1
2804     : "a"(v8)          // %2
2805     : "memory", "cc");
2806 }
2807
2808 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
2809   size_t width_tmp = (size_t)(width);
2810   asm volatile (
2811     "rep stosl " MEMSTORESTRING(eax,0) "       \n"
2812     : "+D"(dst_argb),  // %0
2813       "+c"(width_tmp)  // %1
2814     : "a"(v32)         // %2
2815     : "memory", "cc");
2816 }
2817 #endif  // HAS_SETROW_X86
2818
2819 #ifdef HAS_YUY2TOYROW_SSE2
2820 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
2821   asm volatile (
2822     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2823     "psrlw     $0x8,%%xmm5                     \n"
2824     LABELALIGN
2825   "1:                                          \n"
2826     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2827     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2828     "lea       " MEMLEA(0x20,0) ",%0           \n"
2829     "pand      %%xmm5,%%xmm0                   \n"
2830     "pand      %%xmm5,%%xmm1                   \n"
2831     "packuswb  %%xmm1,%%xmm0                   \n"
2832     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2833     "lea       " MEMLEA(0x10,1) ",%1           \n"
2834     "sub       $0x10,%2                        \n"
2835     "jg        1b                              \n"
2836   : "+r"(src_yuy2),  // %0
2837     "+r"(dst_y),     // %1
2838     "+r"(pix)        // %2
2839   :
2840   : "memory", "cc"
2841     , "xmm0", "xmm1", "xmm5"
2842   );
2843 }
2844
2845 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2846                       uint8* dst_u, uint8* dst_v, int pix) {
2847   asm volatile (
2848     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2849     "psrlw     $0x8,%%xmm5                     \n"
2850     "sub       %1,%2                           \n"
2851     LABELALIGN
2852   "1:                                          \n"
2853     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2854     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2855     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
2856     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
2857     "lea       " MEMLEA(0x20,0) ",%0           \n"
2858     "pavgb     %%xmm2,%%xmm0                   \n"
2859     "pavgb     %%xmm3,%%xmm1                   \n"
2860     "psrlw     $0x8,%%xmm0                     \n"
2861     "psrlw     $0x8,%%xmm1                     \n"
2862     "packuswb  %%xmm1,%%xmm0                   \n"
2863     "movdqa    %%xmm0,%%xmm1                   \n"
2864     "pand      %%xmm5,%%xmm0                   \n"
2865     "packuswb  %%xmm0,%%xmm0                   \n"
2866     "psrlw     $0x8,%%xmm1                     \n"
2867     "packuswb  %%xmm1,%%xmm1                   \n"
2868     "movq      %%xmm0," MEMACCESS(1) "         \n"
2869     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
2870     "lea       " MEMLEA(0x8,1) ",%1            \n"
2871     "sub       $0x10,%3                        \n"
2872     "jg        1b                              \n"
2873   : "+r"(src_yuy2),    // %0
2874     "+r"(dst_u),       // %1
2875     "+r"(dst_v),       // %2
2876     "+r"(pix)          // %3
2877   : "r"((intptr_t)(stride_yuy2))  // %4
2878   : "memory", "cc", NACL_R14
2879     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2880   );
2881 }
2882
2883 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2884                          uint8* dst_u, uint8* dst_v, int pix) {
2885   asm volatile (
2886     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2887     "psrlw     $0x8,%%xmm5                     \n"
2888     "sub       %1,%2                           \n"
2889     LABELALIGN
2890   "1:                                          \n"
2891     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2892     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2893     "lea       " MEMLEA(0x20,0) ",%0           \n"
2894     "psrlw     $0x8,%%xmm0                     \n"
2895     "psrlw     $0x8,%%xmm1                     \n"
2896     "packuswb  %%xmm1,%%xmm0                   \n"
2897     "movdqa    %%xmm0,%%xmm1                   \n"
2898     "pand      %%xmm5,%%xmm0                   \n"
2899     "packuswb  %%xmm0,%%xmm0                   \n"
2900     "psrlw     $0x8,%%xmm1                     \n"
2901     "packuswb  %%xmm1,%%xmm1                   \n"
2902     "movq      %%xmm0," MEMACCESS(1) "         \n"
2903     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
2904     "lea       " MEMLEA(0x8,1) ",%1            \n"
2905     "sub       $0x10,%3                        \n"
2906     "jg        1b                              \n"
2907   : "+r"(src_yuy2),    // %0
2908     "+r"(dst_u),       // %1
2909     "+r"(dst_v),       // %2
2910     "+r"(pix)          // %3
2911   :
2912   : "memory", "cc", NACL_R14
2913     "xmm0", "xmm1", "xmm5"
2914   );
2915 }
2916
2917 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
2918   asm volatile (
2919     LABELALIGN
2920   "1:                                          \n"
2921     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2922     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2923     "lea       " MEMLEA(0x20,0) ",%0           \n"
2924     "psrlw     $0x8,%%xmm0                     \n"
2925     "psrlw     $0x8,%%xmm1                     \n"
2926     "packuswb  %%xmm1,%%xmm0                   \n"
2927     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2928     "lea       " MEMLEA(0x10,1) ",%1           \n"
2929     "sub       $0x10,%2                        \n"
2930     "jg        1b                              \n"
2931   : "+r"(src_uyvy),  // %0
2932     "+r"(dst_y),     // %1
2933     "+r"(pix)        // %2
2934   :
2935   : "memory", "cc"
2936     , "xmm0", "xmm1"
2937   );
2938 }
2939
2940 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2941                       uint8* dst_u, uint8* dst_v, int pix) {
2942   asm volatile (
2943     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2944     "psrlw     $0x8,%%xmm5                     \n"
2945     "sub       %1,%2                           \n"
2946     LABELALIGN
2947   "1:                                          \n"
2948     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2949     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2950     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
2951     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
2952     "lea       " MEMLEA(0x20,0) ",%0           \n"
2953     "pavgb     %%xmm2,%%xmm0                   \n"
2954     "pavgb     %%xmm3,%%xmm1                   \n"
2955     "pand      %%xmm5,%%xmm0                   \n"
2956     "pand      %%xmm5,%%xmm1                   \n"
2957     "packuswb  %%xmm1,%%xmm0                   \n"
2958     "movdqa    %%xmm0,%%xmm1                   \n"
2959     "pand      %%xmm5,%%xmm0                   \n"
2960     "packuswb  %%xmm0,%%xmm0                   \n"
2961     "psrlw     $0x8,%%xmm1                     \n"
2962     "packuswb  %%xmm1,%%xmm1                   \n"
2963     "movq      %%xmm0," MEMACCESS(1) "         \n"
2964     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
2965     "lea       " MEMLEA(0x8,1) ",%1            \n"
2966     "sub       $0x10,%3                        \n"
2967     "jg        1b                              \n"
2968   : "+r"(src_uyvy),    // %0
2969     "+r"(dst_u),       // %1
2970     "+r"(dst_v),       // %2
2971     "+r"(pix)          // %3
2972   : "r"((intptr_t)(stride_uyvy))  // %4
2973   : "memory", "cc", NACL_R14
2974     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2975   );
2976 }
2977
2978 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2979                          uint8* dst_u, uint8* dst_v, int pix) {
2980   asm volatile (
2981     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2982     "psrlw     $0x8,%%xmm5                     \n"
2983     "sub       %1,%2                           \n"
2984     LABELALIGN
2985   "1:                                          \n"
2986     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2987     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2988     "lea       " MEMLEA(0x20,0) ",%0           \n"
2989     "pand      %%xmm5,%%xmm0                   \n"
2990     "pand      %%xmm5,%%xmm1                   \n"
2991     "packuswb  %%xmm1,%%xmm0                   \n"
2992     "movdqa    %%xmm0,%%xmm1                   \n"
2993     "pand      %%xmm5,%%xmm0                   \n"
2994     "packuswb  %%xmm0,%%xmm0                   \n"
2995     "psrlw     $0x8,%%xmm1                     \n"
2996     "packuswb  %%xmm1,%%xmm1                   \n"
2997     "movq      %%xmm0," MEMACCESS(1) "         \n"
2998     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
2999     "lea       " MEMLEA(0x8,1) ",%1            \n"
3000     "sub       $0x10,%3                        \n"
3001     "jg        1b                              \n"
3002   : "+r"(src_uyvy),    // %0
3003     "+r"(dst_u),       // %1
3004     "+r"(dst_v),       // %2
3005     "+r"(pix)          // %3
3006   :
3007   : "memory", "cc", NACL_R14
3008     "xmm0", "xmm1", "xmm5"
3009   );
3010 }
3011 #endif  // HAS_YUY2TOYROW_SSE2
3012
3013 #ifdef HAS_YUY2TOYROW_AVX2
3014 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
3015   asm volatile (
3016     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3017     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3018     LABELALIGN
3019   "1:                                          \n"
3020     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3021     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3022     "lea       " MEMLEA(0x40,0) ",%0           \n"
3023     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3024     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3025     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3026     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3027     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
3028     "lea      " MEMLEA(0x20,1) ",%1            \n"
3029     "sub       $0x20,%2                        \n"
3030     "jg        1b                              \n"
3031     "vzeroupper                                \n"
3032   : "+r"(src_yuy2),  // %0
3033     "+r"(dst_y),     // %1
3034     "+r"(pix)        // %2
3035   :
3036   : "memory", "cc"
3037     , "xmm0", "xmm1", "xmm5"
3038   );
3039 }
3040
3041 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3042                       uint8* dst_u, uint8* dst_v, int pix) {
3043   asm volatile (
3044     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3045     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3046     "sub       %1,%2                           \n"
3047     LABELALIGN
3048   "1:                                          \n"
3049     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3050     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3051     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3052     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3053     "lea       " MEMLEA(0x40,0) ",%0           \n"
3054     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3055     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3056     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3057     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3058     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3059     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3060     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3061     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3062     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3063     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3064     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3065     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3066     "lea      " MEMLEA(0x10,1) ",%1            \n"
3067     "sub       $0x20,%3                        \n"
3068     "jg        1b                              \n"
3069     "vzeroupper                                \n"
3070   : "+r"(src_yuy2),    // %0
3071     "+r"(dst_u),       // %1
3072     "+r"(dst_v),       // %2
3073     "+r"(pix)          // %3
3074   : "r"((intptr_t)(stride_yuy2))  // %4
3075   : "memory", "cc", NACL_R14
3076     "xmm0", "xmm1", "xmm5"
3077   );
3078 }
3079
3080 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3081                          uint8* dst_u, uint8* dst_v, int pix) {
3082   asm volatile (
3083     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3084     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3085     "sub       %1,%2                           \n"
3086     LABELALIGN
3087   "1:                                          \n"
3088     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3089     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3090     "lea       " MEMLEA(0x40,0) ",%0           \n"
3091     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3092     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3093     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3094     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3095     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3096     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3097     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3098     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3099     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3100     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3101     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3102     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3103     "lea      " MEMLEA(0x10,1) ",%1            \n"
3104     "sub       $0x20,%3                        \n"
3105     "jg        1b                              \n"
3106     "vzeroupper                                \n"
3107   : "+r"(src_yuy2),    // %0
3108     "+r"(dst_u),       // %1
3109     "+r"(dst_v),       // %2
3110     "+r"(pix)          // %3
3111   :
3112   : "memory", "cc", NACL_R14
3113     "xmm0", "xmm1", "xmm5"
3114   );
3115 }
3116
3117 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
3118   asm volatile (
3119     LABELALIGN
3120   "1:                                          \n"
3121     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3122     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3123     "lea       " MEMLEA(0x40,0) ",%0           \n"
3124     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3125     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3126     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3127     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3128     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
3129     "lea      " MEMLEA(0x20,1) ",%1            \n"
3130     "sub       $0x20,%2                        \n"
3131     "jg        1b                              \n"
3132     "vzeroupper                                \n"
3133   : "+r"(src_uyvy),  // %0
3134     "+r"(dst_y),     // %1
3135     "+r"(pix)        // %2
3136   :
3137   : "memory", "cc"
3138     , "xmm0", "xmm1", "xmm5"
3139   );
3140 }
3141 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3142                       uint8* dst_u, uint8* dst_v, int pix) {
3143   asm volatile (
3144     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3145     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3146     "sub       %1,%2                           \n"
3147
3148     LABELALIGN
3149   "1:                                          \n"
3150     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3151     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3152     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3153     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3154     "lea       " MEMLEA(0x40,0) ",%0           \n"
3155     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3156     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3157     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3158     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3159     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3160     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3161     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3162     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3163     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3164     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3165     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3166     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3167     "lea      " MEMLEA(0x10,1) ",%1            \n"
3168     "sub       $0x20,%3                        \n"
3169     "jg        1b                              \n"
3170     "vzeroupper                                \n"
3171   : "+r"(src_uyvy),    // %0
3172     "+r"(dst_u),       // %1
3173     "+r"(dst_v),       // %2
3174     "+r"(pix)          // %3
3175   : "r"((intptr_t)(stride_uyvy))  // %4
3176   : "memory", "cc", NACL_R14
3177     "xmm0", "xmm1", "xmm5"
3178   );
3179 }
3180
3181 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3182                          uint8* dst_u, uint8* dst_v, int pix) {
3183   asm volatile (
3184     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3185     "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
3186     "sub       %1,%2                           \n"
3187     LABELALIGN
3188   "1:                                          \n"
3189     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3190     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3191     "lea       " MEMLEA(0x40,0) ",%0           \n"
3192     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3193     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3194     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3195     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3196     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3197     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3198     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3199     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3200     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3201     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3202     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3203     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3204     "lea      " MEMLEA(0x10,1) ",%1            \n"
3205     "sub       $0x20,%3                        \n"
3206     "jg        1b                              \n"
3207     "vzeroupper                                \n"
3208   : "+r"(src_uyvy),    // %0
3209     "+r"(dst_u),       // %1
3210     "+r"(dst_v),       // %2
3211     "+r"(pix)          // %3
3212   :
3213   : "memory", "cc", NACL_R14
3214     "xmm0", "xmm1", "xmm5"
3215   );
3216 }
3217 #endif  // HAS_YUY2TOYROW_AVX2
3218
3219 #ifdef HAS_ARGBBLENDROW_SSE2
3220 // Blend 8 pixels at a time.
3221 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3222                        uint8* dst_argb, int width) {
3223   asm volatile (
3224     "pcmpeqb   %%xmm7,%%xmm7                   \n"
3225     "psrlw     $0xf,%%xmm7                     \n"
3226     "pcmpeqb   %%xmm6,%%xmm6                   \n"
3227     "psrlw     $0x8,%%xmm6                     \n"
3228     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3229     "psllw     $0x8,%%xmm5                     \n"
3230     "pcmpeqb   %%xmm4,%%xmm4                   \n"
3231     "pslld     $0x18,%%xmm4                    \n"
3232     "sub       $0x4,%3                         \n"
3233     "jl        49f                             \n"
3234
3235     // 4 pixel loop.
3236     LABELALIGN
3237   "41:                                         \n"
3238     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
3239     "lea       " MEMLEA(0x10,0) ",%0           \n"
3240     "movdqa    %%xmm3,%%xmm0                   \n"
3241     "pxor      %%xmm4,%%xmm3                   \n"
3242     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3243     "psrlw     $0x8,%%xmm3                     \n"
3244     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
3245     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
3246     "pand      %%xmm6,%%xmm2                   \n"
3247     "paddw     %%xmm7,%%xmm3                   \n"
3248     "pmullw    %%xmm3,%%xmm2                   \n"
3249     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
3250     "lea       " MEMLEA(0x10,1) ",%1           \n"
3251     "psrlw     $0x8,%%xmm1                     \n"
3252     "por       %%xmm4,%%xmm0                   \n"
3253     "pmullw    %%xmm3,%%xmm1                   \n"
3254     "psrlw     $0x8,%%xmm2                     \n"
3255     "paddusb   %%xmm2,%%xmm0                   \n"
3256     "pand      %%xmm5,%%xmm1                   \n"
3257     "paddusb   %%xmm1,%%xmm0                   \n"
3258     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3259     "lea       " MEMLEA(0x10,2) ",%2           \n"
3260     "sub       $0x4,%3                         \n"
3261     "jge       41b                             \n"
3262
3263   "49:                                         \n"
3264     "add       $0x3,%3                         \n"
3265     "jl        99f                             \n"
3266
3267     // 1 pixel loop.
3268   "91:                                         \n"
3269     "movd      " MEMACCESS(0) ",%%xmm3         \n"
3270     "lea       " MEMLEA(0x4,0) ",%0            \n"
3271     "movdqa    %%xmm3,%%xmm0                   \n"
3272     "pxor      %%xmm4,%%xmm3                   \n"
3273     "movd      " MEMACCESS(1) ",%%xmm2         \n"
3274     "psrlw     $0x8,%%xmm3                     \n"
3275     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
3276     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
3277     "pand      %%xmm6,%%xmm2                   \n"
3278     "paddw     %%xmm7,%%xmm3                   \n"
3279     "pmullw    %%xmm3,%%xmm2                   \n"
3280     "movd      " MEMACCESS(1) ",%%xmm1         \n"
3281     "lea       " MEMLEA(0x4,1) ",%1            \n"
3282     "psrlw     $0x8,%%xmm1                     \n"
3283     "por       %%xmm4,%%xmm0                   \n"
3284     "pmullw    %%xmm3,%%xmm1                   \n"
3285     "psrlw     $0x8,%%xmm2                     \n"
3286     "paddusb   %%xmm2,%%xmm0                   \n"
3287     "pand      %%xmm5,%%xmm1                   \n"
3288     "paddusb   %%xmm1,%%xmm0                   \n"
3289     "movd      %%xmm0," MEMACCESS(2) "         \n"
3290     "lea       " MEMLEA(0x4,2) ",%2            \n"
3291     "sub       $0x1,%3                         \n"
3292     "jge       91b                             \n"
3293   "99:                                         \n"
3294   : "+r"(src_argb0),    // %0
3295     "+r"(src_argb1),    // %1
3296     "+r"(dst_argb),     // %2
3297     "+r"(width)         // %3
3298   :
3299   : "memory", "cc"
3300     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3301   );
3302 }
3303 #endif  // HAS_ARGBBLENDROW_SSE2
3304
3305 #ifdef HAS_ARGBBLENDROW_SSSE3
3306 // Shuffle table for isolating alpha.
3307 static uvec8 kShuffleAlpha = {
3308   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3309   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3310 };
3311
3312 // Blend 8 pixels at a time
3313 // Shuffle table for reversing the bytes.
3314
3315 // Same as SSE2, but replaces
3316 //    psrlw      xmm3, 8          // alpha
3317 //    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3318 //    pshuflw    xmm3, xmm3,0F5h
3319 // with..
3320 //    pshufb     xmm3, kShuffleAlpha // alpha
3321
3322 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3323                         uint8* dst_argb, int width) {
3324   asm volatile (
3325     "pcmpeqb   %%xmm7,%%xmm7                   \n"
3326     "psrlw     $0xf,%%xmm7                     \n"
3327     "pcmpeqb   %%xmm6,%%xmm6                   \n"
3328     "psrlw     $0x8,%%xmm6                     \n"
3329     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3330     "psllw     $0x8,%%xmm5                     \n"
3331     "pcmpeqb   %%xmm4,%%xmm4                   \n"
3332     "pslld     $0x18,%%xmm4                    \n"
3333     "sub       $0x4,%3                         \n"
3334     "jl        49f                             \n"
3335
3336     // 4 pixel loop.
3337     LABELALIGN
3338   "40:                                         \n"
3339     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
3340     "lea       " MEMLEA(0x10,0) ",%0           \n"
3341     "movdqa    %%xmm3,%%xmm0                   \n"
3342     "pxor      %%xmm4,%%xmm3                   \n"
3343     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3344     "pshufb    %4,%%xmm3                       \n"
3345     "pand      %%xmm6,%%xmm2                   \n"
3346     "paddw     %%xmm7,%%xmm3                   \n"
3347     "pmullw    %%xmm3,%%xmm2                   \n"
3348     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
3349     "lea       " MEMLEA(0x10,1) ",%1           \n"
3350     "psrlw     $0x8,%%xmm1                     \n"
3351     "por       %%xmm4,%%xmm0                   \n"
3352     "pmullw    %%xmm3,%%xmm1                   \n"
3353     "psrlw     $0x8,%%xmm2                     \n"
3354     "paddusb   %%xmm2,%%xmm0                   \n"
3355     "pand      %%xmm5,%%xmm1                   \n"
3356     "paddusb   %%xmm1,%%xmm0                   \n"
3357     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3358     "lea       " MEMLEA(0x10,2) ",%2           \n"
3359     "sub       $0x4,%3                         \n"
3360     "jge       40b                             \n"
3361
3362   "49:                                         \n"
3363     "add       $0x3,%3                         \n"
3364     "jl        99f                             \n"
3365
3366     // 1 pixel loop.
3367   "91:                                         \n"
3368     "movd      " MEMACCESS(0) ",%%xmm3         \n"
3369     "lea       " MEMLEA(0x4,0) ",%0            \n"
3370     "movdqa    %%xmm3,%%xmm0                   \n"
3371     "pxor      %%xmm4,%%xmm3                   \n"
3372     "movd      " MEMACCESS(1) ",%%xmm2         \n"
3373     "pshufb    %4,%%xmm3                       \n"
3374     "pand      %%xmm6,%%xmm2                   \n"
3375     "paddw     %%xmm7,%%xmm3                   \n"
3376     "pmullw    %%xmm3,%%xmm2                   \n"
3377     "movd      " MEMACCESS(1) ",%%xmm1         \n"
3378     "lea       " MEMLEA(0x4,1) ",%1            \n"
3379     "psrlw     $0x8,%%xmm1                     \n"
3380     "por       %%xmm4,%%xmm0                   \n"
3381     "pmullw    %%xmm3,%%xmm1                   \n"
3382     "psrlw     $0x8,%%xmm2                     \n"
3383     "paddusb   %%xmm2,%%xmm0                   \n"
3384     "pand      %%xmm5,%%xmm1                   \n"
3385     "paddusb   %%xmm1,%%xmm0                   \n"
3386     "movd      %%xmm0," MEMACCESS(2) "         \n"
3387     "lea       " MEMLEA(0x4,2) ",%2            \n"
3388     "sub       $0x1,%3                         \n"
3389     "jge       91b                             \n"
3390   "99:                                         \n"
3391   : "+r"(src_argb0),    // %0
3392     "+r"(src_argb1),    // %1
3393     "+r"(dst_argb),     // %2
3394     "+r"(width)         // %3
3395   : "m"(kShuffleAlpha)  // %4
3396   : "memory", "cc"
3397     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3398   );
3399 }
3400 #endif  // HAS_ARGBBLENDROW_SSSE3
3401
3402 #ifdef HAS_ARGBATTENUATEROW_SSE2
3403 // Attenuate 4 pixels at a time.
3404 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3405   asm volatile (
3406     "pcmpeqb   %%xmm4,%%xmm4                   \n"
3407     "pslld     $0x18,%%xmm4                    \n"
3408     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3409     "psrld     $0x8,%%xmm5                     \n"
3410
3411     // 4 pixel loop.
3412     LABELALIGN
3413   "1:                                          \n"
3414     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3415     "punpcklbw %%xmm0,%%xmm0                   \n"
3416     "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
3417     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
3418     "pmulhuw   %%xmm2,%%xmm0                   \n"
3419     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3420     "punpckhbw %%xmm1,%%xmm1                   \n"
3421     "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
3422     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
3423     "pmulhuw   %%xmm2,%%xmm1                   \n"
3424     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3425     "lea       " MEMLEA(0x10,0) ",%0           \n"
3426     "psrlw     $0x8,%%xmm0                     \n"
3427     "pand      %%xmm4,%%xmm2                   \n"
3428     "psrlw     $0x8,%%xmm1                     \n"
3429     "packuswb  %%xmm1,%%xmm0                   \n"
3430     "pand      %%xmm5,%%xmm0                   \n"
3431     "por       %%xmm2,%%xmm0                   \n"
3432     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3433     "lea       " MEMLEA(0x10,1) ",%1           \n"
3434     "sub       $0x4,%2                         \n"
3435     "jg        1b                              \n"
3436   : "+r"(src_argb),    // %0
3437     "+r"(dst_argb),    // %1
3438     "+r"(width)        // %2
3439   :
3440   : "memory", "cc"
3441     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3442   );
3443 }
3444 #endif  // HAS_ARGBATTENUATEROW_SSE2
3445
3446 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3447 // Shuffle table duplicating alpha
3448 static uvec8 kShuffleAlpha0 = {
3449   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
3450 };
3451 static uvec8 kShuffleAlpha1 = {
3452   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3453   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
3454 };
3455 // Attenuate 4 pixels at a time.
3456 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3457   asm volatile (
3458     "pcmpeqb   %%xmm3,%%xmm3                   \n"
3459     "pslld     $0x18,%%xmm3                    \n"
3460     "movdqa    %3,%%xmm4                       \n"
3461     "movdqa    %4,%%xmm5                       \n"
3462
3463     // 4 pixel loop.
3464     LABELALIGN
3465   "1:                                          \n"
3466     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3467     "pshufb    %%xmm4,%%xmm0                   \n"
3468     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3469     "punpcklbw %%xmm1,%%xmm1                   \n"
3470     "pmulhuw   %%xmm1,%%xmm0                   \n"
3471     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3472     "pshufb    %%xmm5,%%xmm1                   \n"
3473     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3474     "punpckhbw %%xmm2,%%xmm2                   \n"
3475     "pmulhuw   %%xmm2,%%xmm1                   \n"
3476     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3477     "lea       " MEMLEA(0x10,0) ",%0           \n"
3478     "pand      %%xmm3,%%xmm2                   \n"
3479     "psrlw     $0x8,%%xmm0                     \n"
3480     "psrlw     $0x8,%%xmm1                     \n"
3481     "packuswb  %%xmm1,%%xmm0                   \n"
3482     "por       %%xmm2,%%xmm0                   \n"
3483     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3484     "lea       " MEMLEA(0x10,1) ",%1           \n"
3485     "sub       $0x4,%2                         \n"
3486     "jg        1b                              \n"
3487   : "+r"(src_argb),    // %0
3488     "+r"(dst_argb),    // %1
3489     "+r"(width)        // %2
3490   : "m"(kShuffleAlpha0),  // %3
3491     "m"(kShuffleAlpha1)  // %4
3492   : "memory", "cc"
3493     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3494   );
3495 }
3496 #endif  // HAS_ARGBATTENUATEROW_SSSE3
3497
3498 #ifdef HAS_ARGBATTENUATEROW_AVX2
3499 // Shuffle table duplicating alpha.
3500 static const uvec8 kShuffleAlpha_AVX2 = {
3501   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
3502 };
3503 // Attenuate 8 pixels at a time.
3504 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3505   asm volatile (
3506     "vbroadcastf128 %3,%%ymm4                  \n"
3507     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3508     "vpslld     $0x18,%%ymm5,%%ymm5            \n"
3509     "sub        %0,%1                          \n"
3510
3511     // 8 pixel loop.
3512     LABELALIGN
3513   "1:                                          \n"
3514     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3515     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3516     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3517     "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
3518     "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
3519     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3520     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3521     "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
3522     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
3523     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
3524     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3525     "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
3526     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3527     "lea       " MEMLEA(0x20,0) ",%0           \n"
3528     "sub        $0x8,%2                        \n"
3529     "jg        1b                              \n"
3530     "vzeroupper                                \n"
3531   : "+r"(src_argb),    // %0
3532     "+r"(dst_argb),    // %1
3533     "+r"(width)        // %2
3534   : "m"(kShuffleAlpha_AVX2)  // %3
3535   : "memory", "cc"
3536     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3537   );
3538 }
3539 #endif  // HAS_ARGBATTENUATEROW_AVX2
3540
3541 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3542 // Unattenuate 4 pixels at a time.
3543 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3544                              int width) {
3545   uintptr_t alpha = 0;
3546   asm volatile (
3547     // 4 pixel loop.
3548     LABELALIGN
3549   "1:                                          \n"
3550     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3551     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3552     "punpcklbw %%xmm0,%%xmm0                   \n"
3553     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3554     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3555     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3556     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3557     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3558     "movlhps   %%xmm3,%%xmm2                   \n"
3559     "pmulhuw   %%xmm2,%%xmm0                   \n"
3560     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3561     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3562     "punpckhbw %%xmm1,%%xmm1                   \n"
3563     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3564     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3565     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3566     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3567     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3568     "movlhps   %%xmm3,%%xmm2                   \n"
3569     "pmulhuw   %%xmm2,%%xmm1                   \n"
3570     "lea       " MEMLEA(0x10,0) ",%0           \n"
3571     "packuswb  %%xmm1,%%xmm0                   \n"
3572     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3573     "lea       " MEMLEA(0x10,1) ",%1           \n"
3574     "sub       $0x4,%2                         \n"
3575     "jg        1b                              \n"
3576   : "+r"(src_argb),    // %0
3577     "+r"(dst_argb),    // %1
3578     "+r"(width),       // %2
3579     "+r"(alpha)        // %3
3580   : "r"(fixed_invtbl8)  // %4
3581   : "memory", "cc", NACL_R14
3582     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3583   );
3584 }
3585 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
3586
3587 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3588 // Shuffle table duplicating alpha.
3589 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3590   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
3591 };
3592 // Unattenuate 8 pixels at a time.
3593 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
3594                              int width) {
3595   uintptr_t alpha = 0;
3596   asm volatile (
3597     "sub        %0,%1                          \n"
3598     "vbroadcastf128 %5,%%ymm5                  \n"
3599
3600     // 8 pixel loop.
3601     LABELALIGN
3602   "1:                                          \n"
3603     // replace VPGATHER
3604     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3605     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3606     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3607     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3608     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3609     "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
3610     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3611     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3612     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3613     "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
3614     "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
3615     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3616     "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
3617     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3618     "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
3619     "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
3620     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3621     "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
3622     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3623     "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
3624     "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
3625     "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
3626     "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
3627     // end of VPGATHER
3628
3629     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3630     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3631     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3632     "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
3633     "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
3634     "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
3635     "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
3636     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3637     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3638     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3639     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3640     "lea       " MEMLEA(0x20,0) ",%0           \n"
3641     "sub        $0x8,%2                        \n"
3642     "jg        1b                              \n"
3643     "vzeroupper                                \n"
3644   : "+r"(src_argb),    // %0
3645     "+r"(dst_argb),    // %1
3646     "+r"(width),       // %2
3647     "+r"(alpha)        // %3
3648   : "r"(fixed_invtbl8),  // %4
3649     "m"(kUnattenShuffleAlpha_AVX2)  // %5
3650   : "memory", "cc", NACL_R14
3651     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3652   );
3653 }
3654 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
3655
3656 #ifdef HAS_ARGBGRAYROW_SSSE3
3657 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
3658 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3659   asm volatile (
3660     "movdqa    %3,%%xmm4                       \n"
3661     "movdqa    %4,%%xmm5                       \n"
3662
3663     // 8 pixel loop.
3664     LABELALIGN
3665   "1:                                          \n"
3666     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3667     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3668     "pmaddubsw %%xmm4,%%xmm0                   \n"
3669     "pmaddubsw %%xmm4,%%xmm1                   \n"
3670     "phaddw    %%xmm1,%%xmm0                   \n"
3671     "paddw     %%xmm5,%%xmm0                   \n"
3672     "psrlw     $0x7,%%xmm0                     \n"
3673     "packuswb  %%xmm0,%%xmm0                   \n"
3674     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3675     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
3676     "lea       " MEMLEA(0x20,0) ",%0           \n"
3677     "psrld     $0x18,%%xmm2                    \n"
3678     "psrld     $0x18,%%xmm3                    \n"
3679     "packuswb  %%xmm3,%%xmm2                   \n"
3680     "packuswb  %%xmm2,%%xmm2                   \n"
3681     "movdqa    %%xmm0,%%xmm3                   \n"
3682     "punpcklbw %%xmm0,%%xmm0                   \n"
3683     "punpcklbw %%xmm2,%%xmm3                   \n"
3684     "movdqa    %%xmm0,%%xmm1                   \n"
3685     "punpcklwd %%xmm3,%%xmm0                   \n"
3686     "punpckhwd %%xmm3,%%xmm1                   \n"
3687     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3688     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
3689     "lea       " MEMLEA(0x20,1) ",%1           \n"
3690     "sub       $0x8,%2                         \n"
3691     "jg        1b                              \n"
3692   : "+r"(src_argb),   // %0
3693     "+r"(dst_argb),   // %1
3694     "+r"(width)       // %2
3695   : "m"(kARGBToYJ),   // %3
3696     "m"(kAddYJ64)     // %4
3697   : "memory", "cc"
3698     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3699   );
3700 }
3701 #endif  // HAS_ARGBGRAYROW_SSSE3
3702
3703 #ifdef HAS_ARGBSEPIAROW_SSSE3
3704 //    b = (r * 35 + g * 68 + b * 17) >> 7
3705 //    g = (r * 45 + g * 88 + b * 22) >> 7
3706 //    r = (r * 50 + g * 98 + b * 24) >> 7
3707 // Constant for ARGB color to sepia tone
3708 static vec8 kARGBToSepiaB = {
3709   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3710 };
3711
3712 static vec8 kARGBToSepiaG = {
3713   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3714 };
3715
3716 static vec8 kARGBToSepiaR = {
3717   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3718 };
3719
3720 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3721 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3722   asm volatile (
3723     "movdqa    %2,%%xmm2                       \n"
3724     "movdqa    %3,%%xmm3                       \n"
3725     "movdqa    %4,%%xmm4                       \n"
3726
3727     // 8 pixel loop.
3728     LABELALIGN
3729   "1:                                          \n"
3730     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3731     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
3732     "pmaddubsw %%xmm2,%%xmm0                   \n"
3733     "pmaddubsw %%xmm2,%%xmm6                   \n"
3734     "phaddw    %%xmm6,%%xmm0                   \n"
3735     "psrlw     $0x7,%%xmm0                     \n"
3736     "packuswb  %%xmm0,%%xmm0                   \n"
3737     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3738     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3739     "pmaddubsw %%xmm3,%%xmm5                   \n"
3740     "pmaddubsw %%xmm3,%%xmm1                   \n"
3741     "phaddw    %%xmm1,%%xmm5                   \n"
3742     "psrlw     $0x7,%%xmm5                     \n"
3743     "packuswb  %%xmm5,%%xmm5                   \n"
3744     "punpcklbw %%xmm5,%%xmm0                   \n"
3745     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3746     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3747     "pmaddubsw %%xmm4,%%xmm5                   \n"
3748     "pmaddubsw %%xmm4,%%xmm1                   \n"
3749     "phaddw    %%xmm1,%%xmm5                   \n"
3750     "psrlw     $0x7,%%xmm5                     \n"
3751     "packuswb  %%xmm5,%%xmm5                   \n"
3752     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
3753     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3754     "psrld     $0x18,%%xmm6                    \n"
3755     "psrld     $0x18,%%xmm1                    \n"
3756     "packuswb  %%xmm1,%%xmm6                   \n"
3757     "packuswb  %%xmm6,%%xmm6                   \n"
3758     "punpcklbw %%xmm6,%%xmm5                   \n"
3759     "movdqa    %%xmm0,%%xmm1                   \n"
3760     "punpcklwd %%xmm5,%%xmm0                   \n"
3761     "punpckhwd %%xmm5,%%xmm1                   \n"
3762     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
3763     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
3764     "lea       " MEMLEA(0x20,0) ",%0           \n"
3765     "sub       $0x8,%1                         \n"
3766     "jg        1b                              \n"
3767   : "+r"(dst_argb),      // %0
3768     "+r"(width)          // %1
3769   : "m"(kARGBToSepiaB),  // %2
3770     "m"(kARGBToSepiaG),  // %3
3771     "m"(kARGBToSepiaR)   // %4
3772   : "memory", "cc"
3773     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3774   );
3775 }
3776 #endif  // HAS_ARGBSEPIAROW_SSSE3
3777
3778 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3779 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3780 // Same as Sepia except matrix is provided.
3781 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
3782                               const int8* matrix_argb, int width) {
3783   asm volatile (
3784     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
3785     "pshufd    $0x00,%%xmm5,%%xmm2             \n"
3786     "pshufd    $0x55,%%xmm5,%%xmm3             \n"
3787     "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
3788     "pshufd    $0xff,%%xmm5,%%xmm5             \n"
3789
3790     // 8 pixel loop.
3791     LABELALIGN
3792   "1:                                          \n"
3793     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3794     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
3795     "pmaddubsw %%xmm2,%%xmm0                   \n"
3796     "pmaddubsw %%xmm2,%%xmm7                   \n"
3797     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
3798     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3799     "pmaddubsw %%xmm3,%%xmm6                   \n"
3800     "pmaddubsw %%xmm3,%%xmm1                   \n"
3801     "phaddsw   %%xmm7,%%xmm0                   \n"
3802     "phaddsw   %%xmm1,%%xmm6                   \n"
3803     "psraw     $0x6,%%xmm0                     \n"
3804     "psraw     $0x6,%%xmm6                     \n"
3805     "packuswb  %%xmm0,%%xmm0                   \n"
3806     "packuswb  %%xmm6,%%xmm6                   \n"
3807     "punpcklbw %%xmm6,%%xmm0                   \n"
3808     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3809     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
3810     "pmaddubsw %%xmm4,%%xmm1                   \n"
3811     "pmaddubsw %%xmm4,%%xmm7                   \n"
3812     "phaddsw   %%xmm7,%%xmm1                   \n"
3813     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
3814     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
3815     "pmaddubsw %%xmm5,%%xmm6                   \n"
3816     "pmaddubsw %%xmm5,%%xmm7                   \n"
3817     "phaddsw   %%xmm7,%%xmm6                   \n"
3818     "psraw     $0x6,%%xmm1                     \n"
3819     "psraw     $0x6,%%xmm6                     \n"
3820     "packuswb  %%xmm1,%%xmm1                   \n"
3821     "packuswb  %%xmm6,%%xmm6                   \n"
3822     "punpcklbw %%xmm6,%%xmm1                   \n"
3823     "movdqa    %%xmm0,%%xmm6                   \n"
3824     "punpcklwd %%xmm1,%%xmm0                   \n"
3825     "punpckhwd %%xmm1,%%xmm6                   \n"
3826     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3827     "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
3828     "lea       " MEMLEA(0x20,0) ",%0           \n"
3829     "lea       " MEMLEA(0x20,1) ",%1           \n"
3830     "sub       $0x8,%2                         \n"
3831     "jg        1b                              \n"
3832   : "+r"(src_argb),      // %0
3833     "+r"(dst_argb),      // %1
3834     "+r"(width)          // %2
3835   : "r"(matrix_argb)     // %3
3836   : "memory", "cc"
3837     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3838   );
3839 }
3840 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
3841
3842 #ifdef HAS_ARGBQUANTIZEROW_SSE2
3843 // Quantize 4 ARGB pixels (16 bytes).
3844 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3845                           int interval_offset, int width) {
3846   asm volatile (
3847     "movd      %2,%%xmm2                       \n"
3848     "movd      %3,%%xmm3                       \n"
3849     "movd      %4,%%xmm4                       \n"
3850     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3851     "pshufd    $0x44,%%xmm2,%%xmm2             \n"
3852     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3853     "pshufd    $0x44,%%xmm3,%%xmm3             \n"
3854     "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
3855     "pshufd    $0x44,%%xmm4,%%xmm4             \n"
3856     "pxor      %%xmm5,%%xmm5                   \n"
3857     "pcmpeqb   %%xmm6,%%xmm6                   \n"
3858     "pslld     $0x18,%%xmm6                    \n"
3859
3860     // 4 pixel loop.
3861     LABELALIGN
3862   "1:                                          \n"
3863     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3864     "punpcklbw %%xmm5,%%xmm0                   \n"
3865     "pmulhuw   %%xmm2,%%xmm0                   \n"
3866     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3867     "punpckhbw %%xmm5,%%xmm1                   \n"
3868     "pmulhuw   %%xmm2,%%xmm1                   \n"
3869     "pmullw    %%xmm3,%%xmm0                   \n"
3870     "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
3871     "pmullw    %%xmm3,%%xmm1                   \n"
3872     "pand      %%xmm6,%%xmm7                   \n"
3873     "paddw     %%xmm4,%%xmm0                   \n"
3874     "paddw     %%xmm4,%%xmm1                   \n"
3875     "packuswb  %%xmm1,%%xmm0                   \n"
3876     "por       %%xmm7,%%xmm0                   \n"
3877     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
3878     "lea       " MEMLEA(0x10,0) ",%0           \n"
3879     "sub       $0x4,%1                         \n"
3880     "jg        1b                              \n"
3881   : "+r"(dst_argb),       // %0
3882     "+r"(width)           // %1
3883   : "r"(scale),           // %2
3884     "r"(interval_size),   // %3
3885     "r"(interval_offset)  // %4
3886   : "memory", "cc"
3887     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3888   );
3889 }
3890 #endif  // HAS_ARGBQUANTIZEROW_SSE2
3891
3892 #ifdef HAS_ARGBSHADEROW_SSE2
3893 // Shade 4 pixels at a time by specified value.
3894 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3895                        uint32 value) {
3896   asm volatile (
3897     "movd      %3,%%xmm2                       \n"
3898     "punpcklbw %%xmm2,%%xmm2                   \n"
3899     "punpcklqdq %%xmm2,%%xmm2                  \n"
3900
3901     // 4 pixel loop.
3902     LABELALIGN
3903   "1:                                          \n"
3904     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3905     "lea       " MEMLEA(0x10,0) ",%0           \n"
3906     "movdqa    %%xmm0,%%xmm1                   \n"
3907     "punpcklbw %%xmm0,%%xmm0                   \n"
3908     "punpckhbw %%xmm1,%%xmm1                   \n"
3909     "pmulhuw   %%xmm2,%%xmm0                   \n"
3910     "pmulhuw   %%xmm2,%%xmm1                   \n"
3911     "psrlw     $0x8,%%xmm0                     \n"
3912     "psrlw     $0x8,%%xmm1                     \n"
3913     "packuswb  %%xmm1,%%xmm0                   \n"
3914     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3915     "lea       " MEMLEA(0x10,1) ",%1           \n"
3916     "sub       $0x4,%2                         \n"
3917     "jg        1b                              \n"
3918   : "+r"(src_argb),  // %0
3919     "+r"(dst_argb),  // %1
3920     "+r"(width)      // %2
3921   : "r"(value)       // %3
3922   : "memory", "cc"
3923     , "xmm0", "xmm1", "xmm2"
3924   );
3925 }
3926 #endif  // HAS_ARGBSHADEROW_SSE2
3927
3928 #ifdef HAS_ARGBMULTIPLYROW_SSE2
3929 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
3930 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3931                           uint8* dst_argb, int width) {
3932   asm volatile (
3933     "pxor      %%xmm5,%%xmm5                  \n"
3934
3935     // 4 pixel loop.
3936     LABELALIGN
3937   "1:                                          \n"
3938     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3939     "lea       " MEMLEA(0x10,0) ",%0           \n"
3940     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3941     "lea       " MEMLEA(0x10,1) ",%1           \n"
3942     "movdqu    %%xmm0,%%xmm1                   \n"
3943     "movdqu    %%xmm2,%%xmm3                   \n"
3944     "punpcklbw %%xmm0,%%xmm0                   \n"
3945     "punpckhbw %%xmm1,%%xmm1                   \n"
3946     "punpcklbw %%xmm5,%%xmm2                   \n"
3947     "punpckhbw %%xmm5,%%xmm3                   \n"
3948     "pmulhuw   %%xmm2,%%xmm0                   \n"
3949     "pmulhuw   %%xmm3,%%xmm1                   \n"
3950     "packuswb  %%xmm1,%%xmm0                   \n"
3951     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3952     "lea       " MEMLEA(0x10,2) ",%2           \n"
3953     "sub       $0x4,%3                         \n"
3954     "jg        1b                              \n"
3955   : "+r"(src_argb0),  // %0
3956     "+r"(src_argb1),  // %1
3957     "+r"(dst_argb),   // %2
3958     "+r"(width)       // %3
3959   :
3960   : "memory", "cc"
3961     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3962   );
3963 }
3964 #endif  // HAS_ARGBMULTIPLYROW_SSE2
3965
3966 #ifdef HAS_ARGBMULTIPLYROW_AVX2
3967 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
3968 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
3969                           uint8* dst_argb, int width) {
3970   asm volatile (
3971     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
3972
3973     // 4 pixel loop.
3974     LABELALIGN
3975   "1:                                          \n"
3976     "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
3977     "lea        " MEMLEA(0x20,0) ",%0          \n"
3978     "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
3979     "lea        " MEMLEA(0x20,1) ",%1          \n"
3980     "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
3981     "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
3982     "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
3983     "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
3984     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3985     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3986     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3987     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
3988     "lea       " MEMLEA(0x20,2) ",%2           \n"
3989     "sub        $0x8,%3                        \n"
3990     "jg        1b                              \n"
3991     "vzeroupper                                \n"
3992   : "+r"(src_argb0),  // %0
3993     "+r"(src_argb1),  // %1
3994     "+r"(dst_argb),   // %2
3995     "+r"(width)       // %3
3996   :
3997   : "memory", "cc"
3998 #if defined(__AVX2__)
3999     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4000 #endif
4001   );
4002 }
4003 #endif  // HAS_ARGBMULTIPLYROW_AVX2
4004
4005 #ifdef HAS_ARGBADDROW_SSE2
4006 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4007 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4008                      uint8* dst_argb, int width) {
4009   asm volatile (
4010     // 4 pixel loop.
4011     LABELALIGN
4012   "1:                                          \n"
4013     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4014     "lea       " MEMLEA(0x10,0) ",%0           \n"
4015     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4016     "lea       " MEMLEA(0x10,1) ",%1           \n"
4017     "paddusb   %%xmm1,%%xmm0                   \n"
4018     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4019     "lea       " MEMLEA(0x10,2) ",%2           \n"
4020     "sub       $0x4,%3                         \n"
4021     "jg        1b                              \n"
4022   : "+r"(src_argb0),  // %0
4023     "+r"(src_argb1),  // %1
4024     "+r"(dst_argb),   // %2
4025     "+r"(width)       // %3
4026   :
4027   : "memory", "cc"
4028     , "xmm0", "xmm1"
4029   );
4030 }
4031 #endif  // HAS_ARGBADDROW_SSE2
4032
4033 #ifdef HAS_ARGBADDROW_AVX2
4034 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4035 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4036                      uint8* dst_argb, int width) {
4037   asm volatile (
4038     // 4 pixel loop.
4039     LABELALIGN
4040   "1:                                          \n"
4041     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
4042     "lea        " MEMLEA(0x20,0) ",%0          \n"
4043     "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4044     "lea        " MEMLEA(0x20,1) ",%1          \n"
4045     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4046     "lea        " MEMLEA(0x20,2) ",%2          \n"
4047     "sub        $0x8,%3                        \n"
4048     "jg        1b                              \n"
4049     "vzeroupper                                \n"
4050   : "+r"(src_argb0),  // %0
4051     "+r"(src_argb1),  // %1
4052     "+r"(dst_argb),   // %2
4053     "+r"(width)       // %3
4054   :
4055   : "memory", "cc"
4056     , "xmm0"
4057   );
4058 }
4059 #endif  // HAS_ARGBADDROW_AVX2
4060
4061 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4062 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
4063 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4064                           uint8* dst_argb, int width) {
4065   asm volatile (
4066     // 4 pixel loop.
4067     LABELALIGN
4068   "1:                                          \n"
4069     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4070     "lea       " MEMLEA(0x10,0) ",%0           \n"
4071     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4072     "lea       " MEMLEA(0x10,1) ",%1           \n"
4073     "psubusb   %%xmm1,%%xmm0                   \n"
4074     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4075     "lea       " MEMLEA(0x10,2) ",%2           \n"
4076     "sub       $0x4,%3                         \n"
4077     "jg        1b                              \n"
4078   : "+r"(src_argb0),  // %0
4079     "+r"(src_argb1),  // %1
4080     "+r"(dst_argb),   // %2
4081     "+r"(width)       // %3
4082   :
4083   : "memory", "cc"
4084     , "xmm0", "xmm1"
4085   );
4086 }
4087 #endif  // HAS_ARGBSUBTRACTROW_SSE2
4088
4089 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4090 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
4091 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4092                           uint8* dst_argb, int width) {
4093   asm volatile (
4094     // 4 pixel loop.
4095     LABELALIGN
4096   "1:                                          \n"
4097     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
4098     "lea        " MEMLEA(0x20,0) ",%0          \n"
4099     "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4100     "lea        " MEMLEA(0x20,1) ",%1          \n"
4101     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4102     "lea        " MEMLEA(0x20,2) ",%2          \n"
4103     "sub        $0x8,%3                        \n"
4104     "jg        1b                              \n"
4105     "vzeroupper                                \n"
4106   : "+r"(src_argb0),  // %0
4107     "+r"(src_argb1),  // %1
4108     "+r"(dst_argb),   // %2
4109     "+r"(width)       // %3
4110   :
4111   : "memory", "cc"
4112     , "xmm0"
4113   );
4114 }
4115 #endif  // HAS_ARGBSUBTRACTROW_AVX2
4116
4117 #ifdef HAS_SOBELXROW_SSE2
4118 // SobelX as a matrix is
4119 // -1  0  1
4120 // -2  0  2
4121 // -1  0  1
4122 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4123                     const uint8* src_y2, uint8* dst_sobelx, int width) {
4124   asm volatile (
4125     "sub       %0,%1                           \n"
4126     "sub       %0,%2                           \n"
4127     "sub       %0,%3                           \n"
4128     "pxor      %%xmm5,%%xmm5                   \n"
4129
4130     // 8 pixel loop.
4131     LABELALIGN
4132   "1:                                          \n"
4133     "movq      " MEMACCESS(0) ",%%xmm0         \n"
4134     "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
4135     "punpcklbw %%xmm5,%%xmm0                   \n"
4136     "punpcklbw %%xmm5,%%xmm1                   \n"
4137     "psubw     %%xmm1,%%xmm0                   \n"
4138     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4139     MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
4140     "punpcklbw %%xmm5,%%xmm1                   \n"
4141     "punpcklbw %%xmm5,%%xmm2                   \n"
4142     "psubw     %%xmm2,%%xmm1                   \n"
4143     MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
4144     MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
4145     "punpcklbw %%xmm5,%%xmm2                   \n"
4146     "punpcklbw %%xmm5,%%xmm3                   \n"
4147     "psubw     %%xmm3,%%xmm2                   \n"
4148     "paddw     %%xmm2,%%xmm0                   \n"
4149     "paddw     %%xmm1,%%xmm0                   \n"
4150     "paddw     %%xmm1,%%xmm0                   \n"
4151     "pxor      %%xmm1,%%xmm1                   \n"
4152     "psubw     %%xmm0,%%xmm1                   \n"
4153     "pmaxsw    %%xmm1,%%xmm0                   \n"
4154     "packuswb  %%xmm0,%%xmm0                   \n"
4155     MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
4156     "lea       " MEMLEA(0x8,0) ",%0            \n"
4157     "sub       $0x8,%4                         \n"
4158     "jg        1b                              \n"
4159   : "+r"(src_y0),      // %0
4160     "+r"(src_y1),      // %1
4161     "+r"(src_y2),      // %2
4162     "+r"(dst_sobelx),  // %3
4163     "+r"(width)        // %4
4164   :
4165   : "memory", "cc", NACL_R14
4166     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4167   );
4168 }
4169 #endif  // HAS_SOBELXROW_SSE2
4170
4171 #ifdef HAS_SOBELYROW_SSE2
4172 // SobelY as a matrix is
4173 // -1 -2 -1
4174 //  0  0  0
4175 //  1  2  1
4176 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4177                     uint8* dst_sobely, int width) {
4178   asm volatile (
4179     "sub       %0,%1                           \n"
4180     "sub       %0,%2                           \n"
4181     "pxor      %%xmm5,%%xmm5                   \n"
4182
4183     // 8 pixel loop.
4184     LABELALIGN
4185   "1:                                          \n"
4186     "movq      " MEMACCESS(0) ",%%xmm0         \n"
4187     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4188     "punpcklbw %%xmm5,%%xmm0                   \n"
4189     "punpcklbw %%xmm5,%%xmm1                   \n"
4190     "psubw     %%xmm1,%%xmm0                   \n"
4191     "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
4192     MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
4193     "punpcklbw %%xmm5,%%xmm1                   \n"
4194     "punpcklbw %%xmm5,%%xmm2                   \n"
4195     "psubw     %%xmm2,%%xmm1                   \n"
4196     "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
4197     MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
4198     "punpcklbw %%xmm5,%%xmm2                   \n"
4199     "punpcklbw %%xmm5,%%xmm3                   \n"
4200     "psubw     %%xmm3,%%xmm2                   \n"
4201     "paddw     %%xmm2,%%xmm0                   \n"
4202     "paddw     %%xmm1,%%xmm0                   \n"
4203     "paddw     %%xmm1,%%xmm0                   \n"
4204     "pxor      %%xmm1,%%xmm1                   \n"
4205     "psubw     %%xmm0,%%xmm1                   \n"
4206     "pmaxsw    %%xmm1,%%xmm0                   \n"
4207     "packuswb  %%xmm0,%%xmm0                   \n"
4208     MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
4209     "lea       " MEMLEA(0x8,0) ",%0            \n"
4210     "sub       $0x8,%3                         \n"
4211     "jg        1b                              \n"
4212   : "+r"(src_y0),      // %0
4213     "+r"(src_y1),      // %1
4214     "+r"(dst_sobely),  // %2
4215     "+r"(width)        // %3
4216   :
4217   : "memory", "cc", NACL_R14
4218     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4219   );
4220 }
4221 #endif  // HAS_SOBELYROW_SSE2
4222
4223 #ifdef HAS_SOBELROW_SSE2
4224 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4225 // A = 255
4226 // R = Sobel
4227 // G = Sobel
4228 // B = Sobel
4229 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4230                    uint8* dst_argb, int width) {
4231   asm volatile (
4232     "sub       %0,%1                           \n"
4233     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4234     "pslld     $0x18,%%xmm5                    \n"
4235
4236     // 8 pixel loop.
4237     LABELALIGN
4238   "1:                                          \n"
4239     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4240     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4241     "lea       " MEMLEA(0x10,0) ",%0           \n"
4242     "paddusb   %%xmm1,%%xmm0                   \n"
4243     "movdqa    %%xmm0,%%xmm2                   \n"
4244     "punpcklbw %%xmm0,%%xmm2                   \n"
4245     "punpckhbw %%xmm0,%%xmm0                   \n"
4246     "movdqa    %%xmm2,%%xmm1                   \n"
4247     "punpcklwd %%xmm2,%%xmm1                   \n"
4248     "punpckhwd %%xmm2,%%xmm2                   \n"
4249     "por       %%xmm5,%%xmm1                   \n"
4250     "por       %%xmm5,%%xmm2                   \n"
4251     "movdqa    %%xmm0,%%xmm3                   \n"
4252     "punpcklwd %%xmm0,%%xmm3                   \n"
4253     "punpckhwd %%xmm0,%%xmm0                   \n"
4254     "por       %%xmm5,%%xmm3                   \n"
4255     "por       %%xmm5,%%xmm0                   \n"
4256     "movdqu    %%xmm1," MEMACCESS(2) "         \n"
4257     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
4258     "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
4259     "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
4260     "lea       " MEMLEA(0x40,2) ",%2           \n"
4261     "sub       $0x10,%3                        \n"
4262     "jg        1b                              \n"
4263   : "+r"(src_sobelx),  // %0
4264     "+r"(src_sobely),  // %1
4265     "+r"(dst_argb),    // %2
4266     "+r"(width)        // %3
4267   :
4268   : "memory", "cc", NACL_R14
4269     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4270   );
4271 }
4272 #endif  // HAS_SOBELROW_SSE2
4273
4274 #ifdef HAS_SOBELTOPLANEROW_SSE2
4275 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
4276 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4277                           uint8* dst_y, int width) {
4278   asm volatile (
4279     "sub       %0,%1                           \n"
4280     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4281     "pslld     $0x18,%%xmm5                    \n"
4282
4283     // 8 pixel loop.
4284     LABELALIGN
4285   "1:                                          \n"
4286     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4287     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4288     "lea       " MEMLEA(0x10,0) ",%0           \n"
4289     "paddusb   %%xmm1,%%xmm0                   \n"
4290     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4291     "lea       " MEMLEA(0x10,2) ",%2           \n"
4292     "sub       $0x10,%3                        \n"
4293     "jg        1b                              \n"
4294   : "+r"(src_sobelx),  // %0
4295     "+r"(src_sobely),  // %1
4296     "+r"(dst_y),       // %2
4297     "+r"(width)        // %3
4298   :
4299   : "memory", "cc", NACL_R14
4300     "xmm0", "xmm1"
4301   );
4302 }
4303 #endif  // HAS_SOBELTOPLANEROW_SSE2
4304
4305 #ifdef HAS_SOBELXYROW_SSE2
4306 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4307 // A = 255
4308 // R = Sobel X
4309 // G = Sobel
4310 // B = Sobel Y
4311 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4312                      uint8* dst_argb, int width) {
4313   asm volatile (
4314     "sub       %0,%1                           \n"
4315     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4316
4317     // 8 pixel loop.
4318     LABELALIGN
4319   "1:                                          \n"
4320     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4321     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4322     "lea       " MEMLEA(0x10,0) ",%0           \n"
4323     "movdqa    %%xmm0,%%xmm2                   \n"
4324     "paddusb   %%xmm1,%%xmm2                   \n"
4325     "movdqa    %%xmm0,%%xmm3                   \n"
4326     "punpcklbw %%xmm5,%%xmm3                   \n"
4327     "punpckhbw %%xmm5,%%xmm0                   \n"
4328     "movdqa    %%xmm1,%%xmm4                   \n"
4329     "punpcklbw %%xmm2,%%xmm4                   \n"
4330     "punpckhbw %%xmm2,%%xmm1                   \n"
4331     "movdqa    %%xmm4,%%xmm6                   \n"
4332     "punpcklwd %%xmm3,%%xmm6                   \n"
4333     "punpckhwd %%xmm3,%%xmm4                   \n"
4334     "movdqa    %%xmm1,%%xmm7                   \n"
4335     "punpcklwd %%xmm0,%%xmm7                   \n"
4336     "punpckhwd %%xmm0,%%xmm1                   \n"
4337     "movdqu    %%xmm6," MEMACCESS(2) "         \n"
4338     "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
4339     "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
4340     "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
4341     "lea       " MEMLEA(0x40,2) ",%2           \n"
4342     "sub       $0x10,%3                        \n"
4343     "jg        1b                              \n"
4344   : "+r"(src_sobelx),  // %0
4345     "+r"(src_sobely),  // %1
4346     "+r"(dst_argb),    // %2
4347     "+r"(width)        // %3
4348   :
4349   : "memory", "cc", NACL_R14
4350     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4351   );
4352 }
4353 #endif  // HAS_SOBELXYROW_SSE2
4354
4355 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4356 // Creates a table of cumulative sums where each value is a sum of all values
4357 // above and to the left of the value, inclusive of the value.
4358 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
4359                                   const int32* previous_cumsum, int width) {
4360   asm volatile (
4361     "pxor      %%xmm0,%%xmm0                   \n"
4362     "pxor      %%xmm1,%%xmm1                   \n"
4363     "sub       $0x4,%3                         \n"
4364     "jl        49f                             \n"
4365     "test      $0xf,%1                         \n"
4366     "jne       49f                             \n"
4367
4368   // 4 pixel loop                              \n"
4369     LABELALIGN
4370   "40:                                         \n"
4371     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
4372     "lea       " MEMLEA(0x10,0) ",%0           \n"
4373     "movdqa    %%xmm2,%%xmm4                   \n"
4374     "punpcklbw %%xmm1,%%xmm2                   \n"
4375     "movdqa    %%xmm2,%%xmm3                   \n"
4376     "punpcklwd %%xmm1,%%xmm2                   \n"
4377     "punpckhwd %%xmm1,%%xmm3                   \n"
4378     "punpckhbw %%xmm1,%%xmm4                   \n"
4379     "movdqa    %%xmm4,%%xmm5                   \n"
4380     "punpcklwd %%xmm1,%%xmm4                   \n"
4381     "punpckhwd %%xmm1,%%xmm5                   \n"
4382     "paddd     %%xmm2,%%xmm0                   \n"
4383     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4384     "paddd     %%xmm0,%%xmm2                   \n"
4385     "paddd     %%xmm3,%%xmm0                   \n"
4386     "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
4387     "paddd     %%xmm0,%%xmm3                   \n"
4388     "paddd     %%xmm4,%%xmm0                   \n"
4389     "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
4390     "paddd     %%xmm0,%%xmm4                   \n"
4391     "paddd     %%xmm5,%%xmm0                   \n"
4392     "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
4393     "lea       " MEMLEA(0x40,2) ",%2           \n"
4394     "paddd     %%xmm0,%%xmm5                   \n"
4395     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4396     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
4397     "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
4398     "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
4399     "lea       " MEMLEA(0x40,1) ",%1           \n"
4400     "sub       $0x4,%3                         \n"
4401     "jge       40b                             \n"
4402
4403   "49:                                         \n"
4404     "add       $0x3,%3                         \n"
4405     "jl        19f                             \n"
4406
4407   // 1 pixel loop                              \n"
4408     LABELALIGN
4409   "10:                                         \n"
4410     "movd      " MEMACCESS(0) ",%%xmm2         \n"
4411     "lea       " MEMLEA(0x4,0) ",%0            \n"
4412     "punpcklbw %%xmm1,%%xmm2                   \n"
4413     "punpcklwd %%xmm1,%%xmm2                   \n"
4414     "paddd     %%xmm2,%%xmm0                   \n"
4415     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4416     "lea       " MEMLEA(0x10,2) ",%2           \n"
4417     "paddd     %%xmm0,%%xmm2                   \n"
4418     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4419     "lea       " MEMLEA(0x10,1) ",%1           \n"
4420     "sub       $0x1,%3                         \n"
4421     "jge       10b                             \n"
4422
4423   "19:                                         \n"
4424   : "+r"(row),  // %0
4425     "+r"(cumsum),  // %1
4426     "+r"(previous_cumsum),  // %2
4427     "+r"(width)  // %3
4428   :
4429   : "memory", "cc"
4430     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4431   );
4432 }
4433 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
4434
4435 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4436 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4437                                     int width, int area, uint8* dst,
4438                                     int count) {
4439   asm volatile (
4440     "movd      %5,%%xmm5                       \n"
4441     "cvtdq2ps  %%xmm5,%%xmm5                   \n"
4442     "rcpss     %%xmm5,%%xmm4                   \n"
4443     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
4444     "sub       $0x4,%3                         \n"
4445     "jl        49f                             \n"
4446     "cmpl      $0x80,%5                        \n"
4447     "ja        40f                             \n"
4448
4449     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4450     "pcmpeqb   %%xmm6,%%xmm6                   \n"
4451     "psrld     $0x10,%%xmm6                    \n"
4452     "cvtdq2ps  %%xmm6,%%xmm6                   \n"
4453     "addps     %%xmm6,%%xmm5                   \n"
4454     "mulps     %%xmm4,%%xmm5                   \n"
4455     "cvtps2dq  %%xmm5,%%xmm5                   \n"
4456     "packssdw  %%xmm5,%%xmm5                   \n"
4457
4458   // 4 pixel small loop                        \n"
4459     LABELALIGN
4460   "4:                                         \n"
4461     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4462     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4463     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4464     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4465     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4466     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4467     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4468     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4469     "lea       " MEMLEA(0x40,0) ",%0           \n"
4470     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4471     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4472     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4473     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4474     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4475     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4476     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4477     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4478     "lea       " MEMLEA(0x40,1) ",%1           \n"
4479     "packssdw  %%xmm1,%%xmm0                   \n"
4480     "packssdw  %%xmm3,%%xmm2                   \n"
4481     "pmulhuw   %%xmm5,%%xmm0                   \n"
4482     "pmulhuw   %%xmm5,%%xmm2                   \n"
4483     "packuswb  %%xmm2,%%xmm0                   \n"
4484     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4485     "lea       " MEMLEA(0x10,2) ",%2           \n"
4486     "sub       $0x4,%3                         \n"
4487     "jge       4b                              \n"
4488     "jmp       49f                             \n"
4489
4490   // 4 pixel loop                              \n"
4491     LABELALIGN
4492   "40:                                         \n"
4493     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4494     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4495     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4496     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4497     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4498     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4499     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4500     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4501     "lea       " MEMLEA(0x40,0) ",%0           \n"
4502     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4503     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4504     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4505     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4506     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4507     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4508     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4509     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4510     "lea       " MEMLEA(0x40,1) ",%1           \n"
4511     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4512     "cvtdq2ps  %%xmm1,%%xmm1                   \n"
4513     "mulps     %%xmm4,%%xmm0                   \n"
4514     "mulps     %%xmm4,%%xmm1                   \n"
4515     "cvtdq2ps  %%xmm2,%%xmm2                   \n"
4516     "cvtdq2ps  %%xmm3,%%xmm3                   \n"
4517     "mulps     %%xmm4,%%xmm2                   \n"
4518     "mulps     %%xmm4,%%xmm3                   \n"
4519     "cvtps2dq  %%xmm0,%%xmm0                   \n"
4520     "cvtps2dq  %%xmm1,%%xmm1                   \n"
4521     "cvtps2dq  %%xmm2,%%xmm2                   \n"
4522     "cvtps2dq  %%xmm3,%%xmm3                   \n"
4523     "packssdw  %%xmm1,%%xmm0                   \n"
4524     "packssdw  %%xmm3,%%xmm2                   \n"
4525     "packuswb  %%xmm2,%%xmm0                   \n"
4526     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4527     "lea       " MEMLEA(0x10,2) ",%2           \n"
4528     "sub       $0x4,%3                         \n"
4529     "jge       40b                             \n"
4530
4531   "49:                                         \n"
4532     "add       $0x3,%3                         \n"
4533     "jl        19f                             \n"
4534
4535   // 1 pixel loop                              \n"
4536     LABELALIGN
4537   "10:                                         \n"
4538     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4539     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4540     "lea       " MEMLEA(0x10,0) ",%0           \n"
4541     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4542     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4543     "lea       " MEMLEA(0x10,1) ",%1           \n"
4544     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4545     "mulps     %%xmm4,%%xmm0                   \n"
4546     "cvtps2dq  %%xmm0,%%xmm0                   \n"
4547     "packssdw  %%xmm0,%%xmm0                   \n"
4548     "packuswb  %%xmm0,%%xmm0                   \n"
4549     "movd      %%xmm0," MEMACCESS(2) "         \n"
4550     "lea       " MEMLEA(0x4,2) ",%2            \n"
4551     "sub       $0x1,%3                         \n"
4552     "jge       10b                             \n"
4553   "19:                                         \n"
4554   : "+r"(topleft),  // %0
4555     "+r"(botleft),  // %1
4556     "+r"(dst),      // %2
4557     "+rm"(count)    // %3
4558   : "r"((intptr_t)(width)),  // %4
4559     "rm"(area)     // %5
4560   : "memory", "cc", NACL_R14
4561     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4562   );
4563 }
4564 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4565
4566 #ifdef HAS_ARGBAFFINEROW_SSE2
4567 // Copy ARGB pixels from source image with slope to a row of destination.
4568 LIBYUV_API
4569 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
4570                         uint8* dst_argb, const float* src_dudv, int width) {
4571   intptr_t src_argb_stride_temp = src_argb_stride;
4572   intptr_t temp = 0;
4573   asm volatile (
4574     "movq      " MEMACCESS(3) ",%%xmm2         \n"
4575     "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
4576     "shl       $0x10,%1                        \n"
4577     "add       $0x4,%1                         \n"
4578     "movd      %1,%%xmm5                       \n"
4579     "sub       $0x4,%4                         \n"
4580     "jl        49f                             \n"
4581
4582     "pshufd    $0x44,%%xmm7,%%xmm7             \n"
4583     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4584     "movdqa    %%xmm2,%%xmm0                   \n"
4585     "addps     %%xmm7,%%xmm0                   \n"
4586     "movlhps   %%xmm0,%%xmm2                   \n"
4587     "movdqa    %%xmm7,%%xmm4                   \n"
4588     "addps     %%xmm4,%%xmm4                   \n"
4589     "movdqa    %%xmm2,%%xmm3                   \n"
4590     "addps     %%xmm4,%%xmm3                   \n"
4591     "addps     %%xmm4,%%xmm4                   \n"
4592
4593   // 4 pixel loop                              \n"
4594     LABELALIGN
4595   "40:                                         \n"
4596     "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
4597     "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
4598     "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
4599     "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
4600     "movd      %%xmm0,%k1                      \n"
4601     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4602     "movd      %%xmm0,%k5                      \n"
4603     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4604     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
4605     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4606     "punpckldq %%xmm6,%%xmm1                   \n"
4607     "addps     %%xmm4,%%xmm2                   \n"
4608     "movq      %%xmm1," MEMACCESS(2) "         \n"
4609     "movd      %%xmm0,%k1                      \n"
4610     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4611     "movd      %%xmm0,%k5                      \n"
4612     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4613     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4614     "punpckldq %%xmm6,%%xmm0                   \n"
4615     "addps     %%xmm4,%%xmm3                   \n"
4616     "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
4617     "lea       " MEMLEA(0x10,2) ",%2           \n"
4618     "sub       $0x4,%4                         \n"
4619     "jge       40b                             \n"
4620
4621   "49:                                         \n"
4622     "add       $0x3,%4                         \n"
4623     "jl        19f                             \n"
4624
4625   // 1 pixel loop                              \n"
4626     LABELALIGN
4627   "10:                                         \n"
4628     "cvttps2dq %%xmm2,%%xmm0                   \n"
4629     "packssdw  %%xmm0,%%xmm0                   \n"
4630     "pmaddwd   %%xmm5,%%xmm0                   \n"
4631     "addps     %%xmm7,%%xmm2                   \n"
4632     "movd      %%xmm0,%k1                      \n"
4633     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4634     "movd      %%xmm0," MEMACCESS(2) "         \n"
4635     "lea       " MEMLEA(0x04,2) ",%2           \n"
4636     "sub       $0x1,%4                         \n"
4637     "jge       10b                             \n"
4638   "19:                                         \n"
4639   : "+r"(src_argb),  // %0
4640     "+r"(src_argb_stride_temp),  // %1
4641     "+r"(dst_argb),  // %2
4642     "+r"(src_dudv),  // %3
4643     "+rm"(width),    // %4
4644     "+r"(temp)   // %5
4645   :
4646   : "memory", "cc", NACL_R14
4647     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4648   );
4649 }
4650 #endif  // HAS_ARGBAFFINEROW_SSE2
4651
4652 #ifdef HAS_INTERPOLATEROW_SSSE3
4653 // Bilinear filter 16x2 -> 16x1
4654 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4655                           ptrdiff_t src_stride, int dst_width,
4656                           int source_y_fraction) {
4657   asm volatile (
4658     "sub       %1,%0                           \n"
4659     "shr       %3                              \n"
4660     "cmp       $0x0,%3                         \n"
4661     "je        100f                            \n"
4662     "cmp       $0x20,%3                        \n"
4663     "je        75f                             \n"
4664     "cmp       $0x40,%3                        \n"
4665     "je        50f                             \n"
4666     "cmp       $0x60,%3                        \n"
4667     "je        25f                             \n"
4668
4669     "movd      %3,%%xmm0                       \n"
4670     "neg       %3                              \n"
4671     "add       $0x80,%3                        \n"
4672     "movd      %3,%%xmm5                       \n"
4673     "punpcklbw %%xmm0,%%xmm5                   \n"
4674     "punpcklwd %%xmm5,%%xmm5                   \n"
4675     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4676
4677     // General purpose row blend.
4678     LABELALIGN
4679   "1:                                          \n"
4680     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4681     MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4682     "movdqa    %%xmm0,%%xmm1                   \n"
4683     "punpcklbw %%xmm2,%%xmm0                   \n"
4684     "punpckhbw %%xmm2,%%xmm1                   \n"
4685     "pmaddubsw %%xmm5,%%xmm0                   \n"
4686     "pmaddubsw %%xmm5,%%xmm1                   \n"
4687     "psrlw     $0x7,%%xmm0                     \n"
4688     "psrlw     $0x7,%%xmm1                     \n"
4689     "packuswb  %%xmm1,%%xmm0                   \n"
4690     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4691     "lea       " MEMLEA(0x10,1) ",%1           \n"
4692     "sub       $0x10,%2                        \n"
4693     "jg        1b                              \n"
4694     "jmp       99f                             \n"
4695
4696     // Blend 25 / 75.
4697     LABELALIGN
4698   "25:                                         \n"
4699     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4700     MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4701     "pavgb     %%xmm1,%%xmm0                   \n"
4702     "pavgb     %%xmm1,%%xmm0                   \n"
4703     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4704     "lea       " MEMLEA(0x10,1) ",%1           \n"
4705     "sub       $0x10,%2                        \n"
4706     "jg        25b                             \n"
4707     "jmp       99f                             \n"
4708
4709     // Blend 50 / 50.
4710     LABELALIGN
4711   "50:                                         \n"
4712     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4713     MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4714     "pavgb     %%xmm1,%%xmm0                   \n"
4715     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4716     "lea       " MEMLEA(0x10,1) ",%1           \n"
4717     "sub       $0x10,%2                        \n"
4718     "jg        50b                             \n"
4719     "jmp       99f                             \n"
4720
4721     // Blend 75 / 25.
4722     LABELALIGN
4723   "75:                                         \n"
4724     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4725     MEMOPREG(movdqu,0x00,1,4,1,xmm0)
4726     "pavgb     %%xmm1,%%xmm0                   \n"
4727     "pavgb     %%xmm1,%%xmm0                   \n"
4728     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4729     "lea       " MEMLEA(0x10,1) ",%1           \n"
4730     "sub       $0x10,%2                        \n"
4731     "jg        75b                             \n"
4732     "jmp       99f                             \n"
4733
4734     // Blend 100 / 0 - Copy row unchanged.
4735     LABELALIGN
4736   "100:                                        \n"
4737     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4738     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4739     "lea       " MEMLEA(0x10,1) ",%1           \n"
4740     "sub       $0x10,%2                        \n"
4741     "jg        100b                            \n"
4742
4743   "99:                                         \n"
4744   : "+r"(dst_ptr),    // %0
4745     "+r"(src_ptr),    // %1
4746     "+r"(dst_width),  // %2
4747     "+r"(source_y_fraction)  // %3
4748   : "r"((intptr_t)(src_stride))  // %4
4749   : "memory", "cc", NACL_R14
4750     "xmm0", "xmm1", "xmm2", "xmm5"
4751   );
4752 }
4753 #endif  // HAS_INTERPOLATEROW_SSSE3
4754
4755 #ifdef HAS_INTERPOLATEROW_AVX2
4756 // Bilinear filter 32x2 -> 32x1
4757 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
4758                          ptrdiff_t src_stride, int dst_width,
4759                          int source_y_fraction) {
4760   asm volatile (
4761     "shr       %3                              \n"
4762     "cmp       $0x0,%3                         \n"
4763     "je        100f                            \n"
4764     "sub       %1,%0                           \n"
4765     "cmp       $0x20,%3                        \n"
4766     "je        75f                             \n"
4767     "cmp       $0x40,%3                        \n"
4768     "je        50f                             \n"
4769     "cmp       $0x60,%3                        \n"
4770     "je        25f                             \n"
4771
4772     "vmovd      %3,%%xmm0                      \n"
4773     "neg        %3                             \n"
4774     "add        $0x80,%3                       \n"
4775     "vmovd      %3,%%xmm5                      \n"
4776     "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
4777     "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
4778     "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
4779     "vpermd     %%ymm5,%%ymm0,%%ymm5           \n"
4780
4781     // General purpose row blend.
4782     LABELALIGN
4783   "1:                                          \n"
4784     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
4785     MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
4786     "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
4787     "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
4788     "vpmaddubsw %%ymm5,%%ymm0,%%ymm0           \n"
4789     "vpmaddubsw %%ymm5,%%ymm1,%%ymm1           \n"
4790     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
4791     "vpsrlw     $0x7,%%ymm1,%%ymm1             \n"
4792     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
4793     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4794     "lea       " MEMLEA(0x20,1) ",%1           \n"
4795     "sub       $0x20,%2                        \n"
4796     "jg        1b                              \n"
4797     "jmp       99f                             \n"
4798
4799     // Blend 25 / 75.
4800     LABELALIGN
4801   "25:                                         \n"
4802     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
4803     MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
4804     "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
4805     "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
4806     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4807     "lea       " MEMLEA(0x20,1) ",%1           \n"
4808     "sub       $0x20,%2                        \n"
4809     "jg        25b                             \n"
4810     "jmp       99f                             \n"
4811
4812     // Blend 50 / 50.
4813     LABELALIGN
4814   "50:                                         \n"
4815     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
4816     VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
4817     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4818     "lea       " MEMLEA(0x20,1) ",%1           \n"
4819     "sub       $0x20,%2                        \n"
4820     "jg        50b                             \n"
4821     "jmp       99f                             \n"
4822
4823     // Blend 75 / 25.
4824     LABELALIGN
4825   "75:                                         \n"
4826     "vmovdqu    " MEMACCESS(1) ",%%ymm1        \n"
4827     MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
4828     "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
4829     "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
4830     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4831     "lea       " MEMLEA(0x20,1) ",%1           \n"
4832     "sub       $0x20,%2                        \n"
4833     "jg        75b                             \n"
4834     "jmp       99f                             \n"
4835
4836     // Blend 100 / 0 - Copy row unchanged.
4837     LABELALIGN
4838   "100:                                        \n"
4839     "rep movsb " MEMMOVESTRING(1,0) "          \n"
4840     "jmp       999f                            \n"
4841
4842   "99:                                         \n"
4843     "vzeroupper                                \n"
4844   "999:                                        \n"
4845   : "+D"(dst_ptr),    // %0
4846     "+S"(src_ptr),    // %1
4847     "+c"(dst_width),  // %2
4848     "+r"(source_y_fraction)  // %3
4849   : "r"((intptr_t)(src_stride))  // %4
4850   : "memory", "cc", NACL_R14
4851     "xmm0", "xmm1", "xmm2", "xmm5"
4852   );
4853 }
4854 #endif  // HAS_INTERPOLATEROW_AVX2
4855
4856 #ifdef HAS_INTERPOLATEROW_SSE2
4857 // Bilinear filter 16x2 -> 16x1
4858 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
4859                          ptrdiff_t src_stride, int dst_width,
4860                          int source_y_fraction) {
4861   asm volatile (
4862     "sub       %1,%0                           \n"
4863     "shr       %3                              \n"
4864     "cmp       $0x0,%3                         \n"
4865     "je        100f                            \n"
4866     "cmp       $0x20,%3                        \n"
4867     "je        75f                             \n"
4868     "cmp       $0x40,%3                        \n"
4869     "je        50f                             \n"
4870     "cmp       $0x60,%3                        \n"
4871     "je        25f                             \n"
4872
4873     "movd      %3,%%xmm0                       \n"
4874     "neg       %3                              \n"
4875     "add       $0x80,%3                        \n"
4876     "movd      %3,%%xmm5                       \n"
4877     "punpcklbw %%xmm0,%%xmm5                   \n"
4878     "punpcklwd %%xmm5,%%xmm5                   \n"
4879     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4880     "pxor      %%xmm4,%%xmm4                   \n"
4881
4882     // General purpose row blend.
4883     LABELALIGN
4884   "1:                                          \n"
4885     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4886     MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
4887     "movdqa    %%xmm0,%%xmm1                   \n"
4888     "movdqa    %%xmm2,%%xmm3                   \n"
4889     "punpcklbw %%xmm4,%%xmm2                   \n"
4890     "punpckhbw %%xmm4,%%xmm3                   \n"
4891     "punpcklbw %%xmm4,%%xmm0                   \n"
4892     "punpckhbw %%xmm4,%%xmm1                   \n"
4893     "psubw     %%xmm0,%%xmm2                   \n"
4894     "psubw     %%xmm1,%%xmm3                   \n"
4895     "paddw     %%xmm2,%%xmm2                   \n"
4896     "paddw     %%xmm3,%%xmm3                   \n"
4897     "pmulhw    %%xmm5,%%xmm2                   \n"
4898     "pmulhw    %%xmm5,%%xmm3                   \n"
4899     "paddw     %%xmm2,%%xmm0                   \n"
4900     "paddw     %%xmm3,%%xmm1                   \n"
4901     "packuswb  %%xmm1,%%xmm0                   \n"
4902     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4903     "lea       " MEMLEA(0x10,1) ",%1           \n"
4904     "sub       $0x10,%2                        \n"
4905     "jg        1b                              \n"
4906     "jmp       99f                             \n"
4907
4908     // Blend 25 / 75.
4909     LABELALIGN
4910   "25:                                         \n"
4911     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4912     MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
4913     "pavgb     %%xmm1,%%xmm0                   \n"
4914     "pavgb     %%xmm1,%%xmm0                   \n"
4915     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4916     "lea       " MEMLEA(0x10,1) ",%1           \n"
4917     "sub       $0x10,%2                        \n"
4918     "jg        25b                             \n"
4919     "jmp       99f                             \n"
4920
4921     // Blend 50 / 50.
4922     LABELALIGN
4923   "50:                                         \n"
4924     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4925     MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
4926     "pavgb     %%xmm1,%%xmm0                   \n"
4927     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4928     "lea       " MEMLEA(0x10,1) ",%1           \n"
4929     "sub       $0x10,%2                        \n"
4930     "jg        50b                             \n"
4931     "jmp       99f                             \n"
4932
4933     // Blend 75 / 25.
4934     LABELALIGN
4935   "75:                                         \n"
4936     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4937     MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
4938     "pavgb     %%xmm1,%%xmm0                   \n"
4939     "pavgb     %%xmm1,%%xmm0                   \n"
4940     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4941     "lea       " MEMLEA(0x10,1) ",%1           \n"
4942     "sub       $0x10,%2                        \n"
4943     "jg        75b                             \n"
4944     "jmp       99f                             \n"
4945
4946     // Blend 100 / 0 - Copy row unchanged.
4947     LABELALIGN
4948   "100:                                        \n"
4949     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4950     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4951     "lea       " MEMLEA(0x10,1) ",%1           \n"
4952     "sub       $0x10,%2                        \n"
4953     "jg        100b                            \n"
4954
4955   "99:                                         \n"
4956   : "+r"(dst_ptr),    // %0
4957     "+r"(src_ptr),    // %1
4958     "+r"(dst_width),  // %2
4959     "+r"(source_y_fraction)  // %3
4960   : "r"((intptr_t)(src_stride))  // %4
4961   : "memory", "cc", NACL_R14
4962     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4963   );
4964 }
4965 #endif  // HAS_INTERPOLATEROW_SSE2
4966
4967 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
4968 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4969 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4970                           const uint8* shuffler, int pix) {
4971   asm volatile (
4972     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
4973     LABELALIGN
4974   "1:                                          \n"
4975     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4976     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4977     "lea       " MEMLEA(0x20,0) ",%0           \n"
4978     "pshufb    %%xmm5,%%xmm0                   \n"
4979     "pshufb    %%xmm5,%%xmm1                   \n"
4980     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
4981     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
4982     "lea       " MEMLEA(0x20,1) ",%1           \n"
4983     "sub       $0x8,%2                         \n"
4984     "jg        1b                              \n"
4985   : "+r"(src_argb),  // %0
4986     "+r"(dst_argb),  // %1
4987     "+r"(pix)        // %2
4988   : "r"(shuffler)    // %3
4989   : "memory", "cc"
4990     , "xmm0", "xmm1", "xmm5"
4991   );
4992 }
4993 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
4994
4995 #ifdef HAS_ARGBSHUFFLEROW_AVX2
4996 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4997 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4998                          const uint8* shuffler, int pix) {
4999   asm volatile (
5000     "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
5001     LABELALIGN
5002   "1:                                          \n"
5003     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
5004     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
5005     "lea       " MEMLEA(0x40,0) ",%0           \n"
5006     "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
5007     "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
5008     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
5009     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
5010     "lea       " MEMLEA(0x40,1) ",%1           \n"
5011     "sub       $0x10,%2                        \n"
5012     "jg        1b                              \n"
5013     "vzeroupper                                \n"
5014   : "+r"(src_argb),  // %0
5015     "+r"(dst_argb),  // %1
5016     "+r"(pix)        // %2
5017   : "r"(shuffler)    // %3
5018   : "memory", "cc"
5019     , "xmm0", "xmm1", "xmm5"
5020   );
5021 }
5022 #endif  // HAS_ARGBSHUFFLEROW_AVX2
5023
5024 #ifdef HAS_ARGBSHUFFLEROW_SSE2
5025 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5026 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5027                          const uint8* shuffler, int pix) {
5028   uintptr_t pixel_temp = 0u;
5029   asm volatile (
5030     "pxor      %%xmm5,%%xmm5                   \n"
5031     "mov       " MEMACCESS(4) ",%k2            \n"
5032     "cmp       $0x3000102,%k2                  \n"
5033     "je        3012f                           \n"
5034     "cmp       $0x10203,%k2                    \n"
5035     "je        123f                            \n"
5036     "cmp       $0x30201,%k2                    \n"
5037     "je        321f                            \n"
5038     "cmp       $0x2010003,%k2                  \n"
5039     "je        2103f                           \n"
5040
5041     LABELALIGN
5042   "1:                                          \n"
5043     "movzb     " MEMACCESS(4) ",%2             \n"
5044     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5045     "mov       %b2," MEMACCESS(1) "            \n"
5046     "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
5047     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5048     "mov       %b2," MEMACCESS2(0x1,1) "       \n"
5049     "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
5050     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5051     "mov       %b2," MEMACCESS2(0x2,1) "       \n"
5052     "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
5053     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5054     "mov       %b2," MEMACCESS2(0x3,1) "       \n"
5055     "lea       " MEMLEA(0x4,0) ",%0            \n"
5056     "lea       " MEMLEA(0x4,1) ",%1            \n"
5057     "sub       $0x1,%3                         \n"
5058     "jg        1b                              \n"
5059     "jmp       99f                             \n"
5060
5061     LABELALIGN
5062   "123:                                        \n"
5063     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5064     "lea       " MEMLEA(0x10,0) ",%0           \n"
5065     "movdqa    %%xmm0,%%xmm1                   \n"
5066     "punpcklbw %%xmm5,%%xmm0                   \n"
5067     "punpckhbw %%xmm5,%%xmm1                   \n"
5068     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
5069     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
5070     "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
5071     "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
5072     "packuswb  %%xmm1,%%xmm0                   \n"
5073     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5074     "lea       " MEMLEA(0x10,1) ",%1           \n"
5075     "sub       $0x4,%3                         \n"
5076     "jg        123b                            \n"
5077     "jmp       99f                             \n"
5078
5079     LABELALIGN
5080   "321:                                        \n"
5081     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5082     "lea       " MEMLEA(0x10,0) ",%0           \n"
5083     "movdqa    %%xmm0,%%xmm1                   \n"
5084     "punpcklbw %%xmm5,%%xmm0                   \n"
5085     "punpckhbw %%xmm5,%%xmm1                   \n"
5086     "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
5087     "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
5088     "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
5089     "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
5090     "packuswb  %%xmm1,%%xmm0                   \n"
5091     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5092     "lea       " MEMLEA(0x10,1) ",%1           \n"
5093     "sub       $0x4,%3                         \n"
5094     "jg        321b                            \n"
5095     "jmp       99f                             \n"
5096
5097     LABELALIGN
5098   "2103:                                       \n"
5099     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5100     "lea       " MEMLEA(0x10,0) ",%0           \n"
5101     "movdqa    %%xmm0,%%xmm1                   \n"
5102     "punpcklbw %%xmm5,%%xmm0                   \n"
5103     "punpckhbw %%xmm5,%%xmm1                   \n"
5104     "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
5105     "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
5106     "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
5107     "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
5108     "packuswb  %%xmm1,%%xmm0                   \n"
5109     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5110     "lea       " MEMLEA(0x10,1) ",%1           \n"
5111     "sub       $0x4,%3                         \n"
5112     "jg        2103b                           \n"
5113     "jmp       99f                             \n"
5114
5115     LABELALIGN
5116   "3012:                                       \n"
5117     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5118     "lea       " MEMLEA(0x10,0) ",%0           \n"
5119     "movdqa    %%xmm0,%%xmm1                   \n"
5120     "punpcklbw %%xmm5,%%xmm0                   \n"
5121     "punpckhbw %%xmm5,%%xmm1                   \n"
5122     "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
5123     "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
5124     "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
5125     "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
5126     "packuswb  %%xmm1,%%xmm0                   \n"
5127     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5128     "lea       " MEMLEA(0x10,1) ",%1           \n"
5129     "sub       $0x4,%3                         \n"
5130     "jg        3012b                           \n"
5131
5132   "99:                                         \n"
5133   : "+r"(src_argb),    // %0
5134     "+r"(dst_argb),    // %1
5135     "+d"(pixel_temp),  // %2
5136     "+r"(pix)         // %3
5137   : "r"(shuffler)      // %4
5138   : "memory", "cc", NACL_R14
5139     "xmm0", "xmm1", "xmm5"
5140   );
5141 }
5142 #endif  // HAS_ARGBSHUFFLEROW_SSE2
5143
5144 #ifdef HAS_I422TOYUY2ROW_SSE2
5145 void I422ToYUY2Row_SSE2(const uint8* src_y,
5146                         const uint8* src_u,
5147                         const uint8* src_v,
5148                         uint8* dst_frame, int width) {
5149  asm volatile (
5150     "sub       %1,%2                             \n"
5151     LABELALIGN
5152   "1:                                            \n"
5153     "movq      " MEMACCESS(1) ",%%xmm2           \n"
5154     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5155     "lea       " MEMLEA(0x8,1) ",%1              \n"
5156     "punpcklbw %%xmm3,%%xmm2                     \n"
5157     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5158     "lea       " MEMLEA(0x10,0) ",%0             \n"
5159     "movdqa    %%xmm0,%%xmm1                     \n"
5160     "punpcklbw %%xmm2,%%xmm0                     \n"
5161     "punpckhbw %%xmm2,%%xmm1                     \n"
5162     "movdqu    %%xmm0," MEMACCESS(3) "           \n"
5163     "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
5164     "lea       " MEMLEA(0x20,3) ",%3             \n"
5165     "sub       $0x10,%4                          \n"
5166     "jg         1b                               \n"
5167     : "+r"(src_y),  // %0
5168       "+r"(src_u),  // %1
5169       "+r"(src_v),  // %2
5170       "+r"(dst_frame),  // %3
5171       "+rm"(width)  // %4
5172     :
5173     : "memory", "cc", NACL_R14
5174     "xmm0", "xmm1", "xmm2", "xmm3"
5175   );
5176 }
5177 #endif  // HAS_I422TOYUY2ROW_SSE2
5178
5179 #ifdef HAS_I422TOUYVYROW_SSE2
5180 void I422ToUYVYRow_SSE2(const uint8* src_y,
5181                         const uint8* src_u,
5182                         const uint8* src_v,
5183                         uint8* dst_frame, int width) {
5184  asm volatile (
5185     "sub        %1,%2                            \n"
5186     LABELALIGN
5187   "1:                                            \n"
5188     "movq      " MEMACCESS(1) ",%%xmm2           \n"
5189     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5190     "lea       " MEMLEA(0x8,1) ",%1              \n"
5191     "punpcklbw %%xmm3,%%xmm2                     \n"
5192     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5193     "movdqa    %%xmm2,%%xmm1                     \n"
5194     "lea       " MEMLEA(0x10,0) ",%0             \n"
5195     "punpcklbw %%xmm0,%%xmm1                     \n"
5196     "punpckhbw %%xmm0,%%xmm2                     \n"
5197     "movdqu    %%xmm1," MEMACCESS(3) "           \n"
5198     "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
5199     "lea       " MEMLEA(0x20,3) ",%3             \n"
5200     "sub       $0x10,%4                          \n"
5201     "jg         1b                               \n"
5202     : "+r"(src_y),  // %0
5203       "+r"(src_u),  // %1
5204       "+r"(src_v),  // %2
5205       "+r"(dst_frame),  // %3
5206       "+rm"(width)  // %4
5207     :
5208     : "memory", "cc", NACL_R14
5209     "xmm0", "xmm1", "xmm2", "xmm3"
5210   );
5211 }
5212 #endif  // HAS_I422TOUYVYROW_SSE2
5213
5214 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5215 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5216                             uint8* dst_argb, const float* poly,
5217                             int width) {
5218   asm volatile (
5219     "pxor      %%xmm3,%%xmm3                   \n"
5220
5221     // 2 pixel loop.
5222     LABELALIGN
5223   "1:                                          \n"
5224     "movq      " MEMACCESS(0) ",%%xmm0         \n"
5225     "lea       " MEMLEA(0x8,0) ",%0            \n"
5226     "punpcklbw %%xmm3,%%xmm0                   \n"
5227     "movdqa    %%xmm0,%%xmm4                   \n"
5228     "punpcklwd %%xmm3,%%xmm0                   \n"
5229     "punpckhwd %%xmm3,%%xmm4                   \n"
5230     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
5231     "cvtdq2ps  %%xmm4,%%xmm4                   \n"
5232     "movdqa    %%xmm0,%%xmm1                   \n"
5233     "movdqa    %%xmm4,%%xmm5                   \n"
5234     "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
5235     "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
5236     "addps     " MEMACCESS(3) ",%%xmm0         \n"
5237     "addps     " MEMACCESS(3) ",%%xmm4         \n"
5238     "movdqa    %%xmm1,%%xmm2                   \n"
5239     "movdqa    %%xmm5,%%xmm6                   \n"
5240     "mulps     %%xmm1,%%xmm2                   \n"
5241     "mulps     %%xmm5,%%xmm6                   \n"
5242     "mulps     %%xmm2,%%xmm1                   \n"
5243     "mulps     %%xmm6,%%xmm5                   \n"
5244     "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
5245     "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
5246     "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
5247     "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
5248     "addps     %%xmm2,%%xmm0                   \n"
5249     "addps     %%xmm6,%%xmm4                   \n"
5250     "addps     %%xmm1,%%xmm0                   \n"
5251     "addps     %%xmm5,%%xmm4                   \n"
5252     "cvttps2dq %%xmm0,%%xmm0                   \n"
5253     "cvttps2dq %%xmm4,%%xmm4                   \n"
5254     "packuswb  %%xmm4,%%xmm0                   \n"
5255     "packuswb  %%xmm0,%%xmm0                   \n"
5256     "movq      %%xmm0," MEMACCESS(1) "         \n"
5257     "lea       " MEMLEA(0x8,1) ",%1            \n"
5258     "sub       $0x2,%2                         \n"
5259     "jg        1b                              \n"
5260   : "+r"(src_argb),  // %0
5261     "+r"(dst_argb),  // %1
5262     "+r"(width)      // %2
5263   : "r"(poly)        // %3
5264   : "memory", "cc"
5265     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5266   );
5267 }
5268 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
5269
5270 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5271 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5272                             uint8* dst_argb, const float* poly,
5273                             int width) {
5274   asm volatile (
5275     "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
5276     "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5277     "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5278     "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5279
5280     // 2 pixel loop.
5281     LABELALIGN
5282   "1:                                          \n"
5283     "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
5284     "lea         " MEMLEA(0x8,0) ",%0          \n"
5285     "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
5286     "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
5287     "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
5288     "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
5289     "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
5290     "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
5291     "vcvttps2dq  %%ymm0,%%ymm0                 \n"
5292     "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
5293     "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5294     "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
5295     "vmovq       %%xmm0," MEMACCESS(1) "       \n"
5296     "lea         " MEMLEA(0x8,1) ",%1          \n"
5297     "sub         $0x2,%2                       \n"
5298     "jg          1b                            \n"
5299     "vzeroupper                                \n"
5300   : "+r"(src_argb),  // %0
5301     "+r"(dst_argb),  // %1
5302     "+r"(width)      // %2
5303   : "r"(poly)        // %3
5304   : "memory", "cc",
5305     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5306   );
5307 }
5308 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
5309
5310 #ifdef HAS_ARGBCOLORTABLEROW_X86
5311 // Tranform ARGB pixels with color table.
5312 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
5313                            int width) {
5314   uintptr_t pixel_temp = 0u;
5315   asm volatile (
5316     // 1 pixel loop.
5317     LABELALIGN
5318   "1:                                          \n"
5319     "movzb     " MEMACCESS(0) ",%1             \n"
5320     "lea       " MEMLEA(0x4,0) ",%0            \n"
5321     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5322     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5323     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5324     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5325     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5326     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5327     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5328     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5329     "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
5330     MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
5331     "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
5332     "dec       %2                              \n"
5333     "jg        1b                              \n"
5334   : "+r"(dst_argb),   // %0
5335     "+d"(pixel_temp), // %1
5336     "+r"(width)       // %2
5337   : "r"(table_argb)   // %3
5338   : "memory", "cc");
5339 }
5340 #endif  // HAS_ARGBCOLORTABLEROW_X86
5341
5342 #ifdef HAS_RGBCOLORTABLEROW_X86
5343 // Tranform RGB pixels with color table.
5344 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5345   uintptr_t pixel_temp = 0u;
5346   asm volatile (
5347     // 1 pixel loop.
5348     LABELALIGN
5349   "1:                                          \n"
5350     "movzb     " MEMACCESS(0) ",%1             \n"
5351     "lea       " MEMLEA(0x4,0) ",%0            \n"
5352     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5353     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5354     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5355     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5356     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5357     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5358     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5359     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5360     "dec       %2                              \n"
5361     "jg        1b                              \n"
5362   : "+r"(dst_argb),   // %0
5363     "+d"(pixel_temp), // %1
5364     "+r"(width)       // %2
5365   : "r"(table_argb)   // %3
5366   : "memory", "cc");
5367 }
5368 #endif  // HAS_RGBCOLORTABLEROW_X86
5369
5370 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5371 // Tranform RGB pixels with luma table.
5372 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5373                                  int width,
5374                                  const uint8* luma, uint32 lumacoeff) {
5375   uintptr_t pixel_temp = 0u;
5376   uintptr_t table_temp = 0u;
5377   asm volatile (
5378     "movd      %6,%%xmm3                       \n"
5379     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
5380     "pcmpeqb   %%xmm4,%%xmm4                   \n"
5381     "psllw     $0x8,%%xmm4                     \n"
5382     "pxor      %%xmm5,%%xmm5                   \n"
5383
5384     // 4 pixel loop.
5385     LABELALIGN
5386   "1:                                          \n"
5387     "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
5388     "pmaddubsw %%xmm3,%%xmm0                   \n"
5389     "phaddw    %%xmm0,%%xmm0                   \n"
5390     "pand      %%xmm4,%%xmm0                   \n"
5391     "punpcklwd %%xmm5,%%xmm0                   \n"
5392     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5393     "add       %5,%1                           \n"
5394     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5395
5396     "movzb     " MEMACCESS(2) ",%0             \n"
5397     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5398     "mov       %b0," MEMACCESS(3) "            \n"
5399     "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
5400     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5401     "mov       %b0," MEMACCESS2(0x1,3) "       \n"
5402     "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
5403     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5404     "mov       %b0," MEMACCESS2(0x2,3) "       \n"
5405     "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
5406     "mov       %b0," MEMACCESS2(0x3,3) "       \n"
5407
5408     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5409     "add       %5,%1                           \n"
5410     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5411
5412     "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
5413     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5414     "mov       %b0," MEMACCESS2(0x4,3) "       \n"
5415     "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
5416     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5417     "mov       %b0," MEMACCESS2(0x5,3) "       \n"
5418     "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
5419     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5420     "mov       %b0," MEMACCESS2(0x6,3) "       \n"
5421     "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
5422     "mov       %b0," MEMACCESS2(0x7,3) "       \n"
5423
5424     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5425     "add       %5,%1                           \n"
5426     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5427
5428     "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
5429     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5430     "mov       %b0," MEMACCESS2(0x8,3) "       \n"
5431     "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
5432     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5433     "mov       %b0," MEMACCESS2(0x9,3) "       \n"
5434     "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
5435     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5436     "mov       %b0," MEMACCESS2(0xa,3) "       \n"
5437     "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
5438     "mov       %b0," MEMACCESS2(0xb,3) "       \n"
5439
5440     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5441     "add       %5,%1                           \n"
5442
5443     "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
5444     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5445     "mov       %b0," MEMACCESS2(0xc,3) "       \n"
5446     "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
5447     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5448     "mov       %b0," MEMACCESS2(0xd,3) "       \n"
5449     "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
5450     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5451     "mov       %b0," MEMACCESS2(0xe,3) "       \n"
5452     "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
5453     "mov       %b0," MEMACCESS2(0xf,3) "       \n"
5454     "lea       " MEMLEA(0x10,2) ",%2           \n"
5455     "lea       " MEMLEA(0x10,3) ",%3           \n"
5456     "sub       $0x4,%4                         \n"
5457     "jg        1b                              \n"
5458   : "+d"(pixel_temp),  // %0
5459     "+a"(table_temp),  // %1
5460     "+r"(src_argb),    // %2
5461     "+r"(dst_argb),    // %3
5462     "+rm"(width)       // %4
5463   : "r"(luma),         // %5
5464     "rm"(lumacoeff)    // %6
5465   : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5466   );
5467 }
5468 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5469
5470 #endif  // defined(__x86_64__) || defined(__i386__)
5471
5472 #ifdef __cplusplus
5473 }  // extern "C"
5474 }  // namespace libyuv
5475 #endif