]> granicus.if.org Git - libvpx/blob - third_party/libyuv/source/row_posix.cc
1a6f7dc4dd006910125f33ad9bf1827fde92a704
[libvpx] / third_party / libyuv / source / row_posix.cc
1 // VERSION 2
2 /*
3  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
4  *
5  *  Use of this source code is governed by a BSD-style license
6  *  that can be found in the LICENSE file in the root of the source
7  *  tree. An additional intellectual property rights grant can be found
8  *  in the file PATENTS. All contributing project authors may
9  *  be found in the AUTHORS file in the root of the source tree.
10  */
11
12 #include "libyuv/row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23
24 // Constants for ARGB
25 static vec8 kARGBToY = {
26   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
27 };
28
29 // JPeg full range.
30 static vec8 kARGBToYJ = {
31   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
32 };
33 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
34
35 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
36
37 static vec8 kARGBToU = {
38   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
39 };
40
41 static vec8 kARGBToUJ = {
42   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
43 };
44
45 static vec8 kARGBToV = {
46   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
47 };
48
49 static vec8 kARGBToVJ = {
50   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
51 };
52
53 // Constants for BGRA
54 static vec8 kBGRAToY = {
55   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
56 };
57
58 static vec8 kBGRAToU = {
59   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
60 };
61
62 static vec8 kBGRAToV = {
63   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
64 };
65
66 // Constants for ABGR
67 static vec8 kABGRToY = {
68   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
69 };
70
71 static vec8 kABGRToU = {
72   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
73 };
74
75 static vec8 kABGRToV = {
76   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
77 };
78
79 // Constants for RGBA.
80 static vec8 kRGBAToY = {
81   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
82 };
83
84 static vec8 kRGBAToU = {
85   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
86 };
87
88 static vec8 kRGBAToV = {
89   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
90 };
91
92 static uvec8 kAddY16 = {
93   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
94 };
95
96 // 7 bit fixed point 0.5.
97 static vec16 kAddYJ64 = {
98   64, 64, 64, 64, 64, 64, 64, 64
99 };
100
101 static uvec8 kAddUV128 = {
102   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
103   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
104 };
105
106 static uvec16 kAddUVJ128 = {
107   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
108 };
109 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
110
111 #ifdef HAS_RGB24TOARGBROW_SSSE3
112
113 // Shuffle table for converting RGB24 to ARGB.
114 static uvec8 kShuffleMaskRGB24ToARGB = {
115   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
116 };
117
118 // Shuffle table for converting RAW to ARGB.
119 static uvec8 kShuffleMaskRAWToARGB = {
120   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
121 };
122
123 // Shuffle table for converting ARGB to RGB24.
124 static uvec8 kShuffleMaskARGBToRGB24 = {
125   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
126 };
127
128 // Shuffle table for converting ARGB to RAW.
129 static uvec8 kShuffleMaskARGBToRAW = {
130   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
131 };
132
133 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
134 static uvec8 kShuffleMaskARGBToRGB24_0 = {
135   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
136 };
137
138 // Shuffle table for converting ARGB to RAW.
139 static uvec8 kShuffleMaskARGBToRAW_0 = {
140   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
141 };
142 #endif  // HAS_RGB24TOARGBROW_SSSE3
143
144 #if defined(TESTING) && defined(__x86_64__)
145 void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
146   asm volatile (
147     ".p2align  5                               \n"
148     "mov       %%eax,%%eax                     \n"
149     "mov       %%ebx,%%ebx                     \n"
150     "mov       %%ecx,%%ecx                     \n"
151     "mov       %%edx,%%edx                     \n"
152     "mov       %%esi,%%esi                     \n"
153     "mov       %%edi,%%edi                     \n"
154     "mov       %%ebp,%%ebp                     \n"
155     "mov       %%esp,%%esp                     \n"
156     ".p2align  5                               \n"
157     "mov       %%r8d,%%r8d                     \n"
158     "mov       %%r9d,%%r9d                     \n"
159     "mov       %%r10d,%%r10d                   \n"
160     "mov       %%r11d,%%r11d                   \n"
161     "mov       %%r12d,%%r12d                   \n"
162     "mov       %%r13d,%%r13d                   \n"
163     "mov       %%r14d,%%r14d                   \n"
164     "mov       %%r15d,%%r15d                   \n"
165     ".p2align  5                               \n"
166     "lea       (%%rax),%%eax                   \n"
167     "lea       (%%rbx),%%ebx                   \n"
168     "lea       (%%rcx),%%ecx                   \n"
169     "lea       (%%rdx),%%edx                   \n"
170     "lea       (%%rsi),%%esi                   \n"
171     "lea       (%%rdi),%%edi                   \n"
172     "lea       (%%rbp),%%ebp                   \n"
173     "lea       (%%rsp),%%esp                   \n"
174     ".p2align  5                               \n"
175     "lea       (%%r8),%%r8d                    \n"
176     "lea       (%%r9),%%r9d                    \n"
177     "lea       (%%r10),%%r10d                  \n"
178     "lea       (%%r11),%%r11d                  \n"
179     "lea       (%%r12),%%r12d                  \n"
180     "lea       (%%r13),%%r13d                  \n"
181     "lea       (%%r14),%%r14d                  \n"
182     "lea       (%%r15),%%r15d                  \n"
183
184     ".p2align  5                               \n"
185     "lea       0x10(%%rax),%%eax               \n"
186     "lea       0x10(%%rbx),%%ebx               \n"
187     "lea       0x10(%%rcx),%%ecx               \n"
188     "lea       0x10(%%rdx),%%edx               \n"
189     "lea       0x10(%%rsi),%%esi               \n"
190     "lea       0x10(%%rdi),%%edi               \n"
191     "lea       0x10(%%rbp),%%ebp               \n"
192     "lea       0x10(%%rsp),%%esp               \n"
193     ".p2align  5                               \n"
194     "lea       0x10(%%r8),%%r8d                \n"
195     "lea       0x10(%%r9),%%r9d                \n"
196     "lea       0x10(%%r10),%%r10d              \n"
197     "lea       0x10(%%r11),%%r11d              \n"
198     "lea       0x10(%%r12),%%r12d              \n"
199     "lea       0x10(%%r13),%%r13d              \n"
200     "lea       0x10(%%r14),%%r14d              \n"
201     "lea       0x10(%%r15),%%r15d              \n"
202
203     ".p2align  5                               \n"
204     "add       0x10,%%eax                      \n"
205     "add       0x10,%%ebx                      \n"
206     "add       0x10,%%ecx                      \n"
207     "add       0x10,%%edx                      \n"
208     "add       0x10,%%esi                      \n"
209     "add       0x10,%%edi                      \n"
210     "add       0x10,%%ebp                      \n"
211     "add       0x10,%%esp                      \n"
212     ".p2align  5                               \n"
213     "add       0x10,%%r8d                      \n"
214     "add       0x10,%%r9d                      \n"
215     "add       0x10,%%r10d                     \n"
216     "add       0x10,%%r11d                     \n"
217     "add       0x10,%%r12d                     \n"
218     "add       0x10,%%r13d                     \n"
219     "add       0x10,%%r14d                     \n"
220     "add       0x10,%%r15d                     \n"
221
222     ".p2align  2                               \n"
223   "1:                                          \n"
224     "movq      " MEMACCESS(0) ",%%xmm0         \n"
225     "lea       " MEMLEA(0x8,0) ",%0            \n"
226     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
227     "lea       " MEMLEA(0x20,1) ",%1           \n"
228     "sub       $0x8,%2                         \n"
229     "jg        1b                              \n"
230   : "+r"(src_y),     // %0
231     "+r"(dst_argb),  // %1
232     "+r"(pix)        // %2
233   :
234   : "memory", "cc", "xmm0", "xmm1", "xmm5"
235   );
236 }
237 #endif  // TESTING
238
239 #ifdef HAS_I400TOARGBROW_SSE2
240 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
241   asm volatile (
242     "pcmpeqb   %%xmm5,%%xmm5                   \n"
243     "pslld     $0x18,%%xmm5                    \n"
244     LABELALIGN
245   "1:                                          \n"
246     "movq      " MEMACCESS(0) ",%%xmm0         \n"
247     "lea       " MEMLEA(0x8,0) ",%0            \n"
248     "punpcklbw %%xmm0,%%xmm0                   \n"
249     "movdqa    %%xmm0,%%xmm1                   \n"
250     "punpcklwd %%xmm0,%%xmm0                   \n"
251     "punpckhwd %%xmm1,%%xmm1                   \n"
252     "por       %%xmm5,%%xmm0                   \n"
253     "por       %%xmm5,%%xmm1                   \n"
254     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
255     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
256     "lea       " MEMLEA(0x20,1) ",%1           \n"
257     "sub       $0x8,%2                         \n"
258     "jg        1b                              \n"
259   : "+r"(src_y),     // %0
260     "+r"(dst_argb),  // %1
261     "+r"(pix)        // %2
262   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
263   );
264 }
265 #endif  // HAS_I400TOARGBROW_SSE2
266
267 #ifdef HAS_RGB24TOARGBROW_SSSE3
268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
269   asm volatile (
270     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
271     "pslld     $0x18,%%xmm5                    \n"
272     "movdqa    %3,%%xmm4                       \n"
273     LABELALIGN
274   "1:                                          \n"
275     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
276     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
277     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
278     "lea       " MEMLEA(0x30,0) ",%0           \n"
279     "movdqa    %%xmm3,%%xmm2                   \n"
280     "palignr   $0x8,%%xmm1,%%xmm2              \n"
281     "pshufb    %%xmm4,%%xmm2                   \n"
282     "por       %%xmm5,%%xmm2                   \n"
283     "palignr   $0xc,%%xmm0,%%xmm1              \n"
284     "pshufb    %%xmm4,%%xmm0                   \n"
285     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
286     "por       %%xmm5,%%xmm0                   \n"
287     "pshufb    %%xmm4,%%xmm1                   \n"
288     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
289     "por       %%xmm5,%%xmm1                   \n"
290     "palignr   $0x4,%%xmm3,%%xmm3              \n"
291     "pshufb    %%xmm4,%%xmm3                   \n"
292     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
293     "por       %%xmm5,%%xmm3                   \n"
294     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
295     "lea       " MEMLEA(0x40,1) ",%1           \n"
296     "sub       $0x10,%2                        \n"
297     "jg        1b                              \n"
298   : "+r"(src_rgb24),  // %0
299     "+r"(dst_argb),  // %1
300     "+r"(pix)        // %2
301   : "m"(kShuffleMaskRGB24ToARGB)  // %3
302   : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
303   );
304 }
305
306 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
307   asm volatile (
308     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
309     "pslld     $0x18,%%xmm5                    \n"
310     "movdqa    %3,%%xmm4                       \n"
311     LABELALIGN
312   "1:                                          \n"
313     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
314     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
315     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
316     "lea       " MEMLEA(0x30,0) ",%0           \n"
317     "movdqa    %%xmm3,%%xmm2                   \n"
318     "palignr   $0x8,%%xmm1,%%xmm2              \n"
319     "pshufb    %%xmm4,%%xmm2                   \n"
320     "por       %%xmm5,%%xmm2                   \n"
321     "palignr   $0xc,%%xmm0,%%xmm1              \n"
322     "pshufb    %%xmm4,%%xmm0                   \n"
323     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
324     "por       %%xmm5,%%xmm0                   \n"
325     "pshufb    %%xmm4,%%xmm1                   \n"
326     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
327     "por       %%xmm5,%%xmm1                   \n"
328     "palignr   $0x4,%%xmm3,%%xmm3              \n"
329     "pshufb    %%xmm4,%%xmm3                   \n"
330     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
331     "por       %%xmm5,%%xmm3                   \n"
332     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
333     "lea       " MEMLEA(0x40,1) ",%1           \n"
334     "sub       $0x10,%2                        \n"
335     "jg        1b                              \n"
336   : "+r"(src_raw),   // %0
337     "+r"(dst_argb),  // %1
338     "+r"(pix)        // %2
339   : "m"(kShuffleMaskRAWToARGB)  // %3
340   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
341   );
342 }
343
344 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
345   asm volatile (
346     "mov       $0x1080108,%%eax                \n"
347     "movd      %%eax,%%xmm5                    \n"
348     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
349     "mov       $0x20802080,%%eax               \n"
350     "movd      %%eax,%%xmm6                    \n"
351     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
352     "pcmpeqb   %%xmm3,%%xmm3                   \n"
353     "psllw     $0xb,%%xmm3                     \n"
354     "pcmpeqb   %%xmm4,%%xmm4                   \n"
355     "psllw     $0xa,%%xmm4                     \n"
356     "psrlw     $0x5,%%xmm4                     \n"
357     "pcmpeqb   %%xmm7,%%xmm7                   \n"
358     "psllw     $0x8,%%xmm7                     \n"
359     "sub       %0,%1                           \n"
360     "sub       %0,%1                           \n"
361     LABELALIGN
362   "1:                                          \n"
363     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
364     "movdqa    %%xmm0,%%xmm1                   \n"
365     "movdqa    %%xmm0,%%xmm2                   \n"
366     "pand      %%xmm3,%%xmm1                   \n"
367     "psllw     $0xb,%%xmm2                     \n"
368     "pmulhuw   %%xmm5,%%xmm1                   \n"
369     "pmulhuw   %%xmm5,%%xmm2                   \n"
370     "psllw     $0x8,%%xmm1                     \n"
371     "por       %%xmm2,%%xmm1                   \n"
372     "pand      %%xmm4,%%xmm0                   \n"
373     "pmulhuw   %%xmm6,%%xmm0                   \n"
374     "por       %%xmm7,%%xmm0                   \n"
375     "movdqa    %%xmm1,%%xmm2                   \n"
376     "punpcklbw %%xmm0,%%xmm1                   \n"
377     "punpckhbw %%xmm0,%%xmm2                   \n"
378     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
379     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
380     "lea       " MEMLEA(0x10,0) ",%0           \n"
381     "sub       $0x8,%2                         \n"
382     "jg        1b                              \n"
383   : "+r"(src),  // %0
384     "+r"(dst),  // %1
385     "+r"(pix)   // %2
386   :
387   : "memory", "cc", "eax", NACL_R14
388     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
389   );
390 }
391
392 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
393   asm volatile (
394     "mov       $0x1080108,%%eax                \n"
395     "movd      %%eax,%%xmm5                    \n"
396     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
397     "mov       $0x42004200,%%eax               \n"
398     "movd      %%eax,%%xmm6                    \n"
399     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
400     "pcmpeqb   %%xmm3,%%xmm3                   \n"
401     "psllw     $0xb,%%xmm3                     \n"
402     "movdqa    %%xmm3,%%xmm4                   \n"
403     "psrlw     $0x6,%%xmm4                     \n"
404     "pcmpeqb   %%xmm7,%%xmm7                   \n"
405     "psllw     $0x8,%%xmm7                     \n"
406     "sub       %0,%1                           \n"
407     "sub       %0,%1                           \n"
408     LABELALIGN
409   "1:                                          \n"
410     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
411     "movdqa    %%xmm0,%%xmm1                   \n"
412     "movdqa    %%xmm0,%%xmm2                   \n"
413     "psllw     $0x1,%%xmm1                     \n"
414     "psllw     $0xb,%%xmm2                     \n"
415     "pand      %%xmm3,%%xmm1                   \n"
416     "pmulhuw   %%xmm5,%%xmm2                   \n"
417     "pmulhuw   %%xmm5,%%xmm1                   \n"
418     "psllw     $0x8,%%xmm1                     \n"
419     "por       %%xmm2,%%xmm1                   \n"
420     "movdqa    %%xmm0,%%xmm2                   \n"
421     "pand      %%xmm4,%%xmm0                   \n"
422     "psraw     $0x8,%%xmm2                     \n"
423     "pmulhuw   %%xmm6,%%xmm0                   \n"
424     "pand      %%xmm7,%%xmm2                   \n"
425     "por       %%xmm2,%%xmm0                   \n"
426     "movdqa    %%xmm1,%%xmm2                   \n"
427     "punpcklbw %%xmm0,%%xmm1                   \n"
428     "punpckhbw %%xmm0,%%xmm2                   \n"
429     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
430     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
431     "lea       " MEMLEA(0x10,0) ",%0           \n"
432     "sub       $0x8,%2                         \n"
433     "jg        1b                              \n"
434   : "+r"(src),  // %0
435     "+r"(dst),  // %1
436     "+r"(pix)   // %2
437   :
438   : "memory", "cc", "eax", NACL_R14
439     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
440   );
441 }
442
443 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
444   asm volatile (
445     "mov       $0xf0f0f0f,%%eax                \n"
446     "movd      %%eax,%%xmm4                    \n"
447     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
448     "movdqa    %%xmm4,%%xmm5                   \n"
449     "pslld     $0x4,%%xmm5                     \n"
450     "sub       %0,%1                           \n"
451     "sub       %0,%1                           \n"
452     LABELALIGN
453   "1:                                          \n"
454     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
455     "movdqa    %%xmm0,%%xmm2                   \n"
456     "pand      %%xmm4,%%xmm0                   \n"
457     "pand      %%xmm5,%%xmm2                   \n"
458     "movdqa    %%xmm0,%%xmm1                   \n"
459     "movdqa    %%xmm2,%%xmm3                   \n"
460     "psllw     $0x4,%%xmm1                     \n"
461     "psrlw     $0x4,%%xmm3                     \n"
462     "por       %%xmm1,%%xmm0                   \n"
463     "por       %%xmm3,%%xmm2                   \n"
464     "movdqa    %%xmm0,%%xmm1                   \n"
465     "punpcklbw %%xmm2,%%xmm0                   \n"
466     "punpckhbw %%xmm2,%%xmm1                   \n"
467     MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
468     MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
469     "lea       " MEMLEA(0x10,0) ",%0           \n"
470     "sub       $0x8,%2                         \n"
471     "jg        1b                              \n"
472   : "+r"(src),  // %0
473     "+r"(dst),  // %1
474     "+r"(pix)   // %2
475   :
476   : "memory", "cc", "eax", NACL_R14
477     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
478   );
479 }
480
481 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
482   asm volatile (
483     "movdqa    %3,%%xmm6                       \n"
484     LABELALIGN
485   "1:                                          \n"
486     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
487     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
488     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
489     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
490     "lea       " MEMLEA(0x40,0) ",%0           \n"
491     "pshufb    %%xmm6,%%xmm0                   \n"
492     "pshufb    %%xmm6,%%xmm1                   \n"
493     "pshufb    %%xmm6,%%xmm2                   \n"
494     "pshufb    %%xmm6,%%xmm3                   \n"
495     "movdqa    %%xmm1,%%xmm4                   \n"
496     "psrldq    $0x4,%%xmm1                     \n"
497     "pslldq    $0xc,%%xmm4                     \n"
498     "movdqa    %%xmm2,%%xmm5                   \n"
499     "por       %%xmm4,%%xmm0                   \n"
500     "pslldq    $0x8,%%xmm5                     \n"
501     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
502     "por       %%xmm5,%%xmm1                   \n"
503     "psrldq    $0x8,%%xmm2                     \n"
504     "pslldq    $0x4,%%xmm3                     \n"
505     "por       %%xmm3,%%xmm2                   \n"
506     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
507     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
508     "lea       " MEMLEA(0x30,1) ",%1           \n"
509     "sub       $0x10,%2                        \n"
510     "jg        1b                              \n"
511   : "+r"(src),  // %0
512     "+r"(dst),  // %1
513     "+r"(pix)   // %2
514   : "m"(kShuffleMaskARGBToRGB24)  // %3
515   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
516   );
517 }
518
519 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
520   asm volatile (
521     "movdqa    %3,%%xmm6                       \n"
522     LABELALIGN
523   "1:                                          \n"
524     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
525     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
526     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
527     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
528     "lea       " MEMLEA(0x40,0) ",%0           \n"
529     "pshufb    %%xmm6,%%xmm0                   \n"
530     "pshufb    %%xmm6,%%xmm1                   \n"
531     "pshufb    %%xmm6,%%xmm2                   \n"
532     "pshufb    %%xmm6,%%xmm3                   \n"
533     "movdqa    %%xmm1,%%xmm4                   \n"
534     "psrldq    $0x4,%%xmm1                     \n"
535     "pslldq    $0xc,%%xmm4                     \n"
536     "movdqa    %%xmm2,%%xmm5                   \n"
537     "por       %%xmm4,%%xmm0                   \n"
538     "pslldq    $0x8,%%xmm5                     \n"
539     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
540     "por       %%xmm5,%%xmm1                   \n"
541     "psrldq    $0x8,%%xmm2                     \n"
542     "pslldq    $0x4,%%xmm3                     \n"
543     "por       %%xmm3,%%xmm2                   \n"
544     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
545     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
546     "lea       " MEMLEA(0x30,1) ",%1           \n"
547     "sub       $0x10,%2                        \n"
548     "jg        1b                              \n"
549   : "+r"(src),  // %0
550     "+r"(dst),  // %1
551     "+r"(pix)   // %2
552   : "m"(kShuffleMaskARGBToRAW)  // %3
553   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
554   );
555 }
556
557 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
558   asm volatile (
559     "pcmpeqb   %%xmm3,%%xmm3                   \n"
560     "psrld     $0x1b,%%xmm3                    \n"
561     "pcmpeqb   %%xmm4,%%xmm4                   \n"
562     "psrld     $0x1a,%%xmm4                    \n"
563     "pslld     $0x5,%%xmm4                     \n"
564     "pcmpeqb   %%xmm5,%%xmm5                   \n"
565     "pslld     $0xb,%%xmm5                     \n"
566     LABELALIGN
567   "1:                                          \n"
568     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
569     "movdqa    %%xmm0,%%xmm1                   \n"
570     "movdqa    %%xmm0,%%xmm2                   \n"
571     "pslld     $0x8,%%xmm0                     \n"
572     "psrld     $0x3,%%xmm1                     \n"
573     "psrld     $0x5,%%xmm2                     \n"
574     "psrad     $0x10,%%xmm0                    \n"
575     "pand      %%xmm3,%%xmm1                   \n"
576     "pand      %%xmm4,%%xmm2                   \n"
577     "pand      %%xmm5,%%xmm0                   \n"
578     "por       %%xmm2,%%xmm1                   \n"
579     "por       %%xmm1,%%xmm0                   \n"
580     "packssdw  %%xmm0,%%xmm0                   \n"
581     "lea       " MEMLEA(0x10,0) ",%0           \n"
582     "movq      %%xmm0," MEMACCESS(1) "         \n"
583     "lea       " MEMLEA(0x8,1) ",%1            \n"
584     "sub       $0x4,%2                         \n"
585     "jg        1b                              \n"
586   : "+r"(src),  // %0
587     "+r"(dst),  // %1
588     "+r"(pix)   // %2
589   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
590   );
591 }
592
593 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
594   asm volatile (
595     "pcmpeqb   %%xmm4,%%xmm4                   \n"
596     "psrld     $0x1b,%%xmm4                    \n"
597     "movdqa    %%xmm4,%%xmm5                   \n"
598     "pslld     $0x5,%%xmm5                     \n"
599     "movdqa    %%xmm4,%%xmm6                   \n"
600     "pslld     $0xa,%%xmm6                     \n"
601     "pcmpeqb   %%xmm7,%%xmm7                   \n"
602     "pslld     $0xf,%%xmm7                     \n"
603     LABELALIGN
604   "1:                                          \n"
605     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
606     "movdqa    %%xmm0,%%xmm1                   \n"
607     "movdqa    %%xmm0,%%xmm2                   \n"
608     "movdqa    %%xmm0,%%xmm3                   \n"
609     "psrad     $0x10,%%xmm0                    \n"
610     "psrld     $0x3,%%xmm1                     \n"
611     "psrld     $0x6,%%xmm2                     \n"
612     "psrld     $0x9,%%xmm3                     \n"
613     "pand      %%xmm7,%%xmm0                   \n"
614     "pand      %%xmm4,%%xmm1                   \n"
615     "pand      %%xmm5,%%xmm2                   \n"
616     "pand      %%xmm6,%%xmm3                   \n"
617     "por       %%xmm1,%%xmm0                   \n"
618     "por       %%xmm3,%%xmm2                   \n"
619     "por       %%xmm2,%%xmm0                   \n"
620     "packssdw  %%xmm0,%%xmm0                   \n"
621     "lea       " MEMLEA(0x10,0) ",%0           \n"
622     "movq      %%xmm0," MEMACCESS(1) "         \n"
623     "lea       " MEMLEA(0x8,1) ",%1            \n"
624     "sub       $0x4,%2                         \n"
625     "jg        1b                              \n"
626   : "+r"(src),  // %0
627     "+r"(dst),  // %1
628     "+r"(pix)   // %2
629   :: "memory", "cc",
630     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
631   );
632 }
633
634 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
635   asm volatile (
636     "pcmpeqb   %%xmm4,%%xmm4                   \n"
637     "psllw     $0xc,%%xmm4                     \n"
638     "movdqa    %%xmm4,%%xmm3                   \n"
639     "psrlw     $0x8,%%xmm3                     \n"
640     LABELALIGN
641   "1:                                          \n"
642     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
643     "movdqa    %%xmm0,%%xmm1                   \n"
644     "pand      %%xmm3,%%xmm0                   \n"
645     "pand      %%xmm4,%%xmm1                   \n"
646     "psrlq     $0x4,%%xmm0                     \n"
647     "psrlq     $0x8,%%xmm1                     \n"
648     "por       %%xmm1,%%xmm0                   \n"
649     "packuswb  %%xmm0,%%xmm0                   \n"
650     "lea       " MEMLEA(0x10,0) ",%0           \n"
651     "movq      %%xmm0," MEMACCESS(1) "         \n"
652     "lea       " MEMLEA(0x8,1) ",%1            \n"
653     "sub       $0x4,%2                         \n"
654     "jg        1b                              \n"
655   : "+r"(src),  // %0
656     "+r"(dst),  // %1
657     "+r"(pix)   // %2
658   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
659   );
660 }
661 #endif  // HAS_RGB24TOARGBROW_SSSE3
662
663 #ifdef HAS_ARGBTOYROW_SSSE3
664 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
665 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
666   asm volatile (
667     "movdqa    %3,%%xmm4                       \n"
668     "movdqa    %4,%%xmm5                       \n"
669     LABELALIGN
670   "1:                                          \n"
671     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
672     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
673     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
674     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
675     "pmaddubsw %%xmm4,%%xmm0                   \n"
676     "pmaddubsw %%xmm4,%%xmm1                   \n"
677     "pmaddubsw %%xmm4,%%xmm2                   \n"
678     "pmaddubsw %%xmm4,%%xmm3                   \n"
679     "lea       " MEMLEA(0x40,0) ",%0           \n"
680     "phaddw    %%xmm1,%%xmm0                   \n"
681     "phaddw    %%xmm3,%%xmm2                   \n"
682     "psrlw     $0x7,%%xmm0                     \n"
683     "psrlw     $0x7,%%xmm2                     \n"
684     "packuswb  %%xmm2,%%xmm0                   \n"
685     "paddb     %%xmm5,%%xmm0                   \n"
686     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
687     "lea       " MEMLEA(0x10,1) ",%1           \n"
688     "sub       $0x10,%2                        \n"
689     "jg        1b                              \n"
690   : "+r"(src_argb),  // %0
691     "+r"(dst_y),     // %1
692     "+r"(pix)        // %2
693   : "m"(kARGBToY),   // %3
694     "m"(kAddY16)     // %4
695   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
696   );
697 }
698 #endif  // HAS_ARGBTOYROW_SSSE3
699
700 #ifdef HAS_ARGBTOYJROW_SSSE3
701 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
702 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
703 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
704   asm volatile (
705     "movdqa    %3,%%xmm4                       \n"
706     "movdqa    %4,%%xmm5                       \n"
707     LABELALIGN
708   "1:                                          \n"
709     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
710     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
711     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
712     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
713     "pmaddubsw %%xmm4,%%xmm0                   \n"
714     "pmaddubsw %%xmm4,%%xmm1                   \n"
715     "pmaddubsw %%xmm4,%%xmm2                   \n"
716     "pmaddubsw %%xmm4,%%xmm3                   \n"
717     "lea       " MEMLEA(0x40,0) ",%0           \n"
718     "phaddw    %%xmm1,%%xmm0                   \n"
719     "phaddw    %%xmm3,%%xmm2                   \n"
720     "paddw     %%xmm5,%%xmm0                   \n"
721     "paddw     %%xmm5,%%xmm2                   \n"
722     "psrlw     $0x7,%%xmm0                     \n"
723     "psrlw     $0x7,%%xmm2                     \n"
724     "packuswb  %%xmm2,%%xmm0                   \n"
725     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
726     "lea       " MEMLEA(0x10,1) ",%1           \n"
727     "sub       $0x10,%2                        \n"
728     "jg        1b                              \n"
729   : "+r"(src_argb),  // %0
730     "+r"(dst_y),     // %1
731     "+r"(pix)        // %2
732   : "m"(kARGBToYJ),  // %3
733     "m"(kAddYJ64)    // %4
734   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
735   );
736 }
737 #endif  // HAS_ARGBTOYJROW_SSSE3
738
739 #ifdef HAS_ARGBTOYROW_AVX2
740 // vpermd for vphaddw + vpackuswb vpermd.
741 static const lvec32 kPermdARGBToY_AVX = {
742   0, 4, 1, 5, 2, 6, 3, 7
743 };
744
745 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
746 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
747   asm volatile (
748     "vbroadcastf128 %3,%%ymm4                  \n"
749     "vbroadcastf128 %4,%%ymm5                  \n"
750     "vmovdqu    %5,%%ymm6                      \n"
751     LABELALIGN
752   "1:                                          \n"
753     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
754     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
755     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
756     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
757     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
758     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
759     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
760     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
761     "lea       " MEMLEA(0x80,0) ",%0           \n"
762     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
763     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
764     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
765     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
766     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
767     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
768     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
769     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
770     "lea       " MEMLEA(0x20,1) ",%1           \n"
771     "sub       $0x20,%2                        \n"
772     "jg        1b                              \n"
773     "vzeroupper                                \n"
774   : "+r"(src_argb),  // %0
775     "+r"(dst_y),     // %1
776     "+r"(pix)        // %2
777   : "m"(kARGBToY),   // %3
778     "m"(kAddY16),    // %4
779     "m"(kPermdARGBToY_AVX)  // %5
780   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
781   );
782 }
783 #endif  // HAS_ARGBTOYROW_AVX2
784
785 #ifdef HAS_ARGBTOYJROW_AVX2
786 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
787 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
788   asm volatile (
789     "vbroadcastf128 %3,%%ymm4                  \n"
790     "vbroadcastf128 %4,%%ymm5                  \n"
791     "vmovdqu    %5,%%ymm6                      \n"
792     LABELALIGN
793   "1:                                          \n"
794     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
795     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
796     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
797     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
798     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
799     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
800     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
801     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
802     "lea       " MEMLEA(0x80,0) ",%0           \n"
803     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
804     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
805     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
806     "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
807     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
808     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
809     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
810     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
811     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
812     "lea       " MEMLEA(0x20,1) ",%1           \n"
813     "sub       $0x20,%2                        \n"
814     "jg        1b                              \n"
815     "vzeroupper                                \n"
816   : "+r"(src_argb),  // %0
817     "+r"(dst_y),     // %1
818     "+r"(pix)        // %2
819   : "m"(kARGBToYJ),   // %3
820     "m"(kAddYJ64),    // %4
821     "m"(kPermdARGBToY_AVX)  // %5
822   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
823   );
824 }
825 #endif  // HAS_ARGBTOYJROW_AVX2
826
827 #ifdef HAS_ARGBTOUVROW_SSSE3
828 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
829                        uint8* dst_u, uint8* dst_v, int width) {
830   asm volatile (
831     "movdqa    %5,%%xmm3                       \n"
832     "movdqa    %6,%%xmm4                       \n"
833     "movdqa    %7,%%xmm5                       \n"
834     "sub       %1,%2                           \n"
835     LABELALIGN
836   "1:                                          \n"
837     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
838     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
839     "pavgb     %%xmm7,%%xmm0                   \n"
840     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
841     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
842     "pavgb     %%xmm7,%%xmm1                   \n"
843     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
844     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
845     "pavgb     %%xmm7,%%xmm2                   \n"
846     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
847     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
848     "pavgb     %%xmm7,%%xmm6                   \n"
849
850     "lea       " MEMLEA(0x40,0) ",%0           \n"
851     "movdqa    %%xmm0,%%xmm7                   \n"
852     "shufps    $0x88,%%xmm1,%%xmm0             \n"
853     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
854     "pavgb     %%xmm7,%%xmm0                   \n"
855     "movdqa    %%xmm2,%%xmm7                   \n"
856     "shufps    $0x88,%%xmm6,%%xmm2             \n"
857     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
858     "pavgb     %%xmm7,%%xmm2                   \n"
859     "movdqa    %%xmm0,%%xmm1                   \n"
860     "movdqa    %%xmm2,%%xmm6                   \n"
861     "pmaddubsw %%xmm4,%%xmm0                   \n"
862     "pmaddubsw %%xmm4,%%xmm2                   \n"
863     "pmaddubsw %%xmm3,%%xmm1                   \n"
864     "pmaddubsw %%xmm3,%%xmm6                   \n"
865     "phaddw    %%xmm2,%%xmm0                   \n"
866     "phaddw    %%xmm6,%%xmm1                   \n"
867     "psraw     $0x8,%%xmm0                     \n"
868     "psraw     $0x8,%%xmm1                     \n"
869     "packsswb  %%xmm1,%%xmm0                   \n"
870     "paddb     %%xmm5,%%xmm0                   \n"
871     "movlps    %%xmm0," MEMACCESS(1) "         \n"
872     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
873     "lea       " MEMLEA(0x8,1) ",%1            \n"
874     "sub       $0x10,%3                        \n"
875     "jg        1b                              \n"
876   : "+r"(src_argb0),       // %0
877     "+r"(dst_u),           // %1
878     "+r"(dst_v),           // %2
879     "+rm"(width)           // %3
880   : "r"((intptr_t)(src_stride_argb)), // %4
881     "m"(kARGBToV),  // %5
882     "m"(kARGBToU),  // %6
883     "m"(kAddUV128)  // %7
884   : "memory", "cc", NACL_R14
885     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
886   );
887 }
888 #endif  // HAS_ARGBTOUVROW_SSSE3
889
890 #ifdef HAS_ARGBTOUVROW_AVX2
891 // vpshufb for vphaddw + vpackuswb packed to shorts.
892 static const lvec8 kShufARGBToUV_AVX = {
893   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
894   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
895 };
896 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
897                       uint8* dst_u, uint8* dst_v, int width) {
898   asm volatile (
899     "vbroadcastf128 %5,%%ymm5                  \n"
900     "vbroadcastf128 %6,%%ymm6                  \n"
901     "vbroadcastf128 %7,%%ymm7                  \n"
902     "sub       %1,%2                           \n"
903     LABELALIGN
904   "1:                                          \n"
905     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
906     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
907     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
908     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
909     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
910     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
911     VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
912     VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
913     "lea       " MEMLEA(0x80,0) ",%0           \n"
914     "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
915     "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
916     "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
917     "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
918     "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
919     "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
920
921     "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
922     "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
923     "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
924     "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
925     "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
926     "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
927     "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
928     "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
929     "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
930     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
931     "vpshufb    %8,%%ymm0,%%ymm0               \n"
932     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
933
934     "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
935     VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
936     "lea       " MEMLEA(0x10,1) ",%1           \n"
937     "sub       $0x20,%3                        \n"
938     "jg        1b                              \n"
939     "vzeroupper                                \n"
940   : "+r"(src_argb0),       // %0
941     "+r"(dst_u),           // %1
942     "+r"(dst_v),           // %2
943     "+rm"(width)           // %3
944   : "r"((intptr_t)(src_stride_argb)), // %4
945     "m"(kAddUV128),  // %5
946     "m"(kARGBToV),   // %6
947     "m"(kARGBToU),   // %7
948     "m"(kShufARGBToUV_AVX)  // %8
949   : "memory", "cc", NACL_R14
950     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
951   );
952 }
953 #endif  // HAS_ARGBTOUVROW_AVX2
954
955 #ifdef HAS_ARGBTOUVJROW_SSSE3
956 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
957 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
958                         uint8* dst_u, uint8* dst_v, int width) {
959   asm volatile (
960     "movdqa    %5,%%xmm3                       \n"
961     "movdqa    %6,%%xmm4                       \n"
962     "movdqa    %7,%%xmm5                       \n"
963     "sub       %1,%2                           \n"
964     LABELALIGN
965   "1:                                          \n"
966     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
967     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
968     "pavgb     %%xmm7,%%xmm0                   \n"
969     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
970     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
971     "pavgb     %%xmm7,%%xmm1                   \n"
972     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
973     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
974     "pavgb     %%xmm7,%%xmm2                   \n"
975     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
976     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
977     "pavgb     %%xmm7,%%xmm6                   \n"
978
979     "lea       " MEMLEA(0x40,0) ",%0           \n"
980     "movdqa    %%xmm0,%%xmm7                   \n"
981     "shufps    $0x88,%%xmm1,%%xmm0             \n"
982     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
983     "pavgb     %%xmm7,%%xmm0                   \n"
984     "movdqa    %%xmm2,%%xmm7                   \n"
985     "shufps    $0x88,%%xmm6,%%xmm2             \n"
986     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
987     "pavgb     %%xmm7,%%xmm2                   \n"
988     "movdqa    %%xmm0,%%xmm1                   \n"
989     "movdqa    %%xmm2,%%xmm6                   \n"
990     "pmaddubsw %%xmm4,%%xmm0                   \n"
991     "pmaddubsw %%xmm4,%%xmm2                   \n"
992     "pmaddubsw %%xmm3,%%xmm1                   \n"
993     "pmaddubsw %%xmm3,%%xmm6                   \n"
994     "phaddw    %%xmm2,%%xmm0                   \n"
995     "phaddw    %%xmm6,%%xmm1                   \n"
996     "paddw     %%xmm5,%%xmm0                   \n"
997     "paddw     %%xmm5,%%xmm1                   \n"
998     "psraw     $0x8,%%xmm0                     \n"
999     "psraw     $0x8,%%xmm1                     \n"
1000     "packsswb  %%xmm1,%%xmm0                   \n"
1001     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1002     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1003     "lea       " MEMLEA(0x8,1) ",%1            \n"
1004     "sub       $0x10,%3                        \n"
1005     "jg        1b                              \n"
1006   : "+r"(src_argb0),       // %0
1007     "+r"(dst_u),           // %1
1008     "+r"(dst_v),           // %2
1009     "+rm"(width)           // %3
1010   : "r"((intptr_t)(src_stride_argb)), // %4
1011     "m"(kARGBToVJ),  // %5
1012     "m"(kARGBToUJ),  // %6
1013     "m"(kAddUVJ128)  // %7
1014   : "memory", "cc", NACL_R14
1015     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1016   );
1017 }
1018 #endif  // HAS_ARGBTOUVJROW_SSSE3
1019
1020 #ifdef HAS_ARGBTOUV444ROW_SSSE3
1021 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1022                           int width) {
1023   asm volatile (
1024     "movdqa    %4,%%xmm3                       \n"
1025     "movdqa    %5,%%xmm4                       \n"
1026     "movdqa    %6,%%xmm5                       \n"
1027     "sub       %1,%2                           \n"
1028     LABELALIGN
1029   "1:                                          \n"
1030     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1031     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1032     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1033     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1034     "pmaddubsw %%xmm4,%%xmm0                   \n"
1035     "pmaddubsw %%xmm4,%%xmm1                   \n"
1036     "pmaddubsw %%xmm4,%%xmm2                   \n"
1037     "pmaddubsw %%xmm4,%%xmm6                   \n"
1038     "phaddw    %%xmm1,%%xmm0                   \n"
1039     "phaddw    %%xmm6,%%xmm2                   \n"
1040     "psraw     $0x8,%%xmm0                     \n"
1041     "psraw     $0x8,%%xmm2                     \n"
1042     "packsswb  %%xmm2,%%xmm0                   \n"
1043     "paddb     %%xmm5,%%xmm0                   \n"
1044     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1045     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1046     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1047     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1048     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1049     "pmaddubsw %%xmm3,%%xmm0                   \n"
1050     "pmaddubsw %%xmm3,%%xmm1                   \n"
1051     "pmaddubsw %%xmm3,%%xmm2                   \n"
1052     "pmaddubsw %%xmm3,%%xmm6                   \n"
1053     "phaddw    %%xmm1,%%xmm0                   \n"
1054     "phaddw    %%xmm6,%%xmm2                   \n"
1055     "psraw     $0x8,%%xmm0                     \n"
1056     "psraw     $0x8,%%xmm2                     \n"
1057     "packsswb  %%xmm2,%%xmm0                   \n"
1058     "paddb     %%xmm5,%%xmm0                   \n"
1059     "lea       " MEMLEA(0x40,0) ",%0           \n"
1060     MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
1061     "lea       " MEMLEA(0x10,1) ",%1           \n"
1062     "sub       $0x10,%3                        \n"
1063     "jg        1b                              \n"
1064   : "+r"(src_argb),        // %0
1065     "+r"(dst_u),           // %1
1066     "+r"(dst_v),           // %2
1067     "+rm"(width)           // %3
1068   : "m"(kARGBToV),  // %4
1069     "m"(kARGBToU),  // %5
1070     "m"(kAddUV128)  // %6
1071   : "memory", "cc", NACL_R14
1072     "xmm0", "xmm1", "xmm2", "xmm6"
1073   );
1074 }
1075 #endif  // HAS_ARGBTOUV444ROW_SSSE3
1076
1077 #ifdef HAS_ARGBTOUV422ROW_SSSE3
1078 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1079                           uint8* dst_u, uint8* dst_v, int width) {
1080   asm volatile (
1081     "movdqa    %4,%%xmm3                       \n"
1082     "movdqa    %5,%%xmm4                       \n"
1083     "movdqa    %6,%%xmm5                       \n"
1084     "sub       %1,%2                           \n"
1085     LABELALIGN
1086   "1:                                          \n"
1087     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1088     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1089     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1090     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1091     "lea       " MEMLEA(0x40,0) ",%0           \n"
1092     "movdqa    %%xmm0,%%xmm7                   \n"
1093     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1094     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1095     "pavgb     %%xmm7,%%xmm0                   \n"
1096     "movdqa    %%xmm2,%%xmm7                   \n"
1097     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1098     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1099     "pavgb     %%xmm7,%%xmm2                   \n"
1100     "movdqa    %%xmm0,%%xmm1                   \n"
1101     "movdqa    %%xmm2,%%xmm6                   \n"
1102     "pmaddubsw %%xmm4,%%xmm0                   \n"
1103     "pmaddubsw %%xmm4,%%xmm2                   \n"
1104     "pmaddubsw %%xmm3,%%xmm1                   \n"
1105     "pmaddubsw %%xmm3,%%xmm6                   \n"
1106     "phaddw    %%xmm2,%%xmm0                   \n"
1107     "phaddw    %%xmm6,%%xmm1                   \n"
1108     "psraw     $0x8,%%xmm0                     \n"
1109     "psraw     $0x8,%%xmm1                     \n"
1110     "packsswb  %%xmm1,%%xmm0                   \n"
1111     "paddb     %%xmm5,%%xmm0                   \n"
1112     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1113     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1114     "lea       " MEMLEA(0x8,1) ",%1            \n"
1115     "sub       $0x10,%3                        \n"
1116     "jg        1b                              \n"
1117   : "+r"(src_argb0),       // %0
1118     "+r"(dst_u),           // %1
1119     "+r"(dst_v),           // %2
1120     "+rm"(width)           // %3
1121   : "m"(kARGBToV),  // %4
1122     "m"(kARGBToU),  // %5
1123     "m"(kAddUV128)  // %6
1124   : "memory", "cc", NACL_R14
1125     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1126   );
1127 }
1128 #endif  // HAS_ARGBTOUV422ROW_SSSE3
1129
1130 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
1131   asm volatile (
1132     "movdqa    %4,%%xmm5                       \n"
1133     "movdqa    %3,%%xmm4                       \n"
1134     LABELALIGN
1135   "1:                                          \n"
1136     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1137     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1138     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1139     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1140     "pmaddubsw %%xmm4,%%xmm0                   \n"
1141     "pmaddubsw %%xmm4,%%xmm1                   \n"
1142     "pmaddubsw %%xmm4,%%xmm2                   \n"
1143     "pmaddubsw %%xmm4,%%xmm3                   \n"
1144     "lea       " MEMLEA(0x40,0) ",%0           \n"
1145     "phaddw    %%xmm1,%%xmm0                   \n"
1146     "phaddw    %%xmm3,%%xmm2                   \n"
1147     "psrlw     $0x7,%%xmm0                     \n"
1148     "psrlw     $0x7,%%xmm2                     \n"
1149     "packuswb  %%xmm2,%%xmm0                   \n"
1150     "paddb     %%xmm5,%%xmm0                   \n"
1151     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1152     "lea       " MEMLEA(0x10,1) ",%1           \n"
1153     "sub       $0x10,%2                        \n"
1154     "jg        1b                              \n"
1155   : "+r"(src_bgra),  // %0
1156     "+r"(dst_y),     // %1
1157     "+r"(pix)        // %2
1158   : "m"(kBGRAToY),   // %3
1159     "m"(kAddY16)     // %4
1160   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1161   );
1162 }
1163
1164 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1165                        uint8* dst_u, uint8* dst_v, int width) {
1166   asm volatile (
1167     "movdqa    %5,%%xmm3                       \n"
1168     "movdqa    %6,%%xmm4                       \n"
1169     "movdqa    %7,%%xmm5                       \n"
1170     "sub       %1,%2                           \n"
1171     LABELALIGN
1172   "1:                                          \n"
1173     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1174     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1175     "pavgb     %%xmm7,%%xmm0                   \n"
1176     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1177     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1178     "pavgb     %%xmm7,%%xmm1                   \n"
1179     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1180     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1181     "pavgb     %%xmm7,%%xmm2                   \n"
1182     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1183     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1184     "pavgb     %%xmm7,%%xmm6                   \n"
1185
1186     "lea       " MEMLEA(0x40,0) ",%0           \n"
1187     "movdqa    %%xmm0,%%xmm7                   \n"
1188     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1189     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1190     "pavgb     %%xmm7,%%xmm0                   \n"
1191     "movdqa    %%xmm2,%%xmm7                   \n"
1192     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1193     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1194     "pavgb     %%xmm7,%%xmm2                   \n"
1195     "movdqa    %%xmm0,%%xmm1                   \n"
1196     "movdqa    %%xmm2,%%xmm6                   \n"
1197     "pmaddubsw %%xmm4,%%xmm0                   \n"
1198     "pmaddubsw %%xmm4,%%xmm2                   \n"
1199     "pmaddubsw %%xmm3,%%xmm1                   \n"
1200     "pmaddubsw %%xmm3,%%xmm6                   \n"
1201     "phaddw    %%xmm2,%%xmm0                   \n"
1202     "phaddw    %%xmm6,%%xmm1                   \n"
1203     "psraw     $0x8,%%xmm0                     \n"
1204     "psraw     $0x8,%%xmm1                     \n"
1205     "packsswb  %%xmm1,%%xmm0                   \n"
1206     "paddb     %%xmm5,%%xmm0                   \n"
1207     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1208     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1209     "lea       " MEMLEA(0x8,1) ",%1            \n"
1210     "sub       $0x10,%3                        \n"
1211     "jg        1b                              \n"
1212   : "+r"(src_bgra0),       // %0
1213     "+r"(dst_u),           // %1
1214     "+r"(dst_v),           // %2
1215     "+rm"(width)           // %3
1216   : "r"((intptr_t)(src_stride_bgra)), // %4
1217     "m"(kBGRAToV),  // %5
1218     "m"(kBGRAToU),  // %6
1219     "m"(kAddUV128)  // %7
1220   : "memory", "cc", NACL_R14
1221     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1222   );
1223 }
1224
1225 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1226   asm volatile (
1227     "movdqa    %4,%%xmm5                       \n"
1228     "movdqa    %3,%%xmm4                       \n"
1229     LABELALIGN
1230   "1:                                          \n"
1231     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1232     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1233     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1234     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1235     "pmaddubsw %%xmm4,%%xmm0                   \n"
1236     "pmaddubsw %%xmm4,%%xmm1                   \n"
1237     "pmaddubsw %%xmm4,%%xmm2                   \n"
1238     "pmaddubsw %%xmm4,%%xmm3                   \n"
1239     "lea       " MEMLEA(0x40,0) ",%0           \n"
1240     "phaddw    %%xmm1,%%xmm0                   \n"
1241     "phaddw    %%xmm3,%%xmm2                   \n"
1242     "psrlw     $0x7,%%xmm0                     \n"
1243     "psrlw     $0x7,%%xmm2                     \n"
1244     "packuswb  %%xmm2,%%xmm0                   \n"
1245     "paddb     %%xmm5,%%xmm0                   \n"
1246     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1247     "lea       " MEMLEA(0x10,1) ",%1           \n"
1248     "sub       $0x10,%2                        \n"
1249     "jg        1b                              \n"
1250   : "+r"(src_abgr),  // %0
1251     "+r"(dst_y),     // %1
1252     "+r"(pix)        // %2
1253   : "m"(kABGRToY),   // %3
1254     "m"(kAddY16)     // %4
1255   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1256   );
1257 }
1258
1259 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1260   asm volatile (
1261     "movdqa    %4,%%xmm5                       \n"
1262     "movdqa    %3,%%xmm4                       \n"
1263     LABELALIGN
1264   "1:                                          \n"
1265     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1266     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1267     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1268     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1269     "pmaddubsw %%xmm4,%%xmm0                   \n"
1270     "pmaddubsw %%xmm4,%%xmm1                   \n"
1271     "pmaddubsw %%xmm4,%%xmm2                   \n"
1272     "pmaddubsw %%xmm4,%%xmm3                   \n"
1273     "lea       " MEMLEA(0x40,0) ",%0           \n"
1274     "phaddw    %%xmm1,%%xmm0                   \n"
1275     "phaddw    %%xmm3,%%xmm2                   \n"
1276     "psrlw     $0x7,%%xmm0                     \n"
1277     "psrlw     $0x7,%%xmm2                     \n"
1278     "packuswb  %%xmm2,%%xmm0                   \n"
1279     "paddb     %%xmm5,%%xmm0                   \n"
1280     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1281     "lea       " MEMLEA(0x10,1) ",%1           \n"
1282     "sub       $0x10,%2                        \n"
1283     "jg        1b                              \n"
1284   : "+r"(src_rgba),  // %0
1285     "+r"(dst_y),     // %1
1286     "+r"(pix)        // %2
1287   : "m"(kRGBAToY),   // %3
1288     "m"(kAddY16)     // %4
1289   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1290   );
1291 }
1292
1293 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1294                        uint8* dst_u, uint8* dst_v, int width) {
1295   asm volatile (
1296     "movdqa    %5,%%xmm3                       \n"
1297     "movdqa    %6,%%xmm4                       \n"
1298     "movdqa    %7,%%xmm5                       \n"
1299     "sub       %1,%2                           \n"
1300     LABELALIGN
1301   "1:                                          \n"
1302     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1303     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1304     "pavgb     %%xmm7,%%xmm0                   \n"
1305     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1306     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1307     "pavgb     %%xmm7,%%xmm1                   \n"
1308     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1309     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1310     "pavgb     %%xmm7,%%xmm2                   \n"
1311     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1312     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1313     "pavgb     %%xmm7,%%xmm6                   \n"
1314
1315     "lea       " MEMLEA(0x40,0) ",%0           \n"
1316     "movdqa    %%xmm0,%%xmm7                   \n"
1317     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1318     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1319     "pavgb     %%xmm7,%%xmm0                   \n"
1320     "movdqa    %%xmm2,%%xmm7                   \n"
1321     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1322     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1323     "pavgb     %%xmm7,%%xmm2                   \n"
1324     "movdqa    %%xmm0,%%xmm1                   \n"
1325     "movdqa    %%xmm2,%%xmm6                   \n"
1326     "pmaddubsw %%xmm4,%%xmm0                   \n"
1327     "pmaddubsw %%xmm4,%%xmm2                   \n"
1328     "pmaddubsw %%xmm3,%%xmm1                   \n"
1329     "pmaddubsw %%xmm3,%%xmm6                   \n"
1330     "phaddw    %%xmm2,%%xmm0                   \n"
1331     "phaddw    %%xmm6,%%xmm1                   \n"
1332     "psraw     $0x8,%%xmm0                     \n"
1333     "psraw     $0x8,%%xmm1                     \n"
1334     "packsswb  %%xmm1,%%xmm0                   \n"
1335     "paddb     %%xmm5,%%xmm0                   \n"
1336     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1337     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1338     "lea       " MEMLEA(0x8,1) ",%1            \n"
1339     "sub       $0x10,%3                        \n"
1340     "jg        1b                              \n"
1341   : "+r"(src_abgr0),       // %0
1342     "+r"(dst_u),           // %1
1343     "+r"(dst_v),           // %2
1344     "+rm"(width)           // %3
1345   : "r"((intptr_t)(src_stride_abgr)), // %4
1346     "m"(kABGRToV),  // %5
1347     "m"(kABGRToU),  // %6
1348     "m"(kAddUV128)  // %7
1349   : "memory", "cc", NACL_R14
1350     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1351   );
1352 }
1353
1354 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1355                        uint8* dst_u, uint8* dst_v, int width) {
1356   asm volatile (
1357     "movdqa    %5,%%xmm3                       \n"
1358     "movdqa    %6,%%xmm4                       \n"
1359     "movdqa    %7,%%xmm5                       \n"
1360     "sub       %1,%2                           \n"
1361     LABELALIGN
1362   "1:                                          \n"
1363     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1364     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1365     "pavgb     %%xmm7,%%xmm0                   \n"
1366     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1367     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1368     "pavgb     %%xmm7,%%xmm1                   \n"
1369     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1370     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1371     "pavgb     %%xmm7,%%xmm2                   \n"
1372     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1373     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1374     "pavgb     %%xmm7,%%xmm6                   \n"
1375
1376     "lea       " MEMLEA(0x40,0) ",%0           \n"
1377     "movdqa    %%xmm0,%%xmm7                   \n"
1378     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1379     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1380     "pavgb     %%xmm7,%%xmm0                   \n"
1381     "movdqa    %%xmm2,%%xmm7                   \n"
1382     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1383     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1384     "pavgb     %%xmm7,%%xmm2                   \n"
1385     "movdqa    %%xmm0,%%xmm1                   \n"
1386     "movdqa    %%xmm2,%%xmm6                   \n"
1387     "pmaddubsw %%xmm4,%%xmm0                   \n"
1388     "pmaddubsw %%xmm4,%%xmm2                   \n"
1389     "pmaddubsw %%xmm3,%%xmm1                   \n"
1390     "pmaddubsw %%xmm3,%%xmm6                   \n"
1391     "phaddw    %%xmm2,%%xmm0                   \n"
1392     "phaddw    %%xmm6,%%xmm1                   \n"
1393     "psraw     $0x8,%%xmm0                     \n"
1394     "psraw     $0x8,%%xmm1                     \n"
1395     "packsswb  %%xmm1,%%xmm0                   \n"
1396     "paddb     %%xmm5,%%xmm0                   \n"
1397     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1398     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1399     "lea       " MEMLEA(0x8,1) ",%1            \n"
1400     "sub       $0x10,%3                        \n"
1401     "jg        1b                              \n"
1402   : "+r"(src_rgba0),       // %0
1403     "+r"(dst_u),           // %1
1404     "+r"(dst_v),           // %2
1405     "+rm"(width)           // %3
1406   : "r"((intptr_t)(src_stride_rgba)), // %4
1407     "m"(kRGBAToV),  // %5
1408     "m"(kRGBAToU),  // %6
1409     "m"(kAddUV128)  // %7
1410   : "memory", "cc", NACL_R14
1411     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1412   );
1413 }
1414
1415 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1416
1417 // YUV to RGB conversion constants.
1418 // Y contribution to R,G,B.  Scale and bias.
1419 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1420 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
1421
1422 // U and V contributions to R,G,B.
1423 #define UB -128 /* -min(128, round(2.018 * 64)) */
1424 #define UG 25 /* -round(-0.391 * 64) */
1425 #define VG 52 /* -round(-0.813 * 64) */
1426 #define VR -102 /* -round(1.596 * 64) */
1427
1428 // Bias values to subtract 16 from Y and 128 from U and V.
1429 #define BB (UB * 128            - YGB)
1430 #define BG (UG * 128 + VG * 128 - YGB)
1431 #define BR            (VR * 128 - YGB)
1432
1433 struct YuvConstants {
1434   lvec8 kUVToB;     // 0
1435   lvec8 kUVToG;     // 32
1436   lvec8 kUVToR;     // 64
1437   lvec16 kUVBiasB;  // 96
1438   lvec16 kUVBiasG;  // 128
1439   lvec16 kUVBiasR;  // 160
1440   lvec16 kYToRgb;   // 192
1441 };
1442
1443 // BT601 constants for YUV to RGB.
1444 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
1445   { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1446     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
1447   { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1448     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1449   { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1450     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
1451   { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1452   { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1453   { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1454   { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1455 };
1456
1457 // BT601 constants for NV21 where chroma plane is VU instead of UV.
1458 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
1459   { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1460     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
1461   { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1462     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1463   { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1464     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
1465   { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1466   { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1467   { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1468   { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1469 };
1470
1471 // Read 8 UV from 411
1472 #define READYUV444                                                             \
1473     "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
1474     MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1475     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
1476     "punpcklbw  %%xmm1,%%xmm0                                   \n"
1477
1478 // Read 4 UV from 422, upsample to 8 UV
1479 #define READYUV422                                                             \
1480     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
1481     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1482     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
1483     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1484     "punpcklwd  %%xmm0,%%xmm0                                   \n"
1485
1486 // Read 2 UV from 411, upsample to 8 UV
1487 #define READYUV411                                                             \
1488     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
1489     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1490     "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
1491     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1492     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1493     "punpckldq  %%xmm0,%%xmm0                                   \n"
1494
1495 // Read 4 UV from NV12, upsample to 8 UV
1496 #define READNV12                                                               \
1497     "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
1498     "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
1499     "punpcklwd  %%xmm0,%%xmm0                                   \n"
1500
1501 // Convert 8 pixels: 8 UV and 8 Y
1502 #define YUVTORGB(YuvConstants)                                                 \
1503     "movdqa     %%xmm0,%%xmm1                                   \n"            \
1504     "movdqa     %%xmm0,%%xmm2                                   \n"            \
1505     "movdqa     %%xmm0,%%xmm3                                   \n"            \
1506     "movdqa     " MEMACCESS2(96, [YuvConstants]) ",%%xmm0       \n"            \
1507     "pmaddubsw  " MEMACCESS([YuvConstants]) ",%%xmm1            \n"            \
1508     "psubw      %%xmm1,%%xmm0                                   \n"            \
1509     "movdqa     " MEMACCESS2(128, [YuvConstants]) ",%%xmm1      \n"            \
1510     "pmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%xmm2       \n"            \
1511     "psubw      %%xmm2,%%xmm1                                   \n"            \
1512     "movdqa     " MEMACCESS2(160, [YuvConstants]) ",%%xmm2      \n"            \
1513     "pmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%xmm3       \n"            \
1514     "psubw      %%xmm3,%%xmm2                                   \n"            \
1515     "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
1516     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
1517     "punpcklbw  %%xmm3,%%xmm3                                   \n"            \
1518     "pmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%xmm3      \n"            \
1519     "paddsw     %%xmm3,%%xmm0                                   \n"            \
1520     "paddsw     %%xmm3,%%xmm1                                   \n"            \
1521     "paddsw     %%xmm3,%%xmm2                                   \n"            \
1522     "psraw      $0x6,%%xmm0                                     \n"            \
1523     "psraw      $0x6,%%xmm1                                     \n"            \
1524     "psraw      $0x6,%%xmm2                                     \n"            \
1525     "packuswb   %%xmm0,%%xmm0                                   \n"            \
1526     "packuswb   %%xmm1,%%xmm1                                   \n"            \
1527     "packuswb   %%xmm2,%%xmm2                                   \n"
1528
1529 // Store 8 ARGB values. Assumes XMM5 is zero.
1530 #define STOREARGB                                                              \
1531     "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
1532     "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
1533     "movdqa     %%xmm0,%%xmm1                                    \n"           \
1534     "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
1535     "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
1536     "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
1537     "movdqu     %%xmm1," MEMACCESS2(0x10,[dst_argb]) "           \n"           \
1538     "lea        " MEMLEA(0x20,[dst_argb]) ",%[dst_argb]          \n"
1539
1540 // Store 8 BGRA values. Assumes XMM5 is zero.
1541 #define STOREBGRA                                                              \
1542     "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
1543     "punpcklbw %%xmm0,%%xmm1                                     \n"           \
1544     "punpcklbw %%xmm2,%%xmm5                                     \n"           \
1545     "movdqa    %%xmm5,%%xmm0                                     \n"           \
1546     "punpcklwd %%xmm1,%%xmm5                                     \n"           \
1547     "punpckhwd %%xmm1,%%xmm0                                     \n"           \
1548     "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "                  \n"           \
1549     "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "            \n"           \
1550     "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra]           \n"
1551
1552 // Store 8 ABGR values. Assumes XMM5 is zero.
1553 #define STOREABGR                                                              \
1554     "punpcklbw %%xmm1,%%xmm2                                     \n"           \
1555     "punpcklbw %%xmm5,%%xmm0                                     \n"           \
1556     "movdqa    %%xmm2,%%xmm1                                     \n"           \
1557     "punpcklwd %%xmm0,%%xmm2                                     \n"           \
1558     "punpckhwd %%xmm0,%%xmm1                                     \n"           \
1559     "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "                  \n"           \
1560     "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "            \n"           \
1561     "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr]           \n"
1562
1563 // Store 8 RGBA values. Assumes XMM5 is zero.
1564 #define STORERGBA                                                              \
1565     "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
1566     "punpcklbw %%xmm2,%%xmm1                                     \n"           \
1567     "punpcklbw %%xmm0,%%xmm5                                     \n"           \
1568     "movdqa    %%xmm5,%%xmm0                                     \n"           \
1569     "punpcklwd %%xmm1,%%xmm5                                     \n"           \
1570     "punpckhwd %%xmm1,%%xmm0                                     \n"           \
1571     "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
1572     "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "            \n"           \
1573     "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba]           \n"
1574
1575 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1576                                 const uint8* u_buf,
1577                                 const uint8* v_buf,
1578                                 uint8* dst_argb,
1579                                 int width) {
1580   asm volatile (
1581     "sub       %[u_buf],%[v_buf]               \n"
1582     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1583     LABELALIGN
1584   "1:                                          \n"
1585     READYUV444
1586     YUVTORGB(kYuvConstants)
1587     STOREARGB
1588     "sub       $0x8,%[width]                   \n"
1589     "jg        1b                              \n"
1590   : [y_buf]"+r"(y_buf),    // %[y_buf]
1591     [u_buf]"+r"(u_buf),    // %[u_buf]
1592     [v_buf]"+r"(v_buf),    // %[v_buf]
1593     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1594     [width]"+rm"(width)    // %[width]
1595   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1596   : "memory", "cc", NACL_R14
1597     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1598   );
1599 }
1600
1601 // TODO(fbarchard): Consider putting masks into constants.
1602 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1603                                  const uint8* u_buf,
1604                                  const uint8* v_buf,
1605                                  uint8* dst_rgb24,
1606                                  int width) {
1607   asm volatile (
1608     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1609     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
1610     "sub       %[u_buf],%[v_buf]               \n"
1611     LABELALIGN
1612   "1:                                          \n"
1613     READYUV422
1614     YUVTORGB(kYuvConstants)
1615     "punpcklbw %%xmm1,%%xmm0                   \n"
1616     "punpcklbw %%xmm2,%%xmm2                   \n"
1617     "movdqa    %%xmm0,%%xmm1                   \n"
1618     "punpcklwd %%xmm2,%%xmm0                   \n"
1619     "punpckhwd %%xmm2,%%xmm1                   \n"
1620     "pshufb    %%xmm5,%%xmm0                   \n"
1621     "pshufb    %%xmm6,%%xmm1                   \n"
1622     "palignr   $0xc,%%xmm0,%%xmm1              \n"
1623     "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1624     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1625     "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1626     "subl      $0x8,%[width]                   \n"
1627     "jg        1b                              \n"
1628   : [y_buf]"+r"(y_buf),    // %[y_buf]
1629     [u_buf]"+r"(u_buf),    // %[u_buf]
1630     [v_buf]"+r"(v_buf),    // %[v_buf]
1631     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
1632 // TODO(fbarchard): Make width a register for 32 bit.
1633 #if defined(__i386__) && defined(__pic__)
1634     [width]"+m"(width)     // %[width]
1635 #else
1636     [width]"+rm"(width)    // %[width]
1637 #endif
1638   : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1639     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1640     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1641   : "memory", "cc", NACL_R14
1642     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1643   );
1644 }
1645
1646 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1647                                const uint8* u_buf,
1648                                const uint8* v_buf,
1649                                uint8* dst_raw,
1650                                int width) {
1651   asm volatile (
1652     "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1653     "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
1654     "sub       %[u_buf],%[v_buf]               \n"
1655     LABELALIGN
1656   "1:                                          \n"
1657     READYUV422
1658     YUVTORGB(kYuvConstants)
1659     "punpcklbw %%xmm1,%%xmm0                   \n"
1660     "punpcklbw %%xmm2,%%xmm2                   \n"
1661     "movdqa    %%xmm0,%%xmm1                   \n"
1662     "punpcklwd %%xmm2,%%xmm0                   \n"
1663     "punpckhwd %%xmm2,%%xmm1                   \n"
1664     "pshufb    %%xmm5,%%xmm0                   \n"
1665     "pshufb    %%xmm6,%%xmm1                   \n"
1666     "palignr   $0xc,%%xmm0,%%xmm1              \n"
1667     "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
1668     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
1669     "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
1670     "subl      $0x8,%[width]                   \n"
1671     "jg        1b                              \n"
1672   : [y_buf]"+r"(y_buf),    // %[y_buf]
1673     [u_buf]"+r"(u_buf),    // %[u_buf]
1674     [v_buf]"+r"(v_buf),    // %[v_buf]
1675     [dst_raw]"+r"(dst_raw),  // %[dst_raw]
1676 // TODO(fbarchard): Make width a register for 32 bit.
1677 #if defined(__i386__) && defined(__pic__)
1678     [width]"+m"(width)    // %[width]
1679 #else
1680     [width]"+rm"(width)    // %[width]
1681 #endif
1682   : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1683     [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1684     [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
1685   : "memory", "cc", NACL_R14
1686     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1687   );
1688 }
1689
1690 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1691                                 const uint8* u_buf,
1692                                 const uint8* v_buf,
1693                                 uint8* dst_argb,
1694                                 int width) {
1695   asm volatile (
1696     "sub       %[u_buf],%[v_buf]               \n"
1697     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1698     LABELALIGN
1699   "1:                                          \n"
1700     READYUV422
1701     YUVTORGB(kYuvConstants)
1702     STOREARGB
1703     "sub       $0x8,%[width]                   \n"
1704     "jg        1b                              \n"
1705   : [y_buf]"+r"(y_buf),    // %[y_buf]
1706     [u_buf]"+r"(u_buf),    // %[u_buf]
1707     [v_buf]"+r"(v_buf),    // %[v_buf]
1708     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1709     [width]"+rm"(width)    // %[width]
1710   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1711   : "memory", "cc", NACL_R14
1712     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1713   );
1714 }
1715
1716 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1717                                 const uint8* u_buf,
1718                                 const uint8* v_buf,
1719                                 uint8* dst_argb,
1720                                 int width) {
1721   asm volatile (
1722     "sub       %[u_buf],%[v_buf]               \n"
1723     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1724     LABELALIGN
1725   "1:                                          \n"
1726     READYUV411
1727     YUVTORGB(kYuvConstants)
1728     STOREARGB
1729     "sub       $0x8,%[width]                   \n"
1730     "jg        1b                              \n"
1731   : [y_buf]"+r"(y_buf),    // %[y_buf]
1732     [u_buf]"+r"(u_buf),    // %[u_buf]
1733     [v_buf]"+r"(v_buf),    // %[v_buf]
1734     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1735     [width]"+rm"(width)    // %[width]
1736   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1737   : "memory", "cc", NACL_R14
1738     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1739   );
1740 }
1741
1742 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1743                                 const uint8* uv_buf,
1744                                 uint8* dst_argb,
1745                                 int width) {
1746   asm volatile (
1747     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1748     LABELALIGN
1749   "1:                                          \n"
1750     READNV12
1751     YUVTORGB(kYuvConstants)
1752     STOREARGB
1753     "sub       $0x8,%[width]                   \n"
1754     "jg        1b                              \n"
1755   : [y_buf]"+r"(y_buf),    // %[y_buf]
1756     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1757     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1758     [width]"+rm"(width)    // %[width]
1759   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1760   // Does not use r14.
1761   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1762   );
1763 }
1764
1765 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1766                                 const uint8* uv_buf,
1767                                 uint8* dst_argb,
1768                                 int width) {
1769   asm volatile (
1770     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1771     LABELALIGN
1772   "1:                                          \n"
1773     READNV12
1774     YUVTORGB(kYuvConstants)
1775     STOREARGB
1776     "sub       $0x8,%[width]                   \n"
1777     "jg        1b                              \n"
1778   : [y_buf]"+r"(y_buf),    // %[y_buf]
1779     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1780     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1781     [width]"+rm"(width)    // %[width]
1782   : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
1783   // Does not use r14.
1784   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1785   );
1786 }
1787
1788 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1789                                 const uint8* u_buf,
1790                                 const uint8* v_buf,
1791                                 uint8* dst_bgra,
1792                                 int width) {
1793   asm volatile (
1794     "sub       %[u_buf],%[v_buf]               \n"
1795     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1796     LABELALIGN
1797   "1:                                          \n"
1798     READYUV422
1799     YUVTORGB(kYuvConstants)
1800     STOREBGRA
1801     "sub       $0x8,%[width]                   \n"
1802     "jg        1b                              \n"
1803   : [y_buf]"+r"(y_buf),    // %[y_buf]
1804     [u_buf]"+r"(u_buf),    // %[u_buf]
1805     [v_buf]"+r"(v_buf),    // %[v_buf]
1806     [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
1807     [width]"+rm"(width)    // %[width]
1808   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1809   : "memory", "cc", NACL_R14
1810     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1811   );
1812 }
1813
1814 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
1815                                 const uint8* u_buf,
1816                                 const uint8* v_buf,
1817                                 uint8* dst_abgr,
1818                                 int width) {
1819   asm volatile (
1820     "sub       %[u_buf],%[v_buf]               \n"
1821     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1822     LABELALIGN
1823   "1:                                          \n"
1824     READYUV422
1825     YUVTORGB(kYuvConstants)
1826     STOREABGR
1827     "sub       $0x8,%[width]                   \n"
1828     "jg        1b                              \n"
1829   : [y_buf]"+r"(y_buf),    // %[y_buf]
1830     [u_buf]"+r"(u_buf),    // %[u_buf]
1831     [v_buf]"+r"(v_buf),    // %[v_buf]
1832     [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
1833     [width]"+rm"(width)    // %[width]
1834   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1835   : "memory", "cc", NACL_R14
1836     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1837   );
1838 }
1839
1840 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1841                                 const uint8* u_buf,
1842                                 const uint8* v_buf,
1843                                 uint8* dst_rgba,
1844                                 int width) {
1845   asm volatile (
1846     "sub       %[u_buf],%[v_buf]               \n"
1847     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1848     LABELALIGN
1849   "1:                                          \n"
1850     READYUV422
1851     YUVTORGB(kYuvConstants)
1852     STORERGBA
1853     "sub       $0x8,%[width]                   \n"
1854     "jg        1b                              \n"
1855   : [y_buf]"+r"(y_buf),    // %[y_buf]
1856     [u_buf]"+r"(u_buf),    // %[u_buf]
1857     [v_buf]"+r"(v_buf),    // %[v_buf]
1858     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
1859     [width]"+rm"(width)    // %[width]
1860   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1861   : "memory", "cc", NACL_R14
1862     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1863   );
1864 }
1865
1866 #endif  // HAS_I422TOARGBROW_SSSE3
1867
1868 // Read 8 UV from 422, upsample to 16 UV.
1869 #define READYUV422_AVX2                                                        \
1870     "vmovq       " MEMACCESS([u_buf]) ",%%xmm0                      \n"        \
1871     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
1872     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
1873     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1874     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1875     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"
1876
1877 // Convert 16 pixels: 16 UV and 16 Y.
1878 #define YUVTORGB_AVX2(YuvConstants)                                            \
1879     "vpmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2   \n"        \
1880     "vpmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1   \n"        \
1881     "vpmaddubsw  " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0        \n"        \
1882     "vmovdqu     " MEMACCESS2(160, [YuvConstants]) ",%%ymm3         \n"        \
1883     "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
1884     "vmovdqu     " MEMACCESS2(128, [YuvConstants]) ",%%ymm2         \n"        \
1885     "vpsubw      %%ymm1,%%ymm2,%%ymm1                               \n"        \
1886     "vmovdqu     " MEMACCESS2(96, [YuvConstants]) ",%%ymm1          \n"        \
1887     "vpsubw      %%ymm0,%%ymm1,%%ymm0                               \n"        \
1888     "vmovdqu     " MEMACCESS([y_buf]) ",%%xmm3                      \n"        \
1889     "lea         " MEMLEA(0x10, [y_buf]) ",%[y_buf]                 \n"        \
1890     "vpermq      $0xd8,%%ymm3,%%ymm3                                \n"        \
1891     "vpunpcklbw  %%ymm3,%%ymm3,%%ymm3                               \n"        \
1892     "vpmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3  \n"        \
1893     "vpaddsw     %%ymm3,%%ymm0,%%ymm0           \n"                            \
1894     "vpaddsw     %%ymm3,%%ymm1,%%ymm1           \n"                            \
1895     "vpaddsw     %%ymm3,%%ymm2,%%ymm2           \n"                            \
1896     "vpsraw      $0x6,%%ymm0,%%ymm0             \n"                            \
1897     "vpsraw      $0x6,%%ymm1,%%ymm1             \n"                            \
1898     "vpsraw      $0x6,%%ymm2,%%ymm2             \n"                            \
1899     "vpackuswb   %%ymm0,%%ymm0,%%ymm0           \n"                            \
1900     "vpackuswb   %%ymm1,%%ymm1,%%ymm1           \n"                            \
1901     "vpackuswb   %%ymm2,%%ymm2,%%ymm2           \n"
1902
1903 #if defined(HAS_I422TOBGRAROW_AVX2)
1904 // 16 pixels
1905 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
1906 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
1907                                const uint8* u_buf,
1908                                const uint8* v_buf,
1909                                uint8* dst_bgra,
1910                                int width) {
1911   asm volatile (
1912     "sub       %[u_buf],%[v_buf]               \n"
1913     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
1914     LABELALIGN
1915   "1:                                          \n"
1916     READYUV422_AVX2
1917     YUVTORGB_AVX2(kYuvConstants)
1918
1919     // Step 3: Weave into BGRA
1920     "vpunpcklbw %%ymm0,%%ymm1,%%ymm1           \n"  // GB
1921     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
1922     "vpunpcklbw %%ymm2,%%ymm5,%%ymm2           \n"  // AR
1923     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
1924     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"  // ARGB first 8 pixels
1925     "vpunpckhwd %%ymm1,%%ymm2,%%ymm2           \n"  // ARGB next 8 pixels
1926
1927     "vmovdqu    %%ymm0," MEMACCESS([dst_bgra]) "\n"
1928     "vmovdqu    %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
1929     "lea       " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
1930     "sub       $0x10,%[width]                  \n"
1931     "jg        1b                              \n"
1932     "vzeroupper                                \n"
1933   : [y_buf]"+r"(y_buf),    // %[y_buf]
1934     [u_buf]"+r"(u_buf),    // %[u_buf]
1935     [v_buf]"+r"(v_buf),    // %[v_buf]
1936     [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
1937     [width]"+rm"(width)    // %[width]
1938   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
1939   : "memory", "cc", NACL_R14
1940     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1941   );
1942 }
1943 #endif  // HAS_I422TOBGRAROW_AVX2
1944
1945 #if defined(HAS_I422TOARGBROW_AVX2)
1946 // 16 pixels
1947 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
1948 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
1949                                const uint8* u_buf,
1950                                const uint8* v_buf,
1951                                uint8* dst_argb,
1952                                int width) {
1953   asm volatile (
1954     "sub       %[u_buf],%[v_buf]               \n"
1955     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
1956     LABELALIGN
1957   "1:                                          \n"
1958     READYUV422_AVX2
1959     YUVTORGB_AVX2(kYuvConstants)
1960
1961     // Step 3: Weave into ARGB
1962     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
1963     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
1964     "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
1965     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
1966     "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
1967     "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
1968
1969     "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
1970     "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
1971     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
1972     "sub       $0x10,%[width]                  \n"
1973     "jg        1b                              \n"
1974     "vzeroupper                                \n"
1975   : [y_buf]"+r"(y_buf),    // %[y_buf]
1976     [u_buf]"+r"(u_buf),    // %[u_buf]
1977     [v_buf]"+r"(v_buf),    // %[v_buf]
1978     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1979     [width]"+rm"(width)    // %[width]
1980   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
1981   : "memory", "cc", NACL_R14
1982     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1983   );
1984 }
1985 #endif  // HAS_I422TOARGBROW_AVX2
1986
1987 #if defined(HAS_I422TOABGRROW_AVX2)
1988 // 16 pixels
1989 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
1990 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
1991                                const uint8* u_buf,
1992                                const uint8* v_buf,
1993                                uint8* dst_argb,
1994                                int width) {
1995   asm volatile (
1996     "sub       %[u_buf],%[v_buf]               \n"
1997     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
1998     LABELALIGN
1999   "1:                                          \n"
2000     READYUV422_AVX2
2001     YUVTORGB_AVX2(kYuvConstants)
2002
2003     // Step 3: Weave into ABGR
2004     "vpunpcklbw %%ymm1,%%ymm2,%%ymm1           \n"  // RG
2005     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2006     "vpunpcklbw %%ymm5,%%ymm0,%%ymm2           \n"  // BA
2007     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2008     "vpunpcklwd %%ymm2,%%ymm1,%%ymm0           \n"  // RGBA first 8 pixels
2009     "vpunpckhwd %%ymm2,%%ymm1,%%ymm1           \n"  // RGBA next 8 pixels
2010     "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
2011     "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2012     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2013     "sub       $0x10,%[width]                  \n"
2014     "jg        1b                              \n"
2015     "vzeroupper                                \n"
2016   : [y_buf]"+r"(y_buf),    // %[y_buf]
2017     [u_buf]"+r"(u_buf),    // %[u_buf]
2018     [v_buf]"+r"(v_buf),    // %[v_buf]
2019     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2020     [width]"+rm"(width)    // %[width]
2021   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
2022   : "memory", "cc", NACL_R14
2023     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2024   );
2025 }
2026 #endif  // HAS_I422TOABGRROW_AVX2
2027
2028 #if defined(HAS_I422TORGBAROW_AVX2)
2029 // 16 pixels
2030 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2031 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2032                                const uint8* u_buf,
2033                                const uint8* v_buf,
2034                                uint8* dst_argb,
2035                                int width) {
2036   asm volatile (
2037     "sub       %[u_buf],%[v_buf]               \n"
2038     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2039     LABELALIGN
2040   "1:                                          \n"
2041     READYUV422_AVX2
2042     YUVTORGB_AVX2(kYuvConstants)
2043
2044     // Step 3: Weave into RGBA
2045     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
2046     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2047     "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
2048     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2049     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
2050     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
2051     "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
2052     "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2053     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2054     "sub       $0x10,%[width]                  \n"
2055     "jg        1b                              \n"
2056     "vzeroupper                                \n"
2057   : [y_buf]"+r"(y_buf),    // %[y_buf]
2058     [u_buf]"+r"(u_buf),    // %[u_buf]
2059     [v_buf]"+r"(v_buf),    // %[v_buf]
2060     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2061     [width]"+rm"(width)    // %[width]
2062   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
2063   : "memory", "cc", NACL_R14
2064     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2065   );
2066 }
2067 #endif  // HAS_I422TORGBAROW_AVX2
2068
2069 #ifdef HAS_YTOARGBROW_SSE2
2070 void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2071   asm volatile (
2072     "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
2073     "movd      %%eax,%%xmm2                    \n"
2074     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
2075     "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
2076     "movd      %%eax,%%xmm3                    \n"
2077     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
2078     "pcmpeqb   %%xmm4,%%xmm4                   \n"
2079     "pslld     $0x18,%%xmm4                    \n"
2080     LABELALIGN
2081   "1:                                          \n"
2082     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2083     "movq      " MEMACCESS(0) ",%%xmm0         \n"
2084     "lea       " MEMLEA(0x8,0) ",%0            \n"
2085     "punpcklbw %%xmm0,%%xmm0                   \n"
2086     "pmulhuw   %%xmm2,%%xmm0                   \n"
2087     "psubusw   %%xmm3,%%xmm0                   \n"
2088     "psrlw     $6, %%xmm0                      \n"
2089     "packuswb  %%xmm0,%%xmm0                   \n"
2090
2091     // Step 2: Weave into ARGB
2092     "punpcklbw %%xmm0,%%xmm0                   \n"
2093     "movdqa    %%xmm0,%%xmm1                   \n"
2094     "punpcklwd %%xmm0,%%xmm0                   \n"
2095     "punpckhwd %%xmm1,%%xmm1                   \n"
2096     "por       %%xmm4,%%xmm0                   \n"
2097     "por       %%xmm4,%%xmm1                   \n"
2098     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2099     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2100     "lea       " MEMLEA(0x20,1) ",%1           \n"
2101
2102     "sub       $0x8,%2                         \n"
2103     "jg        1b                              \n"
2104   : "+r"(y_buf),     // %0
2105     "+r"(dst_argb),  // %1
2106     "+rm"(width)     // %2
2107   :
2108   : "memory", "cc", "eax"
2109     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2110   );
2111 }
2112 #endif  // HAS_YTOARGBROW_SSE2
2113
2114 #ifdef HAS_YTOARGBROW_AVX2
2115 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2116 // note: vpunpcklbw mutates and vpackuswb unmutates.
2117 void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2118   asm volatile (
2119     "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
2120     "vmovd      %%eax,%%xmm2                   \n"
2121     "vbroadcastss %%xmm2,%%ymm2                \n"
2122     "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
2123     "vmovd      %%eax,%%xmm3                   \n"
2124     "vbroadcastss %%xmm3,%%ymm3                \n"
2125     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
2126     "vpslld     $0x18,%%ymm4,%%ymm4            \n"
2127
2128     LABELALIGN
2129   "1:                                          \n"
2130     // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2131     "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
2132     "lea        " MEMLEA(0x10,0) ",%0          \n"
2133     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2134     "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
2135     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
2136     "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
2137     "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
2138     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
2139     "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
2140     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2141     "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
2142     "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
2143     "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
2144     "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
2145     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2146     "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
2147     "lea       " MEMLEA(0x40,1) ",%1           \n"
2148     "sub        $0x10,%2                       \n"
2149     "jg        1b                              \n"
2150     "vzeroupper                                \n"
2151   : "+r"(y_buf),     // %0
2152     "+r"(dst_argb),  // %1
2153     "+rm"(width)     // %2
2154   :
2155   : "memory", "cc", "eax"
2156     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2157   );
2158 }
2159 #endif  // HAS_YTOARGBROW_AVX2
2160
2161 #ifdef HAS_MIRRORROW_SSSE3
2162 // Shuffle table for reversing the bytes.
2163 static uvec8 kShuffleMirror = {
2164   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2165 };
2166
2167 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2168   intptr_t temp_width = (intptr_t)(width);
2169   asm volatile (
2170     "movdqa    %3,%%xmm5                       \n"
2171     LABELALIGN
2172   "1:                                          \n"
2173     MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
2174     "pshufb    %%xmm5,%%xmm0                   \n"
2175     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2176     "lea       " MEMLEA(0x10,1) ",%1           \n"
2177     "sub       $0x10,%2                        \n"
2178     "jg        1b                              \n"
2179   : "+r"(src),  // %0
2180     "+r"(dst),  // %1
2181     "+r"(temp_width)  // %2
2182   : "m"(kShuffleMirror) // %3
2183   : "memory", "cc", NACL_R14
2184     "xmm0", "xmm5"
2185   );
2186 }
2187 #endif  // HAS_MIRRORROW_SSSE3
2188
2189 #ifdef HAS_MIRRORROW_AVX2
2190 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2191   intptr_t temp_width = (intptr_t)(width);
2192   asm volatile (
2193     "vbroadcastf128 %3,%%ymm5                  \n"
2194     LABELALIGN
2195   "1:                                          \n"
2196     MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
2197     "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
2198     "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
2199     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2200     "lea       " MEMLEA(0x20,1) ",%1           \n"
2201     "sub       $0x20,%2                        \n"
2202     "jg        1b                              \n"
2203     "vzeroupper                                \n"
2204   : "+r"(src),  // %0
2205     "+r"(dst),  // %1
2206     "+r"(temp_width)  // %2
2207   : "m"(kShuffleMirror) // %3
2208   : "memory", "cc", NACL_R14
2209     "xmm0", "xmm5"
2210   );
2211 }
2212 #endif  // HAS_MIRRORROW_AVX2
2213
2214 #ifdef HAS_MIRRORROW_SSE2
2215 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2216   intptr_t temp_width = (intptr_t)(width);
2217   asm volatile (
2218     LABELALIGN
2219   "1:                                          \n"
2220     MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
2221     "movdqa    %%xmm0,%%xmm1                   \n"
2222     "psllw     $0x8,%%xmm0                     \n"
2223     "psrlw     $0x8,%%xmm1                     \n"
2224     "por       %%xmm1,%%xmm0                   \n"
2225     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
2226     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
2227     "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
2228     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2229     "lea       " MEMLEA(0x10,1)",%1            \n"
2230     "sub       $0x10,%2                        \n"
2231     "jg        1b                              \n"
2232   : "+r"(src),  // %0
2233     "+r"(dst),  // %1
2234     "+r"(temp_width)  // %2
2235   :
2236   : "memory", "cc", NACL_R14
2237     "xmm0", "xmm1"
2238   );
2239 }
2240 #endif  // HAS_MIRRORROW_SSE2
2241
2242 #ifdef HAS_MIRRORROW_UV_SSSE3
2243 // Shuffle table for reversing the bytes of UV channels.
2244 static uvec8 kShuffleMirrorUV = {
2245   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2246 };
2247 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2248                        int width) {
2249   intptr_t temp_width = (intptr_t)(width);
2250   asm volatile (
2251     "movdqa    %4,%%xmm1                       \n"
2252     "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
2253     "sub       %1,%2                           \n"
2254     LABELALIGN
2255   "1:                                          \n"
2256     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2257     "lea       " MEMLEA(-0x10,0) ",%0          \n"
2258     "pshufb    %%xmm1,%%xmm0                   \n"
2259     "movlpd    %%xmm0," MEMACCESS(1) "         \n"
2260     MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
2261     "lea       " MEMLEA(0x8,1) ",%1            \n"
2262     "sub       $8,%3                           \n"
2263     "jg        1b                              \n"
2264   : "+r"(src),      // %0
2265     "+r"(dst_u),    // %1
2266     "+r"(dst_v),    // %2
2267     "+r"(temp_width)  // %3
2268   : "m"(kShuffleMirrorUV)  // %4
2269   : "memory", "cc", NACL_R14
2270     "xmm0", "xmm1"
2271   );
2272 }
2273 #endif  // HAS_MIRRORROW_UV_SSSE3
2274
2275 #ifdef HAS_ARGBMIRRORROW_SSE2
2276
2277 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2278   intptr_t temp_width = (intptr_t)(width);
2279   asm volatile (
2280     "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
2281     LABELALIGN
2282   "1:                                          \n"
2283     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2284     "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
2285     "lea       " MEMLEA(-0x10,0) ",%0          \n"
2286     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2287     "lea       " MEMLEA(0x10,1) ",%1           \n"
2288     "sub       $0x4,%2                         \n"
2289     "jg        1b                              \n"
2290   : "+r"(src),  // %0
2291     "+r"(dst),  // %1
2292     "+r"(temp_width)  // %2
2293   :
2294   : "memory", "cc"
2295     , "xmm0"
2296   );
2297 }
2298 #endif  // HAS_ARGBMIRRORROW_SSE2
2299
2300 #ifdef HAS_ARGBMIRRORROW_AVX2
2301 // Shuffle table for reversing the bytes.
2302 static const ulvec32 kARGBShuffleMirror_AVX2 = {
2303   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2304 };
2305 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2306   intptr_t temp_width = (intptr_t)(width);
2307   asm volatile (
2308     "vmovdqu    %3,%%ymm5                      \n"
2309     LABELALIGN
2310   "1:                                          \n"
2311     VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2312     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2313     "lea        " MEMLEA(0x20,1) ",%1          \n"
2314     "sub        $0x8,%2                        \n"
2315     "jg         1b                             \n"
2316     "vzeroupper                                \n"
2317   : "+r"(src),  // %0
2318     "+r"(dst),  // %1
2319     "+r"(temp_width)  // %2
2320   : "m"(kARGBShuffleMirror_AVX2) // %3
2321   : "memory", "cc", NACL_R14
2322     "xmm0", "xmm5"
2323   );
2324 }
2325 #endif  // HAS_ARGBMIRRORROW_AVX2
2326
2327 #ifdef HAS_SPLITUVROW_AVX2
2328 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2329   asm volatile (
2330     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
2331     "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
2332     "sub        %1,%2                            \n"
2333     LABELALIGN
2334   "1:                                            \n"
2335     "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
2336     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
2337     "lea        " MEMLEA(0x40,0) ",%0            \n"
2338     "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
2339     "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
2340     "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
2341     "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
2342     "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
2343     "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
2344     "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
2345     "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
2346     "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
2347     MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
2348     "lea        " MEMLEA(0x20,1) ",%1            \n"
2349     "sub        $0x20,%3                         \n"
2350     "jg         1b                               \n"
2351     "vzeroupper                                  \n"
2352   : "+r"(src_uv),     // %0
2353     "+r"(dst_u),      // %1
2354     "+r"(dst_v),      // %2
2355     "+r"(pix)         // %3
2356   :
2357   : "memory", "cc", NACL_R14
2358     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2359   );
2360 }
2361 #endif  // HAS_SPLITUVROW_AVX2
2362
2363 #ifdef HAS_SPLITUVROW_SSE2
2364 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2365   asm volatile (
2366     "pcmpeqb    %%xmm5,%%xmm5                    \n"
2367     "psrlw      $0x8,%%xmm5                      \n"
2368     "sub        %1,%2                            \n"
2369     LABELALIGN
2370   "1:                                            \n"
2371     "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
2372     "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
2373     "lea        " MEMLEA(0x20,0) ",%0            \n"
2374     "movdqa     %%xmm0,%%xmm2                    \n"
2375     "movdqa     %%xmm1,%%xmm3                    \n"
2376     "pand       %%xmm5,%%xmm0                    \n"
2377     "pand       %%xmm5,%%xmm1                    \n"
2378     "packuswb   %%xmm1,%%xmm0                    \n"
2379     "psrlw      $0x8,%%xmm2                      \n"
2380     "psrlw      $0x8,%%xmm3                      \n"
2381     "packuswb   %%xmm3,%%xmm2                    \n"
2382     "movdqu     %%xmm0," MEMACCESS(1) "          \n"
2383     MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
2384     "lea        " MEMLEA(0x10,1) ",%1            \n"
2385     "sub        $0x10,%3                         \n"
2386     "jg         1b                               \n"
2387   : "+r"(src_uv),     // %0
2388     "+r"(dst_u),      // %1
2389     "+r"(dst_v),      // %2
2390     "+r"(pix)         // %3
2391   :
2392   : "memory", "cc", NACL_R14
2393     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2394   );
2395 }
2396 #endif  // HAS_SPLITUVROW_SSE2
2397
2398 #ifdef HAS_MERGEUVROW_AVX2
2399 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2400                      int width) {
2401   asm volatile (
2402     "sub       %0,%1                             \n"
2403     LABELALIGN
2404   "1:                                            \n"
2405     "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
2406     MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
2407     "lea       " MEMLEA(0x20,0) ",%0             \n"
2408     "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
2409     "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
2410     "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
2411     "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2412     "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2413     "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2414     "lea       " MEMLEA(0x40,2) ",%2             \n"
2415     "sub       $0x20,%3                          \n"
2416     "jg        1b                                \n"
2417     "vzeroupper                                  \n"
2418   : "+r"(src_u),     // %0
2419     "+r"(src_v),     // %1
2420     "+r"(dst_uv),    // %2
2421     "+r"(width)      // %3
2422   :
2423   : "memory", "cc", NACL_R14
2424     "xmm0", "xmm1", "xmm2"
2425   );
2426 }
2427 #endif  // HAS_MERGEUVROW_AVX2
2428
2429 #ifdef HAS_MERGEUVROW_SSE2
2430 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2431                      int width) {
2432   asm volatile (
2433     "sub       %0,%1                             \n"
2434     LABELALIGN
2435   "1:                                            \n"
2436     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
2437     MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
2438     "lea       " MEMLEA(0x10,0) ",%0             \n"
2439     "movdqa    %%xmm0,%%xmm2                     \n"
2440     "punpcklbw %%xmm1,%%xmm0                     \n"
2441     "punpckhbw %%xmm1,%%xmm2                     \n"
2442     "movdqu    %%xmm0," MEMACCESS(2) "           \n"
2443     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
2444     "lea       " MEMLEA(0x20,2) ",%2             \n"
2445     "sub       $0x10,%3                          \n"
2446     "jg        1b                                \n"
2447   : "+r"(src_u),     // %0
2448     "+r"(src_v),     // %1
2449     "+r"(dst_uv),    // %2
2450     "+r"(width)      // %3
2451   :
2452   : "memory", "cc", NACL_R14
2453     "xmm0", "xmm1", "xmm2"
2454   );
2455 }
2456 #endif  // HAS_MERGEUVROW_SSE2
2457
2458 #ifdef HAS_COPYROW_SSE2
2459 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2460   asm volatile (
2461     LABELALIGN
2462   "1:                                          \n"
2463     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2464     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2465     "lea       " MEMLEA(0x20,0) ",%0           \n"
2466     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2467     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2468     "lea       " MEMLEA(0x20,1) ",%1           \n"
2469     "sub       $0x20,%2                        \n"
2470     "jg        1b                              \n"
2471   : "+r"(src),   // %0
2472     "+r"(dst),   // %1
2473     "+r"(count)  // %2
2474   :
2475   : "memory", "cc"
2476     , "xmm0", "xmm1"
2477   );
2478 }
2479 #endif  // HAS_COPYROW_SSE2
2480
2481 #ifdef HAS_COPYROW_AVX
2482 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2483   asm volatile (
2484     LABELALIGN
2485   "1:                                          \n"
2486     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2487     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
2488     "lea       " MEMLEA(0x40,0) ",%0           \n"
2489     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
2490     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
2491     "lea       " MEMLEA(0x40,1) ",%1           \n"
2492     "sub       $0x40,%2                        \n"
2493     "jg        1b                              \n"
2494   : "+r"(src),   // %0
2495     "+r"(dst),   // %1
2496     "+r"(count)  // %2
2497   :
2498   : "memory", "cc"
2499     , "xmm0", "xmm1"
2500   );
2501 }
2502 #endif  // HAS_COPYROW_AVX
2503
2504 #ifdef HAS_COPYROW_ERMS
2505 // Multiple of 1.
2506 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2507   size_t width_tmp = (size_t)(width);
2508   asm volatile (
2509     "rep movsb " MEMMOVESTRING(0,1) "          \n"
2510   : "+S"(src),  // %0
2511     "+D"(dst),  // %1
2512     "+c"(width_tmp) // %2
2513   :
2514   : "memory", "cc"
2515   );
2516 }
2517 #endif  // HAS_COPYROW_ERMS
2518
2519 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2520 // width in pixels
2521 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2522   asm volatile (
2523     "pcmpeqb   %%xmm0,%%xmm0                   \n"
2524     "pslld     $0x18,%%xmm0                    \n"
2525     "pcmpeqb   %%xmm1,%%xmm1                   \n"
2526     "psrld     $0x8,%%xmm1                     \n"
2527     LABELALIGN
2528   "1:                                          \n"
2529     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
2530     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
2531     "lea       " MEMLEA(0x20,0) ",%0           \n"
2532     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2533     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2534     "pand      %%xmm0,%%xmm2                   \n"
2535     "pand      %%xmm0,%%xmm3                   \n"
2536     "pand      %%xmm1,%%xmm4                   \n"
2537     "pand      %%xmm1,%%xmm5                   \n"
2538     "por       %%xmm4,%%xmm2                   \n"
2539     "por       %%xmm5,%%xmm3                   \n"
2540     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2541     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2542     "lea       " MEMLEA(0x20,1) ",%1           \n"
2543     "sub       $0x8,%2                         \n"
2544     "jg        1b                              \n"
2545   : "+r"(src),   // %0
2546     "+r"(dst),   // %1
2547     "+r"(width)  // %2
2548   :
2549   : "memory", "cc"
2550     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2551   );
2552 }
2553 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
2554
2555 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
2556 // width in pixels
2557 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2558   asm volatile (
2559     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
2560     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
2561     LABELALIGN
2562   "1:                                          \n"
2563     "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
2564     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
2565     "lea       " MEMLEA(0x40,0) ",%0           \n"
2566     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
2567     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
2568     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
2569     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
2570     "lea       " MEMLEA(0x40,1) ",%1           \n"
2571     "sub       $0x10,%2                        \n"
2572     "jg        1b                              \n"
2573     "vzeroupper                                \n"
2574   : "+r"(src),   // %0
2575     "+r"(dst),   // %1
2576     "+r"(width)  // %2
2577   :
2578   : "memory", "cc"
2579     , "xmm0", "xmm1", "xmm2"
2580   );
2581 }
2582 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
2583
2584 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2585 // width in pixels
2586 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2587   asm volatile (
2588     "pcmpeqb   %%xmm0,%%xmm0                   \n"
2589     "pslld     $0x18,%%xmm0                    \n"
2590     "pcmpeqb   %%xmm1,%%xmm1                   \n"
2591     "psrld     $0x8,%%xmm1                     \n"
2592     LABELALIGN
2593   "1:                                          \n"
2594     "movq      " MEMACCESS(0) ",%%xmm2         \n"
2595     "lea       " MEMLEA(0x8,0) ",%0            \n"
2596     "punpcklbw %%xmm2,%%xmm2                   \n"
2597     "punpckhwd %%xmm2,%%xmm3                   \n"
2598     "punpcklwd %%xmm2,%%xmm2                   \n"
2599     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2600     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2601     "pand      %%xmm0,%%xmm2                   \n"
2602     "pand      %%xmm0,%%xmm3                   \n"
2603     "pand      %%xmm1,%%xmm4                   \n"
2604     "pand      %%xmm1,%%xmm5                   \n"
2605     "por       %%xmm4,%%xmm2                   \n"
2606     "por       %%xmm5,%%xmm3                   \n"
2607     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2608     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2609     "lea       " MEMLEA(0x20,1) ",%1           \n"
2610     "sub       $0x8,%2                         \n"
2611     "jg        1b                              \n"
2612   : "+r"(src),   // %0
2613     "+r"(dst),   // %1
2614     "+r"(width)  // %2
2615   :
2616   : "memory", "cc"
2617     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2618   );
2619 }
2620 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
2621
2622 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
2623 // width in pixels
2624 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2625   asm volatile (
2626     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
2627     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
2628     LABELALIGN
2629   "1:                                          \n"
2630     "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
2631     "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
2632     "lea       " MEMLEA(0x10,0) ",%0           \n"
2633     "vpslld    $0x18,%%ymm1,%%ymm1             \n"
2634     "vpslld    $0x18,%%ymm2,%%ymm2             \n"
2635     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
2636     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
2637     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
2638     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
2639     "lea       " MEMLEA(0x40,1) ",%1           \n"
2640     "sub       $0x10,%2                        \n"
2641     "jg        1b                              \n"
2642     "vzeroupper                                \n"
2643   : "+r"(src),   // %0
2644     "+r"(dst),   // %1
2645     "+r"(width)  // %2
2646   :
2647   : "memory", "cc"
2648     , "xmm0", "xmm1", "xmm2"
2649   );
2650 }
2651 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
2652
2653 #ifdef HAS_SETROW_X86
2654 void SetRow_X86(uint8* dst, uint8 v8, int width) {
2655   size_t width_tmp = (size_t)(width >> 2);
2656   const uint32 v32 = v8 * 0x01010101;  // Duplicate byte to all bytes.
2657   asm volatile (
2658     "rep stosl " MEMSTORESTRING(eax,0) "       \n"
2659     : "+D"(dst),       // %0
2660       "+c"(width_tmp)  // %1
2661     : "a"(v32)         // %2
2662     : "memory", "cc");
2663 }
2664
2665 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
2666   size_t width_tmp = (size_t)(width);
2667   asm volatile (
2668     "rep stosb " MEMSTORESTRING(al,0) "        \n"
2669     : "+D"(dst),       // %0
2670       "+c"(width_tmp)  // %1
2671     : "a"(v8)          // %2
2672     : "memory", "cc");
2673 }
2674
2675 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
2676   size_t width_tmp = (size_t)(width);
2677   asm volatile (
2678     "rep stosl " MEMSTORESTRING(eax,0) "       \n"
2679     : "+D"(dst_argb),  // %0
2680       "+c"(width_tmp)  // %1
2681     : "a"(v32)         // %2
2682     : "memory", "cc");
2683 }
2684 #endif  // HAS_SETROW_X86
2685
2686 #ifdef HAS_YUY2TOYROW_SSE2
2687 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
2688   asm volatile (
2689     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2690     "psrlw     $0x8,%%xmm5                     \n"
2691     LABELALIGN
2692   "1:                                          \n"
2693     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2694     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2695     "lea       " MEMLEA(0x20,0) ",%0           \n"
2696     "pand      %%xmm5,%%xmm0                   \n"
2697     "pand      %%xmm5,%%xmm1                   \n"
2698     "packuswb  %%xmm1,%%xmm0                   \n"
2699     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2700     "lea       " MEMLEA(0x10,1) ",%1           \n"
2701     "sub       $0x10,%2                        \n"
2702     "jg        1b                              \n"
2703   : "+r"(src_yuy2),  // %0
2704     "+r"(dst_y),     // %1
2705     "+r"(pix)        // %2
2706   :
2707   : "memory", "cc"
2708     , "xmm0", "xmm1", "xmm5"
2709   );
2710 }
2711
2712 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2713                       uint8* dst_u, uint8* dst_v, int pix) {
2714   asm volatile (
2715     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2716     "psrlw     $0x8,%%xmm5                     \n"
2717     "sub       %1,%2                           \n"
2718     LABELALIGN
2719   "1:                                          \n"
2720     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2721     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2722     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
2723     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
2724     "lea       " MEMLEA(0x20,0) ",%0           \n"
2725     "pavgb     %%xmm2,%%xmm0                   \n"
2726     "pavgb     %%xmm3,%%xmm1                   \n"
2727     "psrlw     $0x8,%%xmm0                     \n"
2728     "psrlw     $0x8,%%xmm1                     \n"
2729     "packuswb  %%xmm1,%%xmm0                   \n"
2730     "movdqa    %%xmm0,%%xmm1                   \n"
2731     "pand      %%xmm5,%%xmm0                   \n"
2732     "packuswb  %%xmm0,%%xmm0                   \n"
2733     "psrlw     $0x8,%%xmm1                     \n"
2734     "packuswb  %%xmm1,%%xmm1                   \n"
2735     "movq      %%xmm0," MEMACCESS(1) "         \n"
2736     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
2737     "lea       " MEMLEA(0x8,1) ",%1            \n"
2738     "sub       $0x10,%3                        \n"
2739     "jg        1b                              \n"
2740   : "+r"(src_yuy2),    // %0
2741     "+r"(dst_u),       // %1
2742     "+r"(dst_v),       // %2
2743     "+r"(pix)          // %3
2744   : "r"((intptr_t)(stride_yuy2))  // %4
2745   : "memory", "cc", NACL_R14
2746     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2747   );
2748 }
2749
2750 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2751                          uint8* dst_u, uint8* dst_v, int pix) {
2752   asm volatile (
2753     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2754     "psrlw     $0x8,%%xmm5                     \n"
2755     "sub       %1,%2                           \n"
2756     LABELALIGN
2757   "1:                                          \n"
2758     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2759     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2760     "lea       " MEMLEA(0x20,0) ",%0           \n"
2761     "psrlw     $0x8,%%xmm0                     \n"
2762     "psrlw     $0x8,%%xmm1                     \n"
2763     "packuswb  %%xmm1,%%xmm0                   \n"
2764     "movdqa    %%xmm0,%%xmm1                   \n"
2765     "pand      %%xmm5,%%xmm0                   \n"
2766     "packuswb  %%xmm0,%%xmm0                   \n"
2767     "psrlw     $0x8,%%xmm1                     \n"
2768     "packuswb  %%xmm1,%%xmm1                   \n"
2769     "movq      %%xmm0," MEMACCESS(1) "         \n"
2770     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
2771     "lea       " MEMLEA(0x8,1) ",%1            \n"
2772     "sub       $0x10,%3                        \n"
2773     "jg        1b                              \n"
2774   : "+r"(src_yuy2),    // %0
2775     "+r"(dst_u),       // %1
2776     "+r"(dst_v),       // %2
2777     "+r"(pix)          // %3
2778   :
2779   : "memory", "cc", NACL_R14
2780     "xmm0", "xmm1", "xmm5"
2781   );
2782 }
2783
2784 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
2785   asm volatile (
2786     LABELALIGN
2787   "1:                                          \n"
2788     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2789     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2790     "lea       " MEMLEA(0x20,0) ",%0           \n"
2791     "psrlw     $0x8,%%xmm0                     \n"
2792     "psrlw     $0x8,%%xmm1                     \n"
2793     "packuswb  %%xmm1,%%xmm0                   \n"
2794     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2795     "lea       " MEMLEA(0x10,1) ",%1           \n"
2796     "sub       $0x10,%2                        \n"
2797     "jg        1b                              \n"
2798   : "+r"(src_uyvy),  // %0
2799     "+r"(dst_y),     // %1
2800     "+r"(pix)        // %2
2801   :
2802   : "memory", "cc"
2803     , "xmm0", "xmm1"
2804   );
2805 }
2806
2807 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2808                       uint8* dst_u, uint8* dst_v, int pix) {
2809   asm volatile (
2810     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2811     "psrlw     $0x8,%%xmm5                     \n"
2812     "sub       %1,%2                           \n"
2813     LABELALIGN
2814   "1:                                          \n"
2815     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2816     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2817     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
2818     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
2819     "lea       " MEMLEA(0x20,0) ",%0           \n"
2820     "pavgb     %%xmm2,%%xmm0                   \n"
2821     "pavgb     %%xmm3,%%xmm1                   \n"
2822     "pand      %%xmm5,%%xmm0                   \n"
2823     "pand      %%xmm5,%%xmm1                   \n"
2824     "packuswb  %%xmm1,%%xmm0                   \n"
2825     "movdqa    %%xmm0,%%xmm1                   \n"
2826     "pand      %%xmm5,%%xmm0                   \n"
2827     "packuswb  %%xmm0,%%xmm0                   \n"
2828     "psrlw     $0x8,%%xmm1                     \n"
2829     "packuswb  %%xmm1,%%xmm1                   \n"
2830     "movq      %%xmm0," MEMACCESS(1) "         \n"
2831     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
2832     "lea       " MEMLEA(0x8,1) ",%1            \n"
2833     "sub       $0x10,%3                        \n"
2834     "jg        1b                              \n"
2835   : "+r"(src_uyvy),    // %0
2836     "+r"(dst_u),       // %1
2837     "+r"(dst_v),       // %2
2838     "+r"(pix)          // %3
2839   : "r"((intptr_t)(stride_uyvy))  // %4
2840   : "memory", "cc", NACL_R14
2841     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2842   );
2843 }
2844
2845 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2846                          uint8* dst_u, uint8* dst_v, int pix) {
2847   asm volatile (
2848     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2849     "psrlw     $0x8,%%xmm5                     \n"
2850     "sub       %1,%2                           \n"
2851     LABELALIGN
2852   "1:                                          \n"
2853     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2854     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2855     "lea       " MEMLEA(0x20,0) ",%0           \n"
2856     "pand      %%xmm5,%%xmm0                   \n"
2857     "pand      %%xmm5,%%xmm1                   \n"
2858     "packuswb  %%xmm1,%%xmm0                   \n"
2859     "movdqa    %%xmm0,%%xmm1                   \n"
2860     "pand      %%xmm5,%%xmm0                   \n"
2861     "packuswb  %%xmm0,%%xmm0                   \n"
2862     "psrlw     $0x8,%%xmm1                     \n"
2863     "packuswb  %%xmm1,%%xmm1                   \n"
2864     "movq      %%xmm0," MEMACCESS(1) "         \n"
2865     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
2866     "lea       " MEMLEA(0x8,1) ",%1            \n"
2867     "sub       $0x10,%3                        \n"
2868     "jg        1b                              \n"
2869   : "+r"(src_uyvy),    // %0
2870     "+r"(dst_u),       // %1
2871     "+r"(dst_v),       // %2
2872     "+r"(pix)          // %3
2873   :
2874   : "memory", "cc", NACL_R14
2875     "xmm0", "xmm1", "xmm5"
2876   );
2877 }
2878 #endif  // HAS_YUY2TOYROW_SSE2
2879
2880 #ifdef HAS_YUY2TOYROW_AVX2
2881 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
2882   asm volatile (
2883     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2884     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
2885     LABELALIGN
2886   "1:                                          \n"
2887     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2888     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
2889     "lea       " MEMLEA(0x40,0) ",%0           \n"
2890     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
2891     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
2892     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
2893     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
2894     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
2895     "lea      " MEMLEA(0x20,1) ",%1            \n"
2896     "sub       $0x20,%2                        \n"
2897     "jg        1b                              \n"
2898     "vzeroupper                                \n"
2899   : "+r"(src_yuy2),  // %0
2900     "+r"(dst_y),     // %1
2901     "+r"(pix)        // %2
2902   :
2903   : "memory", "cc"
2904     , "xmm0", "xmm1", "xmm5"
2905   );
2906 }
2907
2908 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
2909                       uint8* dst_u, uint8* dst_v, int pix) {
2910   asm volatile (
2911     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2912     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
2913     "sub       %1,%2                           \n"
2914     LABELALIGN
2915   "1:                                          \n"
2916     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2917     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
2918     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
2919     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
2920     "lea       " MEMLEA(0x40,0) ",%0           \n"
2921     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
2922     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
2923     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
2924     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
2925     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
2926     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
2927     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
2928     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
2929     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
2930     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
2931     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
2932     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
2933     "lea      " MEMLEA(0x10,1) ",%1            \n"
2934     "sub       $0x20,%3                        \n"
2935     "jg        1b                              \n"
2936     "vzeroupper                                \n"
2937   : "+r"(src_yuy2),    // %0
2938     "+r"(dst_u),       // %1
2939     "+r"(dst_v),       // %2
2940     "+r"(pix)          // %3
2941   : "r"((intptr_t)(stride_yuy2))  // %4
2942   : "memory", "cc", NACL_R14
2943     "xmm0", "xmm1", "xmm5"
2944   );
2945 }
2946
2947 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
2948                          uint8* dst_u, uint8* dst_v, int pix) {
2949   asm volatile (
2950     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2951     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
2952     "sub       %1,%2                           \n"
2953     LABELALIGN
2954   "1:                                          \n"
2955     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2956     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
2957     "lea       " MEMLEA(0x40,0) ",%0           \n"
2958     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
2959     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
2960     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
2961     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
2962     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
2963     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
2964     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
2965     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
2966     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
2967     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
2968     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
2969     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
2970     "lea      " MEMLEA(0x10,1) ",%1            \n"
2971     "sub       $0x20,%3                        \n"
2972     "jg        1b                              \n"
2973     "vzeroupper                                \n"
2974   : "+r"(src_yuy2),    // %0
2975     "+r"(dst_u),       // %1
2976     "+r"(dst_v),       // %2
2977     "+r"(pix)          // %3
2978   :
2979   : "memory", "cc", NACL_R14
2980     "xmm0", "xmm1", "xmm5"
2981   );
2982 }
2983
2984 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
2985   asm volatile (
2986     LABELALIGN
2987   "1:                                          \n"
2988     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2989     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
2990     "lea       " MEMLEA(0x40,0) ",%0           \n"
2991     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
2992     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
2993     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
2994     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
2995     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
2996     "lea      " MEMLEA(0x20,1) ",%1            \n"
2997     "sub       $0x20,%2                        \n"
2998     "jg        1b                              \n"
2999     "vzeroupper                                \n"
3000   : "+r"(src_uyvy),  // %0
3001     "+r"(dst_y),     // %1
3002     "+r"(pix)        // %2
3003   :
3004   : "memory", "cc"
3005     , "xmm0", "xmm1", "xmm5"
3006   );
3007 }
3008 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3009                       uint8* dst_u, uint8* dst_v, int pix) {
3010   asm volatile (
3011     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3012     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3013     "sub       %1,%2                           \n"
3014
3015     LABELALIGN
3016   "1:                                          \n"
3017     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3018     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3019     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3020     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3021     "lea       " MEMLEA(0x40,0) ",%0           \n"
3022     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3023     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3024     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3025     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3026     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3027     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3028     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3029     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3030     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3031     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3032     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3033     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3034     "lea      " MEMLEA(0x10,1) ",%1            \n"
3035     "sub       $0x20,%3                        \n"
3036     "jg        1b                              \n"
3037     "vzeroupper                                \n"
3038   : "+r"(src_uyvy),    // %0
3039     "+r"(dst_u),       // %1
3040     "+r"(dst_v),       // %2
3041     "+r"(pix)          // %3
3042   : "r"((intptr_t)(stride_uyvy))  // %4
3043   : "memory", "cc", NACL_R14
3044     "xmm0", "xmm1", "xmm5"
3045   );
3046 }
3047
3048 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3049                          uint8* dst_u, uint8* dst_v, int pix) {
3050   asm volatile (
3051     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3052     "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
3053     "sub       %1,%2                           \n"
3054     LABELALIGN
3055   "1:                                          \n"
3056     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3057     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3058     "lea       " MEMLEA(0x40,0) ",%0           \n"
3059     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3060     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3061     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3062     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3063     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3064     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3065     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3066     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3067     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3068     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3069     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3070     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3071     "lea      " MEMLEA(0x10,1) ",%1            \n"
3072     "sub       $0x20,%3                        \n"
3073     "jg        1b                              \n"
3074     "vzeroupper                                \n"
3075   : "+r"(src_uyvy),    // %0
3076     "+r"(dst_u),       // %1
3077     "+r"(dst_v),       // %2
3078     "+r"(pix)          // %3
3079   :
3080   : "memory", "cc", NACL_R14
3081     "xmm0", "xmm1", "xmm5"
3082   );
3083 }
3084 #endif  // HAS_YUY2TOYROW_AVX2
3085
3086 #ifdef HAS_ARGBBLENDROW_SSE2
3087 // Blend 8 pixels at a time.
3088 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3089                        uint8* dst_argb, int width) {
3090   asm volatile (
3091     "pcmpeqb   %%xmm7,%%xmm7                   \n"
3092     "psrlw     $0xf,%%xmm7                     \n"
3093     "pcmpeqb   %%xmm6,%%xmm6                   \n"
3094     "psrlw     $0x8,%%xmm6                     \n"
3095     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3096     "psllw     $0x8,%%xmm5                     \n"
3097     "pcmpeqb   %%xmm4,%%xmm4                   \n"
3098     "pslld     $0x18,%%xmm4                    \n"
3099     "sub       $0x1,%3                         \n"
3100     "je        91f                             \n"
3101     "jl        99f                             \n"
3102
3103     // 1 pixel loop until destination pointer is aligned.
3104   "10:                                         \n"
3105     "test      $0xf,%2                         \n"
3106     "je        19f                             \n"
3107     "movd      " MEMACCESS(0) ",%%xmm3         \n"
3108     "lea       " MEMLEA(0x4,0) ",%0            \n"
3109     "movdqa    %%xmm3,%%xmm0                   \n"
3110     "pxor      %%xmm4,%%xmm3                   \n"
3111     "movd      " MEMACCESS(1) ",%%xmm2         \n"
3112     "psrlw     $0x8,%%xmm3                     \n"
3113     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
3114     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
3115     "pand      %%xmm6,%%xmm2                   \n"
3116     "paddw     %%xmm7,%%xmm3                   \n"
3117     "pmullw    %%xmm3,%%xmm2                   \n"
3118     "movd      " MEMACCESS(1) ",%%xmm1         \n"
3119     "lea       " MEMLEA(0x4,1) ",%1            \n"
3120     "psrlw     $0x8,%%xmm1                     \n"
3121     "por       %%xmm4,%%xmm0                   \n"
3122     "pmullw    %%xmm3,%%xmm1                   \n"
3123     "psrlw     $0x8,%%xmm2                     \n"
3124     "paddusb   %%xmm2,%%xmm0                   \n"
3125     "pand      %%xmm5,%%xmm1                   \n"
3126     "paddusb   %%xmm1,%%xmm0                   \n"
3127     "movd      %%xmm0," MEMACCESS(2) "         \n"
3128     "lea       " MEMLEA(0x4,2) ",%2            \n"
3129     "sub       $0x1,%3                         \n"
3130     "jge       10b                             \n"
3131
3132   "19:                                         \n"
3133     "add       $1-4,%3                         \n"
3134     "jl        49f                             \n"
3135
3136     // 4 pixel loop.
3137     LABELALIGN
3138   "41:                                         \n"
3139     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
3140     "lea       " MEMLEA(0x10,0) ",%0           \n"
3141     "movdqa    %%xmm3,%%xmm0                   \n"
3142     "pxor      %%xmm4,%%xmm3                   \n"
3143     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3144     "psrlw     $0x8,%%xmm3                     \n"
3145     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
3146     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
3147     "pand      %%xmm6,%%xmm2                   \n"
3148     "paddw     %%xmm7,%%xmm3                   \n"
3149     "pmullw    %%xmm3,%%xmm2                   \n"
3150     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
3151     "lea       " MEMLEA(0x10,1) ",%1           \n"
3152     "psrlw     $0x8,%%xmm1                     \n"
3153     "por       %%xmm4,%%xmm0                   \n"
3154     "pmullw    %%xmm3,%%xmm1                   \n"
3155     "psrlw     $0x8,%%xmm2                     \n"
3156     "paddusb   %%xmm2,%%xmm0                   \n"
3157     "pand      %%xmm5,%%xmm1                   \n"
3158     "paddusb   %%xmm1,%%xmm0                   \n"
3159     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3160     "lea       " MEMLEA(0x10,2) ",%2           \n"
3161     "sub       $0x4,%3                         \n"
3162     "jge       41b                             \n"
3163
3164   "49:                                         \n"
3165     "add       $0x3,%3                         \n"
3166     "jl        99f                             \n"
3167
3168     // 1 pixel loop.
3169   "91:                                         \n"
3170     "movd      " MEMACCESS(0) ",%%xmm3         \n"
3171     "lea       " MEMLEA(0x4,0) ",%0            \n"
3172     "movdqa    %%xmm3,%%xmm0                   \n"
3173     "pxor      %%xmm4,%%xmm3                   \n"
3174     "movd      " MEMACCESS(1) ",%%xmm2         \n"
3175     "psrlw     $0x8,%%xmm3                     \n"
3176     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
3177     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
3178     "pand      %%xmm6,%%xmm2                   \n"
3179     "paddw     %%xmm7,%%xmm3                   \n"
3180     "pmullw    %%xmm3,%%xmm2                   \n"
3181     "movd      " MEMACCESS(1) ",%%xmm1         \n"
3182     "lea       " MEMLEA(0x4,1) ",%1            \n"
3183     "psrlw     $0x8,%%xmm1                     \n"
3184     "por       %%xmm4,%%xmm0                   \n"
3185     "pmullw    %%xmm3,%%xmm1                   \n"
3186     "psrlw     $0x8,%%xmm2                     \n"
3187     "paddusb   %%xmm2,%%xmm0                   \n"
3188     "pand      %%xmm5,%%xmm1                   \n"
3189     "paddusb   %%xmm1,%%xmm0                   \n"
3190     "movd      %%xmm0," MEMACCESS(2) "         \n"
3191     "lea       " MEMLEA(0x4,2) ",%2            \n"
3192     "sub       $0x1,%3                         \n"
3193     "jge       91b                             \n"
3194   "99:                                         \n"
3195   : "+r"(src_argb0),    // %0
3196     "+r"(src_argb1),    // %1
3197     "+r"(dst_argb),     // %2
3198     "+r"(width)         // %3
3199   :
3200   : "memory", "cc"
3201     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3202   );
3203 }
3204 #endif  // HAS_ARGBBLENDROW_SSE2
3205
3206 #ifdef HAS_ARGBBLENDROW_SSSE3
3207 // Shuffle table for isolating alpha.
3208 static uvec8 kShuffleAlpha = {
3209   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3210   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3211 };
3212
3213 // Blend 8 pixels at a time
3214 // Shuffle table for reversing the bytes.
3215
3216 // Same as SSE2, but replaces
3217 //    psrlw      xmm3, 8          // alpha
3218 //    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3219 //    pshuflw    xmm3, xmm3,0F5h
3220 // with..
3221 //    pshufb     xmm3, kShuffleAlpha // alpha
3222
3223 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3224                         uint8* dst_argb, int width) {
3225   asm volatile (
3226     "pcmpeqb   %%xmm7,%%xmm7                   \n"
3227     "psrlw     $0xf,%%xmm7                     \n"
3228     "pcmpeqb   %%xmm6,%%xmm6                   \n"
3229     "psrlw     $0x8,%%xmm6                     \n"
3230     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3231     "psllw     $0x8,%%xmm5                     \n"
3232     "pcmpeqb   %%xmm4,%%xmm4                   \n"
3233     "pslld     $0x18,%%xmm4                    \n"
3234     "sub       $0x1,%3                         \n"
3235     "je        91f                             \n"
3236     "jl        99f                             \n"
3237
3238     // 1 pixel loop until destination pointer is aligned.
3239   "10:                                         \n"
3240     "test      $0xf,%2                         \n"
3241     "je        19f                             \n"
3242     "movd      " MEMACCESS(0) ",%%xmm3         \n"
3243     "lea       " MEMLEA(0x4,0) ",%0            \n"
3244     "movdqa    %%xmm3,%%xmm0                   \n"
3245     "pxor      %%xmm4,%%xmm3                   \n"
3246     "movd      " MEMACCESS(1) ",%%xmm2         \n"
3247     "pshufb    %4,%%xmm3                       \n"
3248     "pand      %%xmm6,%%xmm2                   \n"
3249     "paddw     %%xmm7,%%xmm3                   \n"
3250     "pmullw    %%xmm3,%%xmm2                   \n"
3251     "movd      " MEMACCESS(1) ",%%xmm1         \n"
3252     "lea       " MEMLEA(0x4,1) ",%1            \n"
3253     "psrlw     $0x8,%%xmm1                     \n"
3254     "por       %%xmm4,%%xmm0                   \n"
3255     "pmullw    %%xmm3,%%xmm1                   \n"
3256     "psrlw     $0x8,%%xmm2                     \n"
3257     "paddusb   %%xmm2,%%xmm0                   \n"
3258     "pand      %%xmm5,%%xmm1                   \n"
3259     "paddusb   %%xmm1,%%xmm0                   \n"
3260     "movd      %%xmm0," MEMACCESS(2) "         \n"
3261     "lea       " MEMLEA(0x4,2) ",%2            \n"
3262     "sub       $0x1,%3                         \n"
3263     "jge       10b                             \n"
3264
3265   "19:                                         \n"
3266     "add       $1-4,%3                         \n"
3267     "jl        49f                             \n"
3268
3269     // 4 pixel loop.
3270     LABELALIGN
3271   "40:                                         \n"
3272     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
3273     "lea       " MEMLEA(0x10,0) ",%0           \n"
3274     "movdqa    %%xmm3,%%xmm0                   \n"
3275     "pxor      %%xmm4,%%xmm3                   \n"
3276     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3277     "pshufb    %4,%%xmm3                       \n"
3278     "pand      %%xmm6,%%xmm2                   \n"
3279     "paddw     %%xmm7,%%xmm3                   \n"
3280     "pmullw    %%xmm3,%%xmm2                   \n"
3281     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
3282     "lea       " MEMLEA(0x10,1) ",%1           \n"
3283     "psrlw     $0x8,%%xmm1                     \n"
3284     "por       %%xmm4,%%xmm0                   \n"
3285     "pmullw    %%xmm3,%%xmm1                   \n"
3286     "psrlw     $0x8,%%xmm2                     \n"
3287     "paddusb   %%xmm2,%%xmm0                   \n"
3288     "pand      %%xmm5,%%xmm1                   \n"
3289     "paddusb   %%xmm1,%%xmm0                   \n"
3290     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3291     "lea       " MEMLEA(0x10,2) ",%2           \n"
3292     "sub       $0x4,%3                         \n"
3293     "jge       40b                             \n"
3294
3295   "49:                                         \n"
3296     "add       $0x3,%3                         \n"
3297     "jl        99f                             \n"
3298
3299     // 1 pixel loop.
3300   "91:                                         \n"
3301     "movd      " MEMACCESS(0) ",%%xmm3         \n"
3302     "lea       " MEMLEA(0x4,0) ",%0            \n"
3303     "movdqa    %%xmm3,%%xmm0                   \n"
3304     "pxor      %%xmm4,%%xmm3                   \n"
3305     "movd      " MEMACCESS(1) ",%%xmm2         \n"
3306     "pshufb    %4,%%xmm3                       \n"
3307     "pand      %%xmm6,%%xmm2                   \n"
3308     "paddw     %%xmm7,%%xmm3                   \n"
3309     "pmullw    %%xmm3,%%xmm2                   \n"
3310     "movd      " MEMACCESS(1) ",%%xmm1         \n"
3311     "lea       " MEMLEA(0x4,1) ",%1            \n"
3312     "psrlw     $0x8,%%xmm1                     \n"
3313     "por       %%xmm4,%%xmm0                   \n"
3314     "pmullw    %%xmm3,%%xmm1                   \n"
3315     "psrlw     $0x8,%%xmm2                     \n"
3316     "paddusb   %%xmm2,%%xmm0                   \n"
3317     "pand      %%xmm5,%%xmm1                   \n"
3318     "paddusb   %%xmm1,%%xmm0                   \n"
3319     "movd      %%xmm0," MEMACCESS(2) "         \n"
3320     "lea       " MEMLEA(0x4,2) ",%2            \n"
3321     "sub       $0x1,%3                         \n"
3322     "jge       91b                             \n"
3323   "99:                                         \n"
3324   : "+r"(src_argb0),    // %0
3325     "+r"(src_argb1),    // %1
3326     "+r"(dst_argb),     // %2
3327     "+r"(width)         // %3
3328   : "m"(kShuffleAlpha)  // %4
3329   : "memory", "cc"
3330     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3331   );
3332 }
3333 #endif  // HAS_ARGBBLENDROW_SSSE3
3334
3335 #ifdef HAS_ARGBATTENUATEROW_SSE2
3336 // Attenuate 4 pixels at a time.
3337 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3338   asm volatile (
3339     "pcmpeqb   %%xmm4,%%xmm4                   \n"
3340     "pslld     $0x18,%%xmm4                    \n"
3341     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3342     "psrld     $0x8,%%xmm5                     \n"
3343
3344     // 4 pixel loop.
3345     LABELALIGN
3346   "1:                                          \n"
3347     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3348     "punpcklbw %%xmm0,%%xmm0                   \n"
3349     "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
3350     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
3351     "pmulhuw   %%xmm2,%%xmm0                   \n"
3352     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3353     "punpckhbw %%xmm1,%%xmm1                   \n"
3354     "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
3355     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
3356     "pmulhuw   %%xmm2,%%xmm1                   \n"
3357     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3358     "lea       " MEMLEA(0x10,0) ",%0           \n"
3359     "psrlw     $0x8,%%xmm0                     \n"
3360     "pand      %%xmm4,%%xmm2                   \n"
3361     "psrlw     $0x8,%%xmm1                     \n"
3362     "packuswb  %%xmm1,%%xmm0                   \n"
3363     "pand      %%xmm5,%%xmm0                   \n"
3364     "por       %%xmm2,%%xmm0                   \n"
3365     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3366     "lea       " MEMLEA(0x10,1) ",%1           \n"
3367     "sub       $0x4,%2                         \n"
3368     "jg        1b                              \n"
3369   : "+r"(src_argb),    // %0
3370     "+r"(dst_argb),    // %1
3371     "+r"(width)        // %2
3372   :
3373   : "memory", "cc"
3374     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3375   );
3376 }
3377 #endif  // HAS_ARGBATTENUATEROW_SSE2
3378
3379 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3380 // Shuffle table duplicating alpha
3381 static uvec8 kShuffleAlpha0 = {
3382   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
3383 };
3384 static uvec8 kShuffleAlpha1 = {
3385   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3386   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
3387 };
3388 // Attenuate 4 pixels at a time.
3389 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3390   asm volatile (
3391     "pcmpeqb   %%xmm3,%%xmm3                   \n"
3392     "pslld     $0x18,%%xmm3                    \n"
3393     "movdqa    %3,%%xmm4                       \n"
3394     "movdqa    %4,%%xmm5                       \n"
3395
3396     // 4 pixel loop.
3397     LABELALIGN
3398   "1:                                          \n"
3399     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3400     "pshufb    %%xmm4,%%xmm0                   \n"
3401     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3402     "punpcklbw %%xmm1,%%xmm1                   \n"
3403     "pmulhuw   %%xmm1,%%xmm0                   \n"
3404     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3405     "pshufb    %%xmm5,%%xmm1                   \n"
3406     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3407     "punpckhbw %%xmm2,%%xmm2                   \n"
3408     "pmulhuw   %%xmm2,%%xmm1                   \n"
3409     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3410     "lea       " MEMLEA(0x10,0) ",%0           \n"
3411     "pand      %%xmm3,%%xmm2                   \n"
3412     "psrlw     $0x8,%%xmm0                     \n"
3413     "psrlw     $0x8,%%xmm1                     \n"
3414     "packuswb  %%xmm1,%%xmm0                   \n"
3415     "por       %%xmm2,%%xmm0                   \n"
3416     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3417     "lea       " MEMLEA(0x10,1) ",%1           \n"
3418     "sub       $0x4,%2                         \n"
3419     "jg        1b                              \n"
3420   : "+r"(src_argb),    // %0
3421     "+r"(dst_argb),    // %1
3422     "+r"(width)        // %2
3423   : "m"(kShuffleAlpha0),  // %3
3424     "m"(kShuffleAlpha1)  // %4
3425   : "memory", "cc"
3426     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3427   );
3428 }
3429 #endif  // HAS_ARGBATTENUATEROW_SSSE3
3430
3431 #ifdef HAS_ARGBATTENUATEROW_AVX2
3432 // Shuffle table duplicating alpha.
3433 static const uvec8 kShuffleAlpha_AVX2 = {
3434   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
3435 };
3436 // Attenuate 8 pixels at a time.
3437 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3438   asm volatile (
3439     "vbroadcastf128 %3,%%ymm4                  \n"
3440     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3441     "vpslld     $0x18,%%ymm5,%%ymm5            \n"
3442     "sub        %0,%1                          \n"
3443
3444     // 8 pixel loop.
3445     LABELALIGN
3446   "1:                                          \n"
3447     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3448     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3449     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3450     "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
3451     "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
3452     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3453     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3454     "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
3455     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
3456     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
3457     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3458     "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
3459     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3460     "lea       " MEMLEA(0x20,0) ",%0           \n"
3461     "sub        $0x8,%2                        \n"
3462     "jg        1b                              \n"
3463     "vzeroupper                                \n"
3464   : "+r"(src_argb),    // %0
3465     "+r"(dst_argb),    // %1
3466     "+r"(width)        // %2
3467   : "m"(kShuffleAlpha_AVX2)  // %3
3468   : "memory", "cc"
3469     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3470   );
3471 }
3472 #endif  // HAS_ARGBATTENUATEROW_AVX2
3473
3474 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3475 // Unattenuate 4 pixels at a time.
3476 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3477                              int width) {
3478   uintptr_t alpha = 0;
3479   asm volatile (
3480     // 4 pixel loop.
3481     LABELALIGN
3482   "1:                                          \n"
3483     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3484     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3485     "punpcklbw %%xmm0,%%xmm0                   \n"
3486     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3487     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3488     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3489     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3490     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3491     "movlhps   %%xmm3,%%xmm2                   \n"
3492     "pmulhuw   %%xmm2,%%xmm0                   \n"
3493     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3494     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3495     "punpckhbw %%xmm1,%%xmm1                   \n"
3496     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3497     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3498     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3499     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3500     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3501     "movlhps   %%xmm3,%%xmm2                   \n"
3502     "pmulhuw   %%xmm2,%%xmm1                   \n"
3503     "lea       " MEMLEA(0x10,0) ",%0           \n"
3504     "packuswb  %%xmm1,%%xmm0                   \n"
3505     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3506     "lea       " MEMLEA(0x10,1) ",%1           \n"
3507     "sub       $0x4,%2                         \n"
3508     "jg        1b                              \n"
3509   : "+r"(src_argb),    // %0
3510     "+r"(dst_argb),    // %1
3511     "+r"(width),       // %2
3512     "+r"(alpha)        // %3
3513   : "r"(fixed_invtbl8)  // %4
3514   : "memory", "cc", NACL_R14
3515     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3516   );
3517 }
3518 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
3519
3520 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3521 // Shuffle table duplicating alpha.
3522 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3523   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
3524 };
3525 // Unattenuate 8 pixels at a time.
3526 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
3527                              int width) {
3528   uintptr_t alpha = 0;
3529   asm volatile (
3530     "sub        %0,%1                          \n"
3531     "vbroadcastf128 %5,%%ymm5                  \n"
3532
3533     // 8 pixel loop.
3534     LABELALIGN
3535   "1:                                          \n"
3536     // replace VPGATHER
3537     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3538     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3539     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3540     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3541     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3542     "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
3543     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3544     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3545     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3546     "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
3547     "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
3548     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3549     "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
3550     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3551     "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
3552     "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
3553     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3554     "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
3555     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3556     "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
3557     "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
3558     "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
3559     "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
3560     // end of VPGATHER
3561
3562     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3563     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3564     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3565     "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
3566     "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
3567     "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
3568     "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
3569     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3570     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3571     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3572     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3573     "lea       " MEMLEA(0x20,0) ",%0           \n"
3574     "sub        $0x8,%2                        \n"
3575     "jg        1b                              \n"
3576     "vzeroupper                                \n"
3577   : "+r"(src_argb),    // %0
3578     "+r"(dst_argb),    // %1
3579     "+r"(width),       // %2
3580     "+r"(alpha)        // %3
3581   : "r"(fixed_invtbl8),  // %4
3582     "m"(kUnattenShuffleAlpha_AVX2)  // %5
3583   : "memory", "cc", NACL_R14
3584     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3585   );
3586 }
3587 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
3588
3589 #ifdef HAS_ARGBGRAYROW_SSSE3
3590 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
3591 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3592   asm volatile (
3593     "movdqa    %3,%%xmm4                       \n"
3594     "movdqa    %4,%%xmm5                       \n"
3595
3596     // 8 pixel loop.
3597     LABELALIGN
3598   "1:                                          \n"
3599     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3600     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3601     "pmaddubsw %%xmm4,%%xmm0                   \n"
3602     "pmaddubsw %%xmm4,%%xmm1                   \n"
3603     "phaddw    %%xmm1,%%xmm0                   \n"
3604     "paddw     %%xmm5,%%xmm0                   \n"
3605     "psrlw     $0x7,%%xmm0                     \n"
3606     "packuswb  %%xmm0,%%xmm0                   \n"
3607     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3608     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
3609     "lea       " MEMLEA(0x20,0) ",%0           \n"
3610     "psrld     $0x18,%%xmm2                    \n"
3611     "psrld     $0x18,%%xmm3                    \n"
3612     "packuswb  %%xmm3,%%xmm2                   \n"
3613     "packuswb  %%xmm2,%%xmm2                   \n"
3614     "movdqa    %%xmm0,%%xmm3                   \n"
3615     "punpcklbw %%xmm0,%%xmm0                   \n"
3616     "punpcklbw %%xmm2,%%xmm3                   \n"
3617     "movdqa    %%xmm0,%%xmm1                   \n"
3618     "punpcklwd %%xmm3,%%xmm0                   \n"
3619     "punpckhwd %%xmm3,%%xmm1                   \n"
3620     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3621     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
3622     "lea       " MEMLEA(0x20,1) ",%1           \n"
3623     "sub       $0x8,%2                         \n"
3624     "jg        1b                              \n"
3625   : "+r"(src_argb),   // %0
3626     "+r"(dst_argb),   // %1
3627     "+r"(width)       // %2
3628   : "m"(kARGBToYJ),   // %3
3629     "m"(kAddYJ64)     // %4
3630   : "memory", "cc"
3631     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3632   );
3633 }
3634 #endif  // HAS_ARGBGRAYROW_SSSE3
3635
3636 #ifdef HAS_ARGBSEPIAROW_SSSE3
3637 //    b = (r * 35 + g * 68 + b * 17) >> 7
3638 //    g = (r * 45 + g * 88 + b * 22) >> 7
3639 //    r = (r * 50 + g * 98 + b * 24) >> 7
3640 // Constant for ARGB color to sepia tone
3641 static vec8 kARGBToSepiaB = {
3642   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3643 };
3644
3645 static vec8 kARGBToSepiaG = {
3646   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3647 };
3648
3649 static vec8 kARGBToSepiaR = {
3650   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3651 };
3652
3653 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3654 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3655   asm volatile (
3656     "movdqa    %2,%%xmm2                       \n"
3657     "movdqa    %3,%%xmm3                       \n"
3658     "movdqa    %4,%%xmm4                       \n"
3659
3660     // 8 pixel loop.
3661     LABELALIGN
3662   "1:                                          \n"
3663     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3664     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
3665     "pmaddubsw %%xmm2,%%xmm0                   \n"
3666     "pmaddubsw %%xmm2,%%xmm6                   \n"
3667     "phaddw    %%xmm6,%%xmm0                   \n"
3668     "psrlw     $0x7,%%xmm0                     \n"
3669     "packuswb  %%xmm0,%%xmm0                   \n"
3670     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3671     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3672     "pmaddubsw %%xmm3,%%xmm5                   \n"
3673     "pmaddubsw %%xmm3,%%xmm1                   \n"
3674     "phaddw    %%xmm1,%%xmm5                   \n"
3675     "psrlw     $0x7,%%xmm5                     \n"
3676     "packuswb  %%xmm5,%%xmm5                   \n"
3677     "punpcklbw %%xmm5,%%xmm0                   \n"
3678     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3679     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3680     "pmaddubsw %%xmm4,%%xmm5                   \n"
3681     "pmaddubsw %%xmm4,%%xmm1                   \n"
3682     "phaddw    %%xmm1,%%xmm5                   \n"
3683     "psrlw     $0x7,%%xmm5                     \n"
3684     "packuswb  %%xmm5,%%xmm5                   \n"
3685     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
3686     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3687     "psrld     $0x18,%%xmm6                    \n"
3688     "psrld     $0x18,%%xmm1                    \n"
3689     "packuswb  %%xmm1,%%xmm6                   \n"
3690     "packuswb  %%xmm6,%%xmm6                   \n"
3691     "punpcklbw %%xmm6,%%xmm5                   \n"
3692     "movdqa    %%xmm0,%%xmm1                   \n"
3693     "punpcklwd %%xmm5,%%xmm0                   \n"
3694     "punpckhwd %%xmm5,%%xmm1                   \n"
3695     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
3696     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
3697     "lea       " MEMLEA(0x20,0) ",%0           \n"
3698     "sub       $0x8,%1                         \n"
3699     "jg        1b                              \n"
3700   : "+r"(dst_argb),      // %0
3701     "+r"(width)          // %1
3702   : "m"(kARGBToSepiaB),  // %2
3703     "m"(kARGBToSepiaG),  // %3
3704     "m"(kARGBToSepiaR)   // %4
3705   : "memory", "cc"
3706     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3707   );
3708 }
3709 #endif  // HAS_ARGBSEPIAROW_SSSE3
3710
3711 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3712 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3713 // Same as Sepia except matrix is provided.
3714 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
3715                               const int8* matrix_argb, int width) {
3716   asm volatile (
3717     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
3718     "pshufd    $0x00,%%xmm5,%%xmm2             \n"
3719     "pshufd    $0x55,%%xmm5,%%xmm3             \n"
3720     "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
3721     "pshufd    $0xff,%%xmm5,%%xmm5             \n"
3722
3723     // 8 pixel loop.
3724     LABELALIGN
3725   "1:                                          \n"
3726     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3727     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
3728     "pmaddubsw %%xmm2,%%xmm0                   \n"
3729     "pmaddubsw %%xmm2,%%xmm7                   \n"
3730     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
3731     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3732     "pmaddubsw %%xmm3,%%xmm6                   \n"
3733     "pmaddubsw %%xmm3,%%xmm1                   \n"
3734     "phaddsw   %%xmm7,%%xmm0                   \n"
3735     "phaddsw   %%xmm1,%%xmm6                   \n"
3736     "psraw     $0x6,%%xmm0                     \n"
3737     "psraw     $0x6,%%xmm6                     \n"
3738     "packuswb  %%xmm0,%%xmm0                   \n"
3739     "packuswb  %%xmm6,%%xmm6                   \n"
3740     "punpcklbw %%xmm6,%%xmm0                   \n"
3741     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3742     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
3743     "pmaddubsw %%xmm4,%%xmm1                   \n"
3744     "pmaddubsw %%xmm4,%%xmm7                   \n"
3745     "phaddsw   %%xmm7,%%xmm1                   \n"
3746     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
3747     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
3748     "pmaddubsw %%xmm5,%%xmm6                   \n"
3749     "pmaddubsw %%xmm5,%%xmm7                   \n"
3750     "phaddsw   %%xmm7,%%xmm6                   \n"
3751     "psraw     $0x6,%%xmm1                     \n"
3752     "psraw     $0x6,%%xmm6                     \n"
3753     "packuswb  %%xmm1,%%xmm1                   \n"
3754     "packuswb  %%xmm6,%%xmm6                   \n"
3755     "punpcklbw %%xmm6,%%xmm1                   \n"
3756     "movdqa    %%xmm0,%%xmm6                   \n"
3757     "punpcklwd %%xmm1,%%xmm0                   \n"
3758     "punpckhwd %%xmm1,%%xmm6                   \n"
3759     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3760     "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
3761     "lea       " MEMLEA(0x20,0) ",%0           \n"
3762     "lea       " MEMLEA(0x20,1) ",%1           \n"
3763     "sub       $0x8,%2                         \n"
3764     "jg        1b                              \n"
3765   : "+r"(src_argb),      // %0
3766     "+r"(dst_argb),      // %1
3767     "+r"(width)          // %2
3768   : "r"(matrix_argb)     // %3
3769   : "memory", "cc"
3770     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3771   );
3772 }
3773 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
3774
3775 #ifdef HAS_ARGBQUANTIZEROW_SSE2
3776 // Quantize 4 ARGB pixels (16 bytes).
3777 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3778                           int interval_offset, int width) {
3779   asm volatile (
3780     "movd      %2,%%xmm2                       \n"
3781     "movd      %3,%%xmm3                       \n"
3782     "movd      %4,%%xmm4                       \n"
3783     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3784     "pshufd    $0x44,%%xmm2,%%xmm2             \n"
3785     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3786     "pshufd    $0x44,%%xmm3,%%xmm3             \n"
3787     "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
3788     "pshufd    $0x44,%%xmm4,%%xmm4             \n"
3789     "pxor      %%xmm5,%%xmm5                   \n"
3790     "pcmpeqb   %%xmm6,%%xmm6                   \n"
3791     "pslld     $0x18,%%xmm6                    \n"
3792
3793     // 4 pixel loop.
3794     LABELALIGN
3795   "1:                                          \n"
3796     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3797     "punpcklbw %%xmm5,%%xmm0                   \n"
3798     "pmulhuw   %%xmm2,%%xmm0                   \n"
3799     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3800     "punpckhbw %%xmm5,%%xmm1                   \n"
3801     "pmulhuw   %%xmm2,%%xmm1                   \n"
3802     "pmullw    %%xmm3,%%xmm0                   \n"
3803     "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
3804     "pmullw    %%xmm3,%%xmm1                   \n"
3805     "pand      %%xmm6,%%xmm7                   \n"
3806     "paddw     %%xmm4,%%xmm0                   \n"
3807     "paddw     %%xmm4,%%xmm1                   \n"
3808     "packuswb  %%xmm1,%%xmm0                   \n"
3809     "por       %%xmm7,%%xmm0                   \n"
3810     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
3811     "lea       " MEMLEA(0x10,0) ",%0           \n"
3812     "sub       $0x4,%1                         \n"
3813     "jg        1b                              \n"
3814   : "+r"(dst_argb),       // %0
3815     "+r"(width)           // %1
3816   : "r"(scale),           // %2
3817     "r"(interval_size),   // %3
3818     "r"(interval_offset)  // %4
3819   : "memory", "cc"
3820     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3821   );
3822 }
3823 #endif  // HAS_ARGBQUANTIZEROW_SSE2
3824
3825 #ifdef HAS_ARGBSHADEROW_SSE2
3826 // Shade 4 pixels at a time by specified value.
3827 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3828                        uint32 value) {
3829   asm volatile (
3830     "movd      %3,%%xmm2                       \n"
3831     "punpcklbw %%xmm2,%%xmm2                   \n"
3832     "punpcklqdq %%xmm2,%%xmm2                  \n"
3833
3834     // 4 pixel loop.
3835     LABELALIGN
3836   "1:                                          \n"
3837     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3838     "lea       " MEMLEA(0x10,0) ",%0           \n"
3839     "movdqa    %%xmm0,%%xmm1                   \n"
3840     "punpcklbw %%xmm0,%%xmm0                   \n"
3841     "punpckhbw %%xmm1,%%xmm1                   \n"
3842     "pmulhuw   %%xmm2,%%xmm0                   \n"
3843     "pmulhuw   %%xmm2,%%xmm1                   \n"
3844     "psrlw     $0x8,%%xmm0                     \n"
3845     "psrlw     $0x8,%%xmm1                     \n"
3846     "packuswb  %%xmm1,%%xmm0                   \n"
3847     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3848     "lea       " MEMLEA(0x10,1) ",%1           \n"
3849     "sub       $0x4,%2                         \n"
3850     "jg        1b                              \n"
3851   : "+r"(src_argb),  // %0
3852     "+r"(dst_argb),  // %1
3853     "+r"(width)      // %2
3854   : "r"(value)       // %3
3855   : "memory", "cc"
3856     , "xmm0", "xmm1", "xmm2"
3857   );
3858 }
3859 #endif  // HAS_ARGBSHADEROW_SSE2
3860
3861 #ifdef HAS_ARGBMULTIPLYROW_SSE2
3862 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
3863 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3864                           uint8* dst_argb, int width) {
3865   asm volatile (
3866     "pxor      %%xmm5,%%xmm5                  \n"
3867
3868     // 4 pixel loop.
3869     LABELALIGN
3870   "1:                                          \n"
3871     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3872     "lea       " MEMLEA(0x10,0) ",%0           \n"
3873     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3874     "lea       " MEMLEA(0x10,1) ",%1           \n"
3875     "movdqu    %%xmm0,%%xmm1                   \n"
3876     "movdqu    %%xmm2,%%xmm3                   \n"
3877     "punpcklbw %%xmm0,%%xmm0                   \n"
3878     "punpckhbw %%xmm1,%%xmm1                   \n"
3879     "punpcklbw %%xmm5,%%xmm2                   \n"
3880     "punpckhbw %%xmm5,%%xmm3                   \n"
3881     "pmulhuw   %%xmm2,%%xmm0                   \n"
3882     "pmulhuw   %%xmm3,%%xmm1                   \n"
3883     "packuswb  %%xmm1,%%xmm0                   \n"
3884     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3885     "lea       " MEMLEA(0x10,2) ",%2           \n"
3886     "sub       $0x4,%3                         \n"
3887     "jg        1b                              \n"
3888   : "+r"(src_argb0),  // %0
3889     "+r"(src_argb1),  // %1
3890     "+r"(dst_argb),   // %2
3891     "+r"(width)       // %3
3892   :
3893   : "memory", "cc"
3894     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3895   );
3896 }
3897 #endif  // HAS_ARGBMULTIPLYROW_SSE2
3898
3899 #ifdef HAS_ARGBMULTIPLYROW_AVX2
3900 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
3901 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
3902                           uint8* dst_argb, int width) {
3903   asm volatile (
3904     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
3905
3906     // 4 pixel loop.
3907     LABELALIGN
3908   "1:                                          \n"
3909     "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
3910     "lea        " MEMLEA(0x20,0) ",%0          \n"
3911     "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
3912     "lea        " MEMLEA(0x20,1) ",%1          \n"
3913     "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
3914     "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
3915     "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
3916     "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
3917     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3918     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3919     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3920     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
3921     "lea       " MEMLEA(0x20,2) ",%2           \n"
3922     "sub        $0x8,%3                        \n"
3923     "jg        1b                              \n"
3924     "vzeroupper                                \n"
3925   : "+r"(src_argb0),  // %0
3926     "+r"(src_argb1),  // %1
3927     "+r"(dst_argb),   // %2
3928     "+r"(width)       // %3
3929   :
3930   : "memory", "cc"
3931 #if defined(__AVX2__)
3932     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3933 #endif
3934   );
3935 }
3936 #endif  // HAS_ARGBMULTIPLYROW_AVX2
3937
3938 #ifdef HAS_ARGBADDROW_SSE2
3939 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
3940 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3941                      uint8* dst_argb, int width) {
3942   asm volatile (
3943     // 4 pixel loop.
3944     LABELALIGN
3945   "1:                                          \n"
3946     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3947     "lea       " MEMLEA(0x10,0) ",%0           \n"
3948     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
3949     "lea       " MEMLEA(0x10,1) ",%1           \n"
3950     "paddusb   %%xmm1,%%xmm0                   \n"
3951     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3952     "lea       " MEMLEA(0x10,2) ",%2           \n"
3953     "sub       $0x4,%3                         \n"
3954     "jg        1b                              \n"
3955   : "+r"(src_argb0),  // %0
3956     "+r"(src_argb1),  // %1
3957     "+r"(dst_argb),   // %2
3958     "+r"(width)       // %3
3959   :
3960   : "memory", "cc"
3961     , "xmm0", "xmm1"
3962   );
3963 }
3964 #endif  // HAS_ARGBADDROW_SSE2
3965
3966 #ifdef HAS_ARGBADDROW_AVX2
3967 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
3968 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
3969                      uint8* dst_argb, int width) {
3970   asm volatile (
3971     // 4 pixel loop.
3972     LABELALIGN
3973   "1:                                          \n"
3974     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
3975     "lea        " MEMLEA(0x20,0) ",%0          \n"
3976     "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
3977     "lea        " MEMLEA(0x20,1) ",%1          \n"
3978     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
3979     "lea        " MEMLEA(0x20,2) ",%2          \n"
3980     "sub        $0x8,%3                        \n"
3981     "jg        1b                              \n"
3982     "vzeroupper                                \n"
3983   : "+r"(src_argb0),  // %0
3984     "+r"(src_argb1),  // %1
3985     "+r"(dst_argb),   // %2
3986     "+r"(width)       // %3
3987   :
3988   : "memory", "cc"
3989     , "xmm0"
3990   );
3991 }
3992 #endif  // HAS_ARGBADDROW_AVX2
3993
3994 #ifdef HAS_ARGBSUBTRACTROW_SSE2
3995 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
3996 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3997                           uint8* dst_argb, int width) {
3998   asm volatile (
3999     // 4 pixel loop.
4000     LABELALIGN
4001   "1:                                          \n"
4002     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4003     "lea       " MEMLEA(0x10,0) ",%0           \n"
4004     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4005     "lea       " MEMLEA(0x10,1) ",%1           \n"
4006     "psubusb   %%xmm1,%%xmm0                   \n"
4007     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4008     "lea       " MEMLEA(0x10,2) ",%2           \n"
4009     "sub       $0x4,%3                         \n"
4010     "jg        1b                              \n"
4011   : "+r"(src_argb0),  // %0
4012     "+r"(src_argb1),  // %1
4013     "+r"(dst_argb),   // %2
4014     "+r"(width)       // %3
4015   :
4016   : "memory", "cc"
4017     , "xmm0", "xmm1"
4018   );
4019 }
4020 #endif  // HAS_ARGBSUBTRACTROW_SSE2
4021
4022 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4023 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
4024 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4025                           uint8* dst_argb, int width) {
4026   asm volatile (
4027     // 4 pixel loop.
4028     LABELALIGN
4029   "1:                                          \n"
4030     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
4031     "lea        " MEMLEA(0x20,0) ",%0          \n"
4032     "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4033     "lea        " MEMLEA(0x20,1) ",%1          \n"
4034     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4035     "lea        " MEMLEA(0x20,2) ",%2          \n"
4036     "sub        $0x8,%3                        \n"
4037     "jg        1b                              \n"
4038     "vzeroupper                                \n"
4039   : "+r"(src_argb0),  // %0
4040     "+r"(src_argb1),  // %1
4041     "+r"(dst_argb),   // %2
4042     "+r"(width)       // %3
4043   :
4044   : "memory", "cc"
4045     , "xmm0"
4046   );
4047 }
4048 #endif  // HAS_ARGBSUBTRACTROW_AVX2
4049
4050 #ifdef HAS_SOBELXROW_SSE2
4051 // SobelX as a matrix is
4052 // -1  0  1
4053 // -2  0  2
4054 // -1  0  1
4055 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4056                     const uint8* src_y2, uint8* dst_sobelx, int width) {
4057   asm volatile (
4058     "sub       %0,%1                           \n"
4059     "sub       %0,%2                           \n"
4060     "sub       %0,%3                           \n"
4061     "pxor      %%xmm5,%%xmm5                   \n"
4062
4063     // 8 pixel loop.
4064     LABELALIGN
4065   "1:                                          \n"
4066     "movq      " MEMACCESS(0) ",%%xmm0         \n"
4067     "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
4068     "punpcklbw %%xmm5,%%xmm0                   \n"
4069     "punpcklbw %%xmm5,%%xmm1                   \n"
4070     "psubw     %%xmm1,%%xmm0                   \n"
4071     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4072     MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
4073     "punpcklbw %%xmm5,%%xmm1                   \n"
4074     "punpcklbw %%xmm5,%%xmm2                   \n"
4075     "psubw     %%xmm2,%%xmm1                   \n"
4076     MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
4077     MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
4078     "punpcklbw %%xmm5,%%xmm2                   \n"
4079     "punpcklbw %%xmm5,%%xmm3                   \n"
4080     "psubw     %%xmm3,%%xmm2                   \n"
4081     "paddw     %%xmm2,%%xmm0                   \n"
4082     "paddw     %%xmm1,%%xmm0                   \n"
4083     "paddw     %%xmm1,%%xmm0                   \n"
4084     "pxor      %%xmm1,%%xmm1                   \n"
4085     "psubw     %%xmm0,%%xmm1                   \n"
4086     "pmaxsw    %%xmm1,%%xmm0                   \n"
4087     "packuswb  %%xmm0,%%xmm0                   \n"
4088     MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
4089     "lea       " MEMLEA(0x8,0) ",%0            \n"
4090     "sub       $0x8,%4                         \n"
4091     "jg        1b                              \n"
4092   : "+r"(src_y0),      // %0
4093     "+r"(src_y1),      // %1
4094     "+r"(src_y2),      // %2
4095     "+r"(dst_sobelx),  // %3
4096     "+r"(width)        // %4
4097   :
4098   : "memory", "cc", NACL_R14
4099     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4100   );
4101 }
4102 #endif  // HAS_SOBELXROW_SSE2
4103
4104 #ifdef HAS_SOBELYROW_SSE2
4105 // SobelY as a matrix is
4106 // -1 -2 -1
4107 //  0  0  0
4108 //  1  2  1
4109 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4110                     uint8* dst_sobely, int width) {
4111   asm volatile (
4112     "sub       %0,%1                           \n"
4113     "sub       %0,%2                           \n"
4114     "pxor      %%xmm5,%%xmm5                   \n"
4115
4116     // 8 pixel loop.
4117     LABELALIGN
4118   "1:                                          \n"
4119     "movq      " MEMACCESS(0) ",%%xmm0         \n"
4120     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4121     "punpcklbw %%xmm5,%%xmm0                   \n"
4122     "punpcklbw %%xmm5,%%xmm1                   \n"
4123     "psubw     %%xmm1,%%xmm0                   \n"
4124     "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
4125     MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
4126     "punpcklbw %%xmm5,%%xmm1                   \n"
4127     "punpcklbw %%xmm5,%%xmm2                   \n"
4128     "psubw     %%xmm2,%%xmm1                   \n"
4129     "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
4130     MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
4131     "punpcklbw %%xmm5,%%xmm2                   \n"
4132     "punpcklbw %%xmm5,%%xmm3                   \n"
4133     "psubw     %%xmm3,%%xmm2                   \n"
4134     "paddw     %%xmm2,%%xmm0                   \n"
4135     "paddw     %%xmm1,%%xmm0                   \n"
4136     "paddw     %%xmm1,%%xmm0                   \n"
4137     "pxor      %%xmm1,%%xmm1                   \n"
4138     "psubw     %%xmm0,%%xmm1                   \n"
4139     "pmaxsw    %%xmm1,%%xmm0                   \n"
4140     "packuswb  %%xmm0,%%xmm0                   \n"
4141     MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
4142     "lea       " MEMLEA(0x8,0) ",%0            \n"
4143     "sub       $0x8,%3                         \n"
4144     "jg        1b                              \n"
4145   : "+r"(src_y0),      // %0
4146     "+r"(src_y1),      // %1
4147     "+r"(dst_sobely),  // %2
4148     "+r"(width)        // %3
4149   :
4150   : "memory", "cc", NACL_R14
4151     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4152   );
4153 }
4154 #endif  // HAS_SOBELYROW_SSE2
4155
4156 #ifdef HAS_SOBELROW_SSE2
4157 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4158 // A = 255
4159 // R = Sobel
4160 // G = Sobel
4161 // B = Sobel
4162 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4163                    uint8* dst_argb, int width) {
4164   asm volatile (
4165     "sub       %0,%1                           \n"
4166     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4167     "pslld     $0x18,%%xmm5                    \n"
4168
4169     // 8 pixel loop.
4170     LABELALIGN
4171   "1:                                          \n"
4172     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4173     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4174     "lea       " MEMLEA(0x10,0) ",%0           \n"
4175     "paddusb   %%xmm1,%%xmm0                   \n"
4176     "movdqa    %%xmm0,%%xmm2                   \n"
4177     "punpcklbw %%xmm0,%%xmm2                   \n"
4178     "punpckhbw %%xmm0,%%xmm0                   \n"
4179     "movdqa    %%xmm2,%%xmm1                   \n"
4180     "punpcklwd %%xmm2,%%xmm1                   \n"
4181     "punpckhwd %%xmm2,%%xmm2                   \n"
4182     "por       %%xmm5,%%xmm1                   \n"
4183     "por       %%xmm5,%%xmm2                   \n"
4184     "movdqa    %%xmm0,%%xmm3                   \n"
4185     "punpcklwd %%xmm0,%%xmm3                   \n"
4186     "punpckhwd %%xmm0,%%xmm0                   \n"
4187     "por       %%xmm5,%%xmm3                   \n"
4188     "por       %%xmm5,%%xmm0                   \n"
4189     "movdqu    %%xmm1," MEMACCESS(2) "         \n"
4190     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
4191     "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
4192     "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
4193     "lea       " MEMLEA(0x40,2) ",%2           \n"
4194     "sub       $0x10,%3                        \n"
4195     "jg        1b                              \n"
4196   : "+r"(src_sobelx),  // %0
4197     "+r"(src_sobely),  // %1
4198     "+r"(dst_argb),    // %2
4199     "+r"(width)        // %3
4200   :
4201   : "memory", "cc", NACL_R14
4202     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4203   );
4204 }
4205 #endif  // HAS_SOBELROW_SSE2
4206
4207 #ifdef HAS_SOBELTOPLANEROW_SSE2
4208 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
4209 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4210                           uint8* dst_y, int width) {
4211   asm volatile (
4212     "sub       %0,%1                           \n"
4213     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4214     "pslld     $0x18,%%xmm5                    \n"
4215
4216     // 8 pixel loop.
4217     LABELALIGN
4218   "1:                                          \n"
4219     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4220     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4221     "lea       " MEMLEA(0x10,0) ",%0           \n"
4222     "paddusb   %%xmm1,%%xmm0                   \n"
4223     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4224     "lea       " MEMLEA(0x10,2) ",%2           \n"
4225     "sub       $0x10,%3                        \n"
4226     "jg        1b                              \n"
4227   : "+r"(src_sobelx),  // %0
4228     "+r"(src_sobely),  // %1
4229     "+r"(dst_y),       // %2
4230     "+r"(width)        // %3
4231   :
4232   : "memory", "cc", NACL_R14
4233     "xmm0", "xmm1"
4234   );
4235 }
4236 #endif  // HAS_SOBELTOPLANEROW_SSE2
4237
4238 #ifdef HAS_SOBELXYROW_SSE2
4239 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4240 // A = 255
4241 // R = Sobel X
4242 // G = Sobel
4243 // B = Sobel Y
4244 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4245                      uint8* dst_argb, int width) {
4246   asm volatile (
4247     "sub       %0,%1                           \n"
4248     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4249
4250     // 8 pixel loop.
4251     LABELALIGN
4252   "1:                                          \n"
4253     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4254     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4255     "lea       " MEMLEA(0x10,0) ",%0           \n"
4256     "movdqa    %%xmm0,%%xmm2                   \n"
4257     "paddusb   %%xmm1,%%xmm2                   \n"
4258     "movdqa    %%xmm0,%%xmm3                   \n"
4259     "punpcklbw %%xmm5,%%xmm3                   \n"
4260     "punpckhbw %%xmm5,%%xmm0                   \n"
4261     "movdqa    %%xmm1,%%xmm4                   \n"
4262     "punpcklbw %%xmm2,%%xmm4                   \n"
4263     "punpckhbw %%xmm2,%%xmm1                   \n"
4264     "movdqa    %%xmm4,%%xmm6                   \n"
4265     "punpcklwd %%xmm3,%%xmm6                   \n"
4266     "punpckhwd %%xmm3,%%xmm4                   \n"
4267     "movdqa    %%xmm1,%%xmm7                   \n"
4268     "punpcklwd %%xmm0,%%xmm7                   \n"
4269     "punpckhwd %%xmm0,%%xmm1                   \n"
4270     "movdqu    %%xmm6," MEMACCESS(2) "         \n"
4271     "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
4272     "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
4273     "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
4274     "lea       " MEMLEA(0x40,2) ",%2           \n"
4275     "sub       $0x10,%3                        \n"
4276     "jg        1b                              \n"
4277   : "+r"(src_sobelx),  // %0
4278     "+r"(src_sobely),  // %1
4279     "+r"(dst_argb),    // %2
4280     "+r"(width)        // %3
4281   :
4282   : "memory", "cc", NACL_R14
4283     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4284   );
4285 }
4286 #endif  // HAS_SOBELXYROW_SSE2
4287
4288 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4289 // Creates a table of cumulative sums where each value is a sum of all values
4290 // above and to the left of the value, inclusive of the value.
4291 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
4292                                   const int32* previous_cumsum, int width) {
4293   asm volatile (
4294     "pxor      %%xmm0,%%xmm0                   \n"
4295     "pxor      %%xmm1,%%xmm1                   \n"
4296     "sub       $0x4,%3                         \n"
4297     "jl        49f                             \n"
4298     "test      $0xf,%1                         \n"
4299     "jne       49f                             \n"
4300
4301   // 4 pixel loop                              \n"
4302     LABELALIGN
4303   "40:                                         \n"
4304     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
4305     "lea       " MEMLEA(0x10,0) ",%0           \n"
4306     "movdqa    %%xmm2,%%xmm4                   \n"
4307     "punpcklbw %%xmm1,%%xmm2                   \n"
4308     "movdqa    %%xmm2,%%xmm3                   \n"
4309     "punpcklwd %%xmm1,%%xmm2                   \n"
4310     "punpckhwd %%xmm1,%%xmm3                   \n"
4311     "punpckhbw %%xmm1,%%xmm4                   \n"
4312     "movdqa    %%xmm4,%%xmm5                   \n"
4313     "punpcklwd %%xmm1,%%xmm4                   \n"
4314     "punpckhwd %%xmm1,%%xmm5                   \n"
4315     "paddd     %%xmm2,%%xmm0                   \n"
4316     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4317     "paddd     %%xmm0,%%xmm2                   \n"
4318     "paddd     %%xmm3,%%xmm0                   \n"
4319     "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
4320     "paddd     %%xmm0,%%xmm3                   \n"
4321     "paddd     %%xmm4,%%xmm0                   \n"
4322     "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
4323     "paddd     %%xmm0,%%xmm4                   \n"
4324     "paddd     %%xmm5,%%xmm0                   \n"
4325     "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
4326     "lea       " MEMLEA(0x40,2) ",%2           \n"
4327     "paddd     %%xmm0,%%xmm5                   \n"
4328     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4329     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
4330     "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
4331     "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
4332     "lea       " MEMLEA(0x40,1) ",%1           \n"
4333     "sub       $0x4,%3                         \n"
4334     "jge       40b                             \n"
4335
4336   "49:                                         \n"
4337     "add       $0x3,%3                         \n"
4338     "jl        19f                             \n"
4339
4340   // 1 pixel loop                              \n"
4341     LABELALIGN
4342   "10:                                         \n"
4343     "movd      " MEMACCESS(0) ",%%xmm2         \n"
4344     "lea       " MEMLEA(0x4,0) ",%0            \n"
4345     "punpcklbw %%xmm1,%%xmm2                   \n"
4346     "punpcklwd %%xmm1,%%xmm2                   \n"
4347     "paddd     %%xmm2,%%xmm0                   \n"
4348     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4349     "lea       " MEMLEA(0x10,2) ",%2           \n"
4350     "paddd     %%xmm0,%%xmm2                   \n"
4351     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4352     "lea       " MEMLEA(0x10,1) ",%1           \n"
4353     "sub       $0x1,%3                         \n"
4354     "jge       10b                             \n"
4355
4356   "19:                                         \n"
4357   : "+r"(row),  // %0
4358     "+r"(cumsum),  // %1
4359     "+r"(previous_cumsum),  // %2
4360     "+r"(width)  // %3
4361   :
4362   : "memory", "cc"
4363     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4364   );
4365 }
4366 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
4367
4368 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4369 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4370                                     int width, int area, uint8* dst,
4371                                     int count) {
4372   asm volatile (
4373     "movd      %5,%%xmm5                       \n"
4374     "cvtdq2ps  %%xmm5,%%xmm5                   \n"
4375     "rcpss     %%xmm5,%%xmm4                   \n"
4376     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
4377     "sub       $0x4,%3                         \n"
4378     "jl        49f                             \n"
4379     "cmpl      $0x80,%5                        \n"
4380     "ja        40f                             \n"
4381
4382     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4383     "pcmpeqb   %%xmm6,%%xmm6                   \n"
4384     "psrld     $0x10,%%xmm6                    \n"
4385     "cvtdq2ps  %%xmm6,%%xmm6                   \n"
4386     "addps     %%xmm6,%%xmm5                   \n"
4387     "mulps     %%xmm4,%%xmm5                   \n"
4388     "cvtps2dq  %%xmm5,%%xmm5                   \n"
4389     "packssdw  %%xmm5,%%xmm5                   \n"
4390
4391   // 4 pixel small loop                        \n"
4392     LABELALIGN
4393   "4:                                         \n"
4394     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4395     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4396     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4397     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4398     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4399     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4400     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4401     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4402     "lea       " MEMLEA(0x40,0) ",%0           \n"
4403     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4404     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4405     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4406     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4407     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4408     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4409     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4410     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4411     "lea       " MEMLEA(0x40,1) ",%1           \n"
4412     "packssdw  %%xmm1,%%xmm0                   \n"
4413     "packssdw  %%xmm3,%%xmm2                   \n"
4414     "pmulhuw   %%xmm5,%%xmm0                   \n"
4415     "pmulhuw   %%xmm5,%%xmm2                   \n"
4416     "packuswb  %%xmm2,%%xmm0                   \n"
4417     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4418     "lea       " MEMLEA(0x10,2) ",%2           \n"
4419     "sub       $0x4,%3                         \n"
4420     "jge       4b                              \n"
4421     "jmp       49f                             \n"
4422
4423   // 4 pixel loop                              \n"
4424     LABELALIGN
4425   "40:                                         \n"
4426     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4427     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4428     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4429     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4430     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4431     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4432     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4433     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4434     "lea       " MEMLEA(0x40,0) ",%0           \n"
4435     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4436     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4437     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4438     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4439     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4440     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4441     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4442     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4443     "lea       " MEMLEA(0x40,1) ",%1           \n"
4444     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4445     "cvtdq2ps  %%xmm1,%%xmm1                   \n"
4446     "mulps     %%xmm4,%%xmm0                   \n"
4447     "mulps     %%xmm4,%%xmm1                   \n"
4448     "cvtdq2ps  %%xmm2,%%xmm2                   \n"
4449     "cvtdq2ps  %%xmm3,%%xmm3                   \n"
4450     "mulps     %%xmm4,%%xmm2                   \n"
4451     "mulps     %%xmm4,%%xmm3                   \n"
4452     "cvtps2dq  %%xmm0,%%xmm0                   \n"
4453     "cvtps2dq  %%xmm1,%%xmm1                   \n"
4454     "cvtps2dq  %%xmm2,%%xmm2                   \n"
4455     "cvtps2dq  %%xmm3,%%xmm3                   \n"
4456     "packssdw  %%xmm1,%%xmm0                   \n"
4457     "packssdw  %%xmm3,%%xmm2                   \n"
4458     "packuswb  %%xmm2,%%xmm0                   \n"
4459     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4460     "lea       " MEMLEA(0x10,2) ",%2           \n"
4461     "sub       $0x4,%3                         \n"
4462     "jge       40b                             \n"
4463
4464   "49:                                         \n"
4465     "add       $0x3,%3                         \n"
4466     "jl        19f                             \n"
4467
4468   // 1 pixel loop                              \n"
4469     LABELALIGN
4470   "10:                                         \n"
4471     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4472     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4473     "lea       " MEMLEA(0x10,0) ",%0           \n"
4474     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4475     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4476     "lea       " MEMLEA(0x10,1) ",%1           \n"
4477     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4478     "mulps     %%xmm4,%%xmm0                   \n"
4479     "cvtps2dq  %%xmm0,%%xmm0                   \n"
4480     "packssdw  %%xmm0,%%xmm0                   \n"
4481     "packuswb  %%xmm0,%%xmm0                   \n"
4482     "movd      %%xmm0," MEMACCESS(2) "         \n"
4483     "lea       " MEMLEA(0x4,2) ",%2            \n"
4484     "sub       $0x1,%3                         \n"
4485     "jge       10b                             \n"
4486   "19:                                         \n"
4487   : "+r"(topleft),  // %0
4488     "+r"(botleft),  // %1
4489     "+r"(dst),      // %2
4490     "+rm"(count)    // %3
4491   : "r"((intptr_t)(width)),  // %4
4492     "rm"(area)     // %5
4493   : "memory", "cc", NACL_R14
4494     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4495   );
4496 }
4497 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4498
4499 #ifdef HAS_ARGBAFFINEROW_SSE2
4500 // Copy ARGB pixels from source image with slope to a row of destination.
4501 LIBYUV_API
4502 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
4503                         uint8* dst_argb, const float* src_dudv, int width) {
4504   intptr_t src_argb_stride_temp = src_argb_stride;
4505   intptr_t temp = 0;
4506   asm volatile (
4507     "movq      " MEMACCESS(3) ",%%xmm2         \n"
4508     "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
4509     "shl       $0x10,%1                        \n"
4510     "add       $0x4,%1                         \n"
4511     "movd      %1,%%xmm5                       \n"
4512     "sub       $0x4,%4                         \n"
4513     "jl        49f                             \n"
4514
4515     "pshufd    $0x44,%%xmm7,%%xmm7             \n"
4516     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4517     "movdqa    %%xmm2,%%xmm0                   \n"
4518     "addps     %%xmm7,%%xmm0                   \n"
4519     "movlhps   %%xmm0,%%xmm2                   \n"
4520     "movdqa    %%xmm7,%%xmm4                   \n"
4521     "addps     %%xmm4,%%xmm4                   \n"
4522     "movdqa    %%xmm2,%%xmm3                   \n"
4523     "addps     %%xmm4,%%xmm3                   \n"
4524     "addps     %%xmm4,%%xmm4                   \n"
4525
4526   // 4 pixel loop                              \n"
4527     LABELALIGN
4528   "40:                                         \n"
4529     "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
4530     "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
4531     "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
4532     "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
4533     "movd      %%xmm0,%k1                      \n"
4534     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4535     "movd      %%xmm0,%k5                      \n"
4536     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4537     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
4538     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4539     "punpckldq %%xmm6,%%xmm1                   \n"
4540     "addps     %%xmm4,%%xmm2                   \n"
4541     "movq      %%xmm1," MEMACCESS(2) "         \n"
4542     "movd      %%xmm0,%k1                      \n"
4543     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4544     "movd      %%xmm0,%k5                      \n"
4545     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4546     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4547     "punpckldq %%xmm6,%%xmm0                   \n"
4548     "addps     %%xmm4,%%xmm3                   \n"
4549     "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
4550     "lea       " MEMLEA(0x10,2) ",%2           \n"
4551     "sub       $0x4,%4                         \n"
4552     "jge       40b                             \n"
4553
4554   "49:                                         \n"
4555     "add       $0x3,%4                         \n"
4556     "jl        19f                             \n"
4557
4558   // 1 pixel loop                              \n"
4559     LABELALIGN
4560   "10:                                         \n"
4561     "cvttps2dq %%xmm2,%%xmm0                   \n"
4562     "packssdw  %%xmm0,%%xmm0                   \n"
4563     "pmaddwd   %%xmm5,%%xmm0                   \n"
4564     "addps     %%xmm7,%%xmm2                   \n"
4565     "movd      %%xmm0,%k1                      \n"
4566     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4567     "movd      %%xmm0," MEMACCESS(2) "         \n"
4568     "lea       " MEMLEA(0x04,2) ",%2           \n"
4569     "sub       $0x1,%4                         \n"
4570     "jge       10b                             \n"
4571   "19:                                         \n"
4572   : "+r"(src_argb),  // %0
4573     "+r"(src_argb_stride_temp),  // %1
4574     "+r"(dst_argb),  // %2
4575     "+r"(src_dudv),  // %3
4576     "+rm"(width),    // %4
4577     "+r"(temp)   // %5
4578   :
4579   : "memory", "cc", NACL_R14
4580     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4581   );
4582 }
4583 #endif  // HAS_ARGBAFFINEROW_SSE2
4584
4585 #ifdef HAS_INTERPOLATEROW_SSSE3
4586 // Bilinear filter 16x2 -> 16x1
4587 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4588                           ptrdiff_t src_stride, int dst_width,
4589                           int source_y_fraction) {
4590   asm volatile (
4591     "sub       %1,%0                           \n"
4592     "shr       %3                              \n"
4593     "cmp       $0x0,%3                         \n"
4594     "je        100f                            \n"
4595     "cmp       $0x20,%3                        \n"
4596     "je        75f                             \n"
4597     "cmp       $0x40,%3                        \n"
4598     "je        50f                             \n"
4599     "cmp       $0x60,%3                        \n"
4600     "je        25f                             \n"
4601
4602     "movd      %3,%%xmm0                       \n"
4603     "neg       %3                              \n"
4604     "add       $0x80,%3                        \n"
4605     "movd      %3,%%xmm5                       \n"
4606     "punpcklbw %%xmm0,%%xmm5                   \n"
4607     "punpcklwd %%xmm5,%%xmm5                   \n"
4608     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4609
4610     // General purpose row blend.
4611     LABELALIGN
4612   "1:                                          \n"
4613     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4614     MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4615     "movdqa    %%xmm0,%%xmm1                   \n"
4616     "punpcklbw %%xmm2,%%xmm0                   \n"
4617     "punpckhbw %%xmm2,%%xmm1                   \n"
4618     "pmaddubsw %%xmm5,%%xmm0                   \n"
4619     "pmaddubsw %%xmm5,%%xmm1                   \n"
4620     "psrlw     $0x7,%%xmm0                     \n"
4621     "psrlw     $0x7,%%xmm1                     \n"
4622     "packuswb  %%xmm1,%%xmm0                   \n"
4623     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4624     "lea       " MEMLEA(0x10,1) ",%1           \n"
4625     "sub       $0x10,%2                        \n"
4626     "jg        1b                              \n"
4627     "jmp       99f                             \n"
4628
4629     // Blend 25 / 75.
4630     LABELALIGN
4631   "25:                                         \n"
4632     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4633     MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4634     "pavgb     %%xmm1,%%xmm0                   \n"
4635     "pavgb     %%xmm1,%%xmm0                   \n"
4636     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4637     "lea       " MEMLEA(0x10,1) ",%1           \n"
4638     "sub       $0x10,%2                        \n"
4639     "jg        25b                             \n"
4640     "jmp       99f                             \n"
4641
4642     // Blend 50 / 50.
4643     LABELALIGN
4644   "50:                                         \n"
4645     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4646     MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4647     "pavgb     %%xmm1,%%xmm0                   \n"
4648     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4649     "lea       " MEMLEA(0x10,1) ",%1           \n"
4650     "sub       $0x10,%2                        \n"
4651     "jg        50b                             \n"
4652     "jmp       99f                             \n"
4653
4654     // Blend 75 / 25.
4655     LABELALIGN
4656   "75:                                         \n"
4657     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4658     MEMOPREG(movdqu,0x00,1,4,1,xmm0)
4659     "pavgb     %%xmm1,%%xmm0                   \n"
4660     "pavgb     %%xmm1,%%xmm0                   \n"
4661     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4662     "lea       " MEMLEA(0x10,1) ",%1           \n"
4663     "sub       $0x10,%2                        \n"
4664     "jg        75b                             \n"
4665     "jmp       99f                             \n"
4666
4667     // Blend 100 / 0 - Copy row unchanged.
4668     LABELALIGN
4669   "100:                                        \n"
4670     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4671     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4672     "lea       " MEMLEA(0x10,1) ",%1           \n"
4673     "sub       $0x10,%2                        \n"
4674     "jg        100b                            \n"
4675
4676   "99:                                         \n"
4677   : "+r"(dst_ptr),    // %0
4678     "+r"(src_ptr),    // %1
4679     "+r"(dst_width),  // %2
4680     "+r"(source_y_fraction)  // %3
4681   : "r"((intptr_t)(src_stride))  // %4
4682   : "memory", "cc", NACL_R14
4683     "xmm0", "xmm1", "xmm2", "xmm5"
4684   );
4685 }
4686 #endif  // HAS_INTERPOLATEROW_SSSE3
4687
4688 #ifdef HAS_INTERPOLATEROW_AVX2
4689 // Bilinear filter 32x2 -> 32x1
4690 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
4691                          ptrdiff_t src_stride, int dst_width,
4692                          int source_y_fraction) {
4693   asm volatile (
4694     "shr       %3                              \n"
4695     "cmp       $0x0,%3                         \n"
4696     "je        100f                            \n"
4697     "sub       %1,%0                           \n"
4698     "cmp       $0x20,%3                        \n"
4699     "je        75f                             \n"
4700     "cmp       $0x40,%3                        \n"
4701     "je        50f                             \n"
4702     "cmp       $0x60,%3                        \n"
4703     "je        25f                             \n"
4704
4705     "vmovd      %3,%%xmm0                      \n"
4706     "neg        %3                             \n"
4707     "add        $0x80,%3                       \n"
4708     "vmovd      %3,%%xmm5                      \n"
4709     "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
4710     "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
4711     "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
4712     "vpermd     %%ymm5,%%ymm0,%%ymm5           \n"
4713
4714     // General purpose row blend.
4715     LABELALIGN
4716   "1:                                          \n"
4717     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
4718     MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
4719     "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
4720     "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
4721     "vpmaddubsw %%ymm5,%%ymm0,%%ymm0           \n"
4722     "vpmaddubsw %%ymm5,%%ymm1,%%ymm1           \n"
4723     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
4724     "vpsrlw     $0x7,%%ymm1,%%ymm1             \n"
4725     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
4726     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4727     "lea       " MEMLEA(0x20,1) ",%1           \n"
4728     "sub       $0x20,%2                        \n"
4729     "jg        1b                              \n"
4730     "jmp       99f                             \n"
4731
4732     // Blend 25 / 75.
4733     LABELALIGN
4734   "25:                                         \n"
4735     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
4736     MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
4737     "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
4738     "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
4739     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4740     "lea       " MEMLEA(0x20,1) ",%1           \n"
4741     "sub       $0x20,%2                        \n"
4742     "jg        25b                             \n"
4743     "jmp       99f                             \n"
4744
4745     // Blend 50 / 50.
4746     LABELALIGN
4747   "50:                                         \n"
4748     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
4749     VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
4750     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4751     "lea       " MEMLEA(0x20,1) ",%1           \n"
4752     "sub       $0x20,%2                        \n"
4753     "jg        50b                             \n"
4754     "jmp       99f                             \n"
4755
4756     // Blend 75 / 25.
4757     LABELALIGN
4758   "75:                                         \n"
4759     "vmovdqu    " MEMACCESS(1) ",%%ymm1        \n"
4760     MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
4761     "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
4762     "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
4763     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4764     "lea       " MEMLEA(0x20,1) ",%1           \n"
4765     "sub       $0x20,%2                        \n"
4766     "jg        75b                             \n"
4767     "jmp       99f                             \n"
4768
4769     // Blend 100 / 0 - Copy row unchanged.
4770     LABELALIGN
4771   "100:                                        \n"
4772     "rep movsb " MEMMOVESTRING(1,0) "          \n"
4773     "jmp       999f                            \n"
4774
4775   "99:                                         \n"
4776     "vzeroupper                                \n"
4777   "999:                                        \n"
4778   : "+D"(dst_ptr),    // %0
4779     "+S"(src_ptr),    // %1
4780     "+c"(dst_width),  // %2
4781     "+r"(source_y_fraction)  // %3
4782   : "r"((intptr_t)(src_stride))  // %4
4783   : "memory", "cc", NACL_R14
4784     "xmm0", "xmm1", "xmm2", "xmm5"
4785   );
4786 }
4787 #endif  // HAS_INTERPOLATEROW_AVX2
4788
4789 #ifdef HAS_INTERPOLATEROW_SSE2
4790 // Bilinear filter 16x2 -> 16x1
4791 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
4792                          ptrdiff_t src_stride, int dst_width,
4793                          int source_y_fraction) {
4794   asm volatile (
4795     "sub       %1,%0                           \n"
4796     "shr       %3                              \n"
4797     "cmp       $0x0,%3                         \n"
4798     "je        100f                            \n"
4799     "cmp       $0x20,%3                        \n"
4800     "je        75f                             \n"
4801     "cmp       $0x40,%3                        \n"
4802     "je        50f                             \n"
4803     "cmp       $0x60,%3                        \n"
4804     "je        25f                             \n"
4805
4806     "movd      %3,%%xmm0                       \n"
4807     "neg       %3                              \n"
4808     "add       $0x80,%3                        \n"
4809     "movd      %3,%%xmm5                       \n"
4810     "punpcklbw %%xmm0,%%xmm5                   \n"
4811     "punpcklwd %%xmm5,%%xmm5                   \n"
4812     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4813     "pxor      %%xmm4,%%xmm4                   \n"
4814
4815     // General purpose row blend.
4816     LABELALIGN
4817   "1:                                          \n"
4818     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4819     MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
4820     "movdqa    %%xmm0,%%xmm1                   \n"
4821     "movdqa    %%xmm2,%%xmm3                   \n"
4822     "punpcklbw %%xmm4,%%xmm2                   \n"
4823     "punpckhbw %%xmm4,%%xmm3                   \n"
4824     "punpcklbw %%xmm4,%%xmm0                   \n"
4825     "punpckhbw %%xmm4,%%xmm1                   \n"
4826     "psubw     %%xmm0,%%xmm2                   \n"
4827     "psubw     %%xmm1,%%xmm3                   \n"
4828     "paddw     %%xmm2,%%xmm2                   \n"
4829     "paddw     %%xmm3,%%xmm3                   \n"
4830     "pmulhw    %%xmm5,%%xmm2                   \n"
4831     "pmulhw    %%xmm5,%%xmm3                   \n"
4832     "paddw     %%xmm2,%%xmm0                   \n"
4833     "paddw     %%xmm3,%%xmm1                   \n"
4834     "packuswb  %%xmm1,%%xmm0                   \n"
4835     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4836     "lea       " MEMLEA(0x10,1) ",%1           \n"
4837     "sub       $0x10,%2                        \n"
4838     "jg        1b                              \n"
4839     "jmp       99f                             \n"
4840
4841     // Blend 25 / 75.
4842     LABELALIGN
4843   "25:                                         \n"
4844     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4845     MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
4846     "pavgb     %%xmm1,%%xmm0                   \n"
4847     "pavgb     %%xmm1,%%xmm0                   \n"
4848     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4849     "lea       " MEMLEA(0x10,1) ",%1           \n"
4850     "sub       $0x10,%2                        \n"
4851     "jg        25b                             \n"
4852     "jmp       99f                             \n"
4853
4854     // Blend 50 / 50.
4855     LABELALIGN
4856   "50:                                         \n"
4857     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4858     MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
4859     "pavgb     %%xmm1,%%xmm0                   \n"
4860     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4861     "lea       " MEMLEA(0x10,1) ",%1           \n"
4862     "sub       $0x10,%2                        \n"
4863     "jg        50b                             \n"
4864     "jmp       99f                             \n"
4865
4866     // Blend 75 / 25.
4867     LABELALIGN
4868   "75:                                         \n"
4869     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4870     MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
4871     "pavgb     %%xmm1,%%xmm0                   \n"
4872     "pavgb     %%xmm1,%%xmm0                   \n"
4873     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4874     "lea       " MEMLEA(0x10,1) ",%1           \n"
4875     "sub       $0x10,%2                        \n"
4876     "jg        75b                             \n"
4877     "jmp       99f                             \n"
4878
4879     // Blend 100 / 0 - Copy row unchanged.
4880     LABELALIGN
4881   "100:                                        \n"
4882     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4883     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4884     "lea       " MEMLEA(0x10,1) ",%1           \n"
4885     "sub       $0x10,%2                        \n"
4886     "jg        100b                            \n"
4887
4888   "99:                                         \n"
4889   : "+r"(dst_ptr),    // %0
4890     "+r"(src_ptr),    // %1
4891     "+r"(dst_width),  // %2
4892     "+r"(source_y_fraction)  // %3
4893   : "r"((intptr_t)(src_stride))  // %4
4894   : "memory", "cc", NACL_R14
4895     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4896   );
4897 }
4898 #endif  // HAS_INTERPOLATEROW_SSE2
4899
4900 #ifdef HAS_ARGBTOBAYERGGROW_SSE2
4901 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
4902                            uint32 selector, int pix) {
4903   asm volatile (
4904     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4905     "psrld     $0x18,%%xmm5                    \n"
4906     LABELALIGN
4907   "1:                                          \n"
4908     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4909     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4910     "lea       " MEMLEA(0x20,0) ",%0           \n"
4911     "psrld     $0x8,%%xmm0                     \n"
4912     "psrld     $0x8,%%xmm1                     \n"
4913     "pand      %%xmm5,%%xmm0                   \n"
4914     "pand      %%xmm5,%%xmm1                   \n"
4915     "packssdw  %%xmm1,%%xmm0                   \n"
4916     "packuswb  %%xmm1,%%xmm0                   \n"
4917     "movq      %%xmm0," MEMACCESS(1) "         \n"
4918     "lea       " MEMLEA(0x8,1) ",%1            \n"
4919     "sub       $0x8,%2                         \n"
4920     "jg        1b                              \n"
4921   : "+r"(src_argb),  // %0
4922     "+r"(dst_bayer), // %1
4923     "+r"(pix)        // %2
4924   :
4925   : "memory", "cc"
4926     , "xmm0", "xmm1", "xmm5"
4927   );
4928 }
4929 #endif  // HAS_ARGBTOBAYERGGROW_SSE2
4930
4931 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
4932 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4933 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4934                           const uint8* shuffler, int pix) {
4935   asm volatile (
4936     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
4937     LABELALIGN
4938   "1:                                          \n"
4939     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4940     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4941     "lea       " MEMLEA(0x20,0) ",%0           \n"
4942     "pshufb    %%xmm5,%%xmm0                   \n"
4943     "pshufb    %%xmm5,%%xmm1                   \n"
4944     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
4945     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
4946     "lea       " MEMLEA(0x20,1) ",%1           \n"
4947     "sub       $0x8,%2                         \n"
4948     "jg        1b                              \n"
4949   : "+r"(src_argb),  // %0
4950     "+r"(dst_argb),  // %1
4951     "+r"(pix)        // %2
4952   : "r"(shuffler)    // %3
4953   : "memory", "cc"
4954     , "xmm0", "xmm1", "xmm5"
4955   );
4956 }
4957 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
4958
4959 #ifdef HAS_ARGBSHUFFLEROW_AVX2
4960 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4961 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4962                          const uint8* shuffler, int pix) {
4963   asm volatile (
4964     "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
4965     LABELALIGN
4966   "1:                                          \n"
4967     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
4968     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
4969     "lea       " MEMLEA(0x40,0) ",%0           \n"
4970     "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
4971     "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
4972     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
4973     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
4974     "lea       " MEMLEA(0x40,1) ",%1           \n"
4975     "sub       $0x10,%2                        \n"
4976     "jg        1b                              \n"
4977     "vzeroupper                                \n"
4978   : "+r"(src_argb),  // %0
4979     "+r"(dst_argb),  // %1
4980     "+r"(pix)        // %2
4981   : "r"(shuffler)    // %3
4982   : "memory", "cc"
4983     , "xmm0", "xmm1", "xmm5"
4984   );
4985 }
4986 #endif  // HAS_ARGBSHUFFLEROW_AVX2
4987
4988 #ifdef HAS_ARGBSHUFFLEROW_SSE2
4989 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4990 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4991                          const uint8* shuffler, int pix) {
4992   uintptr_t pixel_temp = 0u;
4993   asm volatile (
4994     "pxor      %%xmm5,%%xmm5                   \n"
4995     "mov       " MEMACCESS(4) ",%k2            \n"
4996     "cmp       $0x3000102,%k2                  \n"
4997     "je        3012f                           \n"
4998     "cmp       $0x10203,%k2                    \n"
4999     "je        123f                            \n"
5000     "cmp       $0x30201,%k2                    \n"
5001     "je        321f                            \n"
5002     "cmp       $0x2010003,%k2                  \n"
5003     "je        2103f                           \n"
5004
5005     LABELALIGN
5006   "1:                                          \n"
5007     "movzb     " MEMACCESS(4) ",%2             \n"
5008     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5009     "mov       %b2," MEMACCESS(1) "            \n"
5010     "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
5011     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5012     "mov       %b2," MEMACCESS2(0x1,1) "       \n"
5013     "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
5014     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5015     "mov       %b2," MEMACCESS2(0x2,1) "       \n"
5016     "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
5017     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5018     "mov       %b2," MEMACCESS2(0x3,1) "       \n"
5019     "lea       " MEMLEA(0x4,0) ",%0            \n"
5020     "lea       " MEMLEA(0x4,1) ",%1            \n"
5021     "sub       $0x1,%3                         \n"
5022     "jg        1b                              \n"
5023     "jmp       99f                             \n"
5024
5025     LABELALIGN
5026   "123:                                        \n"
5027     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5028     "lea       " MEMLEA(0x10,0) ",%0           \n"
5029     "movdqa    %%xmm0,%%xmm1                   \n"
5030     "punpcklbw %%xmm5,%%xmm0                   \n"
5031     "punpckhbw %%xmm5,%%xmm1                   \n"
5032     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
5033     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
5034     "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
5035     "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
5036     "packuswb  %%xmm1,%%xmm0                   \n"
5037     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5038     "lea       " MEMLEA(0x10,1) ",%1           \n"
5039     "sub       $0x4,%3                         \n"
5040     "jg        123b                            \n"
5041     "jmp       99f                             \n"
5042
5043     LABELALIGN
5044   "321:                                        \n"
5045     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5046     "lea       " MEMLEA(0x10,0) ",%0           \n"
5047     "movdqa    %%xmm0,%%xmm1                   \n"
5048     "punpcklbw %%xmm5,%%xmm0                   \n"
5049     "punpckhbw %%xmm5,%%xmm1                   \n"
5050     "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
5051     "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
5052     "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
5053     "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
5054     "packuswb  %%xmm1,%%xmm0                   \n"
5055     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5056     "lea       " MEMLEA(0x10,1) ",%1           \n"
5057     "sub       $0x4,%3                         \n"
5058     "jg        321b                            \n"
5059     "jmp       99f                             \n"
5060
5061     LABELALIGN
5062   "2103:                                       \n"
5063     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5064     "lea       " MEMLEA(0x10,0) ",%0           \n"
5065     "movdqa    %%xmm0,%%xmm1                   \n"
5066     "punpcklbw %%xmm5,%%xmm0                   \n"
5067     "punpckhbw %%xmm5,%%xmm1                   \n"
5068     "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
5069     "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
5070     "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
5071     "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
5072     "packuswb  %%xmm1,%%xmm0                   \n"
5073     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5074     "lea       " MEMLEA(0x10,1) ",%1           \n"
5075     "sub       $0x4,%3                         \n"
5076     "jg        2103b                           \n"
5077     "jmp       99f                             \n"
5078
5079     LABELALIGN
5080   "3012:                                       \n"
5081     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5082     "lea       " MEMLEA(0x10,0) ",%0           \n"
5083     "movdqa    %%xmm0,%%xmm1                   \n"
5084     "punpcklbw %%xmm5,%%xmm0                   \n"
5085     "punpckhbw %%xmm5,%%xmm1                   \n"
5086     "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
5087     "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
5088     "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
5089     "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
5090     "packuswb  %%xmm1,%%xmm0                   \n"
5091     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5092     "lea       " MEMLEA(0x10,1) ",%1           \n"
5093     "sub       $0x4,%3                         \n"
5094     "jg        3012b                           \n"
5095
5096   "99:                                         \n"
5097   : "+r"(src_argb),    // %0
5098     "+r"(dst_argb),    // %1
5099     "+d"(pixel_temp),  // %2
5100     "+r"(pix)         // %3
5101   : "r"(shuffler)      // %4
5102   : "memory", "cc", NACL_R14
5103     "xmm0", "xmm1", "xmm5"
5104   );
5105 }
5106 #endif  // HAS_ARGBSHUFFLEROW_SSE2
5107
5108 #ifdef HAS_I422TOYUY2ROW_SSE2
5109 void I422ToYUY2Row_SSE2(const uint8* src_y,
5110                         const uint8* src_u,
5111                         const uint8* src_v,
5112                         uint8* dst_frame, int width) {
5113  asm volatile (
5114     "sub       %1,%2                             \n"
5115     LABELALIGN
5116   "1:                                            \n"
5117     "movq      " MEMACCESS(1) ",%%xmm2           \n"
5118     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5119     "lea       " MEMLEA(0x8,1) ",%1              \n"
5120     "punpcklbw %%xmm3,%%xmm2                     \n"
5121     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5122     "lea       " MEMLEA(0x10,0) ",%0             \n"
5123     "movdqa    %%xmm0,%%xmm1                     \n"
5124     "punpcklbw %%xmm2,%%xmm0                     \n"
5125     "punpckhbw %%xmm2,%%xmm1                     \n"
5126     "movdqu    %%xmm0," MEMACCESS(3) "           \n"
5127     "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
5128     "lea       " MEMLEA(0x20,3) ",%3             \n"
5129     "sub       $0x10,%4                          \n"
5130     "jg         1b                               \n"
5131     : "+r"(src_y),  // %0
5132       "+r"(src_u),  // %1
5133       "+r"(src_v),  // %2
5134       "+r"(dst_frame),  // %3
5135       "+rm"(width)  // %4
5136     :
5137     : "memory", "cc", NACL_R14
5138     "xmm0", "xmm1", "xmm2", "xmm3"
5139   );
5140 }
5141 #endif  // HAS_I422TOYUY2ROW_SSE2
5142
5143 #ifdef HAS_I422TOUYVYROW_SSE2
5144 void I422ToUYVYRow_SSE2(const uint8* src_y,
5145                         const uint8* src_u,
5146                         const uint8* src_v,
5147                         uint8* dst_frame, int width) {
5148  asm volatile (
5149     "sub        %1,%2                            \n"
5150     LABELALIGN
5151   "1:                                            \n"
5152     "movq      " MEMACCESS(1) ",%%xmm2           \n"
5153     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5154     "lea       " MEMLEA(0x8,1) ",%1              \n"
5155     "punpcklbw %%xmm3,%%xmm2                     \n"
5156     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5157     "movdqa    %%xmm2,%%xmm1                     \n"
5158     "lea       " MEMLEA(0x10,0) ",%0             \n"
5159     "punpcklbw %%xmm0,%%xmm1                     \n"
5160     "punpckhbw %%xmm0,%%xmm2                     \n"
5161     "movdqu    %%xmm1," MEMACCESS(3) "           \n"
5162     "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
5163     "lea       " MEMLEA(0x20,3) ",%3             \n"
5164     "sub       $0x10,%4                          \n"
5165     "jg         1b                               \n"
5166     : "+r"(src_y),  // %0
5167       "+r"(src_u),  // %1
5168       "+r"(src_v),  // %2
5169       "+r"(dst_frame),  // %3
5170       "+rm"(width)  // %4
5171     :
5172     : "memory", "cc", NACL_R14
5173     "xmm0", "xmm1", "xmm2", "xmm3"
5174   );
5175 }
5176 #endif  // HAS_I422TOUYVYROW_SSE2
5177
5178 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5179 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5180                             uint8* dst_argb, const float* poly,
5181                             int width) {
5182   asm volatile (
5183     "pxor      %%xmm3,%%xmm3                   \n"
5184
5185     // 2 pixel loop.
5186     LABELALIGN
5187   "1:                                          \n"
5188     "movq      " MEMACCESS(0) ",%%xmm0         \n"
5189     "lea       " MEMLEA(0x8,0) ",%0            \n"
5190     "punpcklbw %%xmm3,%%xmm0                   \n"
5191     "movdqa    %%xmm0,%%xmm4                   \n"
5192     "punpcklwd %%xmm3,%%xmm0                   \n"
5193     "punpckhwd %%xmm3,%%xmm4                   \n"
5194     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
5195     "cvtdq2ps  %%xmm4,%%xmm4                   \n"
5196     "movdqa    %%xmm0,%%xmm1                   \n"
5197     "movdqa    %%xmm4,%%xmm5                   \n"
5198     "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
5199     "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
5200     "addps     " MEMACCESS(3) ",%%xmm0         \n"
5201     "addps     " MEMACCESS(3) ",%%xmm4         \n"
5202     "movdqa    %%xmm1,%%xmm2                   \n"
5203     "movdqa    %%xmm5,%%xmm6                   \n"
5204     "mulps     %%xmm1,%%xmm2                   \n"
5205     "mulps     %%xmm5,%%xmm6                   \n"
5206     "mulps     %%xmm2,%%xmm1                   \n"
5207     "mulps     %%xmm6,%%xmm5                   \n"
5208     "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
5209     "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
5210     "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
5211     "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
5212     "addps     %%xmm2,%%xmm0                   \n"
5213     "addps     %%xmm6,%%xmm4                   \n"
5214     "addps     %%xmm1,%%xmm0                   \n"
5215     "addps     %%xmm5,%%xmm4                   \n"
5216     "cvttps2dq %%xmm0,%%xmm0                   \n"
5217     "cvttps2dq %%xmm4,%%xmm4                   \n"
5218     "packuswb  %%xmm4,%%xmm0                   \n"
5219     "packuswb  %%xmm0,%%xmm0                   \n"
5220     "movq      %%xmm0," MEMACCESS(1) "         \n"
5221     "lea       " MEMLEA(0x8,1) ",%1            \n"
5222     "sub       $0x2,%2                         \n"
5223     "jg        1b                              \n"
5224   : "+r"(src_argb),  // %0
5225     "+r"(dst_argb),  // %1
5226     "+r"(width)      // %2
5227   : "r"(poly)        // %3
5228   : "memory", "cc"
5229     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5230   );
5231 }
5232 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
5233
5234 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5235 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5236                             uint8* dst_argb, const float* poly,
5237                             int width) {
5238   asm volatile (
5239     "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
5240     "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5241     "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5242     "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5243
5244     // 2 pixel loop.
5245     LABELALIGN
5246   "1:                                          \n"
5247     "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
5248     "lea         " MEMLEA(0x8,0) ",%0          \n"
5249     "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
5250     "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
5251     "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
5252     "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
5253     "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
5254     "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
5255     "vcvttps2dq  %%ymm0,%%ymm0                 \n"
5256     "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
5257     "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5258     "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
5259     "vmovq       %%xmm0," MEMACCESS(1) "       \n"
5260     "lea         " MEMLEA(0x8,1) ",%1          \n"
5261     "sub         $0x2,%2                       \n"
5262     "jg          1b                            \n"
5263     "vzeroupper                                \n"
5264   : "+r"(src_argb),  // %0
5265     "+r"(dst_argb),  // %1
5266     "+r"(width)      // %2
5267   : "r"(poly)        // %3
5268   : "memory", "cc",
5269     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5270   );
5271 }
5272 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
5273
5274 #ifdef HAS_ARGBCOLORTABLEROW_X86
5275 // Tranform ARGB pixels with color table.
5276 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
5277                            int width) {
5278   uintptr_t pixel_temp = 0u;
5279   asm volatile (
5280     // 1 pixel loop.
5281     LABELALIGN
5282   "1:                                          \n"
5283     "movzb     " MEMACCESS(0) ",%1             \n"
5284     "lea       " MEMLEA(0x4,0) ",%0            \n"
5285     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5286     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5287     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5288     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5289     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5290     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5291     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5292     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5293     "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
5294     MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
5295     "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
5296     "dec       %2                              \n"
5297     "jg        1b                              \n"
5298   : "+r"(dst_argb),   // %0
5299     "+d"(pixel_temp), // %1
5300     "+r"(width)       // %2
5301   : "r"(table_argb)   // %3
5302   : "memory", "cc");
5303 }
5304 #endif  // HAS_ARGBCOLORTABLEROW_X86
5305
5306 #ifdef HAS_RGBCOLORTABLEROW_X86
5307 // Tranform RGB pixels with color table.
5308 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5309   uintptr_t pixel_temp = 0u;
5310   asm volatile (
5311     // 1 pixel loop.
5312     LABELALIGN
5313   "1:                                          \n"
5314     "movzb     " MEMACCESS(0) ",%1             \n"
5315     "lea       " MEMLEA(0x4,0) ",%0            \n"
5316     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5317     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5318     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5319     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5320     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5321     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5322     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5323     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5324     "dec       %2                              \n"
5325     "jg        1b                              \n"
5326   : "+r"(dst_argb),   // %0
5327     "+d"(pixel_temp), // %1
5328     "+r"(width)       // %2
5329   : "r"(table_argb)   // %3
5330   : "memory", "cc");
5331 }
5332 #endif  // HAS_RGBCOLORTABLEROW_X86
5333
5334 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5335 // Tranform RGB pixels with luma table.
5336 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5337                                  int width,
5338                                  const uint8* luma, uint32 lumacoeff) {
5339   uintptr_t pixel_temp = 0u;
5340   uintptr_t table_temp = 0u;
5341   asm volatile (
5342     "movd      %6,%%xmm3                       \n"
5343     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
5344     "pcmpeqb   %%xmm4,%%xmm4                   \n"
5345     "psllw     $0x8,%%xmm4                     \n"
5346     "pxor      %%xmm5,%%xmm5                   \n"
5347
5348     // 4 pixel loop.
5349     LABELALIGN
5350   "1:                                          \n"
5351     "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
5352     "pmaddubsw %%xmm3,%%xmm0                   \n"
5353     "phaddw    %%xmm0,%%xmm0                   \n"
5354     "pand      %%xmm4,%%xmm0                   \n"
5355     "punpcklwd %%xmm5,%%xmm0                   \n"
5356     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5357     "add       %5,%1                           \n"
5358     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5359
5360     "movzb     " MEMACCESS(2) ",%0             \n"
5361     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5362     "mov       %b0," MEMACCESS(3) "            \n"
5363     "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
5364     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5365     "mov       %b0," MEMACCESS2(0x1,3) "       \n"
5366     "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
5367     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5368     "mov       %b0," MEMACCESS2(0x2,3) "       \n"
5369     "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
5370     "mov       %b0," MEMACCESS2(0x3,3) "       \n"
5371
5372     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5373     "add       %5,%1                           \n"
5374     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5375
5376     "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
5377     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5378     "mov       %b0," MEMACCESS2(0x4,3) "       \n"
5379     "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
5380     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5381     "mov       %b0," MEMACCESS2(0x5,3) "       \n"
5382     "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
5383     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5384     "mov       %b0," MEMACCESS2(0x6,3) "       \n"
5385     "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
5386     "mov       %b0," MEMACCESS2(0x7,3) "       \n"
5387
5388     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5389     "add       %5,%1                           \n"
5390     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5391
5392     "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
5393     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5394     "mov       %b0," MEMACCESS2(0x8,3) "       \n"
5395     "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
5396     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5397     "mov       %b0," MEMACCESS2(0x9,3) "       \n"
5398     "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
5399     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5400     "mov       %b0," MEMACCESS2(0xa,3) "       \n"
5401     "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
5402     "mov       %b0," MEMACCESS2(0xb,3) "       \n"
5403
5404     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5405     "add       %5,%1                           \n"
5406
5407     "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
5408     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5409     "mov       %b0," MEMACCESS2(0xc,3) "       \n"
5410     "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
5411     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5412     "mov       %b0," MEMACCESS2(0xd,3) "       \n"
5413     "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
5414     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5415     "mov       %b0," MEMACCESS2(0xe,3) "       \n"
5416     "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
5417     "mov       %b0," MEMACCESS2(0xf,3) "       \n"
5418     "lea       " MEMLEA(0x10,2) ",%2           \n"
5419     "lea       " MEMLEA(0x10,3) ",%3           \n"
5420     "sub       $0x4,%4                         \n"
5421     "jg        1b                              \n"
5422   : "+d"(pixel_temp),  // %0
5423     "+a"(table_temp),  // %1
5424     "+r"(src_argb),    // %2
5425     "+r"(dst_argb),    // %3
5426     "+rm"(width)       // %4
5427   : "r"(luma),         // %5
5428     "rm"(lumacoeff)    // %6
5429   : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5430   );
5431 }
5432 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5433
5434 #endif  // defined(__x86_64__) || defined(__i386__)
5435
5436 #ifdef __cplusplus
5437 }  // extern "C"
5438 }  // namespace libyuv
5439 #endif