]> granicus.if.org Git - libvpx/blob - third_party/libyuv/source/scale_gcc.cc
Merge "add vp9_block_error_fp_neon"
[libvpx] / third_party / libyuv / source / scale_gcc.cc
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
20
21 // Offsets for source bytes 0 to 9
22 static uvec8 kShuf0 =
23   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
24
25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
26 static uvec8 kShuf1 =
27   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
28
29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
30 static uvec8 kShuf2 =
31   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
32
33 // Offsets for source bytes 0 to 10
34 static uvec8 kShuf01 =
35   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
36
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static uvec8 kShuf11 =
39   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
40
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static uvec8 kShuf21 =
43   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
44
45 // Coefficients for source bytes 0 to 10
46 static uvec8 kMadd01 =
47   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
48
49 // Coefficients for source bytes 10 to 21
50 static uvec8 kMadd11 =
51   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
52
53 // Coefficients for source bytes 21 to 31
54 static uvec8 kMadd21 =
55   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
56
57 // Coefficients for source bytes 21 to 31
58 static vec16 kRound34 =
59   { 2, 2, 2, 2, 2, 2, 2, 2 };
60
61 static uvec8 kShuf38a =
62   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
63
64 static uvec8 kShuf38b =
65   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
66
67 // Arrange words 0,3,6 into 0,1,2
68 static uvec8 kShufAc =
69   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
70
71 // Arrange words 0,3,6 into 3,4,5
72 static uvec8 kShufAc3 =
73   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
74
75 // Scaling values for boxes of 3x3 and 2x3
76 static uvec16 kScaleAc33 =
77   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
78
79 // Arrange first value for pixels 0,1,2,3,4,5
80 static uvec8 kShufAb0 =
81   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
82
83 // Arrange second value for pixels 0,1,2,3,4,5
84 static uvec8 kShufAb1 =
85   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
86
87 // Arrange third value for pixels 0,1,2,3,4,5
88 static uvec8 kShufAb2 =
89   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
90
91 // Scaling values for boxes of 3x2 and 2x2
92 static uvec16 kScaleAb2 =
93   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
94
95 // GCC versions of row functions are verbatim conversions from Visual C.
96 // Generated using gcc disassembly on Visual C object file:
97 // objdump -D yuvscaler.obj >yuvscaler.txt
98
99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
100                         uint8* dst_ptr, int dst_width) {
101   asm volatile (
102     LABELALIGN
103   "1:                                          \n"
104     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
105     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
106     "lea       " MEMLEA(0x20,0) ",%0           \n"
107     "psrlw     $0x8,%%xmm0                     \n"
108     "psrlw     $0x8,%%xmm1                     \n"
109     "packuswb  %%xmm1,%%xmm0                   \n"
110     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
111     "lea       " MEMLEA(0x10,1) ",%1           \n"
112     "sub       $0x10,%2                        \n"
113     "jg        1b                              \n"
114   : "+r"(src_ptr),    // %0
115     "+r"(dst_ptr),    // %1
116     "+r"(dst_width)   // %2
117   :: "memory", "cc", "xmm0", "xmm1"
118   );
119 }
120
121 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
122                               uint8* dst_ptr, int dst_width) {
123   asm volatile (
124     "pcmpeqb   %%xmm5,%%xmm5                   \n"
125     "psrlw     $0x8,%%xmm5                     \n"
126
127     LABELALIGN
128   "1:                                          \n"
129     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
130     "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
131     "lea       " MEMLEA(0x20,0) ",%0           \n"
132     "movdqa    %%xmm0,%%xmm2                   \n"
133     "psrlw     $0x8,%%xmm0                     \n"
134     "movdqa    %%xmm1,%%xmm3                   \n"
135     "psrlw     $0x8,%%xmm1                     \n"
136     "pand      %%xmm5,%%xmm2                   \n"
137     "pand      %%xmm5,%%xmm3                   \n"
138     "pavgw     %%xmm2,%%xmm0                   \n"
139     "pavgw     %%xmm3,%%xmm1                   \n"
140     "packuswb  %%xmm1,%%xmm0                   \n"
141     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
142     "lea       " MEMLEA(0x10,1) ",%1           \n"
143     "sub       $0x10,%2                        \n"
144     "jg        1b                              \n"
145   : "+r"(src_ptr),    // %0
146     "+r"(dst_ptr),    // %1
147     "+r"(dst_width)   // %2
148   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
149   );
150 }
151
152 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
153                            uint8* dst_ptr, int dst_width) {
154   asm volatile (
155     "pcmpeqb   %%xmm5,%%xmm5                   \n"
156     "psrlw     $0x8,%%xmm5                     \n"
157
158     LABELALIGN
159   "1:                                          \n"
160     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
161     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
162     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
163     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
164     "lea       " MEMLEA(0x20,0) ",%0           \n"
165     "pavgb     %%xmm2,%%xmm0                   \n"
166     "pavgb     %%xmm3,%%xmm1                   \n"
167     "movdqa    %%xmm0,%%xmm2                   \n"
168     "psrlw     $0x8,%%xmm0                     \n"
169     "movdqa    %%xmm1,%%xmm3                   \n"
170     "psrlw     $0x8,%%xmm1                     \n"
171     "pand      %%xmm5,%%xmm2                   \n"
172     "pand      %%xmm5,%%xmm3                   \n"
173     "pavgw     %%xmm2,%%xmm0                   \n"
174     "pavgw     %%xmm3,%%xmm1                   \n"
175     "packuswb  %%xmm1,%%xmm0                   \n"
176     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
177     "lea       " MEMLEA(0x10,1) ",%1           \n"
178     "sub       $0x10,%2                        \n"
179     "jg        1b                              \n"
180   : "+r"(src_ptr),    // %0
181     "+r"(dst_ptr),    // %1
182     "+r"(dst_width)   // %2
183   : "r"((intptr_t)(src_stride))   // %3
184   : "memory", "cc", NACL_R14
185     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
186   );
187 }
188
189 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
190                         uint8* dst_ptr, int dst_width) {
191   asm volatile (
192     "pcmpeqb   %%xmm5,%%xmm5                   \n"
193     "psrld     $0x18,%%xmm5                    \n"
194     "pslld     $0x10,%%xmm5                    \n"
195
196     LABELALIGN
197   "1:                                          \n"
198     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
199     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
200     "lea       " MEMLEA(0x20,0) ",%0           \n"
201     "pand      %%xmm5,%%xmm0                   \n"
202     "pand      %%xmm5,%%xmm1                   \n"
203     "packuswb  %%xmm1,%%xmm0                   \n"
204     "psrlw     $0x8,%%xmm0                     \n"
205     "packuswb  %%xmm0,%%xmm0                   \n"
206     "movq      %%xmm0," MEMACCESS(1) "         \n"
207     "lea       " MEMLEA(0x8,1) ",%1            \n"
208     "sub       $0x8,%2                         \n"
209     "jg        1b                              \n"
210   : "+r"(src_ptr),    // %0
211     "+r"(dst_ptr),    // %1
212     "+r"(dst_width)   // %2
213   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
214   );
215 }
216
217 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
218                            uint8* dst_ptr, int dst_width) {
219   intptr_t stridex3 = 0;
220   asm volatile (
221     "pcmpeqb   %%xmm7,%%xmm7                   \n"
222     "psrlw     $0x8,%%xmm7                     \n"
223     "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
224
225     LABELALIGN
226   "1:                                          \n"
227     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
228     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
229     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
230     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
231     "pavgb     %%xmm2,%%xmm0                   \n"
232     "pavgb     %%xmm3,%%xmm1                   \n"
233     MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
234     MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
235     MEMOPREG(movdqu,0x00,0,3,1,xmm4)           //  movdqu  (%0,%3,1),%%xmm4
236     MEMOPREG(movdqu,0x10,0,3,1,xmm5)           //  movdqu  0x10(%0,%3,1),%%xmm5
237     "lea       " MEMLEA(0x20,0) ",%0           \n"
238     "pavgb     %%xmm4,%%xmm2                   \n"
239     "pavgb     %%xmm2,%%xmm0                   \n"
240     "pavgb     %%xmm5,%%xmm3                   \n"
241     "pavgb     %%xmm3,%%xmm1                   \n"
242     "movdqa    %%xmm0,%%xmm2                   \n"
243     "psrlw     $0x8,%%xmm0                     \n"
244     "movdqa    %%xmm1,%%xmm3                   \n"
245     "psrlw     $0x8,%%xmm1                     \n"
246     "pand      %%xmm7,%%xmm2                   \n"
247     "pand      %%xmm7,%%xmm3                   \n"
248     "pavgw     %%xmm2,%%xmm0                   \n"
249     "pavgw     %%xmm3,%%xmm1                   \n"
250     "packuswb  %%xmm1,%%xmm0                   \n"
251     "movdqa    %%xmm0,%%xmm2                   \n"
252     "psrlw     $0x8,%%xmm0                     \n"
253     "pand      %%xmm7,%%xmm2                   \n"
254     "pavgw     %%xmm2,%%xmm0                   \n"
255     "packuswb  %%xmm0,%%xmm0                   \n"
256     "movq      %%xmm0," MEMACCESS(1) "         \n"
257     "lea       " MEMLEA(0x8,1) ",%1            \n"
258     "sub       $0x8,%2                         \n"
259     "jg        1b                              \n"
260   : "+r"(src_ptr),     // %0
261     "+r"(dst_ptr),     // %1
262     "+r"(dst_width),   // %2
263     "+r"(stridex3)     // %3
264   : "r"((intptr_t)(src_stride))    // %4
265   : "memory", "cc", NACL_R14
266     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
267   );
268 }
269
270 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
271                           uint8* dst_ptr, int dst_width) {
272   asm volatile (
273     "movdqa    %0,%%xmm3                       \n"
274     "movdqa    %1,%%xmm4                       \n"
275     "movdqa    %2,%%xmm5                       \n"
276   :
277   : "m"(kShuf0),  // %0
278     "m"(kShuf1),  // %1
279     "m"(kShuf2)   // %2
280   );
281   asm volatile (
282     LABELALIGN
283   "1:                                          \n"
284     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
285     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
286     "lea       " MEMLEA(0x20,0) ",%0           \n"
287     "movdqa    %%xmm2,%%xmm1                   \n"
288     "palignr   $0x8,%%xmm0,%%xmm1              \n"
289     "pshufb    %%xmm3,%%xmm0                   \n"
290     "pshufb    %%xmm4,%%xmm1                   \n"
291     "pshufb    %%xmm5,%%xmm2                   \n"
292     "movq      %%xmm0," MEMACCESS(1) "         \n"
293     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
294     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
295     "lea       " MEMLEA(0x18,1) ",%1           \n"
296     "sub       $0x18,%2                        \n"
297     "jg        1b                              \n"
298   : "+r"(src_ptr),   // %0
299     "+r"(dst_ptr),   // %1
300     "+r"(dst_width)  // %2
301   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
302   );
303 }
304
305 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
306                                 ptrdiff_t src_stride,
307                                 uint8* dst_ptr, int dst_width) {
308   asm volatile (
309     "movdqa    %0,%%xmm2                       \n"  // kShuf01
310     "movdqa    %1,%%xmm3                       \n"  // kShuf11
311     "movdqa    %2,%%xmm4                       \n"  // kShuf21
312   :
313   : "m"(kShuf01),  // %0
314     "m"(kShuf11),  // %1
315     "m"(kShuf21)   // %2
316   );
317   asm volatile (
318     "movdqa    %0,%%xmm5                       \n"  // kMadd01
319     "movdqa    %1,%%xmm0                       \n"  // kMadd11
320     "movdqa    %2,%%xmm1                       \n"  // kRound34
321   :
322   : "m"(kMadd01),  // %0
323     "m"(kMadd11),  // %1
324     "m"(kRound34)  // %2
325   );
326   asm volatile (
327     LABELALIGN
328   "1:                                          \n"
329     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
330     MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
331     "pavgb     %%xmm7,%%xmm6                   \n"
332     "pshufb    %%xmm2,%%xmm6                   \n"
333     "pmaddubsw %%xmm5,%%xmm6                   \n"
334     "paddsw    %%xmm1,%%xmm6                   \n"
335     "psrlw     $0x2,%%xmm6                     \n"
336     "packuswb  %%xmm6,%%xmm6                   \n"
337     "movq      %%xmm6," MEMACCESS(1) "         \n"
338     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
339     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
340     "pavgb     %%xmm7,%%xmm6                   \n"
341     "pshufb    %%xmm3,%%xmm6                   \n"
342     "pmaddubsw %%xmm0,%%xmm6                   \n"
343     "paddsw    %%xmm1,%%xmm6                   \n"
344     "psrlw     $0x2,%%xmm6                     \n"
345     "packuswb  %%xmm6,%%xmm6                   \n"
346     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
347     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
348     MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
349     "lea       " MEMLEA(0x20,0) ",%0           \n"
350     "pavgb     %%xmm7,%%xmm6                   \n"
351     "pshufb    %%xmm4,%%xmm6                   \n"
352     "pmaddubsw %4,%%xmm6                       \n"
353     "paddsw    %%xmm1,%%xmm6                   \n"
354     "psrlw     $0x2,%%xmm6                     \n"
355     "packuswb  %%xmm6,%%xmm6                   \n"
356     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
357     "lea       " MEMLEA(0x18,1) ",%1           \n"
358     "sub       $0x18,%2                        \n"
359     "jg        1b                              \n"
360   : "+r"(src_ptr),   // %0
361     "+r"(dst_ptr),   // %1
362     "+r"(dst_width)  // %2
363   : "r"((intptr_t)(src_stride)),  // %3
364     "m"(kMadd21)     // %4
365   : "memory", "cc", NACL_R14
366     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
367   );
368 }
369
370 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
371                                 ptrdiff_t src_stride,
372                                 uint8* dst_ptr, int dst_width) {
373   asm volatile (
374     "movdqa    %0,%%xmm2                       \n"  // kShuf01
375     "movdqa    %1,%%xmm3                       \n"  // kShuf11
376     "movdqa    %2,%%xmm4                       \n"  // kShuf21
377   :
378   : "m"(kShuf01),  // %0
379     "m"(kShuf11),  // %1
380     "m"(kShuf21)   // %2
381   );
382   asm volatile (
383     "movdqa    %0,%%xmm5                       \n"  // kMadd01
384     "movdqa    %1,%%xmm0                       \n"  // kMadd11
385     "movdqa    %2,%%xmm1                       \n"  // kRound34
386   :
387   : "m"(kMadd01),  // %0
388     "m"(kMadd11),  // %1
389     "m"(kRound34)  // %2
390   );
391
392   asm volatile (
393     LABELALIGN
394   "1:                                          \n"
395     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
396     MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
397     "pavgb     %%xmm6,%%xmm7                   \n"
398     "pavgb     %%xmm7,%%xmm6                   \n"
399     "pshufb    %%xmm2,%%xmm6                   \n"
400     "pmaddubsw %%xmm5,%%xmm6                   \n"
401     "paddsw    %%xmm1,%%xmm6                   \n"
402     "psrlw     $0x2,%%xmm6                     \n"
403     "packuswb  %%xmm6,%%xmm6                   \n"
404     "movq      %%xmm6," MEMACCESS(1) "         \n"
405     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
406     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
407     "pavgb     %%xmm6,%%xmm7                   \n"
408     "pavgb     %%xmm7,%%xmm6                   \n"
409     "pshufb    %%xmm3,%%xmm6                   \n"
410     "pmaddubsw %%xmm0,%%xmm6                   \n"
411     "paddsw    %%xmm1,%%xmm6                   \n"
412     "psrlw     $0x2,%%xmm6                     \n"
413     "packuswb  %%xmm6,%%xmm6                   \n"
414     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
415     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
416     MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
417     "lea       " MEMLEA(0x20,0) ",%0           \n"
418     "pavgb     %%xmm6,%%xmm7                   \n"
419     "pavgb     %%xmm7,%%xmm6                   \n"
420     "pshufb    %%xmm4,%%xmm6                   \n"
421     "pmaddubsw %4,%%xmm6                       \n"
422     "paddsw    %%xmm1,%%xmm6                   \n"
423     "psrlw     $0x2,%%xmm6                     \n"
424     "packuswb  %%xmm6,%%xmm6                   \n"
425     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
426     "lea       " MEMLEA(0x18,1) ",%1           \n"
427     "sub       $0x18,%2                        \n"
428     "jg        1b                              \n"
429     : "+r"(src_ptr),   // %0
430       "+r"(dst_ptr),   // %1
431       "+r"(dst_width)  // %2
432     : "r"((intptr_t)(src_stride)),  // %3
433       "m"(kMadd21)     // %4
434     : "memory", "cc", NACL_R14
435       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
436   );
437 }
438
439 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
440                           uint8* dst_ptr, int dst_width) {
441   asm volatile (
442     "movdqa    %3,%%xmm4                       \n"
443     "movdqa    %4,%%xmm5                       \n"
444
445     LABELALIGN
446   "1:                                          \n"
447     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
448     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
449     "lea       " MEMLEA(0x20,0) ",%0           \n"
450     "pshufb    %%xmm4,%%xmm0                   \n"
451     "pshufb    %%xmm5,%%xmm1                   \n"
452     "paddusb   %%xmm1,%%xmm0                   \n"
453     "movq      %%xmm0," MEMACCESS(1) "         \n"
454     "movhlps   %%xmm0,%%xmm1                   \n"
455     "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
456     "lea       " MEMLEA(0xc,1) ",%1            \n"
457     "sub       $0xc,%2                         \n"
458     "jg        1b                              \n"
459   : "+r"(src_ptr),   // %0
460     "+r"(dst_ptr),   // %1
461     "+r"(dst_width)  // %2
462   : "m"(kShuf38a),   // %3
463     "m"(kShuf38b)    // %4
464   : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
465   );
466 }
467
468 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
469                                 ptrdiff_t src_stride,
470                                 uint8* dst_ptr, int dst_width) {
471   asm volatile (
472     "movdqa    %0,%%xmm2                       \n"
473     "movdqa    %1,%%xmm3                       \n"
474     "movdqa    %2,%%xmm4                       \n"
475     "movdqa    %3,%%xmm5                       \n"
476   :
477   : "m"(kShufAb0),   // %0
478     "m"(kShufAb1),   // %1
479     "m"(kShufAb2),   // %2
480     "m"(kScaleAb2)   // %3
481   );
482   asm volatile (
483     LABELALIGN
484   "1:                                          \n"
485     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
486     MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
487     "lea       " MEMLEA(0x10,0) ",%0           \n"
488     "pavgb     %%xmm1,%%xmm0                   \n"
489     "movdqa    %%xmm0,%%xmm1                   \n"
490     "pshufb    %%xmm2,%%xmm1                   \n"
491     "movdqa    %%xmm0,%%xmm6                   \n"
492     "pshufb    %%xmm3,%%xmm6                   \n"
493     "paddusw   %%xmm6,%%xmm1                   \n"
494     "pshufb    %%xmm4,%%xmm0                   \n"
495     "paddusw   %%xmm0,%%xmm1                   \n"
496     "pmulhuw   %%xmm5,%%xmm1                   \n"
497     "packuswb  %%xmm1,%%xmm1                   \n"
498     "movd      %%xmm1," MEMACCESS(1) "         \n"
499     "psrlq     $0x10,%%xmm1                    \n"
500     "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
501     "lea       " MEMLEA(0x6,1) ",%1            \n"
502     "sub       $0x6,%2                         \n"
503     "jg        1b                              \n"
504   : "+r"(src_ptr),     // %0
505     "+r"(dst_ptr),     // %1
506     "+r"(dst_width)    // %2
507   : "r"((intptr_t)(src_stride))  // %3
508   : "memory", "cc", NACL_R14
509     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
510   );
511 }
512
513 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
514                                 ptrdiff_t src_stride,
515                                 uint8* dst_ptr, int dst_width) {
516   asm volatile (
517     "movdqa    %0,%%xmm2                       \n"
518     "movdqa    %1,%%xmm3                       \n"
519     "movdqa    %2,%%xmm4                       \n"
520     "pxor      %%xmm5,%%xmm5                   \n"
521   :
522   : "m"(kShufAc),    // %0
523     "m"(kShufAc3),   // %1
524     "m"(kScaleAc33)  // %2
525   );
526   asm volatile (
527     LABELALIGN
528   "1:                                          \n"
529     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
530     MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
531     "movhlps   %%xmm0,%%xmm1                   \n"
532     "movhlps   %%xmm6,%%xmm7                   \n"
533     "punpcklbw %%xmm5,%%xmm0                   \n"
534     "punpcklbw %%xmm5,%%xmm1                   \n"
535     "punpcklbw %%xmm5,%%xmm6                   \n"
536     "punpcklbw %%xmm5,%%xmm7                   \n"
537     "paddusw   %%xmm6,%%xmm0                   \n"
538     "paddusw   %%xmm7,%%xmm1                   \n"
539     MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
540     "lea       " MEMLEA(0x10,0) ",%0           \n"
541     "movhlps   %%xmm6,%%xmm7                   \n"
542     "punpcklbw %%xmm5,%%xmm6                   \n"
543     "punpcklbw %%xmm5,%%xmm7                   \n"
544     "paddusw   %%xmm6,%%xmm0                   \n"
545     "paddusw   %%xmm7,%%xmm1                   \n"
546     "movdqa    %%xmm0,%%xmm6                   \n"
547     "psrldq    $0x2,%%xmm0                     \n"
548     "paddusw   %%xmm0,%%xmm6                   \n"
549     "psrldq    $0x2,%%xmm0                     \n"
550     "paddusw   %%xmm0,%%xmm6                   \n"
551     "pshufb    %%xmm2,%%xmm6                   \n"
552     "movdqa    %%xmm1,%%xmm7                   \n"
553     "psrldq    $0x2,%%xmm1                     \n"
554     "paddusw   %%xmm1,%%xmm7                   \n"
555     "psrldq    $0x2,%%xmm1                     \n"
556     "paddusw   %%xmm1,%%xmm7                   \n"
557     "pshufb    %%xmm3,%%xmm7                   \n"
558     "paddusw   %%xmm7,%%xmm6                   \n"
559     "pmulhuw   %%xmm4,%%xmm6                   \n"
560     "packuswb  %%xmm6,%%xmm6                   \n"
561     "movd      %%xmm6," MEMACCESS(1) "         \n"
562     "psrlq     $0x10,%%xmm6                    \n"
563     "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
564     "lea       " MEMLEA(0x6,1) ",%1            \n"
565     "sub       $0x6,%2                         \n"
566     "jg        1b                              \n"
567   : "+r"(src_ptr),    // %0
568     "+r"(dst_ptr),    // %1
569     "+r"(dst_width)   // %2
570   : "r"((intptr_t)(src_stride))   // %3
571   : "memory", "cc", NACL_R14
572     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
573   );
574 }
575
576 // Reads 16xN bytes and produces 16 shorts at a time.
577 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
578                        uint16* dst_ptr, int src_width, int src_height) {
579   int tmp_height = 0;
580   intptr_t tmp_src = 0;
581   asm volatile (
582     "mov       %0,%3                           \n"  // row pointer
583     "mov       %5,%2                           \n"  // height
584     "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
585     "pxor      %%xmm1,%%xmm1                   \n"
586     "pxor      %%xmm4,%%xmm4                   \n"
587
588     LABELALIGN
589   "1:                                          \n"
590     "movdqu    " MEMACCESS(3) ",%%xmm2         \n"
591     "add       %6,%3                           \n"
592     "movdqa    %%xmm2,%%xmm3                   \n"
593     "punpcklbw %%xmm4,%%xmm2                   \n"
594     "punpckhbw %%xmm4,%%xmm3                   \n"
595     "paddusw   %%xmm2,%%xmm0                   \n"
596     "paddusw   %%xmm3,%%xmm1                   \n"
597     "sub       $0x1,%2                         \n"
598     "jg        1b                              \n"
599
600     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
601     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
602     "lea       " MEMLEA(0x20,1) ",%1           \n"
603     "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
604     "mov       %0,%3                           \n"  // row pointer
605     "mov       %5,%2                           \n"  // height
606     "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
607     "pxor      %%xmm1,%%xmm1                   \n"
608     "sub       $0x10,%4                        \n"
609     "jg        1b                              \n"
610   : "+r"(src_ptr),     // %0
611     "+r"(dst_ptr),     // %1
612     "+r"(tmp_height),  // %2
613     "+r"(tmp_src),     // %3
614     "+r"(src_width),   // %4
615     "+rm"(src_height)  // %5
616   : "rm"((intptr_t)(src_stride))  // %6
617   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
618   );
619 }
620
621 // Bilinear column filtering. SSSE3 version.
622 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
623                            int dst_width, int x, int dx) {
624   intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
625   asm volatile (
626     "movd      %6,%%xmm2                       \n"
627     "movd      %7,%%xmm3                       \n"
628     "movl      $0x04040000,%k2                 \n"
629     "movd      %k2,%%xmm5                      \n"
630     "pcmpeqb   %%xmm6,%%xmm6                   \n"
631     "psrlw     $0x9,%%xmm6                     \n"
632     "pextrw    $0x1,%%xmm2,%k3                 \n"
633     "subl      $0x2,%5                         \n"
634     "jl        29f                             \n"
635     "movdqa    %%xmm2,%%xmm0                   \n"
636     "paddd     %%xmm3,%%xmm0                   \n"
637     "punpckldq %%xmm0,%%xmm2                   \n"
638     "punpckldq %%xmm3,%%xmm3                   \n"
639     "paddd     %%xmm3,%%xmm3                   \n"
640     "pextrw    $0x3,%%xmm2,%k4                 \n"
641
642     LABELALIGN
643   "2:                                          \n"
644     "movdqa    %%xmm2,%%xmm1                   \n"
645     "paddd     %%xmm3,%%xmm2                   \n"
646     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
647     "movd      %k2,%%xmm0                      \n"
648     "psrlw     $0x9,%%xmm1                     \n"
649     MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
650     "movd      %k2,%%xmm4                      \n"
651     "pshufb    %%xmm5,%%xmm1                   \n"
652     "punpcklwd %%xmm4,%%xmm0                   \n"
653     "pxor      %%xmm6,%%xmm1                   \n"
654     "pmaddubsw %%xmm1,%%xmm0                   \n"
655     "pextrw    $0x1,%%xmm2,%k3                 \n"
656     "pextrw    $0x3,%%xmm2,%k4                 \n"
657     "psrlw     $0x7,%%xmm0                     \n"
658     "packuswb  %%xmm0,%%xmm0                   \n"
659     "movd      %%xmm0,%k2                      \n"
660     "mov       %w2," MEMACCESS(0) "            \n"
661     "lea       " MEMLEA(0x2,0) ",%0            \n"
662     "sub       $0x2,%5                         \n"
663     "jge       2b                              \n"
664
665     LABELALIGN
666   "29:                                         \n"
667     "addl      $0x1,%5                         \n"
668     "jl        99f                             \n"
669     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
670     "movd      %k2,%%xmm0                      \n"
671     "psrlw     $0x9,%%xmm2                     \n"
672     "pshufb    %%xmm5,%%xmm2                   \n"
673     "pxor      %%xmm6,%%xmm2                   \n"
674     "pmaddubsw %%xmm2,%%xmm0                   \n"
675     "psrlw     $0x7,%%xmm0                     \n"
676     "packuswb  %%xmm0,%%xmm0                   \n"
677     "movd      %%xmm0,%k2                      \n"
678     "mov       %b2," MEMACCESS(0) "            \n"
679   "99:                                         \n"
680   : "+r"(dst_ptr),     // %0
681     "+r"(src_ptr),     // %1
682     "+a"(temp_pixel),  // %2
683     "+r"(x0),          // %3
684     "+r"(x1),          // %4
685     "+rm"(dst_width)   // %5
686   : "rm"(x),           // %6
687     "rm"(dx)           // %7
688   : "memory", "cc", NACL_R14
689     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
690   );
691 }
692
693 // Reads 4 pixels, duplicates them and writes 8 pixels.
694 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
695 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
696                        int dst_width, int x, int dx) {
697   asm volatile (
698     LABELALIGN
699   "1:                                          \n"
700     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
701     "lea       " MEMLEA(0x10,1) ",%1           \n"
702     "movdqa    %%xmm0,%%xmm1                   \n"
703     "punpcklbw %%xmm0,%%xmm0                   \n"
704     "punpckhbw %%xmm1,%%xmm1                   \n"
705     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
706     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
707     "lea       " MEMLEA(0x20,0) ",%0           \n"
708     "sub       $0x20,%2                         \n"
709     "jg        1b                              \n"
710
711   : "+r"(dst_ptr),     // %0
712     "+r"(src_ptr),     // %1
713     "+r"(dst_width)    // %2
714   :: "memory", "cc", "xmm0", "xmm1"
715   );
716 }
717
718 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
719                             ptrdiff_t src_stride,
720                             uint8* dst_argb, int dst_width) {
721   asm volatile (
722     LABELALIGN
723   "1:                                          \n"
724     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
725     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
726     "lea       " MEMLEA(0x20,0) ",%0           \n"
727     "shufps    $0xdd,%%xmm1,%%xmm0             \n"
728     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
729     "lea       " MEMLEA(0x10,1) ",%1           \n"
730     "sub       $0x4,%2                         \n"
731     "jg        1b                              \n"
732   : "+r"(src_argb),  // %0
733     "+r"(dst_argb),  // %1
734     "+r"(dst_width)  // %2
735   :: "memory", "cc", "xmm0", "xmm1"
736   );
737 }
738
739 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
740                                   ptrdiff_t src_stride,
741                                   uint8* dst_argb, int dst_width) {
742   asm volatile (
743     LABELALIGN
744   "1:                                          \n"
745     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
746     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
747     "lea       " MEMLEA(0x20,0) ",%0           \n"
748     "movdqa    %%xmm0,%%xmm2                   \n"
749     "shufps    $0x88,%%xmm1,%%xmm0             \n"
750     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
751     "pavgb     %%xmm2,%%xmm0                   \n"
752     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
753     "lea       " MEMLEA(0x10,1) ",%1           \n"
754     "sub       $0x4,%2                         \n"
755     "jg        1b                              \n"
756   : "+r"(src_argb),  // %0
757     "+r"(dst_argb),  // %1
758     "+r"(dst_width)  // %2
759   :: "memory", "cc", "xmm0", "xmm1"
760   );
761 }
762
763 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
764                                ptrdiff_t src_stride,
765                                uint8* dst_argb, int dst_width) {
766   asm volatile (
767     LABELALIGN
768   "1:                                          \n"
769     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
770     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
771     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
772     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
773     "lea       " MEMLEA(0x20,0) ",%0           \n"
774     "pavgb     %%xmm2,%%xmm0                   \n"
775     "pavgb     %%xmm3,%%xmm1                   \n"
776     "movdqa    %%xmm0,%%xmm2                   \n"
777     "shufps    $0x88,%%xmm1,%%xmm0             \n"
778     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
779     "pavgb     %%xmm2,%%xmm0                   \n"
780     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
781     "lea       " MEMLEA(0x10,1) ",%1           \n"
782     "sub       $0x4,%2                         \n"
783     "jg        1b                              \n"
784   : "+r"(src_argb),   // %0
785     "+r"(dst_argb),   // %1
786     "+r"(dst_width)   // %2
787   : "r"((intptr_t)(src_stride))   // %3
788   : "memory", "cc", NACL_R14
789     "xmm0", "xmm1", "xmm2", "xmm3"
790   );
791 }
792
793 // Reads 4 pixels at a time.
794 // Alignment requirement: dst_argb 16 byte aligned.
795 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
796                                int src_stepx, uint8* dst_argb, int dst_width) {
797   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
798   intptr_t src_stepx_x12 = 0;
799   asm volatile (
800     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
801     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
802     LABELALIGN
803   "1:                                          \n"
804     "movd      " MEMACCESS(0) ",%%xmm0         \n"
805     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
806     "punpckldq %%xmm1,%%xmm0                   \n"
807     MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
808     MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
809     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
810     "punpckldq %%xmm3,%%xmm2                   \n"
811     "punpcklqdq %%xmm2,%%xmm0                  \n"
812     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
813     "lea       " MEMLEA(0x10,2) ",%2           \n"
814     "sub       $0x4,%3                         \n"
815     "jg        1b                              \n"
816   : "+r"(src_argb),      // %0
817     "+r"(src_stepx_x4),  // %1
818     "+r"(dst_argb),      // %2
819     "+r"(dst_width),     // %3
820     "+r"(src_stepx_x12)  // %4
821   :: "memory", "cc", NACL_R14
822     "xmm0", "xmm1", "xmm2", "xmm3"
823   );
824 }
825
826 // Blends four 2x2 to 4x1.
827 // Alignment requirement: dst_argb 16 byte aligned.
828 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
829                                   ptrdiff_t src_stride, int src_stepx,
830                                   uint8* dst_argb, int dst_width) {
831   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
832   intptr_t src_stepx_x12 = 0;
833   intptr_t row1 = (intptr_t)(src_stride);
834   asm volatile (
835     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
836     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
837     "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
838
839     LABELALIGN
840   "1:                                          \n"
841     "movq      " MEMACCESS(0) ",%%xmm0         \n"
842     MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
843     MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
844     MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
845     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
846     "movq      " MEMACCESS(5) ",%%xmm2         \n"
847     MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
848     MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
849     MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
850     "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
851     "pavgb     %%xmm2,%%xmm0                   \n"
852     "pavgb     %%xmm3,%%xmm1                   \n"
853     "movdqa    %%xmm0,%%xmm2                   \n"
854     "shufps    $0x88,%%xmm1,%%xmm0             \n"
855     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
856     "pavgb     %%xmm2,%%xmm0                   \n"
857     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
858     "lea       " MEMLEA(0x10,2) ",%2           \n"
859     "sub       $0x4,%3                         \n"
860     "jg        1b                              \n"
861   : "+r"(src_argb),       // %0
862     "+r"(src_stepx_x4),   // %1
863     "+r"(dst_argb),       // %2
864     "+rm"(dst_width),     // %3
865     "+r"(src_stepx_x12),  // %4
866     "+r"(row1)            // %5
867   :: "memory", "cc", NACL_R14
868     "xmm0", "xmm1", "xmm2", "xmm3"
869   );
870 }
871
872 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
873                         int dst_width, int x, int dx) {
874   intptr_t x0 = 0, x1 = 0;
875   asm volatile (
876     "movd      %5,%%xmm2                       \n"
877     "movd      %6,%%xmm3                       \n"
878     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
879     "pshufd    $0x11,%%xmm3,%%xmm0             \n"
880     "paddd     %%xmm0,%%xmm2                   \n"
881     "paddd     %%xmm3,%%xmm3                   \n"
882     "pshufd    $0x5,%%xmm3,%%xmm0              \n"
883     "paddd     %%xmm0,%%xmm2                   \n"
884     "paddd     %%xmm3,%%xmm3                   \n"
885     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
886     "pextrw    $0x1,%%xmm2,%k0                 \n"
887     "pextrw    $0x3,%%xmm2,%k1                 \n"
888     "cmp       $0x0,%4                         \n"
889     "jl        99f                             \n"
890     "sub       $0x4,%4                         \n"
891     "jl        49f                             \n"
892
893     LABELALIGN
894   "40:                                         \n"
895     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
896     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
897     "pextrw    $0x5,%%xmm2,%k0                 \n"
898     "pextrw    $0x7,%%xmm2,%k1                 \n"
899     "paddd     %%xmm3,%%xmm2                   \n"
900     "punpckldq %%xmm1,%%xmm0                   \n"
901     MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
902     MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
903     "pextrw    $0x1,%%xmm2,%k0                 \n"
904     "pextrw    $0x3,%%xmm2,%k1                 \n"
905     "punpckldq %%xmm4,%%xmm1                   \n"
906     "punpcklqdq %%xmm1,%%xmm0                  \n"
907     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
908     "lea       " MEMLEA(0x10,2) ",%2           \n"
909     "sub       $0x4,%4                         \n"
910     "jge       40b                             \n"
911
912   "49:                                         \n"
913     "test      $0x2,%4                         \n"
914     "je        29f                             \n"
915     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
916     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
917     "pextrw    $0x5,%%xmm2,%k0                 \n"
918     "punpckldq %%xmm1,%%xmm0                   \n"
919     "movq      %%xmm0," MEMACCESS(2) "         \n"
920     "lea       " MEMLEA(0x8,2) ",%2            \n"
921   "29:                                         \n"
922     "test      $0x1,%4                         \n"
923     "je        99f                             \n"
924     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
925     "movd      %%xmm0," MEMACCESS(2) "         \n"
926   "99:                                         \n"
927   : "+a"(x0),          // %0
928     "+d"(x1),          // %1
929     "+r"(dst_argb),    // %2
930     "+r"(src_argb),    // %3
931     "+r"(dst_width)    // %4
932   : "rm"(x),           // %5
933     "rm"(dx)           // %6
934   : "memory", "cc", NACL_R14
935     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
936   );
937 }
938
939 // Reads 4 pixels, duplicates them and writes 8 pixels.
940 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
941 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
942                            int dst_width, int x, int dx) {
943   asm volatile (
944     LABELALIGN
945   "1:                                          \n"
946     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
947     "lea       " MEMLEA(0x10,1) ",%1           \n"
948     "movdqa    %%xmm0,%%xmm1                   \n"
949     "punpckldq %%xmm0,%%xmm0                   \n"
950     "punpckhdq %%xmm1,%%xmm1                   \n"
951     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
952     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
953     "lea       " MEMLEA(0x20,0) ",%0           \n"
954     "sub       $0x8,%2                         \n"
955     "jg        1b                              \n"
956
957   : "+r"(dst_argb),    // %0
958     "+r"(src_argb),    // %1
959     "+r"(dst_width)    // %2
960   :: "memory", "cc", NACL_R14
961     "xmm0", "xmm1"
962   );
963 }
964
965 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
966 static uvec8 kShuffleColARGB = {
967   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
968   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
969 };
970
971 // Shuffle table for duplicating 2 fractions into 8 bytes each
972 static uvec8 kShuffleFractions = {
973   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
974 };
975
976 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
977 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
978                                int dst_width, int x, int dx) {
979   intptr_t x0 = 0, x1 = 0;
980   asm volatile (
981     "movdqa    %0,%%xmm4                       \n"
982     "movdqa    %1,%%xmm5                       \n"
983   :
984   : "m"(kShuffleColARGB),  // %0
985     "m"(kShuffleFractions)  // %1
986   );
987
988   asm volatile (
989     "movd      %5,%%xmm2                       \n"
990     "movd      %6,%%xmm3                       \n"
991     "pcmpeqb   %%xmm6,%%xmm6                   \n"
992     "psrlw     $0x9,%%xmm6                     \n"
993     "pextrw    $0x1,%%xmm2,%k3                 \n"
994     "sub       $0x2,%2                         \n"
995     "jl        29f                             \n"
996     "movdqa    %%xmm2,%%xmm0                   \n"
997     "paddd     %%xmm3,%%xmm0                   \n"
998     "punpckldq %%xmm0,%%xmm2                   \n"
999     "punpckldq %%xmm3,%%xmm3                   \n"
1000     "paddd     %%xmm3,%%xmm3                   \n"
1001     "pextrw    $0x3,%%xmm2,%k4                 \n"
1002
1003     LABELALIGN
1004   "2:                                          \n"
1005     "movdqa    %%xmm2,%%xmm1                   \n"
1006     "paddd     %%xmm3,%%xmm2                   \n"
1007     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
1008     "psrlw     $0x9,%%xmm1                     \n"
1009     MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
1010     "pshufb    %%xmm5,%%xmm1                   \n"
1011     "pshufb    %%xmm4,%%xmm0                   \n"
1012     "pxor      %%xmm6,%%xmm1                   \n"
1013     "pmaddubsw %%xmm1,%%xmm0                   \n"
1014     "psrlw     $0x7,%%xmm0                     \n"
1015     "pextrw    $0x1,%%xmm2,%k3                 \n"
1016     "pextrw    $0x3,%%xmm2,%k4                 \n"
1017     "packuswb  %%xmm0,%%xmm0                   \n"
1018     "movq      %%xmm0," MEMACCESS(0) "         \n"
1019     "lea       " MEMLEA(0x8,0) ",%0            \n"
1020     "sub       $0x2,%2                         \n"
1021     "jge       2b                              \n"
1022
1023     LABELALIGN
1024   "29:                                         \n"
1025     "add       $0x1,%2                         \n"
1026     "jl        99f                             \n"
1027     "psrlw     $0x9,%%xmm2                     \n"
1028     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
1029     "pshufb    %%xmm5,%%xmm2                   \n"
1030     "pshufb    %%xmm4,%%xmm0                   \n"
1031     "pxor      %%xmm6,%%xmm2                   \n"
1032     "pmaddubsw %%xmm2,%%xmm0                   \n"
1033     "psrlw     $0x7,%%xmm0                     \n"
1034     "packuswb  %%xmm0,%%xmm0                   \n"
1035     "movd      %%xmm0," MEMACCESS(0) "         \n"
1036
1037     LABELALIGN
1038   "99:                                         \n"
1039   : "+r"(dst_argb),    // %0
1040     "+r"(src_argb),    // %1
1041     "+rm"(dst_width),  // %2
1042     "+r"(x0),          // %3
1043     "+r"(x1)           // %4
1044   : "rm"(x),           // %5
1045     "rm"(dx)           // %6
1046   : "memory", "cc", NACL_R14
1047     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1048   );
1049 }
1050
1051 // Divide num by div and return as 16.16 fixed point result.
1052 int FixedDiv_X86(int num, int div) {
1053   asm volatile (
1054     "cdq                                       \n"
1055     "shld      $0x10,%%eax,%%edx               \n"
1056     "shl       $0x10,%%eax                     \n"
1057     "idiv      %1                              \n"
1058     "mov       %0, %%eax                       \n"
1059     : "+a"(num)  // %0
1060     : "c"(div)   // %1
1061     : "memory", "cc", "edx"
1062   );
1063   return num;
1064 }
1065
1066 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
1067 int FixedDiv1_X86(int num, int div) {
1068   asm volatile (
1069     "cdq                                       \n"
1070     "shld      $0x10,%%eax,%%edx               \n"
1071     "shl       $0x10,%%eax                     \n"
1072     "sub       $0x10001,%%eax                  \n"
1073     "sbb       $0x0,%%edx                      \n"
1074     "sub       $0x1,%1                         \n"
1075     "idiv      %1                              \n"
1076     "mov       %0, %%eax                       \n"
1077     : "+a"(num)  // %0
1078     : "c"(div)   // %1
1079     : "memory", "cc", "edx"
1080   );
1081   return num;
1082 }
1083
1084 #endif  // defined(__x86_64__) || defined(__i386__)
1085
1086 #ifdef __cplusplus
1087 }  // extern "C"
1088 }  // namespace libyuv
1089 #endif