2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "libyuv/row.h"
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21 // Offsets for source bytes 0 to 9
23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
33 // Offsets for source bytes 0 to 10
34 static uvec8 kShuf01 =
35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static uvec8 kShuf11 =
39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static uvec8 kShuf21 =
43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
45 // Coefficients for source bytes 0 to 10
46 static uvec8 kMadd01 =
47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
49 // Coefficients for source bytes 10 to 21
50 static uvec8 kMadd11 =
51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
53 // Coefficients for source bytes 21 to 31
54 static uvec8 kMadd21 =
55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
57 // Coefficients for source bytes 21 to 31
58 static vec16 kRound34 =
59 { 2, 2, 2, 2, 2, 2, 2, 2 };
61 static uvec8 kShuf38a =
62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
64 static uvec8 kShuf38b =
65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
67 // Arrange words 0,3,6 into 0,1,2
68 static uvec8 kShufAc =
69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
71 // Arrange words 0,3,6 into 3,4,5
72 static uvec8 kShufAc3 =
73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
75 // Scaling values for boxes of 3x3 and 2x3
76 static uvec16 kScaleAc33 =
77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
79 // Arrange first value for pixels 0,1,2,3,4,5
80 static uvec8 kShufAb0 =
81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
83 // Arrange second value for pixels 0,1,2,3,4,5
84 static uvec8 kShufAb1 =
85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
87 // Arrange third value for pixels 0,1,2,3,4,5
88 static uvec8 kShufAb2 =
89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
91 // Scaling values for boxes of 3x2 and 2x2
92 static uvec16 kScaleAb2 =
93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
95 // GCC versions of row functions are verbatim conversions from Visual C.
96 // Generated using gcc disassembly on Visual C object file:
97 // objdump -D yuvscaler.obj >yuvscaler.txt
99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
100 uint8* dst_ptr, int dst_width) {
104 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
105 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
106 "lea " MEMLEA(0x20,0) ",%0 \n"
107 "psrlw $0x8,%%xmm0 \n"
108 "psrlw $0x8,%%xmm1 \n"
109 "packuswb %%xmm1,%%xmm0 \n"
110 "movdqu %%xmm0," MEMACCESS(1) " \n"
111 "lea " MEMLEA(0x10,1) ",%1 \n"
114 : "+r"(src_ptr), // %0
116 "+r"(dst_width) // %2
117 :: "memory", "cc", "xmm0", "xmm1"
121 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
122 uint8* dst_ptr, int dst_width) {
124 "pcmpeqb %%xmm5,%%xmm5 \n"
125 "psrlw $0x8,%%xmm5 \n"
129 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
130 "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
131 "lea " MEMLEA(0x20,0) ",%0 \n"
132 "movdqa %%xmm0,%%xmm2 \n"
133 "psrlw $0x8,%%xmm0 \n"
134 "movdqa %%xmm1,%%xmm3 \n"
135 "psrlw $0x8,%%xmm1 \n"
136 "pand %%xmm5,%%xmm2 \n"
137 "pand %%xmm5,%%xmm3 \n"
138 "pavgw %%xmm2,%%xmm0 \n"
139 "pavgw %%xmm3,%%xmm1 \n"
140 "packuswb %%xmm1,%%xmm0 \n"
141 "movdqu %%xmm0," MEMACCESS(1) " \n"
142 "lea " MEMLEA(0x10,1) ",%1 \n"
145 : "+r"(src_ptr), // %0
147 "+r"(dst_width) // %2
148 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
152 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
153 uint8* dst_ptr, int dst_width) {
155 "pcmpeqb %%xmm5,%%xmm5 \n"
156 "psrlw $0x8,%%xmm5 \n"
160 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
161 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
162 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
163 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
164 "lea " MEMLEA(0x20,0) ",%0 \n"
165 "pavgb %%xmm2,%%xmm0 \n"
166 "pavgb %%xmm3,%%xmm1 \n"
167 "movdqa %%xmm0,%%xmm2 \n"
168 "psrlw $0x8,%%xmm0 \n"
169 "movdqa %%xmm1,%%xmm3 \n"
170 "psrlw $0x8,%%xmm1 \n"
171 "pand %%xmm5,%%xmm2 \n"
172 "pand %%xmm5,%%xmm3 \n"
173 "pavgw %%xmm2,%%xmm0 \n"
174 "pavgw %%xmm3,%%xmm1 \n"
175 "packuswb %%xmm1,%%xmm0 \n"
176 "movdqu %%xmm0," MEMACCESS(1) " \n"
177 "lea " MEMLEA(0x10,1) ",%1 \n"
180 : "+r"(src_ptr), // %0
182 "+r"(dst_width) // %2
183 : "r"((intptr_t)(src_stride)) // %3
184 : "memory", "cc", NACL_R14
185 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
189 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
190 uint8* dst_ptr, int dst_width) {
192 "pcmpeqb %%xmm5,%%xmm5 \n"
193 "psrld $0x18,%%xmm5 \n"
194 "pslld $0x10,%%xmm5 \n"
198 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
199 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
200 "lea " MEMLEA(0x20,0) ",%0 \n"
201 "pand %%xmm5,%%xmm0 \n"
202 "pand %%xmm5,%%xmm1 \n"
203 "packuswb %%xmm1,%%xmm0 \n"
204 "psrlw $0x8,%%xmm0 \n"
205 "packuswb %%xmm0,%%xmm0 \n"
206 "movq %%xmm0," MEMACCESS(1) " \n"
207 "lea " MEMLEA(0x8,1) ",%1 \n"
210 : "+r"(src_ptr), // %0
212 "+r"(dst_width) // %2
213 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
217 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
218 uint8* dst_ptr, int dst_width) {
219 intptr_t stridex3 = 0;
221 "pcmpeqb %%xmm7,%%xmm7 \n"
222 "psrlw $0x8,%%xmm7 \n"
223 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
227 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
228 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
229 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
230 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
231 "pavgb %%xmm2,%%xmm0 \n"
232 "pavgb %%xmm3,%%xmm1 \n"
233 MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
234 MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
235 MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4
236 MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5
237 "lea " MEMLEA(0x20,0) ",%0 \n"
238 "pavgb %%xmm4,%%xmm2 \n"
239 "pavgb %%xmm2,%%xmm0 \n"
240 "pavgb %%xmm5,%%xmm3 \n"
241 "pavgb %%xmm3,%%xmm1 \n"
242 "movdqa %%xmm0,%%xmm2 \n"
243 "psrlw $0x8,%%xmm0 \n"
244 "movdqa %%xmm1,%%xmm3 \n"
245 "psrlw $0x8,%%xmm1 \n"
246 "pand %%xmm7,%%xmm2 \n"
247 "pand %%xmm7,%%xmm3 \n"
248 "pavgw %%xmm2,%%xmm0 \n"
249 "pavgw %%xmm3,%%xmm1 \n"
250 "packuswb %%xmm1,%%xmm0 \n"
251 "movdqa %%xmm0,%%xmm2 \n"
252 "psrlw $0x8,%%xmm0 \n"
253 "pand %%xmm7,%%xmm2 \n"
254 "pavgw %%xmm2,%%xmm0 \n"
255 "packuswb %%xmm0,%%xmm0 \n"
256 "movq %%xmm0," MEMACCESS(1) " \n"
257 "lea " MEMLEA(0x8,1) ",%1 \n"
260 : "+r"(src_ptr), // %0
262 "+r"(dst_width), // %2
264 : "r"((intptr_t)(src_stride)) // %4
265 : "memory", "cc", NACL_R14
266 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
270 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
271 uint8* dst_ptr, int dst_width) {
273 "movdqa %0,%%xmm3 \n"
274 "movdqa %1,%%xmm4 \n"
275 "movdqa %2,%%xmm5 \n"
284 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
285 "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
286 "lea " MEMLEA(0x20,0) ",%0 \n"
287 "movdqa %%xmm2,%%xmm1 \n"
288 "palignr $0x8,%%xmm0,%%xmm1 \n"
289 "pshufb %%xmm3,%%xmm0 \n"
290 "pshufb %%xmm4,%%xmm1 \n"
291 "pshufb %%xmm5,%%xmm2 \n"
292 "movq %%xmm0," MEMACCESS(1) " \n"
293 "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
294 "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
295 "lea " MEMLEA(0x18,1) ",%1 \n"
298 : "+r"(src_ptr), // %0
300 "+r"(dst_width) // %2
301 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
305 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
306 ptrdiff_t src_stride,
307 uint8* dst_ptr, int dst_width) {
309 "movdqa %0,%%xmm2 \n" // kShuf01
310 "movdqa %1,%%xmm3 \n" // kShuf11
311 "movdqa %2,%%xmm4 \n" // kShuf21
313 : "m"(kShuf01), // %0
318 "movdqa %0,%%xmm5 \n" // kMadd01
319 "movdqa %1,%%xmm0 \n" // kMadd11
320 "movdqa %2,%%xmm1 \n" // kRound34
322 : "m"(kMadd01), // %0
329 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
330 MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7
331 "pavgb %%xmm7,%%xmm6 \n"
332 "pshufb %%xmm2,%%xmm6 \n"
333 "pmaddubsw %%xmm5,%%xmm6 \n"
334 "paddsw %%xmm1,%%xmm6 \n"
335 "psrlw $0x2,%%xmm6 \n"
336 "packuswb %%xmm6,%%xmm6 \n"
337 "movq %%xmm6," MEMACCESS(1) " \n"
338 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
339 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
340 "pavgb %%xmm7,%%xmm6 \n"
341 "pshufb %%xmm3,%%xmm6 \n"
342 "pmaddubsw %%xmm0,%%xmm6 \n"
343 "paddsw %%xmm1,%%xmm6 \n"
344 "psrlw $0x2,%%xmm6 \n"
345 "packuswb %%xmm6,%%xmm6 \n"
346 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
347 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
348 MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7
349 "lea " MEMLEA(0x20,0) ",%0 \n"
350 "pavgb %%xmm7,%%xmm6 \n"
351 "pshufb %%xmm4,%%xmm6 \n"
352 "pmaddubsw %4,%%xmm6 \n"
353 "paddsw %%xmm1,%%xmm6 \n"
354 "psrlw $0x2,%%xmm6 \n"
355 "packuswb %%xmm6,%%xmm6 \n"
356 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
357 "lea " MEMLEA(0x18,1) ",%1 \n"
360 : "+r"(src_ptr), // %0
362 "+r"(dst_width) // %2
363 : "r"((intptr_t)(src_stride)), // %3
365 : "memory", "cc", NACL_R14
366 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
370 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
371 ptrdiff_t src_stride,
372 uint8* dst_ptr, int dst_width) {
374 "movdqa %0,%%xmm2 \n" // kShuf01
375 "movdqa %1,%%xmm3 \n" // kShuf11
376 "movdqa %2,%%xmm4 \n" // kShuf21
378 : "m"(kShuf01), // %0
383 "movdqa %0,%%xmm5 \n" // kMadd01
384 "movdqa %1,%%xmm0 \n" // kMadd11
385 "movdqa %2,%%xmm1 \n" // kRound34
387 : "m"(kMadd01), // %0
395 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
396 MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7
397 "pavgb %%xmm6,%%xmm7 \n"
398 "pavgb %%xmm7,%%xmm6 \n"
399 "pshufb %%xmm2,%%xmm6 \n"
400 "pmaddubsw %%xmm5,%%xmm6 \n"
401 "paddsw %%xmm1,%%xmm6 \n"
402 "psrlw $0x2,%%xmm6 \n"
403 "packuswb %%xmm6,%%xmm6 \n"
404 "movq %%xmm6," MEMACCESS(1) " \n"
405 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
406 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
407 "pavgb %%xmm6,%%xmm7 \n"
408 "pavgb %%xmm7,%%xmm6 \n"
409 "pshufb %%xmm3,%%xmm6 \n"
410 "pmaddubsw %%xmm0,%%xmm6 \n"
411 "paddsw %%xmm1,%%xmm6 \n"
412 "psrlw $0x2,%%xmm6 \n"
413 "packuswb %%xmm6,%%xmm6 \n"
414 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
415 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
416 MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7
417 "lea " MEMLEA(0x20,0) ",%0 \n"
418 "pavgb %%xmm6,%%xmm7 \n"
419 "pavgb %%xmm7,%%xmm6 \n"
420 "pshufb %%xmm4,%%xmm6 \n"
421 "pmaddubsw %4,%%xmm6 \n"
422 "paddsw %%xmm1,%%xmm6 \n"
423 "psrlw $0x2,%%xmm6 \n"
424 "packuswb %%xmm6,%%xmm6 \n"
425 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
426 "lea " MEMLEA(0x18,1) ",%1 \n"
429 : "+r"(src_ptr), // %0
431 "+r"(dst_width) // %2
432 : "r"((intptr_t)(src_stride)), // %3
434 : "memory", "cc", NACL_R14
435 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
439 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
440 uint8* dst_ptr, int dst_width) {
442 "movdqa %3,%%xmm4 \n"
443 "movdqa %4,%%xmm5 \n"
447 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
448 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
449 "lea " MEMLEA(0x20,0) ",%0 \n"
450 "pshufb %%xmm4,%%xmm0 \n"
451 "pshufb %%xmm5,%%xmm1 \n"
452 "paddusb %%xmm1,%%xmm0 \n"
453 "movq %%xmm0," MEMACCESS(1) " \n"
454 "movhlps %%xmm0,%%xmm1 \n"
455 "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
456 "lea " MEMLEA(0xc,1) ",%1 \n"
459 : "+r"(src_ptr), // %0
461 "+r"(dst_width) // %2
462 : "m"(kShuf38a), // %3
464 : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
468 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
469 ptrdiff_t src_stride,
470 uint8* dst_ptr, int dst_width) {
472 "movdqa %0,%%xmm2 \n"
473 "movdqa %1,%%xmm3 \n"
474 "movdqa %2,%%xmm4 \n"
475 "movdqa %3,%%xmm5 \n"
477 : "m"(kShufAb0), // %0
485 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
486 MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1
487 "lea " MEMLEA(0x10,0) ",%0 \n"
488 "pavgb %%xmm1,%%xmm0 \n"
489 "movdqa %%xmm0,%%xmm1 \n"
490 "pshufb %%xmm2,%%xmm1 \n"
491 "movdqa %%xmm0,%%xmm6 \n"
492 "pshufb %%xmm3,%%xmm6 \n"
493 "paddusw %%xmm6,%%xmm1 \n"
494 "pshufb %%xmm4,%%xmm0 \n"
495 "paddusw %%xmm0,%%xmm1 \n"
496 "pmulhuw %%xmm5,%%xmm1 \n"
497 "packuswb %%xmm1,%%xmm1 \n"
498 "movd %%xmm1," MEMACCESS(1) " \n"
499 "psrlq $0x10,%%xmm1 \n"
500 "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
501 "lea " MEMLEA(0x6,1) ",%1 \n"
504 : "+r"(src_ptr), // %0
506 "+r"(dst_width) // %2
507 : "r"((intptr_t)(src_stride)) // %3
508 : "memory", "cc", NACL_R14
509 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
513 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
514 ptrdiff_t src_stride,
515 uint8* dst_ptr, int dst_width) {
517 "movdqa %0,%%xmm2 \n"
518 "movdqa %1,%%xmm3 \n"
519 "movdqa %2,%%xmm4 \n"
520 "pxor %%xmm5,%%xmm5 \n"
522 : "m"(kShufAc), // %0
524 "m"(kScaleAc33) // %2
529 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
530 MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6
531 "movhlps %%xmm0,%%xmm1 \n"
532 "movhlps %%xmm6,%%xmm7 \n"
533 "punpcklbw %%xmm5,%%xmm0 \n"
534 "punpcklbw %%xmm5,%%xmm1 \n"
535 "punpcklbw %%xmm5,%%xmm6 \n"
536 "punpcklbw %%xmm5,%%xmm7 \n"
537 "paddusw %%xmm6,%%xmm0 \n"
538 "paddusw %%xmm7,%%xmm1 \n"
539 MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6
540 "lea " MEMLEA(0x10,0) ",%0 \n"
541 "movhlps %%xmm6,%%xmm7 \n"
542 "punpcklbw %%xmm5,%%xmm6 \n"
543 "punpcklbw %%xmm5,%%xmm7 \n"
544 "paddusw %%xmm6,%%xmm0 \n"
545 "paddusw %%xmm7,%%xmm1 \n"
546 "movdqa %%xmm0,%%xmm6 \n"
547 "psrldq $0x2,%%xmm0 \n"
548 "paddusw %%xmm0,%%xmm6 \n"
549 "psrldq $0x2,%%xmm0 \n"
550 "paddusw %%xmm0,%%xmm6 \n"
551 "pshufb %%xmm2,%%xmm6 \n"
552 "movdqa %%xmm1,%%xmm7 \n"
553 "psrldq $0x2,%%xmm1 \n"
554 "paddusw %%xmm1,%%xmm7 \n"
555 "psrldq $0x2,%%xmm1 \n"
556 "paddusw %%xmm1,%%xmm7 \n"
557 "pshufb %%xmm3,%%xmm7 \n"
558 "paddusw %%xmm7,%%xmm6 \n"
559 "pmulhuw %%xmm4,%%xmm6 \n"
560 "packuswb %%xmm6,%%xmm6 \n"
561 "movd %%xmm6," MEMACCESS(1) " \n"
562 "psrlq $0x10,%%xmm6 \n"
563 "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
564 "lea " MEMLEA(0x6,1) ",%1 \n"
567 : "+r"(src_ptr), // %0
569 "+r"(dst_width) // %2
570 : "r"((intptr_t)(src_stride)) // %3
571 : "memory", "cc", NACL_R14
572 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
576 // Reads 16xN bytes and produces 16 shorts at a time.
577 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
578 uint16* dst_ptr, int src_width, int src_height) {
580 intptr_t tmp_src = 0;
582 "mov %0,%3 \n" // row pointer
583 "mov %5,%2 \n" // height
584 "pxor %%xmm0,%%xmm0 \n" // clear accumulators
585 "pxor %%xmm1,%%xmm1 \n"
586 "pxor %%xmm4,%%xmm4 \n"
590 "movdqu " MEMACCESS(3) ",%%xmm2 \n"
592 "movdqa %%xmm2,%%xmm3 \n"
593 "punpcklbw %%xmm4,%%xmm2 \n"
594 "punpckhbw %%xmm4,%%xmm3 \n"
595 "paddusw %%xmm2,%%xmm0 \n"
596 "paddusw %%xmm3,%%xmm1 \n"
600 "movdqu %%xmm0," MEMACCESS(1) " \n"
601 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
602 "lea " MEMLEA(0x20,1) ",%1 \n"
603 "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
604 "mov %0,%3 \n" // row pointer
605 "mov %5,%2 \n" // height
606 "pxor %%xmm0,%%xmm0 \n" // clear accumulators
607 "pxor %%xmm1,%%xmm1 \n"
610 : "+r"(src_ptr), // %0
612 "+r"(tmp_height), // %2
614 "+r"(src_width), // %4
615 "+rm"(src_height) // %5
616 : "rm"((intptr_t)(src_stride)) // %6
617 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
621 // Bilinear column filtering. SSSE3 version.
622 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
623 int dst_width, int x, int dx) {
624 intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
628 "movl $0x04040000,%k2 \n"
630 "pcmpeqb %%xmm6,%%xmm6 \n"
631 "psrlw $0x9,%%xmm6 \n"
632 "pextrw $0x1,%%xmm2,%k3 \n"
635 "movdqa %%xmm2,%%xmm0 \n"
636 "paddd %%xmm3,%%xmm0 \n"
637 "punpckldq %%xmm0,%%xmm2 \n"
638 "punpckldq %%xmm3,%%xmm3 \n"
639 "paddd %%xmm3,%%xmm3 \n"
640 "pextrw $0x3,%%xmm2,%k4 \n"
644 "movdqa %%xmm2,%%xmm1 \n"
645 "paddd %%xmm3,%%xmm2 \n"
646 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
648 "psrlw $0x9,%%xmm1 \n"
649 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
651 "pshufb %%xmm5,%%xmm1 \n"
652 "punpcklwd %%xmm4,%%xmm0 \n"
653 "pxor %%xmm6,%%xmm1 \n"
654 "pmaddubsw %%xmm1,%%xmm0 \n"
655 "pextrw $0x1,%%xmm2,%k3 \n"
656 "pextrw $0x3,%%xmm2,%k4 \n"
657 "psrlw $0x7,%%xmm0 \n"
658 "packuswb %%xmm0,%%xmm0 \n"
660 "mov %w2," MEMACCESS(0) " \n"
661 "lea " MEMLEA(0x2,0) ",%0 \n"
669 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
671 "psrlw $0x9,%%xmm2 \n"
672 "pshufb %%xmm5,%%xmm2 \n"
673 "pxor %%xmm6,%%xmm2 \n"
674 "pmaddubsw %%xmm2,%%xmm0 \n"
675 "psrlw $0x7,%%xmm0 \n"
676 "packuswb %%xmm0,%%xmm0 \n"
678 "mov %b2," MEMACCESS(0) " \n"
680 : "+r"(dst_ptr), // %0
682 "+a"(temp_pixel), // %2
685 "+rm"(dst_width) // %5
688 : "memory", "cc", NACL_R14
689 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
693 // Reads 4 pixels, duplicates them and writes 8 pixels.
694 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
695 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
696 int dst_width, int x, int dx) {
700 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
701 "lea " MEMLEA(0x10,1) ",%1 \n"
702 "movdqa %%xmm0,%%xmm1 \n"
703 "punpcklbw %%xmm0,%%xmm0 \n"
704 "punpckhbw %%xmm1,%%xmm1 \n"
705 "movdqu %%xmm0," MEMACCESS(0) " \n"
706 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
707 "lea " MEMLEA(0x20,0) ",%0 \n"
711 : "+r"(dst_ptr), // %0
713 "+r"(dst_width) // %2
714 :: "memory", "cc", "xmm0", "xmm1"
718 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
719 ptrdiff_t src_stride,
720 uint8* dst_argb, int dst_width) {
724 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
725 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
726 "lea " MEMLEA(0x20,0) ",%0 \n"
727 "shufps $0xdd,%%xmm1,%%xmm0 \n"
728 "movdqu %%xmm0," MEMACCESS(1) " \n"
729 "lea " MEMLEA(0x10,1) ",%1 \n"
732 : "+r"(src_argb), // %0
733 "+r"(dst_argb), // %1
734 "+r"(dst_width) // %2
735 :: "memory", "cc", "xmm0", "xmm1"
739 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
740 ptrdiff_t src_stride,
741 uint8* dst_argb, int dst_width) {
745 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
746 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
747 "lea " MEMLEA(0x20,0) ",%0 \n"
748 "movdqa %%xmm0,%%xmm2 \n"
749 "shufps $0x88,%%xmm1,%%xmm0 \n"
750 "shufps $0xdd,%%xmm1,%%xmm2 \n"
751 "pavgb %%xmm2,%%xmm0 \n"
752 "movdqu %%xmm0," MEMACCESS(1) " \n"
753 "lea " MEMLEA(0x10,1) ",%1 \n"
756 : "+r"(src_argb), // %0
757 "+r"(dst_argb), // %1
758 "+r"(dst_width) // %2
759 :: "memory", "cc", "xmm0", "xmm1"
763 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
764 ptrdiff_t src_stride,
765 uint8* dst_argb, int dst_width) {
769 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
770 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
771 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
772 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
773 "lea " MEMLEA(0x20,0) ",%0 \n"
774 "pavgb %%xmm2,%%xmm0 \n"
775 "pavgb %%xmm3,%%xmm1 \n"
776 "movdqa %%xmm0,%%xmm2 \n"
777 "shufps $0x88,%%xmm1,%%xmm0 \n"
778 "shufps $0xdd,%%xmm1,%%xmm2 \n"
779 "pavgb %%xmm2,%%xmm0 \n"
780 "movdqu %%xmm0," MEMACCESS(1) " \n"
781 "lea " MEMLEA(0x10,1) ",%1 \n"
784 : "+r"(src_argb), // %0
785 "+r"(dst_argb), // %1
786 "+r"(dst_width) // %2
787 : "r"((intptr_t)(src_stride)) // %3
788 : "memory", "cc", NACL_R14
789 "xmm0", "xmm1", "xmm2", "xmm3"
793 // Reads 4 pixels at a time.
794 // Alignment requirement: dst_argb 16 byte aligned.
795 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
796 int src_stepx, uint8* dst_argb, int dst_width) {
797 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
798 intptr_t src_stepx_x12 = 0;
800 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
801 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
804 "movd " MEMACCESS(0) ",%%xmm0 \n"
805 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
806 "punpckldq %%xmm1,%%xmm0 \n"
807 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
808 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
809 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
810 "punpckldq %%xmm3,%%xmm2 \n"
811 "punpcklqdq %%xmm2,%%xmm0 \n"
812 "movdqu %%xmm0," MEMACCESS(2) " \n"
813 "lea " MEMLEA(0x10,2) ",%2 \n"
816 : "+r"(src_argb), // %0
817 "+r"(src_stepx_x4), // %1
818 "+r"(dst_argb), // %2
819 "+r"(dst_width), // %3
820 "+r"(src_stepx_x12) // %4
821 :: "memory", "cc", NACL_R14
822 "xmm0", "xmm1", "xmm2", "xmm3"
826 // Blends four 2x2 to 4x1.
827 // Alignment requirement: dst_argb 16 byte aligned.
828 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
829 ptrdiff_t src_stride, int src_stepx,
830 uint8* dst_argb, int dst_width) {
831 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
832 intptr_t src_stepx_x12 = 0;
833 intptr_t row1 = (intptr_t)(src_stride);
835 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
836 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
837 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
841 "movq " MEMACCESS(0) ",%%xmm0 \n"
842 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
843 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
844 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
845 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
846 "movq " MEMACCESS(5) ",%%xmm2 \n"
847 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
848 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
849 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
850 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
851 "pavgb %%xmm2,%%xmm0 \n"
852 "pavgb %%xmm3,%%xmm1 \n"
853 "movdqa %%xmm0,%%xmm2 \n"
854 "shufps $0x88,%%xmm1,%%xmm0 \n"
855 "shufps $0xdd,%%xmm1,%%xmm2 \n"
856 "pavgb %%xmm2,%%xmm0 \n"
857 "movdqu %%xmm0," MEMACCESS(2) " \n"
858 "lea " MEMLEA(0x10,2) ",%2 \n"
861 : "+r"(src_argb), // %0
862 "+r"(src_stepx_x4), // %1
863 "+r"(dst_argb), // %2
864 "+rm"(dst_width), // %3
865 "+r"(src_stepx_x12), // %4
867 :: "memory", "cc", NACL_R14
868 "xmm0", "xmm1", "xmm2", "xmm3"
872 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
873 int dst_width, int x, int dx) {
874 intptr_t x0 = 0, x1 = 0;
878 "pshufd $0x0,%%xmm2,%%xmm2 \n"
879 "pshufd $0x11,%%xmm3,%%xmm0 \n"
880 "paddd %%xmm0,%%xmm2 \n"
881 "paddd %%xmm3,%%xmm3 \n"
882 "pshufd $0x5,%%xmm3,%%xmm0 \n"
883 "paddd %%xmm0,%%xmm2 \n"
884 "paddd %%xmm3,%%xmm3 \n"
885 "pshufd $0x0,%%xmm3,%%xmm3 \n"
886 "pextrw $0x1,%%xmm2,%k0 \n"
887 "pextrw $0x3,%%xmm2,%k1 \n"
895 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
896 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
897 "pextrw $0x5,%%xmm2,%k0 \n"
898 "pextrw $0x7,%%xmm2,%k1 \n"
899 "paddd %%xmm3,%%xmm2 \n"
900 "punpckldq %%xmm1,%%xmm0 \n"
901 MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
902 MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
903 "pextrw $0x1,%%xmm2,%k0 \n"
904 "pextrw $0x3,%%xmm2,%k1 \n"
905 "punpckldq %%xmm4,%%xmm1 \n"
906 "punpcklqdq %%xmm1,%%xmm0 \n"
907 "movdqu %%xmm0," MEMACCESS(2) " \n"
908 "lea " MEMLEA(0x10,2) ",%2 \n"
915 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
916 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
917 "pextrw $0x5,%%xmm2,%k0 \n"
918 "punpckldq %%xmm1,%%xmm0 \n"
919 "movq %%xmm0," MEMACCESS(2) " \n"
920 "lea " MEMLEA(0x8,2) ",%2 \n"
924 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
925 "movd %%xmm0," MEMACCESS(2) " \n"
929 "+r"(dst_argb), // %2
930 "+r"(src_argb), // %3
931 "+r"(dst_width) // %4
934 : "memory", "cc", NACL_R14
935 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
939 // Reads 4 pixels, duplicates them and writes 8 pixels.
940 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
941 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
942 int dst_width, int x, int dx) {
946 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
947 "lea " MEMLEA(0x10,1) ",%1 \n"
948 "movdqa %%xmm0,%%xmm1 \n"
949 "punpckldq %%xmm0,%%xmm0 \n"
950 "punpckhdq %%xmm1,%%xmm1 \n"
951 "movdqu %%xmm0," MEMACCESS(0) " \n"
952 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
953 "lea " MEMLEA(0x20,0) ",%0 \n"
957 : "+r"(dst_argb), // %0
958 "+r"(src_argb), // %1
959 "+r"(dst_width) // %2
960 :: "memory", "cc", NACL_R14
965 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
966 static uvec8 kShuffleColARGB = {
967 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
968 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
971 // Shuffle table for duplicating 2 fractions into 8 bytes each
972 static uvec8 kShuffleFractions = {
973 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
976 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
977 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
978 int dst_width, int x, int dx) {
979 intptr_t x0 = 0, x1 = 0;
981 "movdqa %0,%%xmm4 \n"
982 "movdqa %1,%%xmm5 \n"
984 : "m"(kShuffleColARGB), // %0
985 "m"(kShuffleFractions) // %1
991 "pcmpeqb %%xmm6,%%xmm6 \n"
992 "psrlw $0x9,%%xmm6 \n"
993 "pextrw $0x1,%%xmm2,%k3 \n"
996 "movdqa %%xmm2,%%xmm0 \n"
997 "paddd %%xmm3,%%xmm0 \n"
998 "punpckldq %%xmm0,%%xmm2 \n"
999 "punpckldq %%xmm3,%%xmm3 \n"
1000 "paddd %%xmm3,%%xmm3 \n"
1001 "pextrw $0x3,%%xmm2,%k4 \n"
1005 "movdqa %%xmm2,%%xmm1 \n"
1006 "paddd %%xmm3,%%xmm2 \n"
1007 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1008 "psrlw $0x9,%%xmm1 \n"
1009 MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
1010 "pshufb %%xmm5,%%xmm1 \n"
1011 "pshufb %%xmm4,%%xmm0 \n"
1012 "pxor %%xmm6,%%xmm1 \n"
1013 "pmaddubsw %%xmm1,%%xmm0 \n"
1014 "psrlw $0x7,%%xmm0 \n"
1015 "pextrw $0x1,%%xmm2,%k3 \n"
1016 "pextrw $0x3,%%xmm2,%k4 \n"
1017 "packuswb %%xmm0,%%xmm0 \n"
1018 "movq %%xmm0," MEMACCESS(0) " \n"
1019 "lea " MEMLEA(0x8,0) ",%0 \n"
1027 "psrlw $0x9,%%xmm2 \n"
1028 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1029 "pshufb %%xmm5,%%xmm2 \n"
1030 "pshufb %%xmm4,%%xmm0 \n"
1031 "pxor %%xmm6,%%xmm2 \n"
1032 "pmaddubsw %%xmm2,%%xmm0 \n"
1033 "psrlw $0x7,%%xmm0 \n"
1034 "packuswb %%xmm0,%%xmm0 \n"
1035 "movd %%xmm0," MEMACCESS(0) " \n"
1039 : "+r"(dst_argb), // %0
1040 "+r"(src_argb), // %1
1041 "+rm"(dst_width), // %2
1046 : "memory", "cc", NACL_R14
1047 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1051 // Divide num by div and return as 16.16 fixed point result.
1052 int FixedDiv_X86(int num, int div) {
1055 "shld $0x10,%%eax,%%edx \n"
1056 "shl $0x10,%%eax \n"
1061 : "memory", "cc", "edx"
1066 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
1067 int FixedDiv1_X86(int num, int div) {
1070 "shld $0x10,%%eax,%%edx \n"
1071 "shl $0x10,%%eax \n"
1072 "sub $0x10001,%%eax \n"
1079 : "memory", "cc", "edx"
1084 #endif // defined(__x86_64__) || defined(__i386__)
1088 } // namespace libyuv