]> granicus.if.org Git - libvpx/blob - third_party/libyuv/source/rotate_neon.cc
update libyuv to r1456
[libvpx] / third_party / libyuv / source / rotate_neon.cc
1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10
11 #include "libyuv/row.h"
12 #include "libyuv/rotate_row.h"
13
14 #include "libyuv/basic_types.h"
15
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
22     !defined(__aarch64__)
23
24 static uvec8 kVTbl4x4Transpose =
25   { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
26
27 void TransposeWx8_NEON(const uint8* src, int src_stride,
28                        uint8* dst, int dst_stride,
29                        int width) {
30   const uint8* src_temp = NULL;
31   asm volatile (
32     // loops are on blocks of 8. loop will stop when
33     // counter gets to or below 0. starting the counter
34     // at w-8 allow for this
35     "sub         %5, #8                        \n"
36
37     // handle 8x8 blocks. this should be the majority of the plane
38     ".p2align  2                               \n"
39     "1:                                        \n"
40       "mov         %0, %1                      \n"
41
42       MEMACCESS(0)
43       "vld1.8      {d0}, [%0], %2              \n"
44       MEMACCESS(0)
45       "vld1.8      {d1}, [%0], %2              \n"
46       MEMACCESS(0)
47       "vld1.8      {d2}, [%0], %2              \n"
48       MEMACCESS(0)
49       "vld1.8      {d3}, [%0], %2              \n"
50       MEMACCESS(0)
51       "vld1.8      {d4}, [%0], %2              \n"
52       MEMACCESS(0)
53       "vld1.8      {d5}, [%0], %2              \n"
54       MEMACCESS(0)
55       "vld1.8      {d6}, [%0], %2              \n"
56       MEMACCESS(0)
57       "vld1.8      {d7}, [%0]                  \n"
58
59       "vtrn.8      d1, d0                      \n"
60       "vtrn.8      d3, d2                      \n"
61       "vtrn.8      d5, d4                      \n"
62       "vtrn.8      d7, d6                      \n"
63
64       "vtrn.16     d1, d3                      \n"
65       "vtrn.16     d0, d2                      \n"
66       "vtrn.16     d5, d7                      \n"
67       "vtrn.16     d4, d6                      \n"
68
69       "vtrn.32     d1, d5                      \n"
70       "vtrn.32     d0, d4                      \n"
71       "vtrn.32     d3, d7                      \n"
72       "vtrn.32     d2, d6                      \n"
73
74       "vrev16.8    q0, q0                      \n"
75       "vrev16.8    q1, q1                      \n"
76       "vrev16.8    q2, q2                      \n"
77       "vrev16.8    q3, q3                      \n"
78
79       "mov         %0, %3                      \n"
80
81     MEMACCESS(0)
82       "vst1.8      {d1}, [%0], %4              \n"
83     MEMACCESS(0)
84       "vst1.8      {d0}, [%0], %4              \n"
85     MEMACCESS(0)
86       "vst1.8      {d3}, [%0], %4              \n"
87     MEMACCESS(0)
88       "vst1.8      {d2}, [%0], %4              \n"
89     MEMACCESS(0)
90       "vst1.8      {d5}, [%0], %4              \n"
91     MEMACCESS(0)
92       "vst1.8      {d4}, [%0], %4              \n"
93     MEMACCESS(0)
94       "vst1.8      {d7}, [%0], %4              \n"
95     MEMACCESS(0)
96       "vst1.8      {d6}, [%0]                  \n"
97
98       "add         %1, #8                      \n"  // src += 8
99       "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
100       "subs        %5,  #8                     \n"  // w   -= 8
101       "bge         1b                          \n"
102
103     // add 8 back to counter. if the result is 0 there are
104     // no residuals.
105     "adds        %5, #8                        \n"
106     "beq         4f                            \n"
107
108     // some residual, so between 1 and 7 lines left to transpose
109     "cmp         %5, #2                        \n"
110     "blt         3f                            \n"
111
112     "cmp         %5, #4                        \n"
113     "blt         2f                            \n"
114
115     // 4x8 block
116     "mov         %0, %1                        \n"
117     MEMACCESS(0)
118     "vld1.32     {d0[0]}, [%0], %2             \n"
119     MEMACCESS(0)
120     "vld1.32     {d0[1]}, [%0], %2             \n"
121     MEMACCESS(0)
122     "vld1.32     {d1[0]}, [%0], %2             \n"
123     MEMACCESS(0)
124     "vld1.32     {d1[1]}, [%0], %2             \n"
125     MEMACCESS(0)
126     "vld1.32     {d2[0]}, [%0], %2             \n"
127     MEMACCESS(0)
128     "vld1.32     {d2[1]}, [%0], %2             \n"
129     MEMACCESS(0)
130     "vld1.32     {d3[0]}, [%0], %2             \n"
131     MEMACCESS(0)
132     "vld1.32     {d3[1]}, [%0]                 \n"
133
134     "mov         %0, %3                        \n"
135
136     MEMACCESS(6)
137     "vld1.8      {q3}, [%6]                    \n"
138
139     "vtbl.8      d4, {d0, d1}, d6              \n"
140     "vtbl.8      d5, {d0, d1}, d7              \n"
141     "vtbl.8      d0, {d2, d3}, d6              \n"
142     "vtbl.8      d1, {d2, d3}, d7              \n"
143
144     // TODO(frkoenig): Rework shuffle above to
145     // write out with 4 instead of 8 writes.
146     MEMACCESS(0)
147     "vst1.32     {d4[0]}, [%0], %4             \n"
148     MEMACCESS(0)
149     "vst1.32     {d4[1]}, [%0], %4             \n"
150     MEMACCESS(0)
151     "vst1.32     {d5[0]}, [%0], %4             \n"
152     MEMACCESS(0)
153     "vst1.32     {d5[1]}, [%0]                 \n"
154
155     "add         %0, %3, #4                    \n"
156     MEMACCESS(0)
157     "vst1.32     {d0[0]}, [%0], %4             \n"
158     MEMACCESS(0)
159     "vst1.32     {d0[1]}, [%0], %4             \n"
160     MEMACCESS(0)
161     "vst1.32     {d1[0]}, [%0], %4             \n"
162     MEMACCESS(0)
163     "vst1.32     {d1[1]}, [%0]                 \n"
164
165     "add         %1, #4                        \n"  // src += 4
166     "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
167     "subs        %5,  #4                       \n"  // w   -= 4
168     "beq         4f                            \n"
169
170     // some residual, check to see if it includes a 2x8 block,
171     // or less
172     "cmp         %5, #2                        \n"
173     "blt         3f                            \n"
174
175     // 2x8 block
176     "2:                                        \n"
177     "mov         %0, %1                        \n"
178     MEMACCESS(0)
179     "vld1.16     {d0[0]}, [%0], %2             \n"
180     MEMACCESS(0)
181     "vld1.16     {d1[0]}, [%0], %2             \n"
182     MEMACCESS(0)
183     "vld1.16     {d0[1]}, [%0], %2             \n"
184     MEMACCESS(0)
185     "vld1.16     {d1[1]}, [%0], %2             \n"
186     MEMACCESS(0)
187     "vld1.16     {d0[2]}, [%0], %2             \n"
188     MEMACCESS(0)
189     "vld1.16     {d1[2]}, [%0], %2             \n"
190     MEMACCESS(0)
191     "vld1.16     {d0[3]}, [%0], %2             \n"
192     MEMACCESS(0)
193     "vld1.16     {d1[3]}, [%0]                 \n"
194
195     "vtrn.8      d0, d1                        \n"
196
197     "mov         %0, %3                        \n"
198
199     MEMACCESS(0)
200     "vst1.64     {d0}, [%0], %4                \n"
201     MEMACCESS(0)
202     "vst1.64     {d1}, [%0]                    \n"
203
204     "add         %1, #2                        \n"  // src += 2
205     "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
206     "subs        %5,  #2                       \n"  // w   -= 2
207     "beq         4f                            \n"
208
209     // 1x8 block
210     "3:                                        \n"
211     MEMACCESS(1)
212     "vld1.8      {d0[0]}, [%1], %2             \n"
213     MEMACCESS(1)
214     "vld1.8      {d0[1]}, [%1], %2             \n"
215     MEMACCESS(1)
216     "vld1.8      {d0[2]}, [%1], %2             \n"
217     MEMACCESS(1)
218     "vld1.8      {d0[3]}, [%1], %2             \n"
219     MEMACCESS(1)
220     "vld1.8      {d0[4]}, [%1], %2             \n"
221     MEMACCESS(1)
222     "vld1.8      {d0[5]}, [%1], %2             \n"
223     MEMACCESS(1)
224     "vld1.8      {d0[6]}, [%1], %2             \n"
225     MEMACCESS(1)
226     "vld1.8      {d0[7]}, [%1]                 \n"
227
228     MEMACCESS(3)
229     "vst1.64     {d0}, [%3]                    \n"
230
231     "4:                                        \n"
232
233     : "+r"(src_temp),          // %0
234       "+r"(src),               // %1
235       "+r"(src_stride),        // %2
236       "+r"(dst),               // %3
237       "+r"(dst_stride),        // %4
238       "+r"(width)              // %5
239     : "r"(&kVTbl4x4Transpose)  // %6
240     : "memory", "cc", "q0", "q1", "q2", "q3"
241   );
242 }
243
244 static uvec8 kVTbl4x4TransposeDi =
245   { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
246
247 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
248                          uint8* dst_a, int dst_stride_a,
249                          uint8* dst_b, int dst_stride_b,
250                          int width) {
251   const uint8* src_temp = NULL;
252   asm volatile (
253     // loops are on blocks of 8. loop will stop when
254     // counter gets to or below 0. starting the counter
255     // at w-8 allow for this
256     "sub         %7, #8                        \n"
257
258     // handle 8x8 blocks. this should be the majority of the plane
259     ".p2align  2                               \n"
260     "1:                                        \n"
261       "mov         %0, %1                      \n"
262
263       MEMACCESS(0)
264       "vld2.8      {d0,  d1},  [%0], %2        \n"
265       MEMACCESS(0)
266       "vld2.8      {d2,  d3},  [%0], %2        \n"
267       MEMACCESS(0)
268       "vld2.8      {d4,  d5},  [%0], %2        \n"
269       MEMACCESS(0)
270       "vld2.8      {d6,  d7},  [%0], %2        \n"
271       MEMACCESS(0)
272       "vld2.8      {d16, d17}, [%0], %2        \n"
273       MEMACCESS(0)
274       "vld2.8      {d18, d19}, [%0], %2        \n"
275       MEMACCESS(0)
276       "vld2.8      {d20, d21}, [%0], %2        \n"
277       MEMACCESS(0)
278       "vld2.8      {d22, d23}, [%0]            \n"
279
280       "vtrn.8      q1, q0                      \n"
281       "vtrn.8      q3, q2                      \n"
282       "vtrn.8      q9, q8                      \n"
283       "vtrn.8      q11, q10                    \n"
284
285       "vtrn.16     q1, q3                      \n"
286       "vtrn.16     q0, q2                      \n"
287       "vtrn.16     q9, q11                     \n"
288       "vtrn.16     q8, q10                     \n"
289
290       "vtrn.32     q1, q9                      \n"
291       "vtrn.32     q0, q8                      \n"
292       "vtrn.32     q3, q11                     \n"
293       "vtrn.32     q2, q10                     \n"
294
295       "vrev16.8    q0, q0                      \n"
296       "vrev16.8    q1, q1                      \n"
297       "vrev16.8    q2, q2                      \n"
298       "vrev16.8    q3, q3                      \n"
299       "vrev16.8    q8, q8                      \n"
300       "vrev16.8    q9, q9                      \n"
301       "vrev16.8    q10, q10                    \n"
302       "vrev16.8    q11, q11                    \n"
303
304       "mov         %0, %3                      \n"
305
306     MEMACCESS(0)
307       "vst1.8      {d2},  [%0], %4             \n"
308     MEMACCESS(0)
309       "vst1.8      {d0},  [%0], %4             \n"
310     MEMACCESS(0)
311       "vst1.8      {d6},  [%0], %4             \n"
312     MEMACCESS(0)
313       "vst1.8      {d4},  [%0], %4             \n"
314     MEMACCESS(0)
315       "vst1.8      {d18}, [%0], %4             \n"
316     MEMACCESS(0)
317       "vst1.8      {d16}, [%0], %4             \n"
318     MEMACCESS(0)
319       "vst1.8      {d22}, [%0], %4             \n"
320     MEMACCESS(0)
321       "vst1.8      {d20}, [%0]                 \n"
322
323       "mov         %0, %5                      \n"
324
325     MEMACCESS(0)
326       "vst1.8      {d3},  [%0], %6             \n"
327     MEMACCESS(0)
328       "vst1.8      {d1},  [%0], %6             \n"
329     MEMACCESS(0)
330       "vst1.8      {d7},  [%0], %6             \n"
331     MEMACCESS(0)
332       "vst1.8      {d5},  [%0], %6             \n"
333     MEMACCESS(0)
334       "vst1.8      {d19}, [%0], %6             \n"
335     MEMACCESS(0)
336       "vst1.8      {d17}, [%0], %6             \n"
337     MEMACCESS(0)
338       "vst1.8      {d23}, [%0], %6             \n"
339     MEMACCESS(0)
340       "vst1.8      {d21}, [%0]                 \n"
341
342       "add         %1, #8*2                    \n"  // src   += 8*2
343       "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
344       "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
345       "subs        %7,  #8                     \n"  // w     -= 8
346       "bge         1b                          \n"
347
348     // add 8 back to counter. if the result is 0 there are
349     // no residuals.
350     "adds        %7, #8                        \n"
351     "beq         4f                            \n"
352
353     // some residual, so between 1 and 7 lines left to transpose
354     "cmp         %7, #2                        \n"
355     "blt         3f                            \n"
356
357     "cmp         %7, #4                        \n"
358     "blt         2f                            \n"
359
360     // TODO(frkoenig): Clean this up
361     // 4x8 block
362     "mov         %0, %1                        \n"
363     MEMACCESS(0)
364     "vld1.64     {d0}, [%0], %2                \n"
365     MEMACCESS(0)
366     "vld1.64     {d1}, [%0], %2                \n"
367     MEMACCESS(0)
368     "vld1.64     {d2}, [%0], %2                \n"
369     MEMACCESS(0)
370     "vld1.64     {d3}, [%0], %2                \n"
371     MEMACCESS(0)
372     "vld1.64     {d4}, [%0], %2                \n"
373     MEMACCESS(0)
374     "vld1.64     {d5}, [%0], %2                \n"
375     MEMACCESS(0)
376     "vld1.64     {d6}, [%0], %2                \n"
377     MEMACCESS(0)
378     "vld1.64     {d7}, [%0]                    \n"
379
380     MEMACCESS(8)
381     "vld1.8      {q15}, [%8]                   \n"
382
383     "vtrn.8      q0, q1                        \n"
384     "vtrn.8      q2, q3                        \n"
385
386     "vtbl.8      d16, {d0, d1}, d30            \n"
387     "vtbl.8      d17, {d0, d1}, d31            \n"
388     "vtbl.8      d18, {d2, d3}, d30            \n"
389     "vtbl.8      d19, {d2, d3}, d31            \n"
390     "vtbl.8      d20, {d4, d5}, d30            \n"
391     "vtbl.8      d21, {d4, d5}, d31            \n"
392     "vtbl.8      d22, {d6, d7}, d30            \n"
393     "vtbl.8      d23, {d6, d7}, d31            \n"
394
395     "mov         %0, %3                        \n"
396
397     MEMACCESS(0)
398     "vst1.32     {d16[0]},  [%0], %4           \n"
399     MEMACCESS(0)
400     "vst1.32     {d16[1]},  [%0], %4           \n"
401     MEMACCESS(0)
402     "vst1.32     {d17[0]},  [%0], %4           \n"
403     MEMACCESS(0)
404     "vst1.32     {d17[1]},  [%0], %4           \n"
405
406     "add         %0, %3, #4                    \n"
407     MEMACCESS(0)
408     "vst1.32     {d20[0]}, [%0], %4            \n"
409     MEMACCESS(0)
410     "vst1.32     {d20[1]}, [%0], %4            \n"
411     MEMACCESS(0)
412     "vst1.32     {d21[0]}, [%0], %4            \n"
413     MEMACCESS(0)
414     "vst1.32     {d21[1]}, [%0]                \n"
415
416     "mov         %0, %5                        \n"
417
418     MEMACCESS(0)
419     "vst1.32     {d18[0]}, [%0], %6            \n"
420     MEMACCESS(0)
421     "vst1.32     {d18[1]}, [%0], %6            \n"
422     MEMACCESS(0)
423     "vst1.32     {d19[0]}, [%0], %6            \n"
424     MEMACCESS(0)
425     "vst1.32     {d19[1]}, [%0], %6            \n"
426
427     "add         %0, %5, #4                    \n"
428     MEMACCESS(0)
429     "vst1.32     {d22[0]},  [%0], %6           \n"
430     MEMACCESS(0)
431     "vst1.32     {d22[1]},  [%0], %6           \n"
432     MEMACCESS(0)
433     "vst1.32     {d23[0]},  [%0], %6           \n"
434     MEMACCESS(0)
435     "vst1.32     {d23[1]},  [%0]               \n"
436
437     "add         %1, #4*2                      \n"  // src   += 4 * 2
438     "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
439     "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
440     "subs        %7,  #4                       \n"  // w     -= 4
441     "beq         4f                            \n"
442
443     // some residual, check to see if it includes a 2x8 block,
444     // or less
445     "cmp         %7, #2                        \n"
446     "blt         3f                            \n"
447
448     // 2x8 block
449     "2:                                        \n"
450     "mov         %0, %1                        \n"
451     MEMACCESS(0)
452     "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
453     MEMACCESS(0)
454     "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
455     MEMACCESS(0)
456     "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
457     MEMACCESS(0)
458     "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
459     MEMACCESS(0)
460     "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
461     MEMACCESS(0)
462     "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
463     MEMACCESS(0)
464     "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
465     MEMACCESS(0)
466     "vld2.16     {d1[3], d3[3]}, [%0]          \n"
467
468     "vtrn.8      d0, d1                        \n"
469     "vtrn.8      d2, d3                        \n"
470
471     "mov         %0, %3                        \n"
472
473     MEMACCESS(0)
474     "vst1.64     {d0}, [%0], %4                \n"
475     MEMACCESS(0)
476     "vst1.64     {d2}, [%0]                    \n"
477
478     "mov         %0, %5                        \n"
479
480     MEMACCESS(0)
481     "vst1.64     {d1}, [%0], %6                \n"
482     MEMACCESS(0)
483     "vst1.64     {d3}, [%0]                    \n"
484
485     "add         %1, #2*2                      \n"  // src   += 2 * 2
486     "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
487     "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
488     "subs        %7,  #2                       \n"  // w     -= 2
489     "beq         4f                            \n"
490
491     // 1x8 block
492     "3:                                        \n"
493     MEMACCESS(1)
494     "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
495     MEMACCESS(1)
496     "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
497     MEMACCESS(1)
498     "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
499     MEMACCESS(1)
500     "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
501     MEMACCESS(1)
502     "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
503     MEMACCESS(1)
504     "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
505     MEMACCESS(1)
506     "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
507     MEMACCESS(1)
508     "vld2.8      {d0[7], d1[7]}, [%1]          \n"
509
510     MEMACCESS(3)
511     "vst1.64     {d0}, [%3]                    \n"
512     MEMACCESS(5)
513     "vst1.64     {d1}, [%5]                    \n"
514
515     "4:                                        \n"
516
517     : "+r"(src_temp),            // %0
518       "+r"(src),                 // %1
519       "+r"(src_stride),          // %2
520       "+r"(dst_a),               // %3
521       "+r"(dst_stride_a),        // %4
522       "+r"(dst_b),               // %5
523       "+r"(dst_stride_b),        // %6
524       "+r"(width)                // %7
525     : "r"(&kVTbl4x4TransposeDi)  // %8
526     : "memory", "cc",
527       "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
528   );
529 }
530 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
531
532 #ifdef __cplusplus
533 }  // extern "C"
534 }  // namespace libyuv
535 #endif