2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_config.h"
14 #include "vpx_ports/mem.h"
16 static const int8_t vp8_sub_pel_filters[8][8] = {
17 { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positionyys are */
18 { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */
19 { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
20 { 0, -9, 93, 50, -6, 0, 0, 0 },
21 { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
22 { 0, -6, 50, 93, -9, 0, 0, 0 },
23 { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
24 { 0, -1, 12, 123, -6, 0, 0, 0 },
27 // This table is derived from vp8/common/filter.c:vp8_sub_pel_filters.
28 // Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive.
29 // Elements 1 and 4 are either 0 or negative. The code accounts for this with
30 // multiply/accumulates which either add or subtract as needed. The other
31 // functions will be updated to use this table later.
32 // It is also expanded to 8 elements to allow loading into 64 bit neon
34 static const uint8_t abs_filters[8][8] = {
35 { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, 6, 123, 12, 1, 0, 0, 0 },
36 { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 },
37 { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 },
38 { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 },
41 static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
42 return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
45 static INLINE void store4x4(unsigned char *dst, int dst_stride,
46 const uint8x8_t a0, const uint8x8_t a1) {
47 if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) {
48 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0);
50 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1);
52 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0);
54 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1);
56 // Store to the aligned local buffer and memcpy instead of vget_lane_u8
57 // which is really really slow.
58 uint32_t output_buffer[4];
59 vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0);
60 vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1);
61 vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0);
62 vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1);
64 memcpy(dst, output_buffer, 4);
66 memcpy(dst, output_buffer + 1, 4);
68 memcpy(dst, output_buffer + 2, 4);
70 memcpy(dst, output_buffer + 3, 4);
74 static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
75 const uint8x8_t filter, uint16x8_t *c,
77 const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
78 vreinterpret_u32_u8(vget_high_u8(a)));
79 const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
80 vreinterpret_u32_u8(vget_high_u8(b)));
81 *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
82 *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
85 static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b,
86 const uint8x8_t filter, uint16x8_t *c,
88 const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
89 vreinterpret_u32_u8(vget_high_u8(a)));
90 const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
91 vreinterpret_u32_u8(vget_high_u8(b)));
92 *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
93 *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
96 static INLINE void yonly4x4(const unsigned char *src, int src_stride,
97 int filter_offset, unsigned char *dst,
99 uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
100 uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
101 uint16x8_t c0, c1, c2, c3;
105 const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]);
106 const uint8x8_t filter0 = vdup_lane_u8(filter, 0);
107 const uint8x8_t filter1 = vdup_lane_u8(filter, 1);
108 const uint8x8_t filter2 = vdup_lane_u8(filter, 2);
109 const uint8x8_t filter3 = vdup_lane_u8(filter, 3);
110 const uint8x8_t filter4 = vdup_lane_u8(filter, 4);
111 const uint8x8_t filter5 = vdup_lane_u8(filter, 5);
113 src -= src_stride * 2;
114 // Shift the even rows to allow using 'vext' to combine the vectors. armv8
115 // has vcopy_lane which would be interesting. This started as just a
116 // horrible workaround for clang adding alignment hints to 32bit loads:
117 // https://llvm.org/bugs/show_bug.cgi?id=24421
118 // But it turns out it almost identical to casting the loads.
119 a0 = load_and_shift(src);
123 a2 = load_and_shift(src);
127 a4 = load_and_shift(src);
131 a6 = load_and_shift(src);
137 // Combine the rows so we can operate on 8 at a time.
138 b0 = vext_u8(a0, a1, 4);
139 b2 = vext_u8(a2, a3, 4);
140 b4 = vext_u8(a4, a5, 4);
141 b6 = vext_u8(a6, a7, 4);
144 // To keep with the 8-at-a-time theme, combine *alternate* rows. This
145 // allows combining the odd rows with the even.
146 b1 = vext_u8(b0, b2, 4);
147 b3 = vext_u8(b2, b4, 4);
148 b5 = vext_u8(b4, b6, 4);
149 b7 = vext_u8(b6, b8, 4);
151 // Multiply and expand to 16 bits.
152 c0 = vmull_u8(b0, filter0);
153 c1 = vmull_u8(b2, filter0);
154 c2 = vmull_u8(b5, filter5);
155 c3 = vmull_u8(b7, filter5);
157 // Multiply, subtract and accumulate for filters 1 and 4 (the negative
159 c0 = vmlsl_u8(c0, b4, filter4);
160 c1 = vmlsl_u8(c1, b6, filter4);
161 c2 = vmlsl_u8(c2, b1, filter1);
162 c3 = vmlsl_u8(c3, b3, filter1);
164 // Add more positive ones. vmlal should really return a signed type.
165 // It's doing signed math internally, as evidenced by the fact we can do
166 // subtractions followed by more additions. Ideally we could use
167 // vqmlal/sl but that instruction doesn't exist. Might be able to
168 // shoehorn vqdmlal/vqdmlsl in here but it would take some effort.
169 c0 = vmlal_u8(c0, b2, filter2);
170 c1 = vmlal_u8(c1, b4, filter2);
171 c2 = vmlal_u8(c2, b3, filter3);
172 c3 = vmlal_u8(c3, b5, filter3);
174 // Use signed saturation math because vmlsl may have left some negative
176 d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
177 d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
179 // Use signed again because numbers like -200 need to be saturated to 0.
180 e0 = vqrshrun_n_s16(d0, 7);
181 e1 = vqrshrun_n_s16(d1, 7);
183 store4x4(dst, dst_stride, e0, e1);
186 void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
187 int xoffset, int yoffset,
188 unsigned char *dst_ptr, int dst_pitch) {
189 uint8x16_t s0, s1, s2, s3, s4;
191 // Variables to hold src[] elements for the given filter[]
192 uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5;
193 uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4;
194 uint8x16_t s01_f0, s23_f0;
195 uint64x2_t s01_f3, s23_f3;
196 uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q;
197 // Accumulator variables.
198 uint16x8_t d0123, d4567, d89;
199 uint16x8_t d0123_a, d4567_a, d89_a;
200 int16x8_t e0123, e4567, e89;
201 // Second pass intermediates.
202 uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
203 uint16x8_t c0, c1, c2, c3;
206 uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5;
208 if (xoffset == 0) { // Second pass only.
209 yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch);
213 if (yoffset == 0) { // First pass only.
215 } else { // Add context for the second pass. 2 extra lines on top.
216 src_ptr -= 2 + (src_pixels_per_line * 2);
219 filter = vld1_u8(abs_filters[xoffset]);
220 filter0 = vdup_lane_u8(filter, 0);
221 filter1 = vdup_lane_u8(filter, 1);
222 filter2 = vdup_lane_u8(filter, 2);
223 filter3 = vdup_lane_u8(filter, 3);
224 filter4 = vdup_lane_u8(filter, 4);
225 filter5 = vdup_lane_u8(filter, 5);
227 // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of
228 // garbage. So much effort for that last single bit.
229 // The low values of each pair are for filter0.
230 s0 = vld1q_u8(src_ptr);
231 src_ptr += src_pixels_per_line;
232 s1 = vld1q_u8(src_ptr);
233 src_ptr += src_pixels_per_line;
234 s2 = vld1q_u8(src_ptr);
235 src_ptr += src_pixels_per_line;
236 s3 = vld1q_u8(src_ptr);
237 src_ptr += src_pixels_per_line;
239 // Shift to extract values for filter[5]
240 // If src[] is 0, this puts:
241 // 3 4 5 6 7 8 9 10 in s0_f5
242 // Can't use vshr.u64 because it crosses the double word boundary.
243 s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
244 s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
245 s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
246 s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
248 s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
249 s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
251 s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
252 s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
253 d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
254 d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
256 // Keep original src data as 64 bits to simplify shifting and extracting.
257 s01 = vreinterpretq_u64_u8(s01_f0);
258 s23 = vreinterpretq_u64_u8(s23_f0);
261 filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
263 // Shift over one to use -1, 0, 1, 2 for filter1
264 // -1 0 1 2 * filter1
265 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
266 vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
270 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
271 vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
275 filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
276 vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
280 s01_f3 = vshrq_n_u64(s01, 24);
281 s23_f3 = vshrq_n_u64(s23, 24);
282 s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
283 vreinterpret_u32_u64(vget_high_u64(s01_f3)));
284 s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
285 vreinterpret_u32_u64(vget_high_u64(s23_f3)));
286 // Accumulate into different registers so it can use saturated addition.
287 d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
288 d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
291 vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
293 vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
296 b0 = vqrshrun_n_s16(e0123, 7);
297 b2 = vqrshrun_n_s16(e4567, 7);
299 if (yoffset == 0) { // firstpass_filter4x4_only
300 store4x4(dst_ptr, dst_pitch, b0, b2);
304 // Load additional context when doing both filters.
305 s0 = vld1q_u8(src_ptr);
306 src_ptr += src_pixels_per_line;
307 s1 = vld1q_u8(src_ptr);
308 src_ptr += src_pixels_per_line;
309 s2 = vld1q_u8(src_ptr);
310 src_ptr += src_pixels_per_line;
311 s3 = vld1q_u8(src_ptr);
312 src_ptr += src_pixels_per_line;
313 s4 = vld1q_u8(src_ptr);
315 s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
316 s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
317 s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
318 s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
319 s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5);
322 s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
323 s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
325 s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
326 s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
327 // But this time instead of 16 pixels to filter, there are 20. So an extra
328 // run with a doubleword register.
329 d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
330 d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
331 d89 = vmull_u8(s4_f5, filter5);
333 // Save a copy as u64 for shifting.
334 s01 = vreinterpretq_u64_u8(s01_f0);
335 s23 = vreinterpretq_u64_u8(s23_f0);
337 filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
338 d89 = vmlal_u8(d89, vget_low_u8(s4), filter0);
340 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
341 vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
343 s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1);
344 d89 = vmlsl_u8(d89, s4_f1, filter1);
346 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
347 vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
349 s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4);
350 d89 = vmlsl_u8(d89, s4_f4, filter4);
352 filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
353 vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
355 s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2);
356 d89 = vmlal_u8(d89, s4_f2, filter2);
358 s01_f3 = vshrq_n_u64(s01, 24);
359 s23_f3 = vshrq_n_u64(s23, 24);
360 s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
361 vreinterpret_u32_u64(vget_high_u64(s01_f3)));
362 s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
363 vreinterpret_u32_u64(vget_high_u64(s23_f3)));
364 s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3);
365 d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
366 d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
367 d89_a = vmull_u8(s4_f3, filter3);
370 vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
372 vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
373 e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a));
375 b4 = vqrshrun_n_s16(e0123, 7);
376 b6 = vqrshrun_n_s16(e4567, 7);
377 b8 = vqrshrun_n_s16(e89, 7);
380 filter = vld1_u8(abs_filters[yoffset]);
381 filter0 = vdup_lane_u8(filter, 0);
382 filter1 = vdup_lane_u8(filter, 1);
383 filter2 = vdup_lane_u8(filter, 2);
384 filter3 = vdup_lane_u8(filter, 3);
385 filter4 = vdup_lane_u8(filter, 4);
386 filter5 = vdup_lane_u8(filter, 5);
388 b1 = vext_u8(b0, b2, 4);
389 b3 = vext_u8(b2, b4, 4);
390 b5 = vext_u8(b4, b6, 4);
391 b7 = vext_u8(b6, b8, 4);
393 c0 = vmull_u8(b0, filter0);
394 c1 = vmull_u8(b2, filter0);
395 c2 = vmull_u8(b5, filter5);
396 c3 = vmull_u8(b7, filter5);
398 c0 = vmlsl_u8(c0, b4, filter4);
399 c1 = vmlsl_u8(c1, b6, filter4);
400 c2 = vmlsl_u8(c2, b1, filter1);
401 c3 = vmlsl_u8(c3, b3, filter1);
403 c0 = vmlal_u8(c0, b2, filter2);
404 c1 = vmlal_u8(c1, b4, filter2);
405 c2 = vmlal_u8(c2, b3, filter3);
406 c3 = vmlal_u8(c3, b5, filter3);
408 d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
409 d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
411 e0 = vqrshrun_n_s16(d0, 7);
412 e1 = vqrshrun_n_s16(d1, 7);
414 store4x4(dst_ptr, dst_pitch, e0, e1);
417 void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
418 int xoffset, int yoffset,
419 unsigned char *dst_ptr, int dst_pitch) {
421 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
422 uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
423 uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
424 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
425 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
426 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
427 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
428 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
429 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
431 if (xoffset == 0) { // secondpass_filter8x4_only
432 // load second_pass filter
433 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
434 d0s8 = vdup_lane_s8(dtmps8, 0);
435 d1s8 = vdup_lane_s8(dtmps8, 1);
436 d2s8 = vdup_lane_s8(dtmps8, 2);
437 d3s8 = vdup_lane_s8(dtmps8, 3);
438 d4s8 = vdup_lane_s8(dtmps8, 4);
439 d5s8 = vdup_lane_s8(dtmps8, 5);
440 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
441 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
442 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
443 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
444 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
445 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
448 src = src_ptr - src_pixels_per_line * 2;
449 d22u8 = vld1_u8(src);
450 src += src_pixels_per_line;
451 d23u8 = vld1_u8(src);
452 src += src_pixels_per_line;
453 d24u8 = vld1_u8(src);
454 src += src_pixels_per_line;
455 d25u8 = vld1_u8(src);
456 src += src_pixels_per_line;
457 d26u8 = vld1_u8(src);
458 src += src_pixels_per_line;
459 d27u8 = vld1_u8(src);
460 src += src_pixels_per_line;
461 d28u8 = vld1_u8(src);
462 src += src_pixels_per_line;
463 d29u8 = vld1_u8(src);
464 src += src_pixels_per_line;
465 d30u8 = vld1_u8(src);
467 q3u16 = vmull_u8(d22u8, d0u8);
468 q4u16 = vmull_u8(d23u8, d0u8);
469 q5u16 = vmull_u8(d24u8, d0u8);
470 q6u16 = vmull_u8(d25u8, d0u8);
472 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
473 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
474 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
475 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
477 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
478 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
479 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
480 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
482 q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
483 q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
484 q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
485 q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
487 q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
488 q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
489 q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
490 q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
492 q7u16 = vmull_u8(d25u8, d3u8);
493 q8u16 = vmull_u8(d26u8, d3u8);
494 q9u16 = vmull_u8(d27u8, d3u8);
495 q10u16 = vmull_u8(d28u8, d3u8);
497 q3s16 = vreinterpretq_s16_u16(q3u16);
498 q4s16 = vreinterpretq_s16_u16(q4u16);
499 q5s16 = vreinterpretq_s16_u16(q5u16);
500 q6s16 = vreinterpretq_s16_u16(q6u16);
501 q7s16 = vreinterpretq_s16_u16(q7u16);
502 q8s16 = vreinterpretq_s16_u16(q8u16);
503 q9s16 = vreinterpretq_s16_u16(q9u16);
504 q10s16 = vreinterpretq_s16_u16(q10u16);
506 q7s16 = vqaddq_s16(q7s16, q3s16);
507 q8s16 = vqaddq_s16(q8s16, q4s16);
508 q9s16 = vqaddq_s16(q9s16, q5s16);
509 q10s16 = vqaddq_s16(q10s16, q6s16);
511 d6u8 = vqrshrun_n_s16(q7s16, 7);
512 d7u8 = vqrshrun_n_s16(q8s16, 7);
513 d8u8 = vqrshrun_n_s16(q9s16, 7);
514 d9u8 = vqrshrun_n_s16(q10s16, 7);
516 vst1_u8(dst_ptr, d6u8);
517 dst_ptr += dst_pitch;
518 vst1_u8(dst_ptr, d7u8);
519 dst_ptr += dst_pitch;
520 vst1_u8(dst_ptr, d8u8);
521 dst_ptr += dst_pitch;
522 vst1_u8(dst_ptr, d9u8);
526 // load first_pass filter
527 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
528 d0s8 = vdup_lane_s8(dtmps8, 0);
529 d1s8 = vdup_lane_s8(dtmps8, 1);
530 d2s8 = vdup_lane_s8(dtmps8, 2);
531 d3s8 = vdup_lane_s8(dtmps8, 3);
532 d4s8 = vdup_lane_s8(dtmps8, 4);
533 d5s8 = vdup_lane_s8(dtmps8, 5);
534 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
535 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
536 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
537 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
538 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
539 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
541 // First pass: output_height lines x output_width columns (9x4)
542 if (yoffset == 0) // firstpass_filter4x4_only
545 src = src_ptr - 2 - (src_pixels_per_line * 2);
546 q3u8 = vld1q_u8(src);
547 src += src_pixels_per_line;
548 q4u8 = vld1q_u8(src);
549 src += src_pixels_per_line;
550 q5u8 = vld1q_u8(src);
551 src += src_pixels_per_line;
552 q6u8 = vld1q_u8(src);
554 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
555 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
556 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
557 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
559 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
560 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
561 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
562 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
564 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
565 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
566 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
567 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
569 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
570 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
571 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
572 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
574 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
575 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
576 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
577 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
579 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
580 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
581 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
582 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
584 q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
585 q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
586 q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
587 q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
589 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
590 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
591 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
592 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
594 q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
595 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
596 q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
597 q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
599 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
600 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
601 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
602 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
604 q3u16 = vmull_u8(d28u8, d3u8);
605 q4u16 = vmull_u8(d29u8, d3u8);
606 q5u16 = vmull_u8(d30u8, d3u8);
607 q6u16 = vmull_u8(d31u8, d3u8);
609 q3s16 = vreinterpretq_s16_u16(q3u16);
610 q4s16 = vreinterpretq_s16_u16(q4u16);
611 q5s16 = vreinterpretq_s16_u16(q5u16);
612 q6s16 = vreinterpretq_s16_u16(q6u16);
613 q7s16 = vreinterpretq_s16_u16(q7u16);
614 q8s16 = vreinterpretq_s16_u16(q8u16);
615 q9s16 = vreinterpretq_s16_u16(q9u16);
616 q10s16 = vreinterpretq_s16_u16(q10u16);
618 q7s16 = vqaddq_s16(q7s16, q3s16);
619 q8s16 = vqaddq_s16(q8s16, q4s16);
620 q9s16 = vqaddq_s16(q9s16, q5s16);
621 q10s16 = vqaddq_s16(q10s16, q6s16);
623 d22u8 = vqrshrun_n_s16(q7s16, 7);
624 d23u8 = vqrshrun_n_s16(q8s16, 7);
625 d24u8 = vqrshrun_n_s16(q9s16, 7);
626 d25u8 = vqrshrun_n_s16(q10s16, 7);
628 if (yoffset == 0) { // firstpass_filter8x4_only
629 vst1_u8(dst_ptr, d22u8);
630 dst_ptr += dst_pitch;
631 vst1_u8(dst_ptr, d23u8);
632 dst_ptr += dst_pitch;
633 vst1_u8(dst_ptr, d24u8);
634 dst_ptr += dst_pitch;
635 vst1_u8(dst_ptr, d25u8);
639 // First Pass on rest 5-line data
640 src += src_pixels_per_line;
641 q3u8 = vld1q_u8(src);
642 src += src_pixels_per_line;
643 q4u8 = vld1q_u8(src);
644 src += src_pixels_per_line;
645 q5u8 = vld1q_u8(src);
646 src += src_pixels_per_line;
647 q6u8 = vld1q_u8(src);
648 src += src_pixels_per_line;
649 q7u8 = vld1q_u8(src);
651 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
652 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
653 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
654 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
655 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
657 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
658 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
659 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
660 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
661 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
663 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
664 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
665 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
666 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
667 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
669 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
670 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
671 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
672 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
673 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
675 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
676 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
677 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
678 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
679 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
681 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
682 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
683 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
684 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
685 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
687 q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
688 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
689 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
690 q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
691 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
693 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
694 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
695 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
696 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
697 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
699 q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
700 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
701 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
702 q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
703 q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
705 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
706 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
707 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
708 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
709 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
711 q3u16 = vmull_u8(d27u8, d3u8);
712 q4u16 = vmull_u8(d28u8, d3u8);
713 q5u16 = vmull_u8(d29u8, d3u8);
714 q6u16 = vmull_u8(d30u8, d3u8);
715 q7u16 = vmull_u8(d31u8, d3u8);
717 q3s16 = vreinterpretq_s16_u16(q3u16);
718 q4s16 = vreinterpretq_s16_u16(q4u16);
719 q5s16 = vreinterpretq_s16_u16(q5u16);
720 q6s16 = vreinterpretq_s16_u16(q6u16);
721 q7s16 = vreinterpretq_s16_u16(q7u16);
722 q8s16 = vreinterpretq_s16_u16(q8u16);
723 q9s16 = vreinterpretq_s16_u16(q9u16);
724 q10s16 = vreinterpretq_s16_u16(q10u16);
725 q11s16 = vreinterpretq_s16_u16(q11u16);
726 q12s16 = vreinterpretq_s16_u16(q12u16);
728 q8s16 = vqaddq_s16(q8s16, q3s16);
729 q9s16 = vqaddq_s16(q9s16, q4s16);
730 q10s16 = vqaddq_s16(q10s16, q5s16);
731 q11s16 = vqaddq_s16(q11s16, q6s16);
732 q12s16 = vqaddq_s16(q12s16, q7s16);
734 d26u8 = vqrshrun_n_s16(q8s16, 7);
735 d27u8 = vqrshrun_n_s16(q9s16, 7);
736 d28u8 = vqrshrun_n_s16(q10s16, 7);
737 d29u8 = vqrshrun_n_s16(q11s16, 7);
738 d30u8 = vqrshrun_n_s16(q12s16, 7);
741 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
742 d0s8 = vdup_lane_s8(dtmps8, 0);
743 d1s8 = vdup_lane_s8(dtmps8, 1);
744 d2s8 = vdup_lane_s8(dtmps8, 2);
745 d3s8 = vdup_lane_s8(dtmps8, 3);
746 d4s8 = vdup_lane_s8(dtmps8, 4);
747 d5s8 = vdup_lane_s8(dtmps8, 5);
748 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
749 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
750 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
751 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
752 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
753 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
755 q3u16 = vmull_u8(d22u8, d0u8);
756 q4u16 = vmull_u8(d23u8, d0u8);
757 q5u16 = vmull_u8(d24u8, d0u8);
758 q6u16 = vmull_u8(d25u8, d0u8);
760 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
761 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
762 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
763 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
765 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
766 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
767 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
768 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
770 q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
771 q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
772 q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
773 q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
775 q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
776 q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
777 q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
778 q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
780 q7u16 = vmull_u8(d25u8, d3u8);
781 q8u16 = vmull_u8(d26u8, d3u8);
782 q9u16 = vmull_u8(d27u8, d3u8);
783 q10u16 = vmull_u8(d28u8, d3u8);
785 q3s16 = vreinterpretq_s16_u16(q3u16);
786 q4s16 = vreinterpretq_s16_u16(q4u16);
787 q5s16 = vreinterpretq_s16_u16(q5u16);
788 q6s16 = vreinterpretq_s16_u16(q6u16);
789 q7s16 = vreinterpretq_s16_u16(q7u16);
790 q8s16 = vreinterpretq_s16_u16(q8u16);
791 q9s16 = vreinterpretq_s16_u16(q9u16);
792 q10s16 = vreinterpretq_s16_u16(q10u16);
794 q7s16 = vqaddq_s16(q7s16, q3s16);
795 q8s16 = vqaddq_s16(q8s16, q4s16);
796 q9s16 = vqaddq_s16(q9s16, q5s16);
797 q10s16 = vqaddq_s16(q10s16, q6s16);
799 d6u8 = vqrshrun_n_s16(q7s16, 7);
800 d7u8 = vqrshrun_n_s16(q8s16, 7);
801 d8u8 = vqrshrun_n_s16(q9s16, 7);
802 d9u8 = vqrshrun_n_s16(q10s16, 7);
804 vst1_u8(dst_ptr, d6u8);
805 dst_ptr += dst_pitch;
806 vst1_u8(dst_ptr, d7u8);
807 dst_ptr += dst_pitch;
808 vst1_u8(dst_ptr, d8u8);
809 dst_ptr += dst_pitch;
810 vst1_u8(dst_ptr, d9u8);
814 void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
815 int xoffset, int yoffset,
816 unsigned char *dst_ptr, int dst_pitch) {
817 unsigned char *src, *tmpp;
818 unsigned char tmp[64];
820 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
821 uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
822 uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
823 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
824 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
825 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
826 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
827 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
828 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
830 if (xoffset == 0) { // secondpass_filter8x8_only
831 // load second_pass filter
832 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
833 d0s8 = vdup_lane_s8(dtmps8, 0);
834 d1s8 = vdup_lane_s8(dtmps8, 1);
835 d2s8 = vdup_lane_s8(dtmps8, 2);
836 d3s8 = vdup_lane_s8(dtmps8, 3);
837 d4s8 = vdup_lane_s8(dtmps8, 4);
838 d5s8 = vdup_lane_s8(dtmps8, 5);
839 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
840 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
841 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
842 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
843 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
844 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
847 src = src_ptr - src_pixels_per_line * 2;
848 d18u8 = vld1_u8(src);
849 src += src_pixels_per_line;
850 d19u8 = vld1_u8(src);
851 src += src_pixels_per_line;
852 d20u8 = vld1_u8(src);
853 src += src_pixels_per_line;
854 d21u8 = vld1_u8(src);
855 src += src_pixels_per_line;
856 d22u8 = vld1_u8(src);
857 src += src_pixels_per_line;
858 d23u8 = vld1_u8(src);
859 src += src_pixels_per_line;
860 d24u8 = vld1_u8(src);
861 src += src_pixels_per_line;
862 d25u8 = vld1_u8(src);
863 src += src_pixels_per_line;
864 d26u8 = vld1_u8(src);
865 src += src_pixels_per_line;
866 d27u8 = vld1_u8(src);
867 src += src_pixels_per_line;
868 d28u8 = vld1_u8(src);
869 src += src_pixels_per_line;
870 d29u8 = vld1_u8(src);
871 src += src_pixels_per_line;
872 d30u8 = vld1_u8(src);
874 for (i = 2; i > 0; i--) {
875 q3u16 = vmull_u8(d18u8, d0u8);
876 q4u16 = vmull_u8(d19u8, d0u8);
877 q5u16 = vmull_u8(d20u8, d0u8);
878 q6u16 = vmull_u8(d21u8, d0u8);
880 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
881 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
882 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
883 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
885 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
886 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
887 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
888 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
890 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
891 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
892 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
893 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
895 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
896 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
897 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
898 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
900 q7u16 = vmull_u8(d21u8, d3u8);
901 q8u16 = vmull_u8(d22u8, d3u8);
902 q9u16 = vmull_u8(d23u8, d3u8);
903 q10u16 = vmull_u8(d24u8, d3u8);
905 q3s16 = vreinterpretq_s16_u16(q3u16);
906 q4s16 = vreinterpretq_s16_u16(q4u16);
907 q5s16 = vreinterpretq_s16_u16(q5u16);
908 q6s16 = vreinterpretq_s16_u16(q6u16);
909 q7s16 = vreinterpretq_s16_u16(q7u16);
910 q8s16 = vreinterpretq_s16_u16(q8u16);
911 q9s16 = vreinterpretq_s16_u16(q9u16);
912 q10s16 = vreinterpretq_s16_u16(q10u16);
914 q7s16 = vqaddq_s16(q7s16, q3s16);
915 q8s16 = vqaddq_s16(q8s16, q4s16);
916 q9s16 = vqaddq_s16(q9s16, q5s16);
917 q10s16 = vqaddq_s16(q10s16, q6s16);
919 d6u8 = vqrshrun_n_s16(q7s16, 7);
920 d7u8 = vqrshrun_n_s16(q8s16, 7);
921 d8u8 = vqrshrun_n_s16(q9s16, 7);
922 d9u8 = vqrshrun_n_s16(q10s16, 7);
934 vst1_u8(dst_ptr, d6u8);
935 dst_ptr += dst_pitch;
936 vst1_u8(dst_ptr, d7u8);
937 dst_ptr += dst_pitch;
938 vst1_u8(dst_ptr, d8u8);
939 dst_ptr += dst_pitch;
940 vst1_u8(dst_ptr, d9u8);
941 dst_ptr += dst_pitch;
946 // load first_pass filter
947 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
948 d0s8 = vdup_lane_s8(dtmps8, 0);
949 d1s8 = vdup_lane_s8(dtmps8, 1);
950 d2s8 = vdup_lane_s8(dtmps8, 2);
951 d3s8 = vdup_lane_s8(dtmps8, 3);
952 d4s8 = vdup_lane_s8(dtmps8, 4);
953 d5s8 = vdup_lane_s8(dtmps8, 5);
954 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
955 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
956 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
957 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
958 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
959 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
961 // First pass: output_height lines x output_width columns (9x4)
962 if (yoffset == 0) // firstpass_filter4x4_only
965 src = src_ptr - 2 - (src_pixels_per_line * 2);
968 for (i = 2; i > 0; i--) {
969 q3u8 = vld1q_u8(src);
970 src += src_pixels_per_line;
971 q4u8 = vld1q_u8(src);
972 src += src_pixels_per_line;
973 q5u8 = vld1q_u8(src);
974 src += src_pixels_per_line;
975 q6u8 = vld1q_u8(src);
976 src += src_pixels_per_line;
978 __builtin_prefetch(src);
979 __builtin_prefetch(src + src_pixels_per_line);
980 __builtin_prefetch(src + src_pixels_per_line * 2);
982 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
983 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
984 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
985 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
987 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
988 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
989 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
990 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
992 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
993 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
994 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
995 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
997 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
998 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
999 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
1000 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
1002 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
1003 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
1004 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
1005 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
1007 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
1008 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
1009 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
1010 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
1012 q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
1013 q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
1014 q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
1015 q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
1017 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
1018 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
1019 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
1020 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
1022 q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
1023 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
1024 q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
1025 q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
1027 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1028 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1029 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1030 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1032 q3u16 = vmull_u8(d28u8, d3u8);
1033 q4u16 = vmull_u8(d29u8, d3u8);
1034 q5u16 = vmull_u8(d30u8, d3u8);
1035 q6u16 = vmull_u8(d31u8, d3u8);
1037 q3s16 = vreinterpretq_s16_u16(q3u16);
1038 q4s16 = vreinterpretq_s16_u16(q4u16);
1039 q5s16 = vreinterpretq_s16_u16(q5u16);
1040 q6s16 = vreinterpretq_s16_u16(q6u16);
1041 q7s16 = vreinterpretq_s16_u16(q7u16);
1042 q8s16 = vreinterpretq_s16_u16(q8u16);
1043 q9s16 = vreinterpretq_s16_u16(q9u16);
1044 q10s16 = vreinterpretq_s16_u16(q10u16);
1046 q7s16 = vqaddq_s16(q7s16, q3s16);
1047 q8s16 = vqaddq_s16(q8s16, q4s16);
1048 q9s16 = vqaddq_s16(q9s16, q5s16);
1049 q10s16 = vqaddq_s16(q10s16, q6s16);
1051 d22u8 = vqrshrun_n_s16(q7s16, 7);
1052 d23u8 = vqrshrun_n_s16(q8s16, 7);
1053 d24u8 = vqrshrun_n_s16(q9s16, 7);
1054 d25u8 = vqrshrun_n_s16(q10s16, 7);
1056 if (yoffset == 0) { // firstpass_filter8x4_only
1057 vst1_u8(dst_ptr, d22u8);
1058 dst_ptr += dst_pitch;
1059 vst1_u8(dst_ptr, d23u8);
1060 dst_ptr += dst_pitch;
1061 vst1_u8(dst_ptr, d24u8);
1062 dst_ptr += dst_pitch;
1063 vst1_u8(dst_ptr, d25u8);
1064 dst_ptr += dst_pitch;
1066 vst1_u8(tmpp, d22u8);
1068 vst1_u8(tmpp, d23u8);
1070 vst1_u8(tmpp, d24u8);
1072 vst1_u8(tmpp, d25u8);
1076 if (yoffset == 0) return;
1078 // First Pass on rest 5-line data
1079 q3u8 = vld1q_u8(src);
1080 src += src_pixels_per_line;
1081 q4u8 = vld1q_u8(src);
1082 src += src_pixels_per_line;
1083 q5u8 = vld1q_u8(src);
1084 src += src_pixels_per_line;
1085 q6u8 = vld1q_u8(src);
1086 src += src_pixels_per_line;
1087 q7u8 = vld1q_u8(src);
1089 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
1090 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
1091 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
1092 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
1093 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
1095 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
1096 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
1097 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
1098 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
1099 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
1101 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
1102 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1103 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1104 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
1105 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
1107 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
1108 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
1109 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
1110 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
1111 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
1113 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
1114 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1115 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1116 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
1117 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
1119 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
1120 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
1121 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
1122 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
1123 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
1125 q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
1126 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1127 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1128 q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
1129 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
1131 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
1132 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
1133 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
1134 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
1135 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
1137 q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
1138 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1139 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1140 q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
1141 q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
1143 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1144 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1145 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1146 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1147 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
1149 q3u16 = vmull_u8(d27u8, d3u8);
1150 q4u16 = vmull_u8(d28u8, d3u8);
1151 q5u16 = vmull_u8(d29u8, d3u8);
1152 q6u16 = vmull_u8(d30u8, d3u8);
1153 q7u16 = vmull_u8(d31u8, d3u8);
1155 q3s16 = vreinterpretq_s16_u16(q3u16);
1156 q4s16 = vreinterpretq_s16_u16(q4u16);
1157 q5s16 = vreinterpretq_s16_u16(q5u16);
1158 q6s16 = vreinterpretq_s16_u16(q6u16);
1159 q7s16 = vreinterpretq_s16_u16(q7u16);
1160 q8s16 = vreinterpretq_s16_u16(q8u16);
1161 q9s16 = vreinterpretq_s16_u16(q9u16);
1162 q10s16 = vreinterpretq_s16_u16(q10u16);
1163 q11s16 = vreinterpretq_s16_u16(q11u16);
1164 q12s16 = vreinterpretq_s16_u16(q12u16);
1166 q8s16 = vqaddq_s16(q8s16, q3s16);
1167 q9s16 = vqaddq_s16(q9s16, q4s16);
1168 q10s16 = vqaddq_s16(q10s16, q5s16);
1169 q11s16 = vqaddq_s16(q11s16, q6s16);
1170 q12s16 = vqaddq_s16(q12s16, q7s16);
1172 d26u8 = vqrshrun_n_s16(q8s16, 7);
1173 d27u8 = vqrshrun_n_s16(q9s16, 7);
1174 d28u8 = vqrshrun_n_s16(q10s16, 7);
1175 d29u8 = vqrshrun_n_s16(q11s16, 7);
1176 d30u8 = vqrshrun_n_s16(q12s16, 7);
1179 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1180 d0s8 = vdup_lane_s8(dtmps8, 0);
1181 d1s8 = vdup_lane_s8(dtmps8, 1);
1182 d2s8 = vdup_lane_s8(dtmps8, 2);
1183 d3s8 = vdup_lane_s8(dtmps8, 3);
1184 d4s8 = vdup_lane_s8(dtmps8, 4);
1185 d5s8 = vdup_lane_s8(dtmps8, 5);
1186 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1187 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1188 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1189 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1190 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1191 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1194 q9u8 = vld1q_u8(tmpp);
1196 q10u8 = vld1q_u8(tmpp);
1198 q11u8 = vld1q_u8(tmpp);
1200 q12u8 = vld1q_u8(tmpp);
1202 d18u8 = vget_low_u8(q9u8);
1203 d19u8 = vget_high_u8(q9u8);
1204 d20u8 = vget_low_u8(q10u8);
1205 d21u8 = vget_high_u8(q10u8);
1206 d22u8 = vget_low_u8(q11u8);
1207 d23u8 = vget_high_u8(q11u8);
1208 d24u8 = vget_low_u8(q12u8);
1209 d25u8 = vget_high_u8(q12u8);
1211 for (i = 2; i > 0; i--) {
1212 q3u16 = vmull_u8(d18u8, d0u8);
1213 q4u16 = vmull_u8(d19u8, d0u8);
1214 q5u16 = vmull_u8(d20u8, d0u8);
1215 q6u16 = vmull_u8(d21u8, d0u8);
1217 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1218 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1219 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1220 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1222 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1223 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1224 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1225 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1227 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1228 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1229 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1230 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1232 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1233 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1234 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1235 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1237 q7u16 = vmull_u8(d21u8, d3u8);
1238 q8u16 = vmull_u8(d22u8, d3u8);
1239 q9u16 = vmull_u8(d23u8, d3u8);
1240 q10u16 = vmull_u8(d24u8, d3u8);
1242 q3s16 = vreinterpretq_s16_u16(q3u16);
1243 q4s16 = vreinterpretq_s16_u16(q4u16);
1244 q5s16 = vreinterpretq_s16_u16(q5u16);
1245 q6s16 = vreinterpretq_s16_u16(q6u16);
1246 q7s16 = vreinterpretq_s16_u16(q7u16);
1247 q8s16 = vreinterpretq_s16_u16(q8u16);
1248 q9s16 = vreinterpretq_s16_u16(q9u16);
1249 q10s16 = vreinterpretq_s16_u16(q10u16);
1251 q7s16 = vqaddq_s16(q7s16, q3s16);
1252 q8s16 = vqaddq_s16(q8s16, q4s16);
1253 q9s16 = vqaddq_s16(q9s16, q5s16);
1254 q10s16 = vqaddq_s16(q10s16, q6s16);
1256 d6u8 = vqrshrun_n_s16(q7s16, 7);
1257 d7u8 = vqrshrun_n_s16(q8s16, 7);
1258 d8u8 = vqrshrun_n_s16(q9s16, 7);
1259 d9u8 = vqrshrun_n_s16(q10s16, 7);
1271 vst1_u8(dst_ptr, d6u8);
1272 dst_ptr += dst_pitch;
1273 vst1_u8(dst_ptr, d7u8);
1274 dst_ptr += dst_pitch;
1275 vst1_u8(dst_ptr, d8u8);
1276 dst_ptr += dst_pitch;
1277 vst1_u8(dst_ptr, d9u8);
1278 dst_ptr += dst_pitch;
1283 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
1284 int src_pixels_per_line, int xoffset,
1285 int yoffset, unsigned char *dst_ptr,
1287 unsigned char *src, *src_tmp, *dst, *tmpp;
1288 unsigned char tmp[336];
1290 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
1291 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
1292 uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
1293 uint8x8_t d28u8, d29u8, d30u8, d31u8;
1294 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
1295 uint8x16_t q3u8, q4u8;
1296 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
1297 uint16x8_t q11u16, q12u16, q13u16, q15u16;
1298 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
1299 int16x8_t q11s16, q12s16, q13s16, q15s16;
1301 if (xoffset == 0) { // secondpass_filter8x8_only
1302 // load second_pass filter
1303 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1304 d0s8 = vdup_lane_s8(dtmps8, 0);
1305 d1s8 = vdup_lane_s8(dtmps8, 1);
1306 d2s8 = vdup_lane_s8(dtmps8, 2);
1307 d3s8 = vdup_lane_s8(dtmps8, 3);
1308 d4s8 = vdup_lane_s8(dtmps8, 4);
1309 d5s8 = vdup_lane_s8(dtmps8, 5);
1310 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1311 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1312 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1313 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1314 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1315 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1318 src_tmp = src_ptr - src_pixels_per_line * 2;
1319 for (i = 0; i < 2; ++i) {
1320 src = src_tmp + i * 8;
1321 dst = dst_ptr + i * 8;
1322 d18u8 = vld1_u8(src);
1323 src += src_pixels_per_line;
1324 d19u8 = vld1_u8(src);
1325 src += src_pixels_per_line;
1326 d20u8 = vld1_u8(src);
1327 src += src_pixels_per_line;
1328 d21u8 = vld1_u8(src);
1329 src += src_pixels_per_line;
1330 d22u8 = vld1_u8(src);
1331 src += src_pixels_per_line;
1332 for (j = 0; j < 4; ++j) {
1333 d23u8 = vld1_u8(src);
1334 src += src_pixels_per_line;
1335 d24u8 = vld1_u8(src);
1336 src += src_pixels_per_line;
1337 d25u8 = vld1_u8(src);
1338 src += src_pixels_per_line;
1339 d26u8 = vld1_u8(src);
1340 src += src_pixels_per_line;
1342 q3u16 = vmull_u8(d18u8, d0u8);
1343 q4u16 = vmull_u8(d19u8, d0u8);
1344 q5u16 = vmull_u8(d20u8, d0u8);
1345 q6u16 = vmull_u8(d21u8, d0u8);
1347 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1348 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1349 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1350 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1352 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1353 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1354 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1355 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1357 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1358 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1359 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1360 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1362 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1363 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1364 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1365 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1367 q7u16 = vmull_u8(d21u8, d3u8);
1368 q8u16 = vmull_u8(d22u8, d3u8);
1369 q9u16 = vmull_u8(d23u8, d3u8);
1370 q10u16 = vmull_u8(d24u8, d3u8);
1372 q3s16 = vreinterpretq_s16_u16(q3u16);
1373 q4s16 = vreinterpretq_s16_u16(q4u16);
1374 q5s16 = vreinterpretq_s16_u16(q5u16);
1375 q6s16 = vreinterpretq_s16_u16(q6u16);
1376 q7s16 = vreinterpretq_s16_u16(q7u16);
1377 q8s16 = vreinterpretq_s16_u16(q8u16);
1378 q9s16 = vreinterpretq_s16_u16(q9u16);
1379 q10s16 = vreinterpretq_s16_u16(q10u16);
1381 q7s16 = vqaddq_s16(q7s16, q3s16);
1382 q8s16 = vqaddq_s16(q8s16, q4s16);
1383 q9s16 = vqaddq_s16(q9s16, q5s16);
1384 q10s16 = vqaddq_s16(q10s16, q6s16);
1386 d6u8 = vqrshrun_n_s16(q7s16, 7);
1387 d7u8 = vqrshrun_n_s16(q8s16, 7);
1388 d8u8 = vqrshrun_n_s16(q9s16, 7);
1389 d9u8 = vqrshrun_n_s16(q10s16, 7);
1410 // load first_pass filter
1411 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
1412 d0s8 = vdup_lane_s8(dtmps8, 0);
1413 d1s8 = vdup_lane_s8(dtmps8, 1);
1414 d2s8 = vdup_lane_s8(dtmps8, 2);
1415 d3s8 = vdup_lane_s8(dtmps8, 3);
1416 d4s8 = vdup_lane_s8(dtmps8, 4);
1417 d5s8 = vdup_lane_s8(dtmps8, 5);
1418 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1419 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1420 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1421 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1422 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1423 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1425 // First pass: output_height lines x output_width columns (9x4)
1426 if (yoffset == 0) { // firstpass_filter4x4_only
1429 for (i = 0; i < 8; ++i) {
1430 d6u8 = vld1_u8(src);
1431 d7u8 = vld1_u8(src + 8);
1432 d8u8 = vld1_u8(src + 16);
1433 src += src_pixels_per_line;
1434 d9u8 = vld1_u8(src);
1435 d10u8 = vld1_u8(src + 8);
1436 d11u8 = vld1_u8(src + 16);
1437 src += src_pixels_per_line;
1439 __builtin_prefetch(src);
1440 __builtin_prefetch(src + src_pixels_per_line);
1442 q6u16 = vmull_u8(d6u8, d0u8);
1443 q7u16 = vmull_u8(d7u8, d0u8);
1444 q8u16 = vmull_u8(d9u8, d0u8);
1445 q9u16 = vmull_u8(d10u8, d0u8);
1447 d20u8 = vext_u8(d6u8, d7u8, 1);
1448 d21u8 = vext_u8(d9u8, d10u8, 1);
1449 d22u8 = vext_u8(d7u8, d8u8, 1);
1450 d23u8 = vext_u8(d10u8, d11u8, 1);
1451 d24u8 = vext_u8(d6u8, d7u8, 4);
1452 d25u8 = vext_u8(d9u8, d10u8, 4);
1453 d26u8 = vext_u8(d7u8, d8u8, 4);
1454 d27u8 = vext_u8(d10u8, d11u8, 4);
1455 d28u8 = vext_u8(d6u8, d7u8, 5);
1456 d29u8 = vext_u8(d9u8, d10u8, 5);
1458 q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
1459 q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
1460 q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
1461 q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
1462 q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
1463 q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
1464 q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
1465 q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
1466 q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
1467 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
1469 d20u8 = vext_u8(d7u8, d8u8, 5);
1470 d21u8 = vext_u8(d10u8, d11u8, 5);
1471 d22u8 = vext_u8(d6u8, d7u8, 2);
1472 d23u8 = vext_u8(d9u8, d10u8, 2);
1473 d24u8 = vext_u8(d7u8, d8u8, 2);
1474 d25u8 = vext_u8(d10u8, d11u8, 2);
1475 d26u8 = vext_u8(d6u8, d7u8, 3);
1476 d27u8 = vext_u8(d9u8, d10u8, 3);
1477 d28u8 = vext_u8(d7u8, d8u8, 3);
1478 d29u8 = vext_u8(d10u8, d11u8, 3);
1480 q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
1481 q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
1482 q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
1483 q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
1484 q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
1485 q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
1487 q10u16 = vmull_u8(d26u8, d3u8);
1488 q11u16 = vmull_u8(d27u8, d3u8);
1489 q12u16 = vmull_u8(d28u8, d3u8);
1490 q15u16 = vmull_u8(d29u8, d3u8);
1492 q6s16 = vreinterpretq_s16_u16(q6u16);
1493 q7s16 = vreinterpretq_s16_u16(q7u16);
1494 q8s16 = vreinterpretq_s16_u16(q8u16);
1495 q9s16 = vreinterpretq_s16_u16(q9u16);
1496 q10s16 = vreinterpretq_s16_u16(q10u16);
1497 q11s16 = vreinterpretq_s16_u16(q11u16);
1498 q12s16 = vreinterpretq_s16_u16(q12u16);
1499 q15s16 = vreinterpretq_s16_u16(q15u16);
1501 q6s16 = vqaddq_s16(q6s16, q10s16);
1502 q8s16 = vqaddq_s16(q8s16, q11s16);
1503 q7s16 = vqaddq_s16(q7s16, q12s16);
1504 q9s16 = vqaddq_s16(q9s16, q15s16);
1506 d6u8 = vqrshrun_n_s16(q6s16, 7);
1507 d7u8 = vqrshrun_n_s16(q7s16, 7);
1508 d8u8 = vqrshrun_n_s16(q8s16, 7);
1509 d9u8 = vqrshrun_n_s16(q9s16, 7);
1511 q3u8 = vcombine_u8(d6u8, d7u8);
1512 q4u8 = vcombine_u8(d8u8, d9u8);
1513 vst1q_u8(dst, q3u8);
1515 vst1q_u8(dst, q4u8);
1521 src = src_ptr - 2 - src_pixels_per_line * 2;
1523 for (i = 0; i < 7; ++i) {
1524 d6u8 = vld1_u8(src);
1525 d7u8 = vld1_u8(src + 8);
1526 d8u8 = vld1_u8(src + 16);
1527 src += src_pixels_per_line;
1528 d9u8 = vld1_u8(src);
1529 d10u8 = vld1_u8(src + 8);
1530 d11u8 = vld1_u8(src + 16);
1531 src += src_pixels_per_line;
1532 d12u8 = vld1_u8(src);
1533 d13u8 = vld1_u8(src + 8);
1534 d14u8 = vld1_u8(src + 16);
1535 src += src_pixels_per_line;
1537 __builtin_prefetch(src);
1538 __builtin_prefetch(src + src_pixels_per_line);
1539 __builtin_prefetch(src + src_pixels_per_line * 2);
1541 q8u16 = vmull_u8(d6u8, d0u8);
1542 q9u16 = vmull_u8(d7u8, d0u8);
1543 q10u16 = vmull_u8(d9u8, d0u8);
1544 q11u16 = vmull_u8(d10u8, d0u8);
1545 q12u16 = vmull_u8(d12u8, d0u8);
1546 q13u16 = vmull_u8(d13u8, d0u8);
1548 d28u8 = vext_u8(d6u8, d7u8, 1);
1549 d29u8 = vext_u8(d9u8, d10u8, 1);
1550 d30u8 = vext_u8(d12u8, d13u8, 1);
1551 q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
1552 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1553 q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
1554 d28u8 = vext_u8(d7u8, d8u8, 1);
1555 d29u8 = vext_u8(d10u8, d11u8, 1);
1556 d30u8 = vext_u8(d13u8, d14u8, 1);
1557 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1558 q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
1559 q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
1561 d28u8 = vext_u8(d6u8, d7u8, 4);
1562 d29u8 = vext_u8(d9u8, d10u8, 4);
1563 d30u8 = vext_u8(d12u8, d13u8, 4);
1564 q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
1565 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1566 q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
1567 d28u8 = vext_u8(d7u8, d8u8, 4);
1568 d29u8 = vext_u8(d10u8, d11u8, 4);
1569 d30u8 = vext_u8(d13u8, d14u8, 4);
1570 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1571 q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
1572 q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
1574 d28u8 = vext_u8(d6u8, d7u8, 5);
1575 d29u8 = vext_u8(d9u8, d10u8, 5);
1576 d30u8 = vext_u8(d12u8, d13u8, 5);
1577 q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
1578 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1579 q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
1580 d28u8 = vext_u8(d7u8, d8u8, 5);
1581 d29u8 = vext_u8(d10u8, d11u8, 5);
1582 d30u8 = vext_u8(d13u8, d14u8, 5);
1583 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1584 q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
1585 q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
1587 d28u8 = vext_u8(d6u8, d7u8, 2);
1588 d29u8 = vext_u8(d9u8, d10u8, 2);
1589 d30u8 = vext_u8(d12u8, d13u8, 2);
1590 q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
1591 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1592 q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
1593 d28u8 = vext_u8(d7u8, d8u8, 2);
1594 d29u8 = vext_u8(d10u8, d11u8, 2);
1595 d30u8 = vext_u8(d13u8, d14u8, 2);
1596 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1597 q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
1598 q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
1600 d28u8 = vext_u8(d6u8, d7u8, 3);
1601 d29u8 = vext_u8(d9u8, d10u8, 3);
1602 d30u8 = vext_u8(d12u8, d13u8, 3);
1603 d15u8 = vext_u8(d7u8, d8u8, 3);
1604 d31u8 = vext_u8(d10u8, d11u8, 3);
1605 d6u8 = vext_u8(d13u8, d14u8, 3);
1606 q4u16 = vmull_u8(d28u8, d3u8);
1607 q5u16 = vmull_u8(d29u8, d3u8);
1608 q6u16 = vmull_u8(d30u8, d3u8);
1609 q4s16 = vreinterpretq_s16_u16(q4u16);
1610 q5s16 = vreinterpretq_s16_u16(q5u16);
1611 q6s16 = vreinterpretq_s16_u16(q6u16);
1612 q8s16 = vreinterpretq_s16_u16(q8u16);
1613 q10s16 = vreinterpretq_s16_u16(q10u16);
1614 q12s16 = vreinterpretq_s16_u16(q12u16);
1615 q8s16 = vqaddq_s16(q8s16, q4s16);
1616 q10s16 = vqaddq_s16(q10s16, q5s16);
1617 q12s16 = vqaddq_s16(q12s16, q6s16);
1619 q6u16 = vmull_u8(d15u8, d3u8);
1620 q7u16 = vmull_u8(d31u8, d3u8);
1621 q3u16 = vmull_u8(d6u8, d3u8);
1622 q3s16 = vreinterpretq_s16_u16(q3u16);
1623 q6s16 = vreinterpretq_s16_u16(q6u16);
1624 q7s16 = vreinterpretq_s16_u16(q7u16);
1625 q9s16 = vreinterpretq_s16_u16(q9u16);
1626 q11s16 = vreinterpretq_s16_u16(q11u16);
1627 q13s16 = vreinterpretq_s16_u16(q13u16);
1628 q9s16 = vqaddq_s16(q9s16, q6s16);
1629 q11s16 = vqaddq_s16(q11s16, q7s16);
1630 q13s16 = vqaddq_s16(q13s16, q3s16);
1632 d6u8 = vqrshrun_n_s16(q8s16, 7);
1633 d7u8 = vqrshrun_n_s16(q9s16, 7);
1634 d8u8 = vqrshrun_n_s16(q10s16, 7);
1635 d9u8 = vqrshrun_n_s16(q11s16, 7);
1636 d10u8 = vqrshrun_n_s16(q12s16, 7);
1637 d11u8 = vqrshrun_n_s16(q13s16, 7);
1639 vst1_u8(tmpp, d6u8);
1641 vst1_u8(tmpp, d7u8);
1643 vst1_u8(tmpp, d8u8);
1645 vst1_u8(tmpp, d9u8);
1647 vst1_u8(tmpp, d10u8);
1649 vst1_u8(tmpp, d11u8);
1653 // Second pass: 16x16
1654 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1655 d0s8 = vdup_lane_s8(dtmps8, 0);
1656 d1s8 = vdup_lane_s8(dtmps8, 1);
1657 d2s8 = vdup_lane_s8(dtmps8, 2);
1658 d3s8 = vdup_lane_s8(dtmps8, 3);
1659 d4s8 = vdup_lane_s8(dtmps8, 4);
1660 d5s8 = vdup_lane_s8(dtmps8, 5);
1661 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1662 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1663 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1664 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1665 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1666 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1668 for (i = 0; i < 2; ++i) {
1669 dst = dst_ptr + 8 * i;
1671 d18u8 = vld1_u8(tmpp);
1673 d19u8 = vld1_u8(tmpp);
1675 d20u8 = vld1_u8(tmpp);
1677 d21u8 = vld1_u8(tmpp);
1679 d22u8 = vld1_u8(tmpp);
1681 for (j = 0; j < 4; ++j) {
1682 d23u8 = vld1_u8(tmpp);
1684 d24u8 = vld1_u8(tmpp);
1686 d25u8 = vld1_u8(tmpp);
1688 d26u8 = vld1_u8(tmpp);
1691 q3u16 = vmull_u8(d18u8, d0u8);
1692 q4u16 = vmull_u8(d19u8, d0u8);
1693 q5u16 = vmull_u8(d20u8, d0u8);
1694 q6u16 = vmull_u8(d21u8, d0u8);
1696 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1697 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1698 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1699 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1701 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1702 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1703 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1704 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1706 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1707 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1708 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1709 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1711 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1712 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1713 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1714 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1716 q7u16 = vmull_u8(d21u8, d3u8);
1717 q8u16 = vmull_u8(d22u8, d3u8);
1718 q9u16 = vmull_u8(d23u8, d3u8);
1719 q10u16 = vmull_u8(d24u8, d3u8);
1721 q3s16 = vreinterpretq_s16_u16(q3u16);
1722 q4s16 = vreinterpretq_s16_u16(q4u16);
1723 q5s16 = vreinterpretq_s16_u16(q5u16);
1724 q6s16 = vreinterpretq_s16_u16(q6u16);
1725 q7s16 = vreinterpretq_s16_u16(q7u16);
1726 q8s16 = vreinterpretq_s16_u16(q8u16);
1727 q9s16 = vreinterpretq_s16_u16(q9u16);
1728 q10s16 = vreinterpretq_s16_u16(q10u16);
1730 q7s16 = vqaddq_s16(q7s16, q3s16);
1731 q8s16 = vqaddq_s16(q8s16, q4s16);
1732 q9s16 = vqaddq_s16(q9s16, q5s16);
1733 q10s16 = vqaddq_s16(q10s16, q6s16);
1735 d6u8 = vqrshrun_n_s16(q7s16, 7);
1736 d7u8 = vqrshrun_n_s16(q8s16, 7);
1737 d8u8 = vqrshrun_n_s16(q9s16, 7);
1738 d9u8 = vqrshrun_n_s16(q10s16, 7);