2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_config.h"
14 #include "vpx_dsp/arm/mem_neon.h"
15 #include "vpx_ports/mem.h"
17 static const int8_t vp8_sub_pel_filters[8][8] = {
18 { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positionyys are */
19 { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */
20 { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
21 { 0, -9, 93, 50, -6, 0, 0, 0 },
22 { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
23 { 0, -6, 50, 93, -9, 0, 0, 0 },
24 { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
25 { 0, -1, 12, 123, -6, 0, 0, 0 },
28 // This table is derived from vp8/common/filter.c:vp8_sub_pel_filters.
29 // Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive.
30 // Elements 1 and 4 are either 0 or negative. The code accounts for this with
31 // multiply/accumulates which either add or subtract as needed. The other
32 // functions will be updated to use this table later.
33 // It is also expanded to 8 elements to allow loading into 64 bit neon
35 static const uint8_t abs_filters[8][8] = {
36 { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, 6, 123, 12, 1, 0, 0, 0 },
37 { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 },
38 { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 },
39 { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 },
42 static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
43 return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
46 static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
47 const uint8x8_t filter, uint16x8_t *c,
49 const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
50 vreinterpret_u32_u8(vget_high_u8(a)));
51 const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
52 vreinterpret_u32_u8(vget_high_u8(b)));
53 *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
54 *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
57 static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b,
58 const uint8x8_t filter, uint16x8_t *c,
60 const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
61 vreinterpret_u32_u8(vget_high_u8(a)));
62 const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
63 vreinterpret_u32_u8(vget_high_u8(b)));
64 *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
65 *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
68 static INLINE void yonly4x4(const unsigned char *src, int src_stride,
69 int filter_offset, unsigned char *dst,
71 uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
72 uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
73 uint16x8_t c0, c1, c2, c3;
77 const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]);
78 const uint8x8_t filter0 = vdup_lane_u8(filter, 0);
79 const uint8x8_t filter1 = vdup_lane_u8(filter, 1);
80 const uint8x8_t filter2 = vdup_lane_u8(filter, 2);
81 const uint8x8_t filter3 = vdup_lane_u8(filter, 3);
82 const uint8x8_t filter4 = vdup_lane_u8(filter, 4);
83 const uint8x8_t filter5 = vdup_lane_u8(filter, 5);
85 src -= src_stride * 2;
86 // Shift the even rows to allow using 'vext' to combine the vectors. armv8
87 // has vcopy_lane which would be interesting. This started as just a
88 // horrible workaround for clang adding alignment hints to 32bit loads:
89 // https://llvm.org/bugs/show_bug.cgi?id=24421
90 // But it turns out it almost identical to casting the loads.
91 a0 = load_and_shift(src);
95 a2 = load_and_shift(src);
99 a4 = load_and_shift(src);
103 a6 = load_and_shift(src);
109 // Combine the rows so we can operate on 8 at a time.
110 b0 = vext_u8(a0, a1, 4);
111 b2 = vext_u8(a2, a3, 4);
112 b4 = vext_u8(a4, a5, 4);
113 b6 = vext_u8(a6, a7, 4);
116 // To keep with the 8-at-a-time theme, combine *alternate* rows. This
117 // allows combining the odd rows with the even.
118 b1 = vext_u8(b0, b2, 4);
119 b3 = vext_u8(b2, b4, 4);
120 b5 = vext_u8(b4, b6, 4);
121 b7 = vext_u8(b6, b8, 4);
123 // Multiply and expand to 16 bits.
124 c0 = vmull_u8(b0, filter0);
125 c1 = vmull_u8(b2, filter0);
126 c2 = vmull_u8(b5, filter5);
127 c3 = vmull_u8(b7, filter5);
129 // Multiply, subtract and accumulate for filters 1 and 4 (the negative
131 c0 = vmlsl_u8(c0, b4, filter4);
132 c1 = vmlsl_u8(c1, b6, filter4);
133 c2 = vmlsl_u8(c2, b1, filter1);
134 c3 = vmlsl_u8(c3, b3, filter1);
136 // Add more positive ones. vmlal should really return a signed type.
137 // It's doing signed math internally, as evidenced by the fact we can do
138 // subtractions followed by more additions. Ideally we could use
139 // vqmlal/sl but that instruction doesn't exist. Might be able to
140 // shoehorn vqdmlal/vqdmlsl in here but it would take some effort.
141 c0 = vmlal_u8(c0, b2, filter2);
142 c1 = vmlal_u8(c1, b4, filter2);
143 c2 = vmlal_u8(c2, b3, filter3);
144 c3 = vmlal_u8(c3, b5, filter3);
146 // Use signed saturation math because vmlsl may have left some negative
148 d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
149 d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
151 // Use signed again because numbers like -200 need to be saturated to 0.
152 e0 = vqrshrun_n_s16(d0, 7);
153 e1 = vqrshrun_n_s16(d1, 7);
155 store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1));
158 void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
159 int xoffset, int yoffset,
160 unsigned char *dst_ptr, int dst_pitch) {
161 uint8x16_t s0, s1, s2, s3, s4;
163 // Variables to hold src[] elements for the given filter[]
164 uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5;
165 uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4;
166 uint8x16_t s01_f0, s23_f0;
167 uint64x2_t s01_f3, s23_f3;
168 uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q;
169 // Accumulator variables.
170 uint16x8_t d0123, d4567, d89;
171 uint16x8_t d0123_a, d4567_a, d89_a;
172 int16x8_t e0123, e4567, e89;
173 // Second pass intermediates.
174 uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
175 uint16x8_t c0, c1, c2, c3;
178 uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5;
180 if (xoffset == 0) { // Second pass only.
181 yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch);
185 if (yoffset == 0) { // First pass only.
187 } else { // Add context for the second pass. 2 extra lines on top.
188 src_ptr -= 2 + (src_pixels_per_line * 2);
191 filter = vld1_u8(abs_filters[xoffset]);
192 filter0 = vdup_lane_u8(filter, 0);
193 filter1 = vdup_lane_u8(filter, 1);
194 filter2 = vdup_lane_u8(filter, 2);
195 filter3 = vdup_lane_u8(filter, 3);
196 filter4 = vdup_lane_u8(filter, 4);
197 filter5 = vdup_lane_u8(filter, 5);
199 // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of
200 // garbage. So much effort for that last single bit.
201 // The low values of each pair are for filter0.
202 s0 = vld1q_u8(src_ptr);
203 src_ptr += src_pixels_per_line;
204 s1 = vld1q_u8(src_ptr);
205 src_ptr += src_pixels_per_line;
206 s2 = vld1q_u8(src_ptr);
207 src_ptr += src_pixels_per_line;
208 s3 = vld1q_u8(src_ptr);
209 src_ptr += src_pixels_per_line;
211 // Shift to extract values for filter[5]
212 // If src[] is 0, this puts:
213 // 3 4 5 6 7 8 9 10 in s0_f5
214 // Can't use vshr.u64 because it crosses the double word boundary.
215 s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
216 s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
217 s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
218 s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
220 s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
221 s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
223 s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
224 s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
225 d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
226 d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
228 // Keep original src data as 64 bits to simplify shifting and extracting.
229 s01 = vreinterpretq_u64_u8(s01_f0);
230 s23 = vreinterpretq_u64_u8(s23_f0);
233 filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
235 // Shift over one to use -1, 0, 1, 2 for filter1
236 // -1 0 1 2 * filter1
237 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
238 vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
242 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
243 vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
247 filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
248 vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
252 s01_f3 = vshrq_n_u64(s01, 24);
253 s23_f3 = vshrq_n_u64(s23, 24);
254 s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
255 vreinterpret_u32_u64(vget_high_u64(s01_f3)));
256 s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
257 vreinterpret_u32_u64(vget_high_u64(s23_f3)));
258 // Accumulate into different registers so it can use saturated addition.
259 d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
260 d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
263 vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
265 vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
268 b0 = vqrshrun_n_s16(e0123, 7);
269 b2 = vqrshrun_n_s16(e4567, 7);
271 if (yoffset == 0) { // firstpass_filter4x4_only
272 store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2));
276 // Load additional context when doing both filters.
277 s0 = vld1q_u8(src_ptr);
278 src_ptr += src_pixels_per_line;
279 s1 = vld1q_u8(src_ptr);
280 src_ptr += src_pixels_per_line;
281 s2 = vld1q_u8(src_ptr);
282 src_ptr += src_pixels_per_line;
283 s3 = vld1q_u8(src_ptr);
284 src_ptr += src_pixels_per_line;
285 s4 = vld1q_u8(src_ptr);
287 s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
288 s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
289 s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
290 s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
291 s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5);
294 s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
295 s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
297 s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
298 s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
299 // But this time instead of 16 pixels to filter, there are 20. So an extra
300 // run with a doubleword register.
301 d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
302 d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
303 d89 = vmull_u8(s4_f5, filter5);
305 // Save a copy as u64 for shifting.
306 s01 = vreinterpretq_u64_u8(s01_f0);
307 s23 = vreinterpretq_u64_u8(s23_f0);
309 filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
310 d89 = vmlal_u8(d89, vget_low_u8(s4), filter0);
312 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
313 vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
315 s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1);
316 d89 = vmlsl_u8(d89, s4_f1, filter1);
318 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
319 vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
321 s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4);
322 d89 = vmlsl_u8(d89, s4_f4, filter4);
324 filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
325 vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
327 s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2);
328 d89 = vmlal_u8(d89, s4_f2, filter2);
330 s01_f3 = vshrq_n_u64(s01, 24);
331 s23_f3 = vshrq_n_u64(s23, 24);
332 s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
333 vreinterpret_u32_u64(vget_high_u64(s01_f3)));
334 s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
335 vreinterpret_u32_u64(vget_high_u64(s23_f3)));
336 s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3);
337 d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
338 d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
339 d89_a = vmull_u8(s4_f3, filter3);
342 vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
344 vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
345 e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a));
347 b4 = vqrshrun_n_s16(e0123, 7);
348 b6 = vqrshrun_n_s16(e4567, 7);
349 b8 = vqrshrun_n_s16(e89, 7);
352 filter = vld1_u8(abs_filters[yoffset]);
353 filter0 = vdup_lane_u8(filter, 0);
354 filter1 = vdup_lane_u8(filter, 1);
355 filter2 = vdup_lane_u8(filter, 2);
356 filter3 = vdup_lane_u8(filter, 3);
357 filter4 = vdup_lane_u8(filter, 4);
358 filter5 = vdup_lane_u8(filter, 5);
360 b1 = vext_u8(b0, b2, 4);
361 b3 = vext_u8(b2, b4, 4);
362 b5 = vext_u8(b4, b6, 4);
363 b7 = vext_u8(b6, b8, 4);
365 c0 = vmull_u8(b0, filter0);
366 c1 = vmull_u8(b2, filter0);
367 c2 = vmull_u8(b5, filter5);
368 c3 = vmull_u8(b7, filter5);
370 c0 = vmlsl_u8(c0, b4, filter4);
371 c1 = vmlsl_u8(c1, b6, filter4);
372 c2 = vmlsl_u8(c2, b1, filter1);
373 c3 = vmlsl_u8(c3, b3, filter1);
375 c0 = vmlal_u8(c0, b2, filter2);
376 c1 = vmlal_u8(c1, b4, filter2);
377 c2 = vmlal_u8(c2, b3, filter3);
378 c3 = vmlal_u8(c3, b5, filter3);
380 d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
381 d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
383 e0 = vqrshrun_n_s16(d0, 7);
384 e1 = vqrshrun_n_s16(d1, 7);
386 store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
389 void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
390 int xoffset, int yoffset,
391 unsigned char *dst_ptr, int dst_pitch) {
393 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
394 uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
395 uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
396 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
397 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
398 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
399 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
400 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
401 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
403 if (xoffset == 0) { // secondpass_filter8x4_only
404 // load second_pass filter
405 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
406 d0s8 = vdup_lane_s8(dtmps8, 0);
407 d1s8 = vdup_lane_s8(dtmps8, 1);
408 d2s8 = vdup_lane_s8(dtmps8, 2);
409 d3s8 = vdup_lane_s8(dtmps8, 3);
410 d4s8 = vdup_lane_s8(dtmps8, 4);
411 d5s8 = vdup_lane_s8(dtmps8, 5);
412 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
413 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
414 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
415 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
416 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
417 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
420 src = src_ptr - src_pixels_per_line * 2;
421 d22u8 = vld1_u8(src);
422 src += src_pixels_per_line;
423 d23u8 = vld1_u8(src);
424 src += src_pixels_per_line;
425 d24u8 = vld1_u8(src);
426 src += src_pixels_per_line;
427 d25u8 = vld1_u8(src);
428 src += src_pixels_per_line;
429 d26u8 = vld1_u8(src);
430 src += src_pixels_per_line;
431 d27u8 = vld1_u8(src);
432 src += src_pixels_per_line;
433 d28u8 = vld1_u8(src);
434 src += src_pixels_per_line;
435 d29u8 = vld1_u8(src);
436 src += src_pixels_per_line;
437 d30u8 = vld1_u8(src);
439 q3u16 = vmull_u8(d22u8, d0u8);
440 q4u16 = vmull_u8(d23u8, d0u8);
441 q5u16 = vmull_u8(d24u8, d0u8);
442 q6u16 = vmull_u8(d25u8, d0u8);
444 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
445 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
446 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
447 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
449 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
450 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
451 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
452 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
454 q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
455 q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
456 q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
457 q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
459 q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
460 q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
461 q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
462 q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
464 q7u16 = vmull_u8(d25u8, d3u8);
465 q8u16 = vmull_u8(d26u8, d3u8);
466 q9u16 = vmull_u8(d27u8, d3u8);
467 q10u16 = vmull_u8(d28u8, d3u8);
469 q3s16 = vreinterpretq_s16_u16(q3u16);
470 q4s16 = vreinterpretq_s16_u16(q4u16);
471 q5s16 = vreinterpretq_s16_u16(q5u16);
472 q6s16 = vreinterpretq_s16_u16(q6u16);
473 q7s16 = vreinterpretq_s16_u16(q7u16);
474 q8s16 = vreinterpretq_s16_u16(q8u16);
475 q9s16 = vreinterpretq_s16_u16(q9u16);
476 q10s16 = vreinterpretq_s16_u16(q10u16);
478 q7s16 = vqaddq_s16(q7s16, q3s16);
479 q8s16 = vqaddq_s16(q8s16, q4s16);
480 q9s16 = vqaddq_s16(q9s16, q5s16);
481 q10s16 = vqaddq_s16(q10s16, q6s16);
483 d6u8 = vqrshrun_n_s16(q7s16, 7);
484 d7u8 = vqrshrun_n_s16(q8s16, 7);
485 d8u8 = vqrshrun_n_s16(q9s16, 7);
486 d9u8 = vqrshrun_n_s16(q10s16, 7);
488 vst1_u8(dst_ptr, d6u8);
489 dst_ptr += dst_pitch;
490 vst1_u8(dst_ptr, d7u8);
491 dst_ptr += dst_pitch;
492 vst1_u8(dst_ptr, d8u8);
493 dst_ptr += dst_pitch;
494 vst1_u8(dst_ptr, d9u8);
498 // load first_pass filter
499 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
500 d0s8 = vdup_lane_s8(dtmps8, 0);
501 d1s8 = vdup_lane_s8(dtmps8, 1);
502 d2s8 = vdup_lane_s8(dtmps8, 2);
503 d3s8 = vdup_lane_s8(dtmps8, 3);
504 d4s8 = vdup_lane_s8(dtmps8, 4);
505 d5s8 = vdup_lane_s8(dtmps8, 5);
506 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
507 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
508 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
509 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
510 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
511 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
513 // First pass: output_height lines x output_width columns (9x4)
514 if (yoffset == 0) // firstpass_filter4x4_only
517 src = src_ptr - 2 - (src_pixels_per_line * 2);
518 q3u8 = vld1q_u8(src);
519 src += src_pixels_per_line;
520 q4u8 = vld1q_u8(src);
521 src += src_pixels_per_line;
522 q5u8 = vld1q_u8(src);
523 src += src_pixels_per_line;
524 q6u8 = vld1q_u8(src);
526 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
527 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
528 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
529 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
531 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
532 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
533 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
534 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
536 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
537 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
538 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
539 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
541 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
542 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
543 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
544 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
546 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
547 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
548 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
549 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
551 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
552 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
553 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
554 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
556 q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
557 q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
558 q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
559 q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
561 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
562 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
563 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
564 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
566 q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
567 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
568 q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
569 q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
571 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
572 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
573 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
574 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
576 q3u16 = vmull_u8(d28u8, d3u8);
577 q4u16 = vmull_u8(d29u8, d3u8);
578 q5u16 = vmull_u8(d30u8, d3u8);
579 q6u16 = vmull_u8(d31u8, d3u8);
581 q3s16 = vreinterpretq_s16_u16(q3u16);
582 q4s16 = vreinterpretq_s16_u16(q4u16);
583 q5s16 = vreinterpretq_s16_u16(q5u16);
584 q6s16 = vreinterpretq_s16_u16(q6u16);
585 q7s16 = vreinterpretq_s16_u16(q7u16);
586 q8s16 = vreinterpretq_s16_u16(q8u16);
587 q9s16 = vreinterpretq_s16_u16(q9u16);
588 q10s16 = vreinterpretq_s16_u16(q10u16);
590 q7s16 = vqaddq_s16(q7s16, q3s16);
591 q8s16 = vqaddq_s16(q8s16, q4s16);
592 q9s16 = vqaddq_s16(q9s16, q5s16);
593 q10s16 = vqaddq_s16(q10s16, q6s16);
595 d22u8 = vqrshrun_n_s16(q7s16, 7);
596 d23u8 = vqrshrun_n_s16(q8s16, 7);
597 d24u8 = vqrshrun_n_s16(q9s16, 7);
598 d25u8 = vqrshrun_n_s16(q10s16, 7);
600 if (yoffset == 0) { // firstpass_filter8x4_only
601 vst1_u8(dst_ptr, d22u8);
602 dst_ptr += dst_pitch;
603 vst1_u8(dst_ptr, d23u8);
604 dst_ptr += dst_pitch;
605 vst1_u8(dst_ptr, d24u8);
606 dst_ptr += dst_pitch;
607 vst1_u8(dst_ptr, d25u8);
611 // First Pass on rest 5-line data
612 src += src_pixels_per_line;
613 q3u8 = vld1q_u8(src);
614 src += src_pixels_per_line;
615 q4u8 = vld1q_u8(src);
616 src += src_pixels_per_line;
617 q5u8 = vld1q_u8(src);
618 src += src_pixels_per_line;
619 q6u8 = vld1q_u8(src);
620 src += src_pixels_per_line;
621 q7u8 = vld1q_u8(src);
623 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
624 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
625 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
626 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
627 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
629 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
630 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
631 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
632 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
633 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
635 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
636 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
637 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
638 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
639 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
641 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
642 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
643 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
644 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
645 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
647 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
648 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
649 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
650 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
651 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
653 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
654 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
655 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
656 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
657 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
659 q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
660 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
661 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
662 q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
663 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
665 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
666 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
667 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
668 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
669 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
671 q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
672 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
673 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
674 q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
675 q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
677 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
678 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
679 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
680 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
681 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
683 q3u16 = vmull_u8(d27u8, d3u8);
684 q4u16 = vmull_u8(d28u8, d3u8);
685 q5u16 = vmull_u8(d29u8, d3u8);
686 q6u16 = vmull_u8(d30u8, d3u8);
687 q7u16 = vmull_u8(d31u8, d3u8);
689 q3s16 = vreinterpretq_s16_u16(q3u16);
690 q4s16 = vreinterpretq_s16_u16(q4u16);
691 q5s16 = vreinterpretq_s16_u16(q5u16);
692 q6s16 = vreinterpretq_s16_u16(q6u16);
693 q7s16 = vreinterpretq_s16_u16(q7u16);
694 q8s16 = vreinterpretq_s16_u16(q8u16);
695 q9s16 = vreinterpretq_s16_u16(q9u16);
696 q10s16 = vreinterpretq_s16_u16(q10u16);
697 q11s16 = vreinterpretq_s16_u16(q11u16);
698 q12s16 = vreinterpretq_s16_u16(q12u16);
700 q8s16 = vqaddq_s16(q8s16, q3s16);
701 q9s16 = vqaddq_s16(q9s16, q4s16);
702 q10s16 = vqaddq_s16(q10s16, q5s16);
703 q11s16 = vqaddq_s16(q11s16, q6s16);
704 q12s16 = vqaddq_s16(q12s16, q7s16);
706 d26u8 = vqrshrun_n_s16(q8s16, 7);
707 d27u8 = vqrshrun_n_s16(q9s16, 7);
708 d28u8 = vqrshrun_n_s16(q10s16, 7);
709 d29u8 = vqrshrun_n_s16(q11s16, 7);
710 d30u8 = vqrshrun_n_s16(q12s16, 7);
713 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
714 d0s8 = vdup_lane_s8(dtmps8, 0);
715 d1s8 = vdup_lane_s8(dtmps8, 1);
716 d2s8 = vdup_lane_s8(dtmps8, 2);
717 d3s8 = vdup_lane_s8(dtmps8, 3);
718 d4s8 = vdup_lane_s8(dtmps8, 4);
719 d5s8 = vdup_lane_s8(dtmps8, 5);
720 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
721 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
722 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
723 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
724 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
725 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
727 q3u16 = vmull_u8(d22u8, d0u8);
728 q4u16 = vmull_u8(d23u8, d0u8);
729 q5u16 = vmull_u8(d24u8, d0u8);
730 q6u16 = vmull_u8(d25u8, d0u8);
732 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
733 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
734 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
735 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
737 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
738 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
739 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
740 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
742 q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
743 q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
744 q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
745 q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
747 q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
748 q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
749 q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
750 q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
752 q7u16 = vmull_u8(d25u8, d3u8);
753 q8u16 = vmull_u8(d26u8, d3u8);
754 q9u16 = vmull_u8(d27u8, d3u8);
755 q10u16 = vmull_u8(d28u8, d3u8);
757 q3s16 = vreinterpretq_s16_u16(q3u16);
758 q4s16 = vreinterpretq_s16_u16(q4u16);
759 q5s16 = vreinterpretq_s16_u16(q5u16);
760 q6s16 = vreinterpretq_s16_u16(q6u16);
761 q7s16 = vreinterpretq_s16_u16(q7u16);
762 q8s16 = vreinterpretq_s16_u16(q8u16);
763 q9s16 = vreinterpretq_s16_u16(q9u16);
764 q10s16 = vreinterpretq_s16_u16(q10u16);
766 q7s16 = vqaddq_s16(q7s16, q3s16);
767 q8s16 = vqaddq_s16(q8s16, q4s16);
768 q9s16 = vqaddq_s16(q9s16, q5s16);
769 q10s16 = vqaddq_s16(q10s16, q6s16);
771 d6u8 = vqrshrun_n_s16(q7s16, 7);
772 d7u8 = vqrshrun_n_s16(q8s16, 7);
773 d8u8 = vqrshrun_n_s16(q9s16, 7);
774 d9u8 = vqrshrun_n_s16(q10s16, 7);
776 vst1_u8(dst_ptr, d6u8);
777 dst_ptr += dst_pitch;
778 vst1_u8(dst_ptr, d7u8);
779 dst_ptr += dst_pitch;
780 vst1_u8(dst_ptr, d8u8);
781 dst_ptr += dst_pitch;
782 vst1_u8(dst_ptr, d9u8);
786 void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
787 int xoffset, int yoffset,
788 unsigned char *dst_ptr, int dst_pitch) {
789 unsigned char *src, *tmpp;
790 unsigned char tmp[64];
792 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
793 uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
794 uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
795 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
796 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
797 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
798 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
799 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
800 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
802 if (xoffset == 0) { // secondpass_filter8x8_only
803 // load second_pass filter
804 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
805 d0s8 = vdup_lane_s8(dtmps8, 0);
806 d1s8 = vdup_lane_s8(dtmps8, 1);
807 d2s8 = vdup_lane_s8(dtmps8, 2);
808 d3s8 = vdup_lane_s8(dtmps8, 3);
809 d4s8 = vdup_lane_s8(dtmps8, 4);
810 d5s8 = vdup_lane_s8(dtmps8, 5);
811 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
812 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
813 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
814 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
815 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
816 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
819 src = src_ptr - src_pixels_per_line * 2;
820 d18u8 = vld1_u8(src);
821 src += src_pixels_per_line;
822 d19u8 = vld1_u8(src);
823 src += src_pixels_per_line;
824 d20u8 = vld1_u8(src);
825 src += src_pixels_per_line;
826 d21u8 = vld1_u8(src);
827 src += src_pixels_per_line;
828 d22u8 = vld1_u8(src);
829 src += src_pixels_per_line;
830 d23u8 = vld1_u8(src);
831 src += src_pixels_per_line;
832 d24u8 = vld1_u8(src);
833 src += src_pixels_per_line;
834 d25u8 = vld1_u8(src);
835 src += src_pixels_per_line;
836 d26u8 = vld1_u8(src);
837 src += src_pixels_per_line;
838 d27u8 = vld1_u8(src);
839 src += src_pixels_per_line;
840 d28u8 = vld1_u8(src);
841 src += src_pixels_per_line;
842 d29u8 = vld1_u8(src);
843 src += src_pixels_per_line;
844 d30u8 = vld1_u8(src);
846 for (i = 2; i > 0; i--) {
847 q3u16 = vmull_u8(d18u8, d0u8);
848 q4u16 = vmull_u8(d19u8, d0u8);
849 q5u16 = vmull_u8(d20u8, d0u8);
850 q6u16 = vmull_u8(d21u8, d0u8);
852 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
853 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
854 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
855 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
857 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
858 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
859 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
860 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
862 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
863 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
864 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
865 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
867 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
868 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
869 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
870 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
872 q7u16 = vmull_u8(d21u8, d3u8);
873 q8u16 = vmull_u8(d22u8, d3u8);
874 q9u16 = vmull_u8(d23u8, d3u8);
875 q10u16 = vmull_u8(d24u8, d3u8);
877 q3s16 = vreinterpretq_s16_u16(q3u16);
878 q4s16 = vreinterpretq_s16_u16(q4u16);
879 q5s16 = vreinterpretq_s16_u16(q5u16);
880 q6s16 = vreinterpretq_s16_u16(q6u16);
881 q7s16 = vreinterpretq_s16_u16(q7u16);
882 q8s16 = vreinterpretq_s16_u16(q8u16);
883 q9s16 = vreinterpretq_s16_u16(q9u16);
884 q10s16 = vreinterpretq_s16_u16(q10u16);
886 q7s16 = vqaddq_s16(q7s16, q3s16);
887 q8s16 = vqaddq_s16(q8s16, q4s16);
888 q9s16 = vqaddq_s16(q9s16, q5s16);
889 q10s16 = vqaddq_s16(q10s16, q6s16);
891 d6u8 = vqrshrun_n_s16(q7s16, 7);
892 d7u8 = vqrshrun_n_s16(q8s16, 7);
893 d8u8 = vqrshrun_n_s16(q9s16, 7);
894 d9u8 = vqrshrun_n_s16(q10s16, 7);
906 vst1_u8(dst_ptr, d6u8);
907 dst_ptr += dst_pitch;
908 vst1_u8(dst_ptr, d7u8);
909 dst_ptr += dst_pitch;
910 vst1_u8(dst_ptr, d8u8);
911 dst_ptr += dst_pitch;
912 vst1_u8(dst_ptr, d9u8);
913 dst_ptr += dst_pitch;
918 // load first_pass filter
919 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
920 d0s8 = vdup_lane_s8(dtmps8, 0);
921 d1s8 = vdup_lane_s8(dtmps8, 1);
922 d2s8 = vdup_lane_s8(dtmps8, 2);
923 d3s8 = vdup_lane_s8(dtmps8, 3);
924 d4s8 = vdup_lane_s8(dtmps8, 4);
925 d5s8 = vdup_lane_s8(dtmps8, 5);
926 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
927 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
928 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
929 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
930 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
931 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
933 // First pass: output_height lines x output_width columns (9x4)
934 if (yoffset == 0) // firstpass_filter4x4_only
937 src = src_ptr - 2 - (src_pixels_per_line * 2);
940 for (i = 2; i > 0; i--) {
941 q3u8 = vld1q_u8(src);
942 src += src_pixels_per_line;
943 q4u8 = vld1q_u8(src);
944 src += src_pixels_per_line;
945 q5u8 = vld1q_u8(src);
946 src += src_pixels_per_line;
947 q6u8 = vld1q_u8(src);
948 src += src_pixels_per_line;
950 __builtin_prefetch(src);
951 __builtin_prefetch(src + src_pixels_per_line);
952 __builtin_prefetch(src + src_pixels_per_line * 2);
954 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
955 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
956 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
957 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
959 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
960 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
961 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
962 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
964 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
965 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
966 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
967 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
969 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
970 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
971 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
972 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
974 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
975 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
976 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
977 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
979 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
980 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
981 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
982 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
984 q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
985 q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
986 q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
987 q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
989 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
990 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
991 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
992 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
994 q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
995 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
996 q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
997 q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
999 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1000 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1001 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1002 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1004 q3u16 = vmull_u8(d28u8, d3u8);
1005 q4u16 = vmull_u8(d29u8, d3u8);
1006 q5u16 = vmull_u8(d30u8, d3u8);
1007 q6u16 = vmull_u8(d31u8, d3u8);
1009 q3s16 = vreinterpretq_s16_u16(q3u16);
1010 q4s16 = vreinterpretq_s16_u16(q4u16);
1011 q5s16 = vreinterpretq_s16_u16(q5u16);
1012 q6s16 = vreinterpretq_s16_u16(q6u16);
1013 q7s16 = vreinterpretq_s16_u16(q7u16);
1014 q8s16 = vreinterpretq_s16_u16(q8u16);
1015 q9s16 = vreinterpretq_s16_u16(q9u16);
1016 q10s16 = vreinterpretq_s16_u16(q10u16);
1018 q7s16 = vqaddq_s16(q7s16, q3s16);
1019 q8s16 = vqaddq_s16(q8s16, q4s16);
1020 q9s16 = vqaddq_s16(q9s16, q5s16);
1021 q10s16 = vqaddq_s16(q10s16, q6s16);
1023 d22u8 = vqrshrun_n_s16(q7s16, 7);
1024 d23u8 = vqrshrun_n_s16(q8s16, 7);
1025 d24u8 = vqrshrun_n_s16(q9s16, 7);
1026 d25u8 = vqrshrun_n_s16(q10s16, 7);
1028 if (yoffset == 0) { // firstpass_filter8x4_only
1029 vst1_u8(dst_ptr, d22u8);
1030 dst_ptr += dst_pitch;
1031 vst1_u8(dst_ptr, d23u8);
1032 dst_ptr += dst_pitch;
1033 vst1_u8(dst_ptr, d24u8);
1034 dst_ptr += dst_pitch;
1035 vst1_u8(dst_ptr, d25u8);
1036 dst_ptr += dst_pitch;
1038 vst1_u8(tmpp, d22u8);
1040 vst1_u8(tmpp, d23u8);
1042 vst1_u8(tmpp, d24u8);
1044 vst1_u8(tmpp, d25u8);
1048 if (yoffset == 0) return;
1050 // First Pass on rest 5-line data
1051 q3u8 = vld1q_u8(src);
1052 src += src_pixels_per_line;
1053 q4u8 = vld1q_u8(src);
1054 src += src_pixels_per_line;
1055 q5u8 = vld1q_u8(src);
1056 src += src_pixels_per_line;
1057 q6u8 = vld1q_u8(src);
1058 src += src_pixels_per_line;
1059 q7u8 = vld1q_u8(src);
1061 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
1062 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
1063 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
1064 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
1065 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
1067 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
1068 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
1069 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
1070 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
1071 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
1073 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
1074 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1075 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1076 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
1077 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
1079 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
1080 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
1081 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
1082 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
1083 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
1085 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
1086 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1087 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1088 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
1089 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
1091 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
1092 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
1093 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
1094 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
1095 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
1097 q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
1098 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1099 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1100 q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
1101 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
1103 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
1104 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
1105 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
1106 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
1107 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
1109 q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
1110 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1111 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1112 q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
1113 q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
1115 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1116 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1117 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1118 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1119 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
1121 q3u16 = vmull_u8(d27u8, d3u8);
1122 q4u16 = vmull_u8(d28u8, d3u8);
1123 q5u16 = vmull_u8(d29u8, d3u8);
1124 q6u16 = vmull_u8(d30u8, d3u8);
1125 q7u16 = vmull_u8(d31u8, d3u8);
1127 q3s16 = vreinterpretq_s16_u16(q3u16);
1128 q4s16 = vreinterpretq_s16_u16(q4u16);
1129 q5s16 = vreinterpretq_s16_u16(q5u16);
1130 q6s16 = vreinterpretq_s16_u16(q6u16);
1131 q7s16 = vreinterpretq_s16_u16(q7u16);
1132 q8s16 = vreinterpretq_s16_u16(q8u16);
1133 q9s16 = vreinterpretq_s16_u16(q9u16);
1134 q10s16 = vreinterpretq_s16_u16(q10u16);
1135 q11s16 = vreinterpretq_s16_u16(q11u16);
1136 q12s16 = vreinterpretq_s16_u16(q12u16);
1138 q8s16 = vqaddq_s16(q8s16, q3s16);
1139 q9s16 = vqaddq_s16(q9s16, q4s16);
1140 q10s16 = vqaddq_s16(q10s16, q5s16);
1141 q11s16 = vqaddq_s16(q11s16, q6s16);
1142 q12s16 = vqaddq_s16(q12s16, q7s16);
1144 d26u8 = vqrshrun_n_s16(q8s16, 7);
1145 d27u8 = vqrshrun_n_s16(q9s16, 7);
1146 d28u8 = vqrshrun_n_s16(q10s16, 7);
1147 d29u8 = vqrshrun_n_s16(q11s16, 7);
1148 d30u8 = vqrshrun_n_s16(q12s16, 7);
1151 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1152 d0s8 = vdup_lane_s8(dtmps8, 0);
1153 d1s8 = vdup_lane_s8(dtmps8, 1);
1154 d2s8 = vdup_lane_s8(dtmps8, 2);
1155 d3s8 = vdup_lane_s8(dtmps8, 3);
1156 d4s8 = vdup_lane_s8(dtmps8, 4);
1157 d5s8 = vdup_lane_s8(dtmps8, 5);
1158 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1159 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1160 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1161 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1162 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1163 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1166 q9u8 = vld1q_u8(tmpp);
1168 q10u8 = vld1q_u8(tmpp);
1170 q11u8 = vld1q_u8(tmpp);
1172 q12u8 = vld1q_u8(tmpp);
1174 d18u8 = vget_low_u8(q9u8);
1175 d19u8 = vget_high_u8(q9u8);
1176 d20u8 = vget_low_u8(q10u8);
1177 d21u8 = vget_high_u8(q10u8);
1178 d22u8 = vget_low_u8(q11u8);
1179 d23u8 = vget_high_u8(q11u8);
1180 d24u8 = vget_low_u8(q12u8);
1181 d25u8 = vget_high_u8(q12u8);
1183 for (i = 2; i > 0; i--) {
1184 q3u16 = vmull_u8(d18u8, d0u8);
1185 q4u16 = vmull_u8(d19u8, d0u8);
1186 q5u16 = vmull_u8(d20u8, d0u8);
1187 q6u16 = vmull_u8(d21u8, d0u8);
1189 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1190 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1191 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1192 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1194 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1195 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1196 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1197 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1199 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1200 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1201 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1202 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1204 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1205 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1206 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1207 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1209 q7u16 = vmull_u8(d21u8, d3u8);
1210 q8u16 = vmull_u8(d22u8, d3u8);
1211 q9u16 = vmull_u8(d23u8, d3u8);
1212 q10u16 = vmull_u8(d24u8, d3u8);
1214 q3s16 = vreinterpretq_s16_u16(q3u16);
1215 q4s16 = vreinterpretq_s16_u16(q4u16);
1216 q5s16 = vreinterpretq_s16_u16(q5u16);
1217 q6s16 = vreinterpretq_s16_u16(q6u16);
1218 q7s16 = vreinterpretq_s16_u16(q7u16);
1219 q8s16 = vreinterpretq_s16_u16(q8u16);
1220 q9s16 = vreinterpretq_s16_u16(q9u16);
1221 q10s16 = vreinterpretq_s16_u16(q10u16);
1223 q7s16 = vqaddq_s16(q7s16, q3s16);
1224 q8s16 = vqaddq_s16(q8s16, q4s16);
1225 q9s16 = vqaddq_s16(q9s16, q5s16);
1226 q10s16 = vqaddq_s16(q10s16, q6s16);
1228 d6u8 = vqrshrun_n_s16(q7s16, 7);
1229 d7u8 = vqrshrun_n_s16(q8s16, 7);
1230 d8u8 = vqrshrun_n_s16(q9s16, 7);
1231 d9u8 = vqrshrun_n_s16(q10s16, 7);
1243 vst1_u8(dst_ptr, d6u8);
1244 dst_ptr += dst_pitch;
1245 vst1_u8(dst_ptr, d7u8);
1246 dst_ptr += dst_pitch;
1247 vst1_u8(dst_ptr, d8u8);
1248 dst_ptr += dst_pitch;
1249 vst1_u8(dst_ptr, d9u8);
1250 dst_ptr += dst_pitch;
1255 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
1256 int src_pixels_per_line, int xoffset,
1257 int yoffset, unsigned char *dst_ptr,
1259 unsigned char *src, *src_tmp, *dst, *tmpp;
1260 unsigned char tmp[336];
1262 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
1263 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
1264 uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
1265 uint8x8_t d28u8, d29u8, d30u8, d31u8;
1266 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
1267 uint8x16_t q3u8, q4u8;
1268 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
1269 uint16x8_t q11u16, q12u16, q13u16, q15u16;
1270 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
1271 int16x8_t q11s16, q12s16, q13s16, q15s16;
1273 if (xoffset == 0) { // secondpass_filter8x8_only
1274 // load second_pass filter
1275 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1276 d0s8 = vdup_lane_s8(dtmps8, 0);
1277 d1s8 = vdup_lane_s8(dtmps8, 1);
1278 d2s8 = vdup_lane_s8(dtmps8, 2);
1279 d3s8 = vdup_lane_s8(dtmps8, 3);
1280 d4s8 = vdup_lane_s8(dtmps8, 4);
1281 d5s8 = vdup_lane_s8(dtmps8, 5);
1282 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1283 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1284 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1285 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1286 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1287 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1290 src_tmp = src_ptr - src_pixels_per_line * 2;
1291 for (i = 0; i < 2; ++i) {
1292 src = src_tmp + i * 8;
1293 dst = dst_ptr + i * 8;
1294 d18u8 = vld1_u8(src);
1295 src += src_pixels_per_line;
1296 d19u8 = vld1_u8(src);
1297 src += src_pixels_per_line;
1298 d20u8 = vld1_u8(src);
1299 src += src_pixels_per_line;
1300 d21u8 = vld1_u8(src);
1301 src += src_pixels_per_line;
1302 d22u8 = vld1_u8(src);
1303 src += src_pixels_per_line;
1304 for (j = 0; j < 4; ++j) {
1305 d23u8 = vld1_u8(src);
1306 src += src_pixels_per_line;
1307 d24u8 = vld1_u8(src);
1308 src += src_pixels_per_line;
1309 d25u8 = vld1_u8(src);
1310 src += src_pixels_per_line;
1311 d26u8 = vld1_u8(src);
1312 src += src_pixels_per_line;
1314 q3u16 = vmull_u8(d18u8, d0u8);
1315 q4u16 = vmull_u8(d19u8, d0u8);
1316 q5u16 = vmull_u8(d20u8, d0u8);
1317 q6u16 = vmull_u8(d21u8, d0u8);
1319 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1320 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1321 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1322 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1324 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1325 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1326 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1327 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1329 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1330 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1331 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1332 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1334 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1335 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1336 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1337 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1339 q7u16 = vmull_u8(d21u8, d3u8);
1340 q8u16 = vmull_u8(d22u8, d3u8);
1341 q9u16 = vmull_u8(d23u8, d3u8);
1342 q10u16 = vmull_u8(d24u8, d3u8);
1344 q3s16 = vreinterpretq_s16_u16(q3u16);
1345 q4s16 = vreinterpretq_s16_u16(q4u16);
1346 q5s16 = vreinterpretq_s16_u16(q5u16);
1347 q6s16 = vreinterpretq_s16_u16(q6u16);
1348 q7s16 = vreinterpretq_s16_u16(q7u16);
1349 q8s16 = vreinterpretq_s16_u16(q8u16);
1350 q9s16 = vreinterpretq_s16_u16(q9u16);
1351 q10s16 = vreinterpretq_s16_u16(q10u16);
1353 q7s16 = vqaddq_s16(q7s16, q3s16);
1354 q8s16 = vqaddq_s16(q8s16, q4s16);
1355 q9s16 = vqaddq_s16(q9s16, q5s16);
1356 q10s16 = vqaddq_s16(q10s16, q6s16);
1358 d6u8 = vqrshrun_n_s16(q7s16, 7);
1359 d7u8 = vqrshrun_n_s16(q8s16, 7);
1360 d8u8 = vqrshrun_n_s16(q9s16, 7);
1361 d9u8 = vqrshrun_n_s16(q10s16, 7);
1382 // load first_pass filter
1383 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
1384 d0s8 = vdup_lane_s8(dtmps8, 0);
1385 d1s8 = vdup_lane_s8(dtmps8, 1);
1386 d2s8 = vdup_lane_s8(dtmps8, 2);
1387 d3s8 = vdup_lane_s8(dtmps8, 3);
1388 d4s8 = vdup_lane_s8(dtmps8, 4);
1389 d5s8 = vdup_lane_s8(dtmps8, 5);
1390 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1391 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1392 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1393 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1394 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1395 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1397 // First pass: output_height lines x output_width columns (9x4)
1398 if (yoffset == 0) { // firstpass_filter4x4_only
1401 for (i = 0; i < 8; ++i) {
1402 d6u8 = vld1_u8(src);
1403 d7u8 = vld1_u8(src + 8);
1404 d8u8 = vld1_u8(src + 16);
1405 src += src_pixels_per_line;
1406 d9u8 = vld1_u8(src);
1407 d10u8 = vld1_u8(src + 8);
1408 d11u8 = vld1_u8(src + 16);
1409 src += src_pixels_per_line;
1411 __builtin_prefetch(src);
1412 __builtin_prefetch(src + src_pixels_per_line);
1414 q6u16 = vmull_u8(d6u8, d0u8);
1415 q7u16 = vmull_u8(d7u8, d0u8);
1416 q8u16 = vmull_u8(d9u8, d0u8);
1417 q9u16 = vmull_u8(d10u8, d0u8);
1419 d20u8 = vext_u8(d6u8, d7u8, 1);
1420 d21u8 = vext_u8(d9u8, d10u8, 1);
1421 d22u8 = vext_u8(d7u8, d8u8, 1);
1422 d23u8 = vext_u8(d10u8, d11u8, 1);
1423 d24u8 = vext_u8(d6u8, d7u8, 4);
1424 d25u8 = vext_u8(d9u8, d10u8, 4);
1425 d26u8 = vext_u8(d7u8, d8u8, 4);
1426 d27u8 = vext_u8(d10u8, d11u8, 4);
1427 d28u8 = vext_u8(d6u8, d7u8, 5);
1428 d29u8 = vext_u8(d9u8, d10u8, 5);
1430 q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
1431 q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
1432 q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
1433 q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
1434 q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
1435 q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
1436 q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
1437 q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
1438 q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
1439 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
1441 d20u8 = vext_u8(d7u8, d8u8, 5);
1442 d21u8 = vext_u8(d10u8, d11u8, 5);
1443 d22u8 = vext_u8(d6u8, d7u8, 2);
1444 d23u8 = vext_u8(d9u8, d10u8, 2);
1445 d24u8 = vext_u8(d7u8, d8u8, 2);
1446 d25u8 = vext_u8(d10u8, d11u8, 2);
1447 d26u8 = vext_u8(d6u8, d7u8, 3);
1448 d27u8 = vext_u8(d9u8, d10u8, 3);
1449 d28u8 = vext_u8(d7u8, d8u8, 3);
1450 d29u8 = vext_u8(d10u8, d11u8, 3);
1452 q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
1453 q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
1454 q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
1455 q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
1456 q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
1457 q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
1459 q10u16 = vmull_u8(d26u8, d3u8);
1460 q11u16 = vmull_u8(d27u8, d3u8);
1461 q12u16 = vmull_u8(d28u8, d3u8);
1462 q15u16 = vmull_u8(d29u8, d3u8);
1464 q6s16 = vreinterpretq_s16_u16(q6u16);
1465 q7s16 = vreinterpretq_s16_u16(q7u16);
1466 q8s16 = vreinterpretq_s16_u16(q8u16);
1467 q9s16 = vreinterpretq_s16_u16(q9u16);
1468 q10s16 = vreinterpretq_s16_u16(q10u16);
1469 q11s16 = vreinterpretq_s16_u16(q11u16);
1470 q12s16 = vreinterpretq_s16_u16(q12u16);
1471 q15s16 = vreinterpretq_s16_u16(q15u16);
1473 q6s16 = vqaddq_s16(q6s16, q10s16);
1474 q8s16 = vqaddq_s16(q8s16, q11s16);
1475 q7s16 = vqaddq_s16(q7s16, q12s16);
1476 q9s16 = vqaddq_s16(q9s16, q15s16);
1478 d6u8 = vqrshrun_n_s16(q6s16, 7);
1479 d7u8 = vqrshrun_n_s16(q7s16, 7);
1480 d8u8 = vqrshrun_n_s16(q8s16, 7);
1481 d9u8 = vqrshrun_n_s16(q9s16, 7);
1483 q3u8 = vcombine_u8(d6u8, d7u8);
1484 q4u8 = vcombine_u8(d8u8, d9u8);
1485 vst1q_u8(dst, q3u8);
1487 vst1q_u8(dst, q4u8);
1493 src = src_ptr - 2 - src_pixels_per_line * 2;
1495 for (i = 0; i < 7; ++i) {
1496 d6u8 = vld1_u8(src);
1497 d7u8 = vld1_u8(src + 8);
1498 d8u8 = vld1_u8(src + 16);
1499 src += src_pixels_per_line;
1500 d9u8 = vld1_u8(src);
1501 d10u8 = vld1_u8(src + 8);
1502 d11u8 = vld1_u8(src + 16);
1503 src += src_pixels_per_line;
1504 d12u8 = vld1_u8(src);
1505 d13u8 = vld1_u8(src + 8);
1506 d14u8 = vld1_u8(src + 16);
1507 src += src_pixels_per_line;
1509 __builtin_prefetch(src);
1510 __builtin_prefetch(src + src_pixels_per_line);
1511 __builtin_prefetch(src + src_pixels_per_line * 2);
1513 q8u16 = vmull_u8(d6u8, d0u8);
1514 q9u16 = vmull_u8(d7u8, d0u8);
1515 q10u16 = vmull_u8(d9u8, d0u8);
1516 q11u16 = vmull_u8(d10u8, d0u8);
1517 q12u16 = vmull_u8(d12u8, d0u8);
1518 q13u16 = vmull_u8(d13u8, d0u8);
1520 d28u8 = vext_u8(d6u8, d7u8, 1);
1521 d29u8 = vext_u8(d9u8, d10u8, 1);
1522 d30u8 = vext_u8(d12u8, d13u8, 1);
1523 q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
1524 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1525 q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
1526 d28u8 = vext_u8(d7u8, d8u8, 1);
1527 d29u8 = vext_u8(d10u8, d11u8, 1);
1528 d30u8 = vext_u8(d13u8, d14u8, 1);
1529 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1530 q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
1531 q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
1533 d28u8 = vext_u8(d6u8, d7u8, 4);
1534 d29u8 = vext_u8(d9u8, d10u8, 4);
1535 d30u8 = vext_u8(d12u8, d13u8, 4);
1536 q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
1537 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1538 q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
1539 d28u8 = vext_u8(d7u8, d8u8, 4);
1540 d29u8 = vext_u8(d10u8, d11u8, 4);
1541 d30u8 = vext_u8(d13u8, d14u8, 4);
1542 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1543 q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
1544 q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
1546 d28u8 = vext_u8(d6u8, d7u8, 5);
1547 d29u8 = vext_u8(d9u8, d10u8, 5);
1548 d30u8 = vext_u8(d12u8, d13u8, 5);
1549 q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
1550 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1551 q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
1552 d28u8 = vext_u8(d7u8, d8u8, 5);
1553 d29u8 = vext_u8(d10u8, d11u8, 5);
1554 d30u8 = vext_u8(d13u8, d14u8, 5);
1555 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1556 q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
1557 q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
1559 d28u8 = vext_u8(d6u8, d7u8, 2);
1560 d29u8 = vext_u8(d9u8, d10u8, 2);
1561 d30u8 = vext_u8(d12u8, d13u8, 2);
1562 q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
1563 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1564 q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
1565 d28u8 = vext_u8(d7u8, d8u8, 2);
1566 d29u8 = vext_u8(d10u8, d11u8, 2);
1567 d30u8 = vext_u8(d13u8, d14u8, 2);
1568 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1569 q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
1570 q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
1572 d28u8 = vext_u8(d6u8, d7u8, 3);
1573 d29u8 = vext_u8(d9u8, d10u8, 3);
1574 d30u8 = vext_u8(d12u8, d13u8, 3);
1575 d15u8 = vext_u8(d7u8, d8u8, 3);
1576 d31u8 = vext_u8(d10u8, d11u8, 3);
1577 d6u8 = vext_u8(d13u8, d14u8, 3);
1578 q4u16 = vmull_u8(d28u8, d3u8);
1579 q5u16 = vmull_u8(d29u8, d3u8);
1580 q6u16 = vmull_u8(d30u8, d3u8);
1581 q4s16 = vreinterpretq_s16_u16(q4u16);
1582 q5s16 = vreinterpretq_s16_u16(q5u16);
1583 q6s16 = vreinterpretq_s16_u16(q6u16);
1584 q8s16 = vreinterpretq_s16_u16(q8u16);
1585 q10s16 = vreinterpretq_s16_u16(q10u16);
1586 q12s16 = vreinterpretq_s16_u16(q12u16);
1587 q8s16 = vqaddq_s16(q8s16, q4s16);
1588 q10s16 = vqaddq_s16(q10s16, q5s16);
1589 q12s16 = vqaddq_s16(q12s16, q6s16);
1591 q6u16 = vmull_u8(d15u8, d3u8);
1592 q7u16 = vmull_u8(d31u8, d3u8);
1593 q3u16 = vmull_u8(d6u8, d3u8);
1594 q3s16 = vreinterpretq_s16_u16(q3u16);
1595 q6s16 = vreinterpretq_s16_u16(q6u16);
1596 q7s16 = vreinterpretq_s16_u16(q7u16);
1597 q9s16 = vreinterpretq_s16_u16(q9u16);
1598 q11s16 = vreinterpretq_s16_u16(q11u16);
1599 q13s16 = vreinterpretq_s16_u16(q13u16);
1600 q9s16 = vqaddq_s16(q9s16, q6s16);
1601 q11s16 = vqaddq_s16(q11s16, q7s16);
1602 q13s16 = vqaddq_s16(q13s16, q3s16);
1604 d6u8 = vqrshrun_n_s16(q8s16, 7);
1605 d7u8 = vqrshrun_n_s16(q9s16, 7);
1606 d8u8 = vqrshrun_n_s16(q10s16, 7);
1607 d9u8 = vqrshrun_n_s16(q11s16, 7);
1608 d10u8 = vqrshrun_n_s16(q12s16, 7);
1609 d11u8 = vqrshrun_n_s16(q13s16, 7);
1611 vst1_u8(tmpp, d6u8);
1613 vst1_u8(tmpp, d7u8);
1615 vst1_u8(tmpp, d8u8);
1617 vst1_u8(tmpp, d9u8);
1619 vst1_u8(tmpp, d10u8);
1621 vst1_u8(tmpp, d11u8);
1625 // Second pass: 16x16
1626 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1627 d0s8 = vdup_lane_s8(dtmps8, 0);
1628 d1s8 = vdup_lane_s8(dtmps8, 1);
1629 d2s8 = vdup_lane_s8(dtmps8, 2);
1630 d3s8 = vdup_lane_s8(dtmps8, 3);
1631 d4s8 = vdup_lane_s8(dtmps8, 4);
1632 d5s8 = vdup_lane_s8(dtmps8, 5);
1633 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1634 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1635 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1636 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1637 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1638 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1640 for (i = 0; i < 2; ++i) {
1641 dst = dst_ptr + 8 * i;
1643 d18u8 = vld1_u8(tmpp);
1645 d19u8 = vld1_u8(tmpp);
1647 d20u8 = vld1_u8(tmpp);
1649 d21u8 = vld1_u8(tmpp);
1651 d22u8 = vld1_u8(tmpp);
1653 for (j = 0; j < 4; ++j) {
1654 d23u8 = vld1_u8(tmpp);
1656 d24u8 = vld1_u8(tmpp);
1658 d25u8 = vld1_u8(tmpp);
1660 d26u8 = vld1_u8(tmpp);
1663 q3u16 = vmull_u8(d18u8, d0u8);
1664 q4u16 = vmull_u8(d19u8, d0u8);
1665 q5u16 = vmull_u8(d20u8, d0u8);
1666 q6u16 = vmull_u8(d21u8, d0u8);
1668 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1669 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1670 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1671 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1673 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1674 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1675 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1676 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1678 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1679 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1680 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1681 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1683 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1684 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1685 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1686 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1688 q7u16 = vmull_u8(d21u8, d3u8);
1689 q8u16 = vmull_u8(d22u8, d3u8);
1690 q9u16 = vmull_u8(d23u8, d3u8);
1691 q10u16 = vmull_u8(d24u8, d3u8);
1693 q3s16 = vreinterpretq_s16_u16(q3u16);
1694 q4s16 = vreinterpretq_s16_u16(q4u16);
1695 q5s16 = vreinterpretq_s16_u16(q5u16);
1696 q6s16 = vreinterpretq_s16_u16(q6u16);
1697 q7s16 = vreinterpretq_s16_u16(q7u16);
1698 q8s16 = vreinterpretq_s16_u16(q8u16);
1699 q9s16 = vreinterpretq_s16_u16(q9u16);
1700 q10s16 = vreinterpretq_s16_u16(q10u16);
1702 q7s16 = vqaddq_s16(q7s16, q3s16);
1703 q8s16 = vqaddq_s16(q8s16, q4s16);
1704 q9s16 = vqaddq_s16(q9s16, q5s16);
1705 q10s16 = vqaddq_s16(q10s16, q6s16);
1707 d6u8 = vqrshrun_n_s16(q7s16, 7);
1708 d7u8 = vqrshrun_n_s16(q8s16, 7);
1709 d8u8 = vqrshrun_n_s16(q9s16, 7);
1710 d9u8 = vqrshrun_n_s16(q10s16, 7);