granicus.if.org Git - libvpx/blob - vp8/common/arm/neon/vp8_loopfilter_neon.c

   1 /*
   2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <arm_neon.h>
  12 #include "./vpx_config.h"
  13 #include "vpx_ports/arm.h"
  14
  15 static INLINE void vp8_loop_filter_neon(
  16         uint8x16_t qblimit,  // flimit
  17         uint8x16_t qlimit,   // limit
  18         uint8x16_t qthresh,  // thresh
  19         uint8x16_t q3,       // p3
  20         uint8x16_t q4,       // p2
  21         uint8x16_t q5,       // p1
  22         uint8x16_t q6,       // p0
  23         uint8x16_t q7,       // q0
  24         uint8x16_t q8,       // q1
  25         uint8x16_t q9,       // q2
  26         uint8x16_t q10,      // q3
  27         uint8x16_t *q5r,     // p1
  28         uint8x16_t *q6r,     // p0
  29         uint8x16_t *q7r,     // q0
  30         uint8x16_t *q8r) {   // q1
  31     uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
  32     int16x8_t q2s16, q11s16;
  33     uint16x8_t q4u16;
  34     int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
  35     int8x8_t d2s8, d3s8;
  36
  37     q11u8 = vabdq_u8(q3, q4);
  38     q12u8 = vabdq_u8(q4, q5);
  39     q13u8 = vabdq_u8(q5, q6);
  40     q14u8 = vabdq_u8(q8, q7);
  41     q3    = vabdq_u8(q9, q8);
  42     q4    = vabdq_u8(q10, q9);
  43
  44     q11u8 = vmaxq_u8(q11u8, q12u8);
  45     q12u8 = vmaxq_u8(q13u8, q14u8);
  46     q3    = vmaxq_u8(q3, q4);
  47     q15u8 = vmaxq_u8(q11u8, q12u8);
  48
  49     q9 = vabdq_u8(q6, q7);
  50
  51     // vp8_hevmask
  52     q13u8 = vcgtq_u8(q13u8, qthresh);
  53     q14u8 = vcgtq_u8(q14u8, qthresh);
  54     q15u8 = vmaxq_u8(q15u8, q3);
  55
  56     q2u8 = vabdq_u8(q5, q8);
  57     q9 = vqaddq_u8(q9, q9);
  58
  59     q15u8 = vcgeq_u8(qlimit, q15u8);
  60
  61     // vp8_filter() function
  62     // convert to signed
  63     q10 = vdupq_n_u8(0x80);
  64     q8 = veorq_u8(q8, q10);
  65     q7 = veorq_u8(q7, q10);
  66     q6 = veorq_u8(q6, q10);
  67     q5 = veorq_u8(q5, q10);
  68
  69     q2u8 = vshrq_n_u8(q2u8, 1);
  70     q9 = vqaddq_u8(q9, q2u8);
  71
  72     q10 = vdupq_n_u8(3);
  73
  74     q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
  75                      vget_low_s8(vreinterpretq_s8_u8(q6)));
  76     q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
  77                       vget_high_s8(vreinterpretq_s8_u8(q6)));
  78
  79     q9 = vcgeq_u8(qblimit, q9);
  80
  81     q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
  82                     vreinterpretq_s8_u8(q8));
  83
  84     q14u8 = vorrq_u8(q13u8, q14u8);
  85
  86     q4u16 = vmovl_u8(vget_low_u8(q10));
  87     q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
  88     q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
  89
  90     q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
  91     q15u8 = vandq_u8(q15u8, q9);
  92
  93     q1s8 = vreinterpretq_s8_u8(q1u8);
  94     q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
  95     q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
  96
  97     q9 = vdupq_n_u8(4);
  98     // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
  99     d2s8 = vqmovn_s16(q2s16);
 100     d3s8 = vqmovn_s16(q11s16);
 101     q1s8 = vcombine_s8(d2s8, d3s8);
 102     q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
 103     q1s8 = vreinterpretq_s8_u8(q1u8);
 104
 105     q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
 106     q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
 107     q2s8 = vshrq_n_s8(q2s8, 3);
 108     q1s8 = vshrq_n_s8(q1s8, 3);
 109
 110     q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
 111     q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
 112
 113     q1s8 = vrshrq_n_s8(q1s8, 1);
 114     q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
 115
 116     q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
 117     q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
 118
 119     q0u8 = vdupq_n_u8(0x80);
 120     *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
 121     *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
 122     *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
 123     *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
 124     return;
 125 }
 126
 127 void vp8_loop_filter_horizontal_edge_y_neon(
 128         unsigned char *src,
 129         int pitch,
 130         unsigned char blimit,
 131         unsigned char limit,
 132         unsigned char thresh) {
 133     uint8x16_t qblimit, qlimit, qthresh, q3, q4;
 134     uint8x16_t q5, q6, q7, q8, q9, q10;
 135
 136     qblimit = vdupq_n_u8(blimit);
 137     qlimit  = vdupq_n_u8(limit);
 138     qthresh = vdupq_n_u8(thresh);
 139     src -= (pitch << 2);
 140
 141     q3 = vld1q_u8(src);
 142     src += pitch;
 143     q4 = vld1q_u8(src);
 144     src += pitch;
 145     q5 = vld1q_u8(src);
 146     src += pitch;
 147     q6 = vld1q_u8(src);
 148     src += pitch;
 149     q7 = vld1q_u8(src);
 150     src += pitch;
 151     q8 = vld1q_u8(src);
 152     src += pitch;
 153     q9 = vld1q_u8(src);
 154     src += pitch;
 155     q10 = vld1q_u8(src);
 156
 157     vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
 158                          q5, q6, q7, q8, q9, q10,
 159                          &q5, &q6, &q7, &q8);
 160
 161     src -= (pitch * 5);
 162     vst1q_u8(src, q5);
 163     src += pitch;
 164     vst1q_u8(src, q6);
 165     src += pitch;
 166     vst1q_u8(src, q7);
 167     src += pitch;
 168     vst1q_u8(src, q8);
 169     return;
 170 }
 171
 172 void vp8_loop_filter_horizontal_edge_uv_neon(
 173         unsigned char *u,
 174         int pitch,
 175         unsigned char blimit,
 176         unsigned char limit,
 177         unsigned char thresh,
 178         unsigned char *v) {
 179     uint8x16_t qblimit, qlimit, qthresh, q3, q4;
 180     uint8x16_t q5, q6, q7, q8, q9, q10;
 181     uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
 182     uint8x8_t d15, d16, d17, d18, d19, d20, d21;
 183
 184     qblimit = vdupq_n_u8(blimit);
 185     qlimit  = vdupq_n_u8(limit);
 186     qthresh = vdupq_n_u8(thresh);
 187
 188     u -= (pitch << 2);
 189     v -= (pitch << 2);
 190
 191     d6  = vld1_u8(u);
 192     u += pitch;
 193     d7  = vld1_u8(v);
 194     v += pitch;
 195     d8  = vld1_u8(u);
 196     u += pitch;
 197     d9  = vld1_u8(v);
 198     v += pitch;
 199     d10 = vld1_u8(u);
 200     u += pitch;
 201     d11 = vld1_u8(v);
 202     v += pitch;
 203     d12 = vld1_u8(u);
 204     u += pitch;
 205     d13 = vld1_u8(v);
 206     v += pitch;
 207     d14 = vld1_u8(u);
 208     u += pitch;
 209     d15 = vld1_u8(v);
 210     v += pitch;
 211     d16 = vld1_u8(u);
 212     u += pitch;
 213     d17 = vld1_u8(v);
 214     v += pitch;
 215     d18 = vld1_u8(u);
 216     u += pitch;
 217     d19 = vld1_u8(v);
 218     v += pitch;
 219     d20 = vld1_u8(u);
 220     d21 = vld1_u8(v);
 221
 222     q3 = vcombine_u8(d6, d7);
 223     q4 = vcombine_u8(d8, d9);
 224     q5 = vcombine_u8(d10, d11);
 225     q6 = vcombine_u8(d12, d13);
 226     q7 = vcombine_u8(d14, d15);
 227     q8 = vcombine_u8(d16, d17);
 228     q9 = vcombine_u8(d18, d19);
 229     q10 = vcombine_u8(d20, d21);
 230
 231     vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
 232                          q5, q6, q7, q8, q9, q10,
 233                          &q5, &q6, &q7, &q8);
 234
 235     u -= (pitch * 5);
 236     vst1_u8(u, vget_low_u8(q5));
 237     u += pitch;
 238     vst1_u8(u, vget_low_u8(q6));
 239     u += pitch;
 240     vst1_u8(u, vget_low_u8(q7));
 241     u += pitch;
 242     vst1_u8(u, vget_low_u8(q8));
 243
 244     v -= (pitch * 5);
 245     vst1_u8(v, vget_high_u8(q5));
 246     v += pitch;
 247     vst1_u8(v, vget_high_u8(q6));
 248     v += pitch;
 249     vst1_u8(v, vget_high_u8(q7));
 250     v += pitch;
 251     vst1_u8(v, vget_high_u8(q8));
 252     return;
 253 }
 254
 255 static INLINE void write_4x8(unsigned char *dst, int pitch,
 256                              const uint8x8x4_t result) {
 257 #ifdef VPX_INCOMPATIBLE_GCC
 258     /*
 259      * uint8x8x4_t result
 260     00 01 02 03 | 04 05 06 07
 261     10 11 12 13 | 14 15 16 17
 262     20 21 22 23 | 24 25 26 27
 263     30 31 32 33 | 34 35 36 37
 264     ---
 265     * after vtrn_u16
 266     00 01 20 21 | 04 05 24 25
 267     02 03 22 23 | 06 07 26 27
 268     10 11 30 31 | 14 15 34 35
 269     12 13 32 33 | 16 17 36 37
 270     ---
 271     * after vtrn_u8
 272     00 10 20 30 | 04 14 24 34
 273     01 11 21 31 | 05 15 25 35
 274     02 12 22 32 | 06 16 26 36
 275     03 13 23 33 | 07 17 27 37
 276     */
 277     const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
 278                                           vreinterpret_u16_u8(result.val[2]));
 279     const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
 280                                           vreinterpret_u16_u8(result.val[3]));
 281     const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
 282                                        vreinterpret_u8_u16(r13_u16.val[0]));
 283     const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
 284                                        vreinterpret_u8_u16(r13_u16.val[1]));
 285     const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
 286     const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
 287     const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
 288     const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
 289     vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
 290     dst += pitch;
 291     vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
 292     dst += pitch;
 293     vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
 294     dst += pitch;
 295     vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
 296     dst += pitch;
 297     vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
 298     dst += pitch;
 299     vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
 300     dst += pitch;
 301     vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
 302     dst += pitch;
 303     vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
 304 #else
 305     vst4_lane_u8(dst, result, 0);
 306     dst += pitch;
 307     vst4_lane_u8(dst, result, 1);
 308     dst += pitch;
 309     vst4_lane_u8(dst, result, 2);
 310     dst += pitch;
 311     vst4_lane_u8(dst, result, 3);
 312     dst += pitch;
 313     vst4_lane_u8(dst, result, 4);
 314     dst += pitch;
 315     vst4_lane_u8(dst, result, 5);
 316     dst += pitch;
 317     vst4_lane_u8(dst, result, 6);
 318     dst += pitch;
 319     vst4_lane_u8(dst, result, 7);
 320 #endif  // VPX_INCOMPATIBLE_GCC
 321 }
 322
 323 void vp8_loop_filter_vertical_edge_y_neon(
 324         unsigned char *src,
 325         int pitch,
 326         unsigned char blimit,
 327         unsigned char limit,
 328         unsigned char thresh) {
 329     unsigned char *s, *d;
 330     uint8x16_t qblimit, qlimit, qthresh, q3, q4;
 331     uint8x16_t q5, q6, q7, q8, q9, q10;
 332     uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
 333     uint8x8_t d15, d16, d17, d18, d19, d20, d21;
 334     uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
 335     uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
 336     uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
 337     uint8x8x4_t q4ResultH, q4ResultL;
 338
 339     qblimit = vdupq_n_u8(blimit);
 340     qlimit  = vdupq_n_u8(limit);
 341     qthresh = vdupq_n_u8(thresh);
 342
 343     s = src - 4;
 344     d6  = vld1_u8(s);
 345     s += pitch;
 346     d8  = vld1_u8(s);
 347     s += pitch;
 348     d10 = vld1_u8(s);
 349     s += pitch;
 350     d12 = vld1_u8(s);
 351     s += pitch;
 352     d14 = vld1_u8(s);
 353     s += pitch;
 354     d16 = vld1_u8(s);
 355     s += pitch;
 356     d18 = vld1_u8(s);
 357     s += pitch;
 358     d20 = vld1_u8(s);
 359     s += pitch;
 360     d7  = vld1_u8(s);
 361     s += pitch;
 362     d9  = vld1_u8(s);
 363     s += pitch;
 364     d11 = vld1_u8(s);
 365     s += pitch;
 366     d13 = vld1_u8(s);
 367     s += pitch;
 368     d15 = vld1_u8(s);
 369     s += pitch;
 370     d17 = vld1_u8(s);
 371     s += pitch;
 372     d19 = vld1_u8(s);
 373     s += pitch;
 374     d21 = vld1_u8(s);
 375
 376     q3 = vcombine_u8(d6, d7);
 377     q4 = vcombine_u8(d8, d9);
 378     q5 = vcombine_u8(d10, d11);
 379     q6 = vcombine_u8(d12, d13);
 380     q7 = vcombine_u8(d14, d15);
 381     q8 = vcombine_u8(d16, d17);
 382     q9 = vcombine_u8(d18, d19);
 383     q10 = vcombine_u8(d20, d21);
 384
 385     q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
 386     q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
 387     q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
 388     q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
 389
 390     q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
 391                        vreinterpretq_u16_u32(q2tmp2.val[0]));
 392     q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
 393                        vreinterpretq_u16_u32(q2tmp3.val[0]));
 394     q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
 395                        vreinterpretq_u16_u32(q2tmp2.val[1]));
 396     q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
 397                        vreinterpretq_u16_u32(q2tmp3.val[1]));
 398
 399     q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
 400                        vreinterpretq_u8_u16(q2tmp5.val[0]));
 401     q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
 402                        vreinterpretq_u8_u16(q2tmp5.val[1]));
 403     q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
 404                        vreinterpretq_u8_u16(q2tmp7.val[0]));
 405     q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
 406                        vreinterpretq_u8_u16(q2tmp7.val[1]));
 407
 408     q3 = q2tmp8.val[0];
 409     q4 = q2tmp8.val[1];
 410     q5 = q2tmp9.val[0];
 411     q6 = q2tmp9.val[1];
 412     q7 = q2tmp10.val[0];
 413     q8 = q2tmp10.val[1];
 414     q9 = q2tmp11.val[0];
 415     q10 = q2tmp11.val[1];
 416
 417     vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
 418                          q5, q6, q7, q8, q9, q10,
 419                          &q5, &q6, &q7, &q8);
 420
 421     q4ResultL.val[0] = vget_low_u8(q5);   // d10
 422     q4ResultL.val[1] = vget_low_u8(q6);   // d12
 423     q4ResultL.val[2] = vget_low_u8(q7);   // d14
 424     q4ResultL.val[3] = vget_low_u8(q8);   // d16
 425     q4ResultH.val[0] = vget_high_u8(q5);  // d11
 426     q4ResultH.val[1] = vget_high_u8(q6);  // d13
 427     q4ResultH.val[2] = vget_high_u8(q7);  // d15
 428     q4ResultH.val[3] = vget_high_u8(q8);  // d17
 429
 430     d = src - 2;
 431     write_4x8(d, pitch, q4ResultL);
 432     d += pitch * 8;
 433     write_4x8(d, pitch, q4ResultH);
 434 }
 435
 436 void vp8_loop_filter_vertical_edge_uv_neon(
 437         unsigned char *u,
 438         int pitch,
 439         unsigned char blimit,
 440         unsigned char limit,
 441         unsigned char thresh,
 442         unsigned char *v) {
 443     unsigned char *us, *ud;
 444     unsigned char *vs, *vd;
 445     uint8x16_t qblimit, qlimit, qthresh, q3, q4;
 446     uint8x16_t q5, q6, q7, q8, q9, q10;
 447     uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
 448     uint8x8_t d15, d16, d17, d18, d19, d20, d21;
 449     uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
 450     uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
 451     uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
 452     uint8x8x4_t q4ResultH, q4ResultL;
 453
 454     qblimit = vdupq_n_u8(blimit);
 455     qlimit  = vdupq_n_u8(limit);
 456     qthresh = vdupq_n_u8(thresh);
 457
 458     us = u - 4;
 459     d6 = vld1_u8(us);
 460     us += pitch;
 461     d8 = vld1_u8(us);
 462     us += pitch;
 463     d10 = vld1_u8(us);
 464     us += pitch;
 465     d12 = vld1_u8(us);
 466     us += pitch;
 467     d14 = vld1_u8(us);
 468     us += pitch;
 469     d16 = vld1_u8(us);
 470     us += pitch;
 471     d18 = vld1_u8(us);
 472     us += pitch;
 473     d20 = vld1_u8(us);
 474
 475     vs = v - 4;
 476     d7 = vld1_u8(vs);
 477     vs += pitch;
 478     d9 = vld1_u8(vs);
 479     vs += pitch;
 480     d11 = vld1_u8(vs);
 481     vs += pitch;
 482     d13 = vld1_u8(vs);
 483     vs += pitch;
 484     d15 = vld1_u8(vs);
 485     vs += pitch;
 486     d17 = vld1_u8(vs);
 487     vs += pitch;
 488     d19 = vld1_u8(vs);
 489     vs += pitch;
 490     d21 = vld1_u8(vs);
 491
 492     q3 = vcombine_u8(d6, d7);
 493     q4 = vcombine_u8(d8, d9);
 494     q5 = vcombine_u8(d10, d11);
 495     q6 = vcombine_u8(d12, d13);
 496     q7 = vcombine_u8(d14, d15);
 497     q8 = vcombine_u8(d16, d17);
 498     q9 = vcombine_u8(d18, d19);
 499     q10 = vcombine_u8(d20, d21);
 500
 501     q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
 502     q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
 503     q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
 504     q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
 505
 506     q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
 507                        vreinterpretq_u16_u32(q2tmp2.val[0]));
 508     q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
 509                        vreinterpretq_u16_u32(q2tmp3.val[0]));
 510     q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
 511                        vreinterpretq_u16_u32(q2tmp2.val[1]));
 512     q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
 513                        vreinterpretq_u16_u32(q2tmp3.val[1]));
 514
 515     q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
 516                        vreinterpretq_u8_u16(q2tmp5.val[0]));
 517     q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
 518                        vreinterpretq_u8_u16(q2tmp5.val[1]));
 519     q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
 520                        vreinterpretq_u8_u16(q2tmp7.val[0]));
 521     q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
 522                        vreinterpretq_u8_u16(q2tmp7.val[1]));
 523
 524     q3 = q2tmp8.val[0];
 525     q4 = q2tmp8.val[1];
 526     q5 = q2tmp9.val[0];
 527     q6 = q2tmp9.val[1];
 528     q7 = q2tmp10.val[0];
 529     q8 = q2tmp10.val[1];
 530     q9 = q2tmp11.val[0];
 531     q10 = q2tmp11.val[1];
 532
 533     vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
 534                          q5, q6, q7, q8, q9, q10,
 535                          &q5, &q6, &q7, &q8);
 536
 537     q4ResultL.val[0] = vget_low_u8(q5);   // d10
 538     q4ResultL.val[1] = vget_low_u8(q6);   // d12
 539     q4ResultL.val[2] = vget_low_u8(q7);   // d14
 540     q4ResultL.val[3] = vget_low_u8(q8);   // d16
 541     ud = u - 2;
 542     write_4x8(ud, pitch, q4ResultL);
 543
 544     q4ResultH.val[0] = vget_high_u8(q5);  // d11
 545     q4ResultH.val[1] = vget_high_u8(q6);  // d13
 546     q4ResultH.val[2] = vget_high_u8(q7);  // d15
 547     q4ResultH.val[3] = vget_high_u8(q8);  // d17
 548     vd = v - 2;
 549     write_4x8(vd, pitch, q4ResultH);
 550 }