Replace 14 with DCT_CONST_BITS in idct NEON functions' shifts

author Linfeng Zhang <linfengz@google.com>

Tue, 14 Feb 2017 20:44:57 +0000 (12:44 -0800)

committer James Zern <jzern@google.com>

Tue, 14 Feb 2017 21:08:41 +0000 (13:08 -0800)
author Linfeng Zhang <linfengz@google.com>
Tue, 14 Feb 2017 20:44:57 +0000 (12:44 -0800)
committer James Zern <jzern@google.com>
Tue, 14 Feb 2017 21:08:41 +0000 (13:08 -0800)
diff --git a/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/vpx_dsp/arm/highbd_idct16x16_add_neon.c

index 8ab2960c0ad6d7844fcb244dc2456f06ba0ed2ec..0f96165e1a50086a3657daa0f90b6b5d5c7dd838 100644 (file)
--- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -19,14 +19,14 @@ static INLINE void highbd_idct16x16_add_wrap_low_8x2(const int64x2x2_t *const t,
                                                       int32x4x2_t *const d1) {
    int32x2x2_t t32[4];
  
-  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], 14);
-  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], 14);
-  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], 14);
-  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], 14);
-  t32[2].val[0] = vrshrn_n_s64(t[2].val[0], 14);
-  t32[2].val[1] = vrshrn_n_s64(t[2].val[1], 14);
-  t32[3].val[0] = vrshrn_n_s64(t[3].val[0], 14);
-  t32[3].val[1] = vrshrn_n_s64(t[3].val[1], 14);
+  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
+  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
+  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
+  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
+  t32[2].val[0] = vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS);
+  t32[2].val[1] = vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS);
+  t32[3].val[0] = vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS);
+  t32[3].val[1] = vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS);
    d0->val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]);
    d0->val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]);
    d1->val[0] = vcombine_s32(t32[2].val[0], t32[2].val[1]);
diff --git a/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/vpx_dsp/arm/highbd_idct4x4_add_neon.c

index 26fa3e216bbce2a5be305d4ea507efa36191f2f0..128f72b9c96d7a4a791bd399feea4c8581d28fd1 100644 (file)
--- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -82,10 +82,10 @@ static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
    b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);
    b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);
    b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);
-  b0 = vrshrq_n_s32(b0, 14);
-  b1 = vrshrq_n_s32(b1, 14);
-  b2 = vrshrq_n_s32(b2, 14);
-  b3 = vrshrq_n_s32(b3, 14);
+  b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
+  b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
+  b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
+  b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
    *a0 = vaddq_s32(b0, b3);
    *a1 = vaddq_s32(b1, b2);
    *a2 = vsubq_s32(b1, b2);
@@ -119,10 +119,14 @@ static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
    c5 = vsubq_s64(c5, c9);
    c6 = vaddq_s64(c6, c10);
    c7 = vaddq_s64(c7, c11);
-  b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14));
-  b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14));
-  b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14));
-  b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14));
+  b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS),
+                    vrshrn_n_s64(c1, DCT_CONST_BITS));
+  b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS),
+                    vrshrn_n_s64(c3, DCT_CONST_BITS));
+  b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS),
+                    vrshrn_n_s64(c5, DCT_CONST_BITS));
+  b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS),
+                    vrshrn_n_s64(c7, DCT_CONST_BITS));
    *a0 = vaddq_s32(b0, b3);
    *a1 = vaddq_s32(b1, b2);
    *a2 = vsubq_s32(b1, b2);
diff --git a/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/vpx_dsp/arm/highbd_idct8x8_add_neon.c

index 141d2e68d24ee76e22ca0568551c9e04a218d744..f53f4c7fcad0fa2ce018de71e8acd5152f9bec43 100644 (file)
--- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -82,18 +82,18 @@ static INLINE void idct8x8_12_half1d_bd10(
    step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
    step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
    step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
-  step1[4] = vrshrq_n_s32(step1[4], 14);
-  step1[5] = vrshrq_n_s32(step1[5], 14);
-  step1[6] = vrshrq_n_s32(step1[6], 14);
-  step1[7] = vrshrq_n_s32(step1[7], 14);
+  step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+  step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
  
    // stage 2
    step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
    step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
    step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
-  step2[1] = vrshrq_n_s32(step2[1], 14);
-  step2[2] = vrshrq_n_s32(step2[2], 14);
-  step2[3] = vrshrq_n_s32(step2[3], 14);
+  step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+  step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+  step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
  
    step2[4] = vaddq_s32(step1[4], step1[5]);
    step2[5] = vsubq_s32(step1[4], step1[5]);
@@ -109,8 +109,8 @@ static INLINE void idct8x8_12_half1d_bd10(
    step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
    step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
    step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
-  step1[5] = vrshrq_n_s32(step1[5], 14);
-  step1[6] = vrshrq_n_s32(step1[6], 14);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
  
    // stage 4
    *io0 = vaddq_s32(step1[0], step2[7]);
@@ -154,14 +154,14 @@ static INLINE void idct8x8_12_half1d_bd12(
    t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
    t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
    t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
-  t32[4] = vrshrn_n_s64(t64[4], 14);
-  t32[5] = vrshrn_n_s64(t64[5], 14);
-  t32[6] = vrshrn_n_s64(t64[6], 14);
-  t32[7] = vrshrn_n_s64(t64[7], 14);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
    step1[4] = vcombine_s32(t32[0], t32[1]);
    step1[5] = vcombine_s32(t32[2], t32[3]);
    step1[6] = vcombine_s32(t32[4], t32[5]);
@@ -174,12 +174,12 @@ static INLINE void idct8x8_12_half1d_bd12(
    t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
    t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
    t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
-  t32[4] = vrshrn_n_s64(t64[4], 14);
-  t32[5] = vrshrn_n_s64(t64[5], 14);
-  t32[6] = vrshrn_n_s64(t64[6], 14);
-  t32[7] = vrshrn_n_s64(t64[7], 14);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
    step2[1] = vcombine_s32(t32[2], t32[3]);
    step2[2] = vcombine_s32(t32[4], t32[5]);
    step2[3] = vcombine_s32(t32[6], t32[7]);
@@ -205,10 +205,10 @@ static INLINE void idct8x8_12_half1d_bd12(
        vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
    t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
                            vget_high_s32(cospis0), 0);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
    step1[5] = vcombine_s32(t32[0], t32[1]);
    step1[6] = vcombine_s32(t32[2], t32[3]);
  
@@ -377,10 +377,10 @@ static INLINE void idct8x8_64_half1d_bd10(
    step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
    step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
  
-  step1[4] = vrshrq_n_s32(step1[4], 14);
-  step1[5] = vrshrq_n_s32(step1[5], 14);
-  step1[6] = vrshrq_n_s32(step1[6], 14);
-  step1[7] = vrshrq_n_s32(step1[7], 14);
+  step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+  step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
  
    // stage 2
    step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
@@ -392,10 +392,10 @@ static INLINE void idct8x8_64_half1d_bd10(
    step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
    step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
  
-  step2[0] = vrshrq_n_s32(step2[0], 14);
-  step2[1] = vrshrq_n_s32(step2[1], 14);
-  step2[2] = vrshrq_n_s32(step2[2], 14);
-  step2[3] = vrshrq_n_s32(step2[3], 14);
+  step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS);
+  step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+  step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+  step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
  
    step2[4] = vaddq_s32(step1[4], step1[5]);
    step2[5] = vsubq_s32(step1[4], step1[5]);
@@ -411,8 +411,8 @@ static INLINE void idct8x8_64_half1d_bd10(
    step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
    step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
    step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
-  step1[5] = vrshrq_n_s32(step1[5], 14);
-  step1[6] = vrshrq_n_s32(step1[6], 14);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
  
    // stage 4
    *io0 = vaddq_s32(step1[0], step2[7]);
@@ -473,14 +473,14 @@ static INLINE void idct8x8_64_half1d_bd12(
    t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0);
    t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1);
    t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
-  t32[4] = vrshrn_n_s64(t64[4], 14);
-  t32[5] = vrshrn_n_s64(t64[5], 14);
-  t32[6] = vrshrn_n_s64(t64[6], 14);
-  t32[7] = vrshrn_n_s64(t64[7], 14);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
    step1[4] = vcombine_s32(t32[0], t32[1]);
    step1[5] = vcombine_s32(t32[2], t32[3]);
    step1[6] = vcombine_s32(t32[4], t32[5]);
@@ -501,14 +501,14 @@ static INLINE void idct8x8_64_half1d_bd12(
    t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
    t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
    t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
-  t32[4] = vrshrn_n_s64(t64[4], 14);
-  t32[5] = vrshrn_n_s64(t64[5], 14);
-  t32[6] = vrshrn_n_s64(t64[6], 14);
-  t32[7] = vrshrn_n_s64(t64[7], 14);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
    step2[0] = vcombine_s32(t32[0], t32[1]);
    step2[1] = vcombine_s32(t32[2], t32[3]);
    step2[2] = vcombine_s32(t32[4], t32[5]);
@@ -535,10 +535,10 @@ static INLINE void idct8x8_64_half1d_bd12(
        vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
    t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
                            vget_high_s32(cospis0), 0);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
    step1[5] = vcombine_s32(t32[0], t32[1]);
    step1[6] = vcombine_s32(t32[2], t32[3]);
  
diff --git a/vpx_dsp/arm/idct16x16_add_neon.c b/vpx_dsp/arm/idct16x16_add_neon.c

index 728ebaeef72c18cdbfc0b84346353b36ba90ab33..b2f516f41208054605e20b2f0790585496702760 100644 (file)
--- a/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/vpx_dsp/arm/idct16x16_add_neon.c
@@ -16,8 +16,8 @@
  
  static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
                                  int16x4_t *const d1) {
-  *d0 = vrshrn_n_s32(t32[0], 14);
-  *d1 = vrshrn_n_s32(t32[1], 14);
+  *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
  }
  
  static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
diff --git a/vpx_dsp/arm/idct32x32_add_neon.c b/vpx_dsp/arm/idct32x32_add_neon.c

index de1bf978750b54af1b2436a1c3c9d8bc9fe8a87c..ae9457e18eeb378cdeaabac3e680053796161d2d 100644 (file)
--- a/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_add_neon.c
@@ -147,8 +147,10 @@ static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
    q11s32 = vaddq_s32(q12s32, q11s32);
    q10s32 = vaddq_s32(q10s32, q15s32);
  
-  *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, 14), vrshrn_n_s32(q9s32, 14));
-  *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14));
+  *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, DCT_CONST_BITS),
+                        vrshrn_n_s32(q9s32, DCT_CONST_BITS));
+  *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, DCT_CONST_BITS),
+                        vrshrn_n_s32(q10s32, DCT_CONST_BITS));
  }
  
  static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1,
diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h

index 9f27e640420d49b2c3192ff579e1a2586ccb8b14..7f7f2f133a13ba94e9c08b61fa9898d2f044156c 100644 (file)
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -15,6 +15,7 @@
  
  #include "./vpx_config.h"
  #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
  #include "vpx_dsp/vpx_dsp_common.h"
  
  DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
@@ -93,21 +94,21 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
  
  //------------------------------------------------------------------------------
  
-// Multiply a by a_const. Saturate, shift and narrow by 14.
+// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
  static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
                                                        const int16_t a_const) {
-  // Shift by 14 + rounding will be within 16 bits for well formed streams.
-  // See WRAPLOW and dct_const_round_shift for details.
+  // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed
+  // streams. See WRAPLOW and dct_const_round_shift for details.
    // This instruction doubles the result and returns the high half, essentially
    // resulting in a right shift by 15. By multiplying the constant first that
-  // becomes a right shift by 14.
+  // becomes a right shift by DCT_CONST_BITS.
    // The largest possible value used here is
    // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
    // within the range of int16_t (+32767 / -32768) even when negated.
    return vqrdmulhq_n_s16(a, a_const * 2);
  }
  
-// Add a and b, then multiply by ab_const. Shift and narrow by 14.
+// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
  static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
      const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
    // In both add_ and it's pair, sub_, the input for well-formed streams will be
@@ -121,21 +122,24 @@ static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
    int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
    temp_low = vmulq_n_s32(temp_low, ab_const);
    temp_high = vmulq_n_s32(temp_high, ab_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
  }
  
-// Subtract b from a, then multiply by ab_const. Shift and narrow by 14.
+// Subtract b from a, then multiply by ab_const. Shift and narrow by
+// DCT_CONST_BITS.
  static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
      const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
    int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
    int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
    temp_low = vmulq_n_s32(temp_low, ab_const);
    temp_high = vmulq_n_s32(temp_high, ab_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
  }
  
  // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
-// 14.
+// DCT_CONST_BITS.
  static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
      const int16x8_t a, const int16_t a_const, const int16x8_t b,
      const int16_t b_const) {
@@ -143,7 +147,8 @@ static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
    int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
    temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
    temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
  }
  
  // Shift the output down by 6 and add it to the destination buffer.
@@ -233,10 +238,10 @@ static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
    c3 = vmull_lane_s16(b2, cospis, 1);
    c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
    c3 = vmlal_lane_s16(c3, b3, cospis, 3);
-  b0 = vrshrn_n_s32(c0, 14);
-  b1 = vrshrn_n_s32(c1, 14);
-  b2 = vrshrn_n_s32(c2, 14);
-  b3 = vrshrn_n_s32(c3, 14);
+  b0 = vrshrn_n_s32(c0, DCT_CONST_BITS);
+  b1 = vrshrn_n_s32(c1, DCT_CONST_BITS);
+  b2 = vrshrn_n_s32(c2, DCT_CONST_BITS);
+  b3 = vrshrn_n_s32(c3, DCT_CONST_BITS);
    d0 = vcombine_s16(b0, b1);
    d1 = vcombine_s16(b3, b2);
    *a0 = vaddq_s16(d0, d1);
@@ -278,8 +283,8 @@ static INLINE void idct8x8_12_pass1_bd8(
    t32[1] = vmull_lane_s16(step2[6], cospis0, 2);
    t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2);
    t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2);
-  step1[5] = vrshrn_n_s32(t32[0], 14);
-  step1[6] = vrshrn_n_s32(t32[1], 14);
+  step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
  
    // stage 4
    *io0 = vadd_s16(step1[0], step2[7]);
@@ -337,10 +342,10 @@ static INLINE void idct8x8_12_pass2_bd8(
    t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
    t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
    t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
    step1[5] = vcombine_s16(t16[0], t16[1]);
    step1[6] = vcombine_s16(t16[2], t16[3]);
  
@@ -405,14 +410,14 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
    t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2);
    t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3);
    t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3);
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
-  t16[4] = vrshrn_n_s32(t32[4], 14);
-  t16[5] = vrshrn_n_s32(t32[5], 14);
-  t16[6] = vrshrn_n_s32(t32[6], 14);
-  t16[7] = vrshrn_n_s32(t32[7], 14);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
+  t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
+  t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
+  t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
+  t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
    step1[4] = vcombine_s16(t16[0], t16[1]);
    step1[5] = vcombine_s16(t16[2], t16[3]);
    step1[6] = vcombine_s16(t16[4], t16[5]);
@@ -433,14 +438,14 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
    t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
    t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
    t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
-  t16[4] = vrshrn_n_s32(t32[4], 14);
-  t16[5] = vrshrn_n_s32(t32[5], 14);
-  t16[6] = vrshrn_n_s32(t32[6], 14);
-  t16[7] = vrshrn_n_s32(t32[7], 14);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
+  t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
+  t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
+  t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
+  t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
    step2[0] = vcombine_s16(t16[0], t16[1]);
    step2[1] = vcombine_s16(t16[2], t16[3]);
    step2[2] = vcombine_s16(t16[4], t16[5]);
@@ -463,10 +468,10 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
    t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
    t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
    t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
    step1[5] = vcombine_s16(t16[0], t16[1]);
    step1[6] = vcombine_s16(t16[2], t16[3]);
  
@@ -486,10 +491,10 @@ static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32,
                                                int16x8_t *const d1) {
    int16x4_t t16[4];
  
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
    *d0 = vcombine_s16(t16[0], t16[1]);
    *d1 = vcombine_s16(t16[2], t16[3]);
  }
author	Linfeng Zhang <linfengz@google.com>
	Tue, 14 Feb 2017 20:44:57 +0000 (12:44 -0800)
committer	James Zern <jzern@google.com>
	Tue, 14 Feb 2017 21:08:41 +0000 (13:08 -0800)
vpx_dsp/arm/highbd_idct16x16_add_neon.c		patch \| blob \| history
vpx_dsp/arm/highbd_idct4x4_add_neon.c		patch \| blob \| history
vpx_dsp/arm/highbd_idct8x8_add_neon.c		patch \| blob \| history
vpx_dsp/arm/idct16x16_add_neon.c		patch \| blob \| history
vpx_dsp/arm/idct32x32_add_neon.c		patch \| blob \| history
vpx_dsp/arm/idct_neon.h		patch \| blob \| history