Merge "Renaming in MB_MODE_INFO"

author Paul Wilkins <paulwilkins@google.com>

Thu, 15 Aug 2013 09:12:48 +0000 (02:12 -0700)

committer Gerrit Code Review <gerrit@gerrit.golo.chromium.org>

Thu, 15 Aug 2013 09:12:48 +0000 (02:12 -0700)
author Paul Wilkins <paulwilkins@google.com>
Thu, 15 Aug 2013 09:12:48 +0000 (02:12 -0700)
committer Gerrit Code Review <gerrit@gerrit.golo.chromium.org>
Thu, 15 Aug 2013 09:12:48 +0000 (02:12 -0700)
diff --git a/build/make/armlink_adapter.sh b/build/make/armlink_adapter.sh

index b53669c9ba24ff07f6ccb19d728c7542dc299149..75c342e97ce3923eff0eec8647a90823b16c4d4f 100755 (executable)
--- a/build/make/armlink_adapter.sh
+++ b/build/make/armlink_adapter.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
  ##
  ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  ##
@@ -13,20 +13,20 @@
  verbose=0
  set -- $*
  for i; do
-    if [ "$i" == "-o" ]; then
+    if [ "$i" = "-o" ]; then
          on_of=1
-    elif [ "$i" == "-v" ]; then
+    elif [ "$i" = "-v" ]; then
          verbose=1
-    elif [ "$i" == "-g" ]; then
+    elif [ "$i" = "-g" ]; then
          args="${args} --debug"
-    elif [ "$on_of" == "1" ]; then
+    elif [ "$on_of" = "1" ]; then
          outfile=$i
          on_of=0
      elif [ -f "$i" ]; then
          infiles="$infiles $i"
-    elif [ "${i:0:2}" == "-l" ]; then
+    elif [ "${i#-l}" != "$i" ]; then
          libs="$libs ${i#-l}"
-    elif [ "${i:0:2}" == "-L" ]; then
+    elif [ "${i#-L}" != "$i" ]; then
          libpaths="${libpaths} ${i#-L}"
      else
          args="${args} ${i}"
diff --git a/build/make/configure.sh b/build/make/configure.sh

index e2566b0a7aaae2f8bcb57e4258abe9b3ce1acc9e..d8e65e32d1f1025a106fd8d623cb45b6a31b0d1e 100755 (executable)
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
  ##
  ##  configure.sh
  ##
@@ -264,12 +264,13 @@ elif test ! -z "$TEMPDIR" ; then
  else
      TMPDIRx="/tmp"
  fi
-TMP_H="${TMPDIRx}/vpx-conf-$$-${RANDOM}.h"
-TMP_C="${TMPDIRx}/vpx-conf-$$-${RANDOM}.c"
-TMP_CC="${TMPDIRx}/vpx-conf-$$-${RANDOM}.cc"
-TMP_O="${TMPDIRx}/vpx-conf-$$-${RANDOM}.o"
-TMP_X="${TMPDIRx}/vpx-conf-$$-${RANDOM}.x"
-TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RANDOM}.asm"
+RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}')
+TMP_H="${TMPDIRx}/vpx-conf-$$-${RAND}.h"
+TMP_C="${TMPDIRx}/vpx-conf-$$-${RAND}.c"
+TMP_CC="${TMPDIRx}/vpx-conf-$$-${RAND}.cc"
+TMP_O="${TMPDIRx}/vpx-conf-$$-${RAND}.o"
+TMP_X="${TMPDIRx}/vpx-conf-$$-${RAND}.x"
+TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RAND}.asm"
  
  clean_temp_files() {
      rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
@@ -805,7 +806,7 @@ process_common_toolchain() {
              arch_int=${arch_int%%te}
              check_add_asflags --defsym ARCHITECTURE=${arch_int}
              tune_cflags="-mtune="
-            if [ ${tgt_isa} == "armv7" ]; then
+            if [ ${tgt_isa} = "armv7" ]; then
                  if [ -z "${float_abi}" ]; then
                      check_cpp <<EOF && float_abi=hard || float_abi=softfp
  #ifndef __ARM_PCS_VFP
@@ -855,7 +856,7 @@ EOF
              tune_cflags="--cpu="
              tune_asflags="--cpu="
              if [ -z "${tune_cpu}" ]; then
-                if [ ${tgt_isa} == "armv7" ]; then
+                if [ ${tgt_isa} = "armv7" ]; then
                      if enabled neon
                      then
                          check_add_cflags --fpu=softvfp+vfpv3
@@ -915,7 +916,7 @@ EOF
  
              enable pic
              soft_enable realtime_only
-            if [ ${tgt_isa} == "armv7" ]; then
+            if [ ${tgt_isa} = "armv7" ]; then
                  soft_enable runtime_cpu_detect
              fi
              if enabled runtime_cpu_detect; then
@@ -1191,7 +1192,7 @@ EOF
  
      # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
      echo "  checking here for x86inc \"${tgt_isa}\" \"$pic\" "
-    if [ ${tgt_isa} = x86_64 -o ! "$pic" == "yes" -o ! ${tgt_os:0:6} = darwin ]; then
+    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o "${tgt_os#darwin}" = "${tgt_os}"  ]; then
        soft_enable use_x86inc
      fi
  
@@ -1287,8 +1288,8 @@ print_config_h() {
  
  print_webm_license() {
      local destination=$1
-    local prefix=$2
-    local suffix=$3
+    local prefix="$2"
+    local suffix="$3"
      shift 3
      cat <<EOF > ${destination}
  ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
diff --git a/build/make/gen_asm_deps.sh b/build/make/gen_asm_deps.sh

index 0b4e3aa84a56686a1448d50de7b6f0ae2e84c461..6a7bff9ebce4c11eccae70e76619c8332f4353b6 100755 (executable)
--- a/build/make/gen_asm_deps.sh
+++ b/build/make/gen_asm_deps.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
  ##
  ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  ##
diff --git a/build/make/version.sh b/build/make/version.sh

index 3efb956bb783ea3afaad8dcc7a36fab5ed71cb8b..e31e568aad4355945fa8cd984e1c008dd3d5a64a 100755 (executable)
--- a/build/make/version.sh
+++ b/build/make/version.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
  ##
  ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  ##
diff --git a/configure b/configure

index 64f016547fcf537d646241503b434f2f9b7eec23..f25fa204adb5fc07aa92bbfafeb4501fc1b28b30 100755 (executable)
--- a/configure
+++ b/configure
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
  ##
  ##  configure
  ##
diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c

new file mode 100644 (file)

index 0000000..319d58f
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
+                                               int16_t *output,
+                                               int output_stride);
+extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
+                                               int16_t *output,
+                                               int16_t *pass1Output,
+                                               int16_t skip_adding,
+                                               uint8_t *dest,
+                                               int dest_stride);
+extern void save_registers();
+extern void restore_registers();
+
+
+void vp9_short_idct16x16_add_neon(int16_t *input,
+                                  uint8_t *dest, int dest_stride) {
+  int16_t pass1_output[16*16] = {0};
+  int16_t row_idct_output[16*16] = {0};
+
+  // save d8-d15 register values.
+  save_registers();
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct16x16_add_neon_pass2(input+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     0,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the lower 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     0,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     1,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     1,
+                                     dest+8,
+                                     dest_stride);
+
+  // restore d8-d15 register values.
+  restore_registers();
+
+  return;
+}
diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm

new file mode 100644 (file)

index 0000000..8d0eab9
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
@@ -0,0 +1,809 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_idct16x16_add_neon_pass1|
+    EXPORT  |vp9_short_idct16x16_add_neon_pass2|
+    EXPORT  |save_registers|
+    EXPORT  |restore_registers|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+    MACRO
+    TRANSPOSE8X8
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    MEND
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void |vp9_short_idct16x16_add_neon_pass1|(int16_t *input,
+;                                          int16_t *output, int output_stride)
+;
+; r0  int16_t input
+; r1  int16_t *output
+; r2  int  output_stride)
+
+; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vp9_short_idct16x16_add_neon_pass1| PROC
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q1,q2}, [r0]!
+    vmov.s16        q15, q1
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0xc00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r12, #0x3e00
+    add             r12, #0xc5
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r12                   ; duplicate cospi_4_64
+
+    ; preloading to avoid stall
+    ; generate cospi_12_64 = 13623
+    mov             r3, #0x3500
+    add             r3, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r12, #0x2300
+    add             r12, #0x8e
+
+    ; step2[4] * cospi_28_64
+    vmull.s16       q2, d18, d0
+    vmull.s16       q3, d19, d0
+
+    ; step2[4] * cospi_28_64 - step2[7] * cospi_4_64
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; dct_const_round_shift(temp1);
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; step2[4] * cospi_4_64
+    vmull.s16       q2, d18, d1
+    vmull.s16       q3, d19, d1
+
+    ; step2[4] * cospi_4_64 + step2[7] * cospi_28_64
+    vmlal.s16       q2, d30, d0
+    vmlal.s16       q3, d31, d0
+
+    ; dct_const_round_shift(temp2);
+    vqrshrn.s32     d14, q2, #14              ; >> 14
+    vqrshrn.s32     d15, q3, #14              ; >> 14
+
+    vdup.16         d0, r3;                   ; duplicate cospi_12_64
+    vdup.16         d1, r12;                  ; duplicate cospi_20_64
+
+    ; preloading to avoid stall
+    ; generate cospi_16_64 = 11585
+    mov             r3, #0x2d00
+    add             r3, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r12, #0x1800
+    add             r12, #0x7e
+
+    ; step2[5] * cospi_12_64
+    vmull.s16       q2, d26, d0
+    vmull.s16       q3, d27, d0
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q2, d22, d1
+    vmlsl.s16       q3, d23, d1
+
+    ; dct_const_round_shift(temp1);
+    vqrshrn.s32     d10, q2, #14              ; >> 14
+    vqrshrn.s32     d11, q3, #14              ; >> 14
+
+    ; step2[5] * cospi_20_64
+    vmull.s16       q2, d26, d1
+    vmull.s16       q3, d27, d1
+
+    ; step2[5] * cospi_20_64 + step2[6] * cospi_12_64
+    vmlal.s16       q2, d22, d0
+    vmlal.s16       q3, d23, d0
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q2, #14              ; >> 14
+    vqrshrn.s32     d13, q3, #14              ; >> 14
+
+    ; stage 4
+    vdup.16         d30, r3                   ; cospi_16_64
+
+    ; step1[0] * cospi_16_64
+    vmull.s16       q2, d16, d30
+    vmull.s16       q11, d17, d30
+
+    ; step1[1] * cospi_16_64
+    vmull.s16       q0, d24, d30
+    vmull.s16       q1, d25, d30
+
+    ; (step1[0] + step1[1]) * cospi_16_64;
+    vadd.s32        q3, q2, q0
+    vadd.s32        q15, q11, q1
+
+    ; step2[0] = dct_const_round_shift(temp1)
+    vqrshrn.s32     d16, q3, #14              ; >> 14
+    vqrshrn.s32     d17, q15, #14             ; >> 14
+
+    ; (step1[0] - step1[1]) * cospi_16_64
+    vsub.s32        q3, q2, q0
+    vsub.s32        q15, q11, q1
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q3, #14              ; >> 14
+    vqrshrn.s32     d19, q15, #14             ; >> 14
+
+    ; generate cospi_8_64 = 15137
+    mov             r3, #0x3b00
+    add             r3, #0x21
+
+    ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+    vdup.16         d30, r12                  ; duplicate cospi_24_64
+    vdup.16         d31, r3                   ; duplicate cospi_8_64
+
+    ; step1[2] * cospi_8_64
+    vmull.s16       q0, d20, d31
+    vmull.s16       q1, d21, d31
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q0, d28, d30
+    vmlal.s16       q1, d29, d30
+
+    ; dct_const_round_shift(temp2);
+    vqrshrn.s32     d22, q0, #14              ; >> 14
+    vqrshrn.s32     d23, q1, #14              ; >> 14
+
+    ; step1[2] * cospi_24_64
+    vmull.s16       q0, d20, d30
+    vmull.s16       q1, d21, d30
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlsl.s16       q0, d28, d31
+    vmlsl.s16       q1, d29, d31
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d20, q0, #14              ; >> 14
+    vqrshrn.s32     d21, q1, #14              ; >> 14
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5];
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5];
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7];
+    vadd.s16        q15, q6, q7               ; step2[7] = step1[6] + step1[7];
+
+    ; generate cospi_16_64 = 11585
+    mov             r3, #0x2d00
+    add             r3, #0x41
+
+    ; stage 5
+    vadd.s16        q0, q8, q11               ; step1[0] = step2[0] + step2[3];
+    vadd.s16        q1, q9, q10               ; step1[1] = step2[1] + step2[2];
+    vsub.s16        q2, q9, q10               ; step1[2] = step2[1] - step2[2];
+    vsub.s16        q3, q8, q11               ; step1[3] = step2[0] - step2[3];
+
+    vdup.16         d16, r3;                  ; duplicate cospi_16_64
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[5] * cospi_16_64
+    vmull.s16       q11, d26, d16
+    vmull.s16       q12, d27, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vsub.s32        q6, q9, q11
+    vsub.s32        q7, q10, q12
+
+    ; step1[5] = dct_const_round_shift(temp1);
+    vqrshrn.s32     d10, q6, #14              ; >> 14
+    vqrshrn.s32     d11, q7, #14              ; >> 14
+
+    ; temp2 = (step2[5] + step2[6]) * cospi_16_64;
+    vadd.s32        q9, q9, q11
+    vadd.s32        q10, q10, q12
+
+    ; step1[6] = dct_const_round_shift(temp2);
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q10, #14             ; >> 14
+
+    ; step1[7] = step2[7];
+    vmov.s16         q7, q15
+
+    ; stage 6
+    vadd.s16        q8, q0, q7                ; step2[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; step2[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; step2[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; step2[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; step2[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; step2[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; step2[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; step2[7] = step1[0] - step1[7];
+
+    ; store the data
+    vst1.64         {d16}, [r1], r2
+    vst1.64         {d17}, [r1], r2
+    vst1.64         {d18}, [r1], r2
+    vst1.64         {d19}, [r1], r2
+    vst1.64         {d20}, [r1], r2
+    vst1.64         {d21}, [r1], r2
+    vst1.64         {d22}, [r1], r2
+    vst1.64         {d23}, [r1], r2
+    vst1.64         {d24}, [r1], r2
+    vst1.64         {d25}, [r1], r2
+    vst1.64         {d26}, [r1], r2
+    vst1.64         {d27}, [r1], r2
+    vst1.64         {d28}, [r1], r2
+    vst1.64         {d29}, [r1], r2
+    vst1.64         {d30}, [r1], r2
+    vst1.64         {d31}, [r1], r2
+
+    bx              lr
+    ENDP  ; |vp9_short_idct16x16_add_neon_pass1|
+
+;void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
+;                                        int16_t *output,
+;                                        int16_t *pass1Output,
+;                                        int16_t skip_adding,
+;                                        uint8_t *dest,
+;                                        int dest_stride)
+;
+; r0  int16_t *src
+; r1  int16_t *output,
+; r2  int16_t *pass1Output,
+; r3  int16_t skip_adding,
+; r4  uint8_t *dest,
+; r5  int dest_stride)
+
+; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vp9_short_idct16x16_add_neon_pass2| PROC
+    push            {r3-r9}
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q0,q1}, [r0]!
+    vmov.s16        q15, q0;
+
+    ; generate  cospi_30_64 = 1606
+    mov             r3, #0x0600
+    add             r3, #0x46
+
+    ; generate cospi_2_64  = 16305
+    mov             r12, #0x3f00
+    add             r12, #0xb1
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         d12, r3                   ; duplicate cospi_30_64
+    vdup.16         d13, r12                  ; duplicate cospi_2_64
+
+    ; preloading to avoid stall
+    ; generate cospi_14_64 = 12665
+    mov             r3, #0x3100
+    add             r3, #0x79
+
+    ; generate cospi_18_64 = 10394
+    mov             r12, #0x2800
+    add             r12, #0x9a
+
+    ; step1[8] * cospi_30_64 = input[1] * cospi_30_64
+    vmull.s16       q2, d16, d12
+    vmull.s16       q3, d17, d12
+
+    ; step1[8] * cospi_30_64 - step1[15] * cospi_2_64
+    vmlsl.s16       q2, d30, d13
+    vmlsl.s16       q3, d31, d13
+
+    ; dct_const_round_shift(temp1);
+    vqrshrn.s32     d0, q2, #14               ; >> 14
+    vqrshrn.s32     d1, q3, #14               ; >> 14
+
+    ; step1[8] * cospi_2_64
+    vmull.s16       q2, d16, d13
+    vmull.s16       q3, d17, d13
+
+    ; step1[8] * cospi_2_64 + step1[15] * cospi_30_64
+    vmlal.s16       q2, d30, d12
+    vmlal.s16       q3, d31, d12
+
+    ; dct_const_round_shift(temp2);
+    vqrshrn.s32     d14, q2, #14              ; >> 14
+    vqrshrn.s32     d15, q3, #14              ; >> 14
+
+    vdup.16         d30, r3                   ; duplicate cospi_14_64
+    vdup.16         d31, r12                  ; duplicate cospi_18_64
+
+    ; preloading to avoid stall
+    ; generate cospi_22_64 = 7723
+    mov             r3, #0x1e00
+    add             r3, #0x2b
+
+    ; generate cospi_10_64 = 14449
+    mov             r12, #0x3800
+    add             r12, #0x71
+
+    ; step1[9] * cospi_14_64 = input[9] * cospi_12_64
+    vmull.s16       q2, d24, d30
+    vmull.s16       q3, d25, d30
+
+    ; step1[9] * cospi_14_64 - step1[14] * cospi_18_64
+    vmlsl.s16       q2, d22, d31
+    vmlsl.s16       q3, d23, d31
+
+    ; dct_const_round_shift(temp1);
+    vqrshrn.s32     d2, q2, #14               ; >> 14
+    vqrshrn.s32     d3, q3, #14               ; >> 14
+
+    ; step1[9] * cospi_18_64
+    vmull.s16       q2, d24, d31
+    vmull.s16       q3, d25, d31
+
+    ; step1[9] * cospi_18_64 + step1[14] * cospi_14_64
+    vmlal.s16       q2, d22, d30
+    vmlal.s16       q3, d23, d30
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d12, q2, #14              ; >> 14
+    vqrshrn.s32     d13, q3, #14              ; >> 14
+
+    vdup.16         d30, r3                   ; duplicate cospi_22_64
+    vdup.16         d31, r12                  ; duplicate cospi_10_64
+
+    ; step1[10] * cospi_22_64
+    vmull.s16       q11, d20, d30
+    vmull.s16       q12, d21, d30
+
+    ; step1[10] * cospi_22_64 - step1[13] * cospi_10_64
+    vmlsl.s16       q11, d26, d31
+    vmlsl.s16       q12, d27, d31
+
+    ; dct_const_round_shift(temp1);
+    vqrshrn.s32     d4, q11, #14              ; >> 14
+    vqrshrn.s32     d5, q12, #14              ; >> 14
+
+    ; step1[10] * cospi_10_64
+    vmull.s16       q11, d20, d31
+    vmull.s16       q12, d21, d31
+
+    ; step1[10] * cospi_10_64 + step1[13] * cospi_22_64
+    vmlal.s16       q11, d26, d30
+    vmlal.s16       q12, d27, d30
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q11, #14             ; >> 14
+    vqrshrn.s32     d11, q12, #14             ; >> 14
+
+    ; preloading to avoid stall
+    ; generate cospi_6_64 = 15679
+    mov             r3, #0x3d00
+    add             r3, #0x3f
+
+    ; generate cospi_26_64 = 4756
+    mov             r12, #0x1200
+    add             r12, #0x94
+
+    vdup.16         d30, r3                   ; duplicate cospi_6_64
+    vdup.16         d31, r12                  ; duplicate cospi_26_64
+
+    ; step1[11] * cospi_6_64 = input[13] * cospi_6_64
+    vmull.s16       q10, d28, d30
+    vmull.s16       q11, d29, d30
+
+    ; step1[11] * cospi_6_64 - step1[12] * cospi_26_64
+    vmlsl.s16       q10, d18, d31
+    vmlsl.s16       q11, d19, d31
+
+    ; dct_const_round_shift(temp1);
+    vqrshrn.s32     d6, q10, #14              ; >> 14
+    vqrshrn.s32     d7, q11, #14              ; >> 14
+
+    ; step1[11] * cospi_26_64
+    vmull.s16       q10, d28, d31
+    vmull.s16       q11, d29, d31
+
+    ; step1[12] * cospi_6_64
+    vmlal.s16       q10, d18, d30
+    vmlal.s16       q11, d19, d30
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d8, q10, #14              ; >> 14
+    vqrshrn.s32     d9, q11, #14              ; >> 14
+
+    ; stage 3
+    vsub.s16        q9, q0, q1                ; step1[9]=step2[8]-step2[9]
+    vadd.s16        q0, q0, q1                ; step1[8]=step2[8]+step2[9]
+    vsub.s16        q10, q3, q2               ; step1[10]=-step2[10]+step2[11]
+    vadd.s16        q11, q2, q3               ; step1[11]=step2[10]+step2[11]
+    vadd.s16        q12, q4, q5               ; step1[12]=step2[12]+step2[13]
+    vsub.s16        q13, q4, q5               ; step1[13]=step2[12]-step2[13]
+    vsub.s16        q14, q7, q6               ; step1[14]=-step2[14]+tep2[15]
+    vadd.s16        q7, q6, q7                ; step1[15]=step2[14]+step2[15]
+
+    ; stage 4
+    ; generate cospi_24_64 = 6270
+    mov             r3, #0x1800
+    add             r3, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r12, #0x3b00
+    add             r12, #0x21
+
+    ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+    vdup.16         d30, r12                  ; duplicate cospi_8_64
+    vdup.16         d31, r3                   ; duplicate cospi_24_64
+
+    ; step1[14] * cospi_24_64
+    vmull.s16       q4, d28, d31
+    vmull.s16       q5, d29, d31
+
+    ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+    vmlsl.s16       q4, d18, d30
+    vmlsl.s16       q5, d19, d30
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d2, q4, #14               ; >> 14
+    vqrshrn.s32     d3, q5, #14               ; >> 14
+
+    ; step1[9] * cospi_24_64
+    vmull.s16       q2, d18, d31
+    vmull.s16       q3, d19, d31
+
+    ; step1[9] * cospi_24_64 + step1[14] * cospi_8_64
+    vmlal.s16       q2, d28, d30
+    vmlal.s16       q3, d29, d30
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d12, q2, #14              ; >> 14
+    vqrshrn.s32     d13, q3, #14              ; >> 14
+
+    vmov.s16        q3, q11
+    vmov.s16        q4, q12
+
+    ; -step1[10] * cospi_24_64 - step1[13] * cospi_8_64
+    rsb              r12, #0
+    vdup.16         d30, r12                  ; duplicate -cospi_8_64
+
+    ; - step1[13] * cospi_8_64
+    vmull.s16       q11, d26, d30
+    vmull.s16       q12, d27, d30
+
+    ; -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+    vmlsl.s16       q11, d20, d31
+    vmlsl.s16       q12, d21, d31
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q11, #14              ; >> 14
+    vqrshrn.s32     d5, q12, #14              ; >> 14
+
+    ; -step1[10] * cospi_8_64
+    vmull.s16       q8, d20, d30
+    vmull.s16       q9, d21, d30
+
+    ; -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+    vmlal.s16       q8, d26, d31
+    vmlal.s16       q9, d27, d31
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d10, q8, #14              ; >> 14
+    vqrshrn.s32     d11, q9, #14              ; >> 14
+
+    ; stage 5
+    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
+    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
+    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
+    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
+    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
+    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
+    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
+    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
+
+    ; stage 6. The following instruction may not be needed, but they will make
+    ; the data more easy to manage as data will be grouped in q0 - q7.
+    vmov.s16        q0, q8
+    vmov.s16        q1, q9
+    vmov.s16        q6, q14
+    vmov.s16        q7, q15
+
+    ; generate cospi_16_64 = 11585
+    mov             r12, #0x2d00
+    add             r12, #0x41
+
+    vdup.16         d30, r12                  ; duplicate cospi_16_64
+
+    ; step1[13] * cospi_16_64
+    vmull.s16       q3, d26, d30
+    vmull.s16       q4, d27, d30
+
+    ; step1[10] * cospi_16_64
+    vmull.s16       q8, d20, d30
+    vmull.s16       q9, d21, d30
+
+    ; (-step1[10] + step1[13]) * cospi_16_64;
+    vsub.s32        q5, q3, q8
+    vsub.s32        q14, q4, q9
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q5, #14               ; >> 14
+    vqrshrn.s32     d5, q14, #14              ; >> 14
+
+    ; (step1[10] + step1[13]) * cospi_16_64
+    vadd.s32        q5, q3, q8
+    vadd.s32        q14, q4, q9
+
+    ; dct_const_round_shift(temp2);
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q14, #14             ; >> 14
+
+    ; (-step1[11] + step1[12]) * cospi_16_64;
+    ; step1[11] * cospi_16_64
+    vmull.s16       q8, d22, d30
+    vmull.s16       q9, d23, d30
+
+    ; step1[12] * cospi_16_64
+    vmull.s16       q13, d24, d30
+    vmull.s16       q14, d25, d30
+
+    ; (-step1[11] + step1[12]) * cospi_16_64
+    vsub.s32        q10, q13, q8
+    vsub.s32        q4, q14, q9
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d6, q10, #14              ; >> 14
+    vqrshrn.s32     d7, q4, #14               ; >> 14
+
+    ; (step1[11] + step1[12]) * cospi_16_64
+    vadd.s32        q13, q13, q8
+    vadd.s32        q14, q14, q9
+
+    ; dct_const_round_shift(temp2);
+    vqrshrn.s32     d8, q13, #14              ; >> 14
+    vqrshrn.s32     d9, q14, #14              ; >> 14
+
+    mov              r4, #16                  ; pass1Output stride
+    ldr              r3, [sp]                 ; load skip_adding
+    cmp              r3, #0                   ; check if need adding dest data
+    beq              skip_adding_dest
+
+    ldr              r7, [sp, #28]            ; dest used to save element 0-7
+    mov              r9, r7                   ; save dest pointer for later use
+    ldr              r8, [sp, #32]            ; load dest_stride
+
+    ; stage 7
+    ; load the data in pass1
+    vld1.s16        {q8}, [r2], r4            ; load data step2[0]
+    vld1.s16        {q9}, [r2], r4            ; load data step2[1]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
+    vld1.64         {d28}, [r7], r8           ; load destinatoin data
+    vld1.64         {d29}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q8, q7               ; step2[0] + step2[15]
+    vadd.s16        q13, q9, q6               ; step2[1] + step2[14]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d28             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d29             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d28, q12                  ; clip pixel
+    vqmovun.s16     d29, q13                  ; clip pixel
+    vst1.64         {d28}, [r9], r8           ; store the data
+    vst1.64         {d29}, [r9], r8           ; store the data
+    vsub.s16        q6, q9, q6                ; step2[1] - step2[14]
+    vsub.s16        q7, q8, q7                ; step2[0] - step2[15]
+    vld1.64         {d28}, [r7], r8           ; load destinatoin data
+    vld1.64         {d29}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d28             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d29             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d28, q12                  ; clip pixel
+    vqmovun.s16     d29, q13                  ; clip pixel
+    vst1.64         {d28}, [r9], r8           ; store the data
+    vst1.64         {d29}, [r9], r8           ; store the data
+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
+    vld1.s16        {q8}, [r2], r4            ; load data step2[4]
+    vld1.s16        {q9}, [r2], r4            ; load data step2[5]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
+    vld1.64         {d28}, [r7], r8           ; load destinatoin data
+    vld1.64         {d29}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q8, q3               ; step2[4] + step2[11]
+    vadd.s16        q13, q9, q2               ; step2[5] + step2[10]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d28             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d29             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d28, q12                  ; clip pixel
+    vqmovun.s16     d29, q13                  ; clip pixel
+    vst1.64         {d28}, [r9], r8           ; store the data
+    vst1.64         {d29}, [r9], r8           ; store the data
+    vsub.s16        q2, q9, q2                ; step2[5] - step2[10]
+    vsub.s16        q3, q8, q3                ; step2[4] - step2[11]
+    vld1.64         {d28}, [r7], r8           ; load destinatoin data
+    vld1.64         {d29}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q10, q1              ; step2[6] + step2[9]
+    vadd.s16        q13, q11, q0              ; step2[7] + step2[8]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d28             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d29             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d28, q12                  ; clip pixel
+    vqmovun.s16     d29, q13                  ; clip pixel
+    vst1.64         {d28}, [r9], r8           ; store the data
+    vst1.64         {d29}, [r9], r8           ; store the data
+    vld1.64         {d28}, [r7], r8           ; load destinatoin data
+    vld1.64         {d29}, [r7], r8           ; load destinatoin data
+    vsub.s16        q0, q11, q0               ; step2[7] - step2[8]
+    vsub.s16        q1, q10, q1               ; step2[6] - step2[9]
+
+    ; store the data  output 8,9,10,11,12,13,14,15
+    vrshr.s16       q0, q0, #6                ; ROUND_POWER_OF_TWO
+    vaddw.u8        q0, q0, d28               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d28, q0                   ; clip pixel
+    vst1.64         {d28}, [r9], r8           ; store the data
+    vld1.64         {d28}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q1, q1, #6
+    vaddw.u8        q1, q1, d29               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d29, q1                   ; clip pixel
+    vst1.64         {d29}, [r9], r8           ; store the data
+    vld1.64         {d29}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q2, q2, #6
+    vaddw.u8        q2, q2, d28               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d28, q2                   ; clip pixel
+    vst1.64         {d28}, [r9], r8           ; store the data
+    vld1.64         {d28}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q3, q3, #6
+    vaddw.u8        q3, q3, d29               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d29, q3                   ; clip pixel
+    vst1.64         {d29}, [r9], r8           ; store the data
+    vld1.64         {d29}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q4, q4, #6
+    vaddw.u8        q4, q4, d28               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d28, q4                   ; clip pixel
+    vst1.64         {d28}, [r9], r8           ; store the data
+    vld1.64         {d28}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q5, q5, #6
+    vaddw.u8        q5, q5, d29               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d29, q5                   ; clip pixel
+    vst1.64         {d29}, [r9], r8           ; store the data
+    vld1.64         {d29}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q6, q6, #6
+    vaddw.u8        q6, q6, d28               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d28, q6                   ; clip pixel
+    vst1.64         {d28}, [r9], r8           ; store the data
+    vld1.64         {d28}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q7, q7, #6
+    vaddw.u8        q7, q7, d29               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d29, q7                   ; clip pixel
+    vst1.64         {d29}, [r9], r8           ; store the data
+    b               end_idct16x16_pass2
+
+skip_adding_dest
+    ; stage 7
+    ; load the data in pass1
+    mov              r5, #24
+    mov              r3, #8
+
+    vld1.s16        {q8}, [r2], r4            ; load data step2[0]
+    vld1.s16        {q9}, [r2], r4            ; load data step2[1]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
+    vadd.s16        q12, q8, q7               ; step2[0] + step2[15]
+    vadd.s16        q13, q9, q6               ; step2[1] + step2[14]
+    vst1.64         {d24}, [r1], r3           ; store output[0]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[1]
+    vst1.64         {d27}, [r1], r5
+    vsub.s16        q6, q9, q6                ; step2[1] - step2[14]
+    vsub.s16        q7, q8, q7                ; step2[0] - step2[15]
+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
+    vst1.64         {d24}, [r1], r3           ; store output[2]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[3]
+    vst1.64         {d27}, [r1], r5
+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
+    vld1.s16        {q8}, [r2], r4            ; load data step2[4]
+    vld1.s16        {q9}, [r2], r4            ; load data step2[5]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
+    vadd.s16        q12, q8, q3               ; step2[4] + step2[11]
+    vadd.s16        q13, q9, q2               ; step2[5] + step2[10]
+    vst1.64         {d24}, [r1], r3           ; store output[4]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[5]
+    vst1.64         {d27}, [r1], r5
+    vsub.s16        q2, q9, q2                ; step2[5] - step2[10]
+    vsub.s16        q3, q8, q3                ; step2[4] - step2[11]
+    vadd.s16        q12, q10, q1              ; step2[6] + step2[9]
+    vadd.s16        q13, q11, q0              ; step2[7] + step2[8]
+    vst1.64         {d24}, [r1], r3           ; store output[6]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[7]
+    vst1.64         {d27}, [r1], r5
+    vsub.s16        q0, q11, q0               ; step2[7] - step2[8]
+    vsub.s16        q1, q10, q1               ; step2[6] - step2[9]
+
+    ; store the data  output 8,9,10,11,12,13,14,15
+    vst1.64         {d0}, [r1], r3
+    vst1.64         {d1}, [r1], r5
+    vst1.64         {d2}, [r1], r3
+    vst1.64         {d3}, [r1], r5
+    vst1.64         {d4}, [r1], r3
+    vst1.64         {d5}, [r1], r5
+    vst1.64         {d6}, [r1], r3
+    vst1.64         {d7}, [r1], r5
+    vst1.64         {d8}, [r1], r3
+    vst1.64         {d9}, [r1], r5
+    vst1.64         {d10}, [r1], r3
+    vst1.64         {d11}, [r1], r5
+    vst1.64         {d12}, [r1], r3
+    vst1.64         {d13}, [r1], r5
+    vst1.64         {d14}, [r1], r3
+    vst1.64         {d15}, [r1], r5
+
+end_idct16x16_pass2
+    pop             {r3-r9}
+    bx              lr
+    ENDP  ; |vp9_short_idct16x16_add_neon_pass2|
+;void |save_registers|()
+|save_registers| PROC
+    vpush           {d8-d15}
+    bx              lr
+    ENDP  ; |save_registers|
+
+;void |restore_registers|()
+|restore_registers| PROC
+    vpop           {d8-d15}
+    bx             lr
+    ENDP  ; |restore_registers|
+    END
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h

index dc04a5ea66fcac088996db0b47d3464aaab46bdd..b22f8c0485c7024def4345022fa38f7e76b2f69f 100644 (file)
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -393,7 +393,7 @@ static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
  }
  
  static BLOCK_SIZE_TYPE get_plane_block_size(BLOCK_SIZE_TYPE bsize,
-                                            struct macroblockd_plane *pd) {
+                           const struct macroblockd_plane *pd) {
    BLOCK_SIZE_TYPE bs = ss_size_lookup[bsize]
                                       [pd->subsampling_x][pd->subsampling_y];
    assert(bs < BLOCK_SIZES);
@@ -418,7 +418,7 @@ typedef void (*foreach_transformed_block_visitor)(int plane, int block,
  static INLINE void foreach_transformed_block_in_plane(
      const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
      foreach_transformed_block_visitor visit, void *arg) {
-  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
  
    // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
    // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
@@ -426,56 +426,45 @@ static INLINE void foreach_transformed_block_in_plane(
    const MB_MODE_INFO* mbmi = &xd->mode_info_context->mbmi;
    const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)
                                  : mbmi->txfm_size;
-  const int block_size_b = bw + bh;
+  const int bw = b_width_log2(bsize) - pd->subsampling_x;
+  const int bh = b_height_log2(bsize) - pd->subsampling_y;
    const int txfrm_size_b = tx_size * 2;
-
-  // subsampled size of the block
-  const int ss_sum = xd->plane[plane].subsampling_x
-      + xd->plane[plane].subsampling_y;
-  const int ss_block_size = block_size_b - ss_sum;
-
    const int step = 1 << txfrm_size_b;
-
    int i;
  
-  assert(txfrm_size_b <= block_size_b);
-  assert(txfrm_size_b <= ss_block_size);
-
    // If mb_to_right_edge is < 0 we are in a situation in which
    // the current block size extends into the UMV and we won't
    // visit the sub blocks that are wholly within the UMV.
    if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
      int r, c;
-    const int sw = bw - xd->plane[plane].subsampling_x;
-    const int sh = bh - xd->plane[plane].subsampling_y;
-    int max_blocks_wide = 1 << sw;
-    int max_blocks_high = 1 << sh;
+
+    int max_blocks_wide = 1 << bw;
+    int max_blocks_high = 1 << bh;
  
      // xd->mb_to_right_edge is in units of pixels * 8.  This converts
      // it to 4x4 block sizes.
      if (xd->mb_to_right_edge < 0)
-      max_blocks_wide +=
-          (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));
+      max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
  
      if (xd->mb_to_bottom_edge < 0)
-      max_blocks_high +=
-          (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));
+      max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
  
      i = 0;
      // Unlike the normal case - in here we have to keep track of the
      // row and column of the blocks we use so that we know if we are in
      // the unrestricted motion border.
-    for (r = 0; r < (1 << sh); r += (1 << tx_size)) {
-      for (c = 0; c < (1 << sw); c += (1 << tx_size)) {
+    for (r = 0; r < (1 << bh); r += (1 << tx_size)) {
+      for (c = 0; c < (1 << bw); c += (1 << tx_size)) {
          if (r < max_blocks_high && c < max_blocks_wide)
            visit(plane, i, bsize, txfrm_size_b, arg);
          i += step;
        }
      }
    } else {
-    for (i = 0; i < (1 << ss_block_size); i += step) {
+    const int ss_block_size = bw + bh;
+    assert(txfrm_size_b <= ss_block_size);
+    for (i = 0; i < (1 << ss_block_size); i += step)
        visit(plane, i, bsize, txfrm_size_b, arg);
-    }
    }
  }
  
@@ -484,10 +473,8 @@ static INLINE void foreach_transformed_block(
      foreach_transformed_block_visitor visit, void *arg) {
    int plane;
  
-  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
-    foreach_transformed_block_in_plane(xd, bsize, plane,
-                                       visit, arg);
-  }
+  for (plane = 0; plane < MAX_MB_PLANE; plane++)
+    foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
  }
  
  static INLINE void foreach_transformed_block_uv(
@@ -495,10 +482,8 @@ static INLINE void foreach_transformed_block_uv(
      foreach_transformed_block_visitor visit, void *arg) {
    int plane;
  
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    foreach_transformed_block_in_plane(xd, bsize, plane,
-                                       visit, arg);
-  }
+  for (plane = 1; plane < MAX_MB_PLANE; plane++)
+    foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
  }
  
  // TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
@@ -548,9 +533,8 @@ static INLINE void foreach_predicted_block(
      foreach_predicted_block_visitor visit, void *arg) {
    int plane;
  
-  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+  for (plane = 0; plane < MAX_MB_PLANE; plane++)
      foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);
-  }
  }
  
  static int raster_block_offset(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
@@ -577,7 +561,7 @@ static int txfrm_block_to_raster_block(MACROBLOCKD *xd,
                                         int plane, int block,
                                         int ss_txfrm_size) {
    const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int txwl = ss_txfrm_size / 2;
+  const int txwl = ss_txfrm_size >> 1;
    const int tx_cols_log2 = bwl - txwl;
    const int tx_cols = 1 << tx_cols_log2;
    const int raster_mb = block >> ss_txfrm_size;
@@ -592,7 +576,7 @@ static void txfrm_block_to_raster_xy(MACROBLOCKD *xd,
                                       int ss_txfrm_size,
                                       int *x, int *y) {
    const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int txwl = ss_txfrm_size / 2;
+  const int txwl = ss_txfrm_size >> 1;
    const int tx_cols_log2 = bwl - txwl;
    const int tx_cols = 1 << tx_cols_log2;
    const int raster_mb = block >> ss_txfrm_size;
@@ -656,14 +640,14 @@ static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
    if (xd->mb_to_right_edge < 0)
      mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
  
+  if (xd->mb_to_bottom_edge < 0)
+    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
    // this code attempts to avoid copying into contexts that are outside
    // our border.  Any blocks that do are set to 0...
    if (above_contexts + aoff > mi_blocks_wide)
      above_contexts = mi_blocks_wide - aoff;
  
-  if (xd->mb_to_bottom_edge < 0)
-    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
    if (left_contexts + loff > mi_blocks_high)
      left_contexts = mi_blocks_high - loff;
  
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c

index 6209e181543cc99988d259c6d077e4e67757ca15..84a4b544255d589a4d111dbc0266cd6531bcfe2d 100644 (file)
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -8,14 +8,15 @@
   *  be found in the AUTHORS file in the root of the source tree.
   */
  
-#include <stdio.h>
-
  #include "./vpx_config.h"
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
+
  #include "vp9_rtcd.h"
+
  #include "vp9/common/vp9_reconintra.h"
  #include "vp9/common/vp9_onyxc_int.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/vpx_once.h"
  
  const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
      DCT_DCT,    // DC
@@ -35,284 +36,254 @@ const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
  };
  
  #define intra_pred_sized(type, size) \
-void vp9_##type##_predictor_##size##x##size##_c(uint8_t *pred_ptr, \
-                                                ptrdiff_t stride, \
-                                                uint8_t *above_row, \
-                                                uint8_t *left_col) { \
-  type##_predictor(pred_ptr, stride, size, above_row, left_col); \
-}
+  void vp9_##type##_predictor_##size##x##size##_c(uint8_t *dst, \
+                                                  ptrdiff_t stride, \
+                                                  const uint8_t *above, \
+                                                  const uint8_t *left) { \
+    type##_predictor(dst, stride, size, above, left); \
+  }
+
  #define intra_pred_allsizes(type) \
    intra_pred_sized(type, 4) \
    intra_pred_sized(type, 8) \
    intra_pred_sized(type, 16) \
    intra_pred_sized(type, 32)
  
-static INLINE void d27_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                 uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d27_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
    int r, c;
+
    // first column
    for (r = 0; r < bs - 1; ++r)
-    pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] + left_col[r + 1], 1);
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
  
-  pred_ptr[(bs - 1) * stride] = left_col[bs - 1];
-  pred_ptr++;
    // second column
    for (r = 0; r < bs - 2; ++r)
-    pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] +
-                                              left_col[r + 1] * 2 +
-                                              left_col[r + 2], 2);
-
-  pred_ptr[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left_col[bs - 2] +
-                                                      left_col[bs - 1] * 3,
-                                                      2);
-  pred_ptr[(bs - 1) * stride] = left_col[bs - 1];
-  pred_ptr++;
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1] * 2 +
+                                         left[r + 2], 2);
+  dst[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left[bs - 2] +
+                                              left[bs - 1] * 3, 2);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
  
    // rest of last row
    for (c = 0; c < bs - 2; ++c)
-    pred_ptr[(bs - 1) * stride + c] = left_col[bs - 1];
+    dst[(bs - 1) * stride + c] = left[bs - 1];
  
    for (r = bs - 2; r >= 0; --r)
      for (c = 0; c < bs - 2; ++c)
-      pred_ptr[r * stride + c] = pred_ptr[(r + 1) * stride + c - 2];
+      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
  }
  intra_pred_allsizes(d27)
  
-static INLINE void d63_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                 uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
    int r, c;
    for (r = 0; r < bs; ++r) {
      for (c = 0; c < bs; ++c)
-      pred_ptr[c] = r & 1 ? ROUND_POWER_OF_TWO(above_row[r/2 + c] +
-                                               above_row[r/2 + c + 1] * 2 +
-                                               above_row[r/2 + c + 2], 2)
-                          : ROUND_POWER_OF_TWO(above_row[r/2 + c] +
-                                               above_row[r/2+ c + 1], 1);
-    pred_ptr += stride;
+      dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] +
+                                          above[r/2 + c + 1] * 2 +
+                                          above[r/2 + c + 2], 2)
+                     : ROUND_POWER_OF_TWO(above[r/2 + c] +
+                                          above[r/2 + c + 1], 1);
+    dst += stride;
    }
  }
  intra_pred_allsizes(d63)
  
-static INLINE void d45_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                 uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
    int r, c;
    for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      if (r + c + 2 < bs * 2)
-        pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r + c] +
-                                         above_row[r + c + 1] * 2 +
-                                         above_row[r + c + 2], 2);
-      else
-        pred_ptr[c] = above_row[bs * 2 - 1];
-    }
-    pred_ptr += stride;
+    for (c = 0; c < bs; ++c)
+      dst[c] = r + c + 2 < bs * 2 ?  ROUND_POWER_OF_TWO(above[r + c] +
+                                                        above[r + c + 1] * 2 +
+                                                        above[r + c + 2], 2)
+                                  : above[bs * 2 - 1];
+    dst += stride;
    }
  }
  intra_pred_allsizes(d45)
  
-static INLINE void d117_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                  uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
    int r, c;
+
    // first row
    for (c = 0; c < bs; c++)
-    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] + above_row[c], 1);
-  pred_ptr += stride;
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c], 1);
+  dst += stride;
  
    // second row
-  pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
-                                   above_row[-1] * 2 +
-                                   above_row[0], 2);
+  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
    for (c = 1; c < bs; c++)
-    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] +
-                                     above_row[c - 1] * 2 +
-                                     above_row[c], 2);
-  pred_ptr += stride;
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2);
+  dst += stride;
  
    // the rest of first col
-  pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] +
-                                   left_col[0] * 2 +
-                                   left_col[1], 2);
+  dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
    for (r = 3; r < bs; ++r)
-    pred_ptr[(r - 2) * stride] = ROUND_POWER_OF_TWO(left_col[r - 3] +
-                                                    left_col[r - 2] * 2 +
-                                                    left_col[r - 1], 2);
+    dst[(r - 2) * stride] = ROUND_POWER_OF_TWO(left[r - 3] + left[r - 2] * 2 +
+                                               left[r - 1], 2);
+
    // the rest of the block
    for (r = 2; r < bs; ++r) {
      for (c = 1; c < bs; c++)
-      pred_ptr[c] = pred_ptr[-2 * stride + c - 1];
-    pred_ptr += stride;
+      dst[c] = dst[-2 * stride + c - 1];
+    dst += stride;
    }
  }
  intra_pred_allsizes(d117)
  
-static INLINE void d135_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                  uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
    int r, c;
-  pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
-                                   above_row[-1] * 2 +
-                                   above_row[0], 2);
+  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
    for (c = 1; c < bs; c++)
-    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] +
-                                     above_row[c - 1] * 2 +
-                                     above_row[c], 2);
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2);
  
-  pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] +
-                                        left_col[0] * 2 +
-                                        left_col[1], 2);
+  dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
    for (r = 2; r < bs; ++r)
-    pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] +
-                                              left_col[r - 1] * 2 +
-                                              left_col[r], 2);
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 +
+                                         left[r], 2);
  
-  pred_ptr += stride;
+  dst += stride;
    for (r = 1; r < bs; ++r) {
      for (c = 1; c < bs; c++)
-      pred_ptr[c] = pred_ptr[-stride + c - 1];
-    pred_ptr += stride;
+      dst[c] = dst[-stride + c - 1];
+    dst += stride;
    }
  }
  intra_pred_allsizes(d135)
  
-static INLINE void d153_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                  uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
    int r, c;
-  pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] + left_col[0], 1);
+  dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0], 1);
    for (r = 1; r < bs; r++)
-    pred_ptr[r * stride] =
-        ROUND_POWER_OF_TWO(left_col[r - 1] + left_col[r], 1);
-  pred_ptr++;
-
-  pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
-                                   above_row[-1] * 2 +
-                                   above_row[0], 2);
-  pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] +
-                                        left_col[0] * 2 +
-                                        left_col[1], 2);
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 1] + left[r], 1);
+  dst++;
+
+  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
+  dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
    for (r = 2; r < bs; r++)
-    pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] +
-                                              left_col[r - 1] * 2 +
-                                              left_col[r], 2);
-  pred_ptr++;
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 +
+                                         left[r], 2);
+  dst++;
  
    for (c = 0; c < bs - 2; c++)
-    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] +
-                                     above_row[c] * 2 +
-                                     above_row[c + 1], 2);
-  pred_ptr += stride;
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c] * 2 + above[c + 1], 2);
+  dst += stride;
+
    for (r = 1; r < bs; ++r) {
      for (c = 0; c < bs - 2; c++)
-      pred_ptr[c] = pred_ptr[-stride + c - 2];
-    pred_ptr += stride;
+      dst[c] = dst[-stride + c - 2];
+    dst += stride;
    }
  }
  intra_pred_allsizes(d153)
  
-static INLINE void v_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                       uint8_t *above_row, uint8_t *left_col) {
+static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
    int r;
  
    for (r = 0; r < bs; r++) {
-    vpx_memcpy(pred_ptr, above_row, bs);
-    pred_ptr += stride;
+    vpx_memcpy(dst, above, bs);
+    dst += stride;
    }
  }
  intra_pred_allsizes(v)
  
-static INLINE void h_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                               uint8_t *above_row, uint8_t *left_col) {
+static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
    int r;
  
    for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, left_col[r], bs);
-    pred_ptr += stride;
+    vpx_memset(dst, left[r], bs);
+    dst += stride;
    }
  }
  intra_pred_allsizes(h)
  
-static INLINE void tm_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                uint8_t *above_row, uint8_t *left_col) {
+static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
    int r, c;
-  int ytop_left = above_row[-1];
+  int ytop_left = above[-1];
  
    for (r = 0; r < bs; r++) {
      for (c = 0; c < bs; c++)
-      pred_ptr[c] = clip_pixel(left_col[r] + above_row[c] - ytop_left);
-    pred_ptr += stride;
+      dst[c] = clip_pixel(left[r] + above[c] - ytop_left);
+    dst += stride;
    }
  }
  intra_pred_allsizes(tm)
  
-static INLINE void dc_128_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                    uint8_t *above_row, uint8_t *left_col) {
+static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
    int r;
  
    for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, 128, bs);
-    pred_ptr += stride;
+    vpx_memset(dst, 128, bs);
+    dst += stride;
    }
  }
  intra_pred_allsizes(dc_128)
  
-static INLINE void dc_left_predictor(uint8_t *pred_ptr, ptrdiff_t stride,
-                                     int bs,
-                                     uint8_t *above_row, uint8_t *left_col) {
-  int i, r;
-  int expected_dc = 128;
-  int average = 0;
-  const int count = bs;
+static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
  
    for (i = 0; i < bs; i++)
-    average += left_col[i];
-  expected_dc = (average + (count >> 1)) / count;
+    sum += left[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
  
    for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, expected_dc, bs);
-    pred_ptr += stride;
+    vpx_memset(dst, expected_dc, bs);
+    dst += stride;
    }
  }
  intra_pred_allsizes(dc_left)
  
-static INLINE void dc_top_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                    uint8_t *above_row, uint8_t *left_col) {
-  int i, r;
-  int expected_dc = 128;
-  int average = 0;
-  const int count = bs;
+static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
  
    for (i = 0; i < bs; i++)
-    average += above_row[i];
-  expected_dc = (average + (count >> 1)) / count;
+    sum += above[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
  
    for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, expected_dc, bs);
-    pred_ptr += stride;
+    vpx_memset(dst, expected_dc, bs);
+    dst += stride;
    }
  }
  intra_pred_allsizes(dc_top)
  
-static INLINE void dc_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                uint8_t *above_row, uint8_t *left_col) {
-  int i, r;
-  int expected_dc = 128;
-  int average = 0;
+static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
    const int count = 2 * bs;
  
-  for (i = 0; i < bs; i++)
-    average += above_row[i];
-  for (i = 0; i < bs; i++)
-    average += left_col[i];
-  expected_dc = (average + (count >> 1)) / count;
+  for (i = 0; i < bs; i++) {
+    sum += above[i];
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;
  
    for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, expected_dc, bs);
-    pred_ptr += stride;
+    vpx_memset(dst, expected_dc, bs);
+    dst += stride;
    }
  }
  intra_pred_allsizes(dc)
  #undef intra_pred_allsizes
  
-typedef void (*intra_pred_fn)(uint8_t *pred_ptr, ptrdiff_t stride,
-                              uint8_t *above_row, uint8_t *left_col);
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);
  
  static intra_pred_fn pred[VP9_INTRA_MODES][4];
  static intra_pred_fn dc_pred[2][2][4];
@@ -342,16 +313,17 @@ static void init_intra_pred_fn_ptrs(void) {
  #undef intra_pred_allsizes
  }
  
-static void build_intra_predictors(uint8_t *src, int src_stride,
-                                   uint8_t *pred_ptr, int stride,
-                                   MB_PREDICTION_MODE mode, TX_SIZE txsz,
+static void build_intra_predictors(const uint8_t *ref, int ref_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   MB_PREDICTION_MODE mode, TX_SIZE tx_size,
                                     int up_available, int left_available,
                                     int right_available) {
    int i;
    DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, yabove_data, 128 + 16);
-  uint8_t *above_row = yabove_data + 16;
-  const int bs = 4 << txsz;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16);
+  uint8_t *above_row = above_data + 16;
+  const uint8_t *const_above_row = above_row;
+  const int bs = 4 << tx_size;
  
    // 127 127 127 .. 127 127 127 127 127 127
    // 129  A   B  ..  Y   Z
@@ -361,45 +333,46 @@ static void build_intra_predictors(uint8_t *src, int src_stride,
    // ..
  
    once(init_intra_pred_fn_ptrs);
+
+  // left
    if (left_available) {
      for (i = 0; i < bs; i++)
-      left_col[i] = src[i * src_stride - 1];
+      left_col[i] = ref[i * ref_stride - 1];
    } else {
      vpx_memset(left_col, 129, bs);
    }
  
+  // above
    if (up_available) {
-    uint8_t *above_ptr = src - src_stride;
+    const uint8_t *above_ref = ref - ref_stride;
      if (bs == 4 && right_available && left_available) {
-      above_row = above_ptr;
+      const_above_row = above_ref;
      } else {
-      vpx_memcpy(above_row, above_ptr, bs);
+      vpx_memcpy(above_row, above_ref, bs);
        if (bs == 4 && right_available)
-        vpx_memcpy(above_row + bs, above_ptr + bs, bs);
+        vpx_memcpy(above_row + bs, above_ref + bs, bs);
        else
          vpx_memset(above_row + bs, above_row[bs - 1], bs);
-      above_row[-1] = left_available ? above_ptr[-1] : 129;
+      above_row[-1] = left_available ? above_ref[-1] : 129;
      }
    } else {
      vpx_memset(above_row, 127, bs * 2);
      above_row[-1] = 127;
    }
  
+  // predict
    if (mode == DC_PRED) {
-    dc_pred[left_available][up_available][txsz](pred_ptr, stride,
-                                                above_row, left_col);
+    dc_pred[left_available][up_available][tx_size](dst, dst_stride,
+                                                   const_above_row, left_col);
    } else {
-    pred[mode][txsz](pred_ptr, stride, above_row, left_col);
+    pred[mode][tx_size](dst, dst_stride, const_above_row, left_col);
    }
  }
  
-void vp9_predict_intra_block(MACROBLOCKD *xd,
-                            int block_idx,
-                            int bwl_in,
-                            TX_SIZE tx_size,
-                            int mode,
-                            uint8_t *reference, int ref_stride,
-                            uint8_t *predictor, int pre_stride) {
+void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
+                            TX_SIZE tx_size, int mode,
+                            const uint8_t *ref, int ref_stride,
+                            uint8_t *dst, int dst_stride) {
    const int bwl = bwl_in - tx_size;
    const int wmask = (1 << bwl) - 1;
    const int have_top = (block_idx >> bwl) || xd->up_available;
@@ -407,10 +380,6 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
    const int have_right = ((block_idx & wmask) != wmask);
  
    assert(bwl >= 0);
-  build_intra_predictors(reference, ref_stride,
-                         predictor, pre_stride,
-                         mode,
-                         tx_size,
-                         have_top, have_left,
-                         have_right);
+  build_intra_predictors(ref, ref_stride, dst, dst_stride, mode, tx_size,
+                         have_top, have_left, have_right);
  }
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h

index e369a7192d712e772dac92430bd8b84c3787bf03..e9d0dbf04cf2c76fe81119b6f3df5edf3a3555f5 100644 (file)
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -14,17 +14,8 @@
  #include "vpx/vpx_integer.h"
  #include "vp9/common/vp9_blockd.h"
  
-MB_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                               int stride, int n,
-                                               int tx, int ty);
-
-MB_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, int block,
-                                          uint8_t *ptr, int stride);
-
-void vp9_predict_intra_block(MACROBLOCKD *xd,
-                            int block_idx,
-                            int bwl_in,
-                            TX_SIZE tx_size,
-                            int mode, uint8_t *ref, int ref_stride,
-                            uint8_t *predictor, int pre_stride);
+void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
+                            TX_SIZE tx_size, int mode,
+                            const uint8_t *ref, int ref_stride,
+                            uint8_t *dst, int dst_stride);
  #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh

index c5ae358066952ddd5c1c70cc4a48fcfa748cc5c8..6f9c4bb614fb08f398ef7974e0900e446bc89987 100644 (file)
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -46,160 +46,160 @@ specialize vp9_idct_add_32x32
  #
  # RECON
  #
-prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d27_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d27_predictor_4x4
  
-prototype void vp9_d45_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d45_predictor_4x4 $ssse3_x86inc
  
-prototype void vp9_d63_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d63_predictor_4x4
  
-prototype void vp9_h_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_h_predictor_4x4 $ssse3_x86inc
  
-prototype void vp9_d117_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d117_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d117_predictor_4x4
  
-prototype void vp9_d135_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d135_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d135_predictor_4x4
  
-prototype void vp9_d153_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d153_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d153_predictor_4x4
  
-prototype void vp9_v_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_v_predictor_4x4 $sse_x86inc
  
-prototype void vp9_tm_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_tm_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_tm_predictor_4x4 $sse_x86inc
  
-prototype void vp9_dc_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_predictor_4x4 $sse_x86inc
  
-prototype void vp9_dc_top_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_top_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_top_predictor_4x4
  
-prototype void vp9_dc_left_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_left_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_left_predictor_4x4
  
-prototype void vp9_dc_128_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_128_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_128_predictor_4x4
  
-prototype void vp9_d27_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d27_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d27_predictor_8x8
  
-prototype void vp9_d45_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d45_predictor_8x8 $ssse3_x86inc
  
-prototype void vp9_d63_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d63_predictor_8x8
  
-prototype void vp9_h_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_h_predictor_8x8 $ssse3_x86inc
  
-prototype void vp9_d117_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d117_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d117_predictor_8x8
  
-prototype void vp9_d135_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d135_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d135_predictor_8x8
  
-prototype void vp9_d153_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d153_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d153_predictor_8x8
  
-prototype void vp9_v_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_v_predictor_8x8 $sse_x86inc
  
-prototype void vp9_tm_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_tm_predictor_8x8 $sse2_x86inc
  
-prototype void vp9_dc_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_predictor_8x8 $sse_x86inc
  
-prototype void vp9_dc_top_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_top_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_top_predictor_8x8
  
-prototype void vp9_dc_left_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_left_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_left_predictor_8x8
  
-prototype void vp9_dc_128_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_128_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_128_predictor_8x8
  
-prototype void vp9_d27_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d27_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d27_predictor_16x16
  
-prototype void vp9_d45_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d45_predictor_16x16 $ssse3_x86inc
  
-prototype void vp9_d63_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d63_predictor_16x16
  
-prototype void vp9_h_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_h_predictor_16x16 $ssse3_x86inc
  
-prototype void vp9_d117_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d117_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d117_predictor_16x16
  
-prototype void vp9_d135_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d135_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d135_predictor_16x16
  
-prototype void vp9_d153_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d153_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d153_predictor_16x16
  
-prototype void vp9_v_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_v_predictor_16x16 $sse2_x86inc
  
-prototype void vp9_tm_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_tm_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_tm_predictor_16x16 $sse2_x86inc
  
-prototype void vp9_dc_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_predictor_16x16 $sse2_x86inc
  
-prototype void vp9_dc_top_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_top_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_top_predictor_16x16
  
-prototype void vp9_dc_left_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_left_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_left_predictor_16x16
  
-prototype void vp9_dc_128_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_128_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_128_predictor_16x16
  
-prototype void vp9_d27_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d27_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d27_predictor_32x32
  
-prototype void vp9_d45_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d45_predictor_32x32 $ssse3_x86inc
  
-prototype void vp9_d63_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d63_predictor_32x32
  
-prototype void vp9_h_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_h_predictor_32x32 $ssse3 x86inc
  
-prototype void vp9_d117_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d117_predictor_32x32
  
-prototype void vp9_d135_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d135_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d135_predictor_32x32
  
-prototype void vp9_d153_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d153_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_d153_predictor_32x32
  
-prototype void vp9_v_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_v_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_v_predictor_32x32 $sse2_x86inc
  
-prototype void vp9_tm_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_tm_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_tm_predictor_32x32 $sse2_x86_64
  
-prototype void vp9_dc_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_predictor_32x32 $sse2_x86inc
  
-prototype void vp9_dc_top_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_top_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_top_predictor_32x32
  
-prototype void vp9_dc_left_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_left_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_left_predictor_32x32
  
-prototype void vp9_dc_128_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_128_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
  specialize vp9_dc_128_predictor_32x32
  
  if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
@@ -313,7 +313,7 @@ prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int des
  specialize vp9_short_idct16x16_1_add sse2
  
  prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_add sse2
+specialize vp9_short_idct16x16_add sse2 neon
  
  prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
  specialize vp9_short_idct10_16x16_add sse2
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c

index d7c06a768fe3ce57ccb2a0686581146af3337dc3..93a5c49c30f4702e35e797fc7da232323b93c919 100644 (file)
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -99,8 +99,9 @@ static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
    uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
                                                   raster_block,
                                                   pd->dst.buf, stride);
+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size >> 1);
  
-  switch (ss_txfrm_size / 2) {
+  switch (tx_size) {
      case TX_4X4: {
        const TX_TYPE tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
        if (tx_type == DCT_DCT)
@@ -120,6 +121,8 @@ static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
      case TX_32X32:
        vp9_idct_add_32x32(qcoeff, dst, stride, eob);
        break;
+    default:
+      assert(!"Invalid transform size");
    }
  }
  
@@ -134,7 +137,7 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
    uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
                                                   raster_block,
                                                   pd->dst.buf, pd->dst.stride);
-  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size >> 1);
    int b_mode;
    int plane_b_size;
    const int tx_ib = raster_block >> tx_size;
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c

index 021eb11e35208cf74d49061bec19105b05af7981..fcc204fa2ea3653b1caca4b333f7408a33c229ec 100644 (file)
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -260,19 +260,19 @@ static void decode_block(int plane, int block,
    MACROBLOCKD *xd = &arg->pbi->mb;
    struct macroblockd_plane* pd = &xd->plane[plane];
    const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const TX_SIZE ss_tx_size = ss_txfrm_size / 2;
+  const TX_SIZE tx_size = ss_txfrm_size >> 1;
    const int seg_eob = get_eob(&xd->seg, segment_id, 16 << ss_txfrm_size);
    const int off = block >> ss_txfrm_size;
-  const int mod = bw - ss_tx_size - pd->subsampling_x;
-  const int aoff = (off & ((1 << mod) - 1)) << ss_tx_size;
-  const int loff = (off >> mod) << ss_tx_size;
-  const int tx_size_in_blocks = 1 << ss_tx_size;
+  const int mod = bw - tx_size - pd->subsampling_x;
+  const int aoff = (off & ((1 << mod) - 1)) << tx_size;
+  const int loff = (off >> mod) << tx_size;
+  const int tx_size_in_blocks = 1 << tx_size;
    ENTROPY_CONTEXT *A = pd->above_context + aoff;
    ENTROPY_CONTEXT *L = pd->left_context + loff;
    const int eob = decode_coefs(&arg->pbi->common, xd, arg->r, block,
                                 pd->plane_type, seg_eob,
                                 BLOCK_OFFSET(pd->qcoeff, block),
-                               ss_tx_size, pd->dequant, A, L);
+                               tx_size, pd->dequant, A, L);
  
    if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
      set_contexts_on_border(xd, bsize, plane, tx_size_in_blocks, eob, aoff, loff,
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c

index 80b0cb573a71a58908c171e65a97a90e1940b6be..33ebdb77562fb25e8001abc6022ee7bd622b84a8 100644 (file)
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -374,13 +374,14 @@ void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
                      int ss_txfrm_size, MACROBLOCK *mb,
                      struct optimize_ctx *ctx) {
    MACROBLOCKD *const xd = &mb->e_mbd;
+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size >> 1);
    int x, y;
  
    // find current entropy context
    txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
  
-  optimize_b(mb, plane, block, bsize,
-             &ctx->ta[plane][x], &ctx->tl[plane][y], ss_txfrm_size / 2);
+  optimize_b(mb, plane, block, bsize, &ctx->ta[plane][x], &ctx->tl[plane][y],
+             tx_size);
  }
  
  static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -460,7 +461,7 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
    int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    const int16_t *scan, *iscan;
    uint16_t *eob = &pd->eobs[block];
-  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size >> 1);
    const int bwl = b_width_log2(bsize) - pd->subsampling_x, bw = 1 << bwl;
    const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
    int xoff, yoff;
@@ -534,17 +535,17 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
    uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane,
                                                   raster_block,
                                                   pd->dst.buf, pd->dst.stride);
+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size >> 1);
+
    xform_quant(plane, block, bsize, ss_txfrm_size, arg);
  
    if (x->optimize)
      vp9_optimize_b(plane, block, bsize, ss_txfrm_size, x, args->ctx);
  
-  if (x->skip_encode)
-    return;
-  if (pd->eobs[block] == 0)
+  if (x->skip_encode || pd->eobs[block] == 0)
      return;
  
-  switch (ss_txfrm_size / 2) {
+  switch (tx_size) {
      case TX_32X32:
        vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
        break;
@@ -563,6 +564,8 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
        inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
                                    dst, pd->dst.stride);
        break;
+    default:
+      assert(!"Invalid transform size");
    }
  }
  
@@ -630,7 +633,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
    MACROBLOCK *const x = args->x;
    MACROBLOCKD *const xd = &x->e_mbd;
    MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size >> 1);
    struct macroblock_plane *const p = &x->plane[plane];
    struct macroblockd_plane *const pd = &xd->plane[plane];
    int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c

index b75c422d8dbb25fc30d4e98348b68f858b6aa6d0..9bd53b3dc013beaa8452f91ef7a304318b04bc3f 100644 (file)
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -100,8 +100,8 @@ struct tokenize_b_args {
  static void set_entropy_context_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
                                    int ss_txfrm_size, void *arg) {
    struct tokenize_b_args* const args = arg;
-  TX_SIZE tx_size = ss_txfrm_size >> 1;
-  MACROBLOCKD *xd = args->xd;
+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size >> 1);
+  MACROBLOCKD *const xd = args->xd;
    const int bwl = b_width_log2(bsize);
    const int off = block >> (2 * tx_size);
    const int mod = bwl - tx_size - xd->plane[plane].subsampling_x;
@@ -127,7 +127,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
    VP9_COMP *cpi = args->cpi;
    MACROBLOCKD *xd = args->xd;
    TOKENEXTRA **tp = args->tp;
-  const TX_SIZE tx_size = ss_txfrm_size >> 1;
+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size >> 1);
    const int tx_size_in_blocks = 1 << tx_size;
    MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
    int pt; /* near block/prev token context index */
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk

index d6b3ec087772a1822763777e32b70b1b85ecdea5..9d2e82919f4bc3dd019c44af90c0930702fe56b3 100644 (file)
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -91,12 +91,14 @@ endif
  VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
  
  VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
  VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
  VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
  VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
  VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
  VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_add_neon$(ASM)
  VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)
  VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
  
  $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh))
author	Paul Wilkins <paulwilkins@google.com>
	Thu, 15 Aug 2013 09:12:48 +0000 (02:12 -0700)
committer	Gerrit Code Review <gerrit@gerrit.golo.chromium.org>
	Thu, 15 Aug 2013 09:12:48 +0000 (02:12 -0700)
build/make/armlink_adapter.sh		patch \| blob \| history
build/make/configure.sh		patch \| blob \| history
build/make/gen_asm_deps.sh		patch \| blob \| history
build/make/version.sh		patch \| blob \| history
configure		patch \| blob \| history
vp9/common/arm/neon/vp9_idct16x16_neon.c	[new file with mode: 0644]	patch \| blob
vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm	[new file with mode: 0644]	patch \| blob
vp9/common/vp9_blockd.h		patch \| blob \| history
vp9/common/vp9_reconintra.c		patch \| blob \| history
vp9/common/vp9_reconintra.h		patch \| blob \| history
vp9/common/vp9_rtcd_defs.sh		patch \| blob \| history
vp9/decoder/vp9_decodframe.c		patch \| blob \| history
vp9/decoder/vp9_detokenize.c		patch \| blob \| history
vp9/encoder/vp9_encodemb.c		patch \| blob \| history
vp9/encoder/vp9_tokenize.c		patch \| blob \| history
vp9/vp9_common.mk		patch \| blob \| history