]> granicus.if.org Git - libvpx/commitdiff
Merge changes I8a106dd6,Iec442603
authorJim Bankoski <jimbankoski@google.com>
Mon, 7 Oct 2013 03:11:24 +0000 (20:11 -0700)
committerGerrit Code Review <gerrit@gerrit.golo.chromium.org>
Mon, 7 Oct 2013 03:11:24 +0000 (20:11 -0700)
* changes:
  d153 intra prediction (16x16) ssse3 using bytes
  d153 intra prediction ssse3 using bytes

121 files changed:
build/make/rtcd.sh
build/make/thumb.pm
libs.mk
test/convolve_test.cc
test/fdct4x4_test.cc
test/test-data.sha1
test/test.mk
test/test_vector_test.cc
vp8/common/filter.c
vp8/common/filter.h
vp8/common/x86/filter_x86.c
vp8/common/x86/filter_x86.h
vp9/common/arm/neon/vp9_idct16x16_neon.c
vp9/common/arm/neon/vp9_save_reg_neon.asm [new file with mode: 0644]
vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
vp9/common/generic/vp9_systemdependent.c
vp9/common/mips/dspr2/vp9_common_dspr2.h [new file with mode: 0644]
vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c [new file with mode: 0644]
vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c [new file with mode: 0644]
vp9/common/mips/dspr2/vp9_convolve8_dspr2.c [new file with mode: 0644]
vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c [new file with mode: 0644]
vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c [new file with mode: 0644]
vp9/common/vp9_alloccommon.c
vp9/common/vp9_blockd.h
vp9/common/vp9_common_data.h
vp9/common/vp9_convolve.h
vp9/common/vp9_debugmodes.c
vp9/common/vp9_default_coef_probs.h
vp9/common/vp9_entropy.c
vp9/common/vp9_entropy.h
vp9/common/vp9_entropymode.c
vp9/common/vp9_entropymode.h
vp9/common/vp9_entropymv.c
vp9/common/vp9_entropymv.h
vp9/common/vp9_filter.c
vp9/common/vp9_filter.h
vp9/common/vp9_findnearmv.c
vp9/common/vp9_findnearmv.h
vp9/common/vp9_idct.c
vp9/common/vp9_idct.h
vp9/common/vp9_loopfilter.c
vp9/common/vp9_loopfilter.h
vp9/common/vp9_loopfilter_filters.c
vp9/common/vp9_mvref_common.c
vp9/common/vp9_onyx.h
vp9/common/vp9_onyxc_int.h
vp9/common/vp9_postproc.c
vp9/common/vp9_pred_common.c
vp9/common/vp9_pred_common.h
vp9/common/vp9_quant_common.c
vp9/common/vp9_reconinter.c
vp9/common/vp9_reconintra.c
vp9/common/vp9_rtcd.c
vp9/common/vp9_rtcd_defs.sh
vp9/common/vp9_scale.h
vp9/common/vp9_subpelvar.h [deleted file]
vp9/common/vp9_treecoder.c
vp9/common/x86/vp9_idct_intrin_sse2.c
vp9/common/x86/vp9_postproc_x86.h
vp9/common/x86/vp9_subpixel_8t_ssse3.asm
vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm [deleted file]
vp9/decoder/vp9_decodemv.c
vp9/decoder/vp9_decodframe.c
vp9/decoder/vp9_detokenize.c
vp9/decoder/vp9_dsubexp.c
vp9/decoder/vp9_dsubexp.h
vp9/decoder/vp9_idct_blk.c [deleted file]
vp9/decoder/vp9_idct_blk.h [deleted file]
vp9/decoder/vp9_onyxd.h
vp9/decoder/vp9_onyxd_if.c
vp9/decoder/vp9_onyxd_int.h
vp9/decoder/vp9_read_bit_buffer.h
vp9/decoder/vp9_thread.c
vp9/decoder/vp9_thread.h
vp9/decoder/vp9_treereader.h
vp9/decoder/x86/vp9_dequantize_sse2.c [deleted file]
vp9/encoder/vp9_bitstream.c
vp9/encoder/vp9_block.h
vp9/encoder/vp9_boolhuff.c
vp9/encoder/vp9_dct.c
vp9/encoder/vp9_encodeframe.c
vp9/encoder/vp9_encodeframe.h
vp9/encoder/vp9_encodeintra.c
vp9/encoder/vp9_encodemb.c
vp9/encoder/vp9_encodemb.h
vp9/encoder/vp9_encodemv.c
vp9/encoder/vp9_firstpass.c
vp9/encoder/vp9_lookahead.c
vp9/encoder/vp9_mbgraph.c
vp9/encoder/vp9_mcomp.c
vp9/encoder/vp9_mcomp.h
vp9/encoder/vp9_onyx_if.c
vp9/encoder/vp9_onyx_int.h
vp9/encoder/vp9_picklpf.c
vp9/encoder/vp9_psnr.c
vp9/encoder/vp9_ratectrl.c
vp9/encoder/vp9_rdopt.c
vp9/encoder/vp9_rdopt.h
vp9/encoder/vp9_ssim.c
vp9/encoder/vp9_temporal_filter.c
vp9/encoder/vp9_tokenize.c
vp9/encoder/vp9_variance.h
vp9/encoder/vp9_variance_c.c
vp9/encoder/x86/vp9_dct32x32_sse2.c
vp9/encoder/x86/vp9_variance_mmx.c
vp9/vp9_common.mk
vp9/vp9_cx_iface.c
vp9/vp9_dx_iface.c
vp9/vp9dx.mk
vpx/vp8.h
vpx/vp8cx.h
vpx/vp8dx.h
vpx/vpx_codec.mk
vpx/vpx_codec_impl_bottom.h [deleted file]
vpx/vpx_codec_impl_top.h [deleted file]
vpx_scale/mips/dspr2/yv12extend_dspr2.c [new file with mode: 0644]
vpx_scale/vpx_scale.mk
vpx_scale/vpx_scale_rtcd.sh

index 6cc36843b1a6266e8d16e317138e3bb03c525166..c531e95a11cd2d38826959dcc2aeac0ebba27b41 100755 (executable)
@@ -290,9 +290,15 @@ static void setup_rtcd_internal(void)
 {
 $(set_function_pointers c $ALL_ARCHS)
 #if HAVE_DSPR2
+#if CONFIG_VP8
 void dsputil_static_init();
 dsputil_static_init();
 #endif
+#if CONFIG_VP9
+void vp9_dsputil_static_init();
+vp9_dsputil_static_init();
+#endif
+#endif
 }
 #endif
 $(common_bottom)
index 545f59f43920316dd5de555a8fa593fbdeb2bbac..e1f34c1ec48c2ddd345653e04b0108498c3831ed 100644 (file)
@@ -47,7 +47,7 @@ sub FixThumbInstructions($$)
     # this is used, it's used for two subsequent load instructions,
     # where a hand-written version of it could merge two subsequent
     # add and sub instructions.
-    s/^(\s*)((ldr|str|pld)(ne)?)(\s+)(r\d+,)?\s*\[(\w+), -([^\]]+)\]/$1sub$4$5$7, $7, $8\n$1$2$5$6\[$7\]\n$1add$4$5$7, $7, $8/g;
+    s/^(\s*)((ldr|str|pld)(ne)?)(\s+)(r\d+,\s*)?\[(\w+), -([^\]]+)\]/$1sub$4$5$7, $7, $8\n$1$2$5$6\[$7\]\n$1add$4$5$7, $7, $8/g;
 
     # Convert register post indexing to a separate add instruction.
     # This converts "ldrneb r9, [r0], r2" into "ldrneb r9, [r0]",
diff --git a/libs.mk b/libs.mk
index 43545e38a35ef96dc1f05b277d8a7bac83defe8d..4691a125846419481083ef95cdbd0adfb0780a91 100644 (file)
--- a/libs.mk
+++ b/libs.mk
@@ -183,8 +183,6 @@ CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
 INSTALL-LIBS-yes += include/vpx/vpx_codec.h
 INSTALL-LIBS-yes += include/vpx/vpx_image.h
 INSTALL-LIBS-yes += include/vpx/vpx_integer.h
-INSTALL-LIBS-yes += include/vpx/vpx_codec_impl_top.h
-INSTALL-LIBS-yes += include/vpx/vpx_codec_impl_bottom.h
 INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h
 INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
index 3100571da4ae3051c776c98fb5ab67eb2db2c762..f0b412deaa80fd4ed253b50041c7656833283166 100644 (file)
@@ -642,4 +642,26 @@ INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
     make_tuple(32, 64, &convolve8_neon),
     make_tuple(64, 64, &convolve8_neon)));
 #endif
+
+#if HAVE_DSPR2
+const ConvolveFunctions convolve8_dspr2(
+    vp9_convolve8_horiz_dspr2, vp9_convolve8_avg_horiz_dspr2,
+    vp9_convolve8_vert_dspr2, vp9_convolve8_avg_vert_dspr2,
+    vp9_convolve8_dspr2, vp9_convolve8_avg_dspr2);
+
+INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_dspr2),
+    make_tuple(8, 4, &convolve8_dspr2),
+    make_tuple(4, 8, &convolve8_dspr2),
+    make_tuple(8, 8, &convolve8_dspr2),
+    make_tuple(16, 8, &convolve8_dspr2),
+    make_tuple(8, 16, &convolve8_dspr2),
+    make_tuple(16, 16, &convolve8_dspr2),
+    make_tuple(32, 16, &convolve8_dspr2),
+    make_tuple(16, 32, &convolve8_dspr2),
+    make_tuple(32, 32, &convolve8_dspr2),
+    make_tuple(64, 32, &convolve8_dspr2),
+    make_tuple(32, 64, &convolve8_dspr2),
+    make_tuple(64, 64, &convolve8_dspr2)));
+#endif
 }  // namespace
index ea40ca62aeae7de2858ea6aca8625cb2a130f78e..d34c79112360f4d57472f1bb68c8552a456d4819 100644 (file)
@@ -31,7 +31,7 @@ void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
 }
 void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
                  int stride, int /*tx_type*/) {
-  vp9_short_idct4x4_add_c(out, dst, stride >> 1);
+  vp9_idct4x4_16_add_c(out, dst, stride >> 1);
 }
 void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
             int stride, int tx_type) {
index fd5982a2cc1343a24ef9c0f52f2eed3ed1ffac8d..a8af8b96fdb1f86ec199fe7d2c40d609ecd8a613 100644 (file)
@@ -524,6 +524,8 @@ b6524e4084d15b5d0caaa3d3d1368db30cbee69c  vp90-2-03-deltaq.webm
 65f45ec9a55537aac76104818278e0978f94a678  vp90-2-03-deltaq.webm.md5
 4dbb87494c7f565ffc266c98d17d0d8c7a5c5aba  vp90-2-05-resize.ivf
 7f6d8879336239a43dbb6c9f13178cb11cf7ed09  vp90-2-05-resize.ivf.md5
+bf61ddc1f716eba58d4c9837d4e91031d9ce4ffe  vp90-2-06-bilinear.webm
+f6235f937552e11d8eb331ec55da6b3aa596b9ac  vp90-2-06-bilinear.webm.md5
 495256cfd123fe777b2c0406862ed8468a1f4677  vp91-2-04-yv444.webm
 65e3a7ffef61ab340d9140f335ecc49125970c2c  vp91-2-04-yv444.webm.md5
 
index fd27506ddc218397c1f9bd6295d4397ccc4337e2..58ced874f1de04987a7433caf1f02539bc0fcada 100644 (file)
@@ -633,5 +633,7 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
index a3341061ebacdc3785af923b58a207851b0fc3eb..de0adf70baa77e780cc30200a1fbef7629b57e62 100644 (file)
@@ -160,7 +160,7 @@ const char *kVP9TestVectors[] = {
   "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
   "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
   "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
-  "vp90-2-05-resize.ivf",
+  "vp90-2-05-resize.ivf",        "vp90-2-06-bilinear.webm",
 #if CONFIG_NON420
   "vp91-2-04-yv444.webm"
 #endif
index 1901ea3b6664aaa453f568b9cad6d47cea6011cf..25266f868272b9bfc398de01ad91807b8aeba59a 100644 (file)
@@ -9,9 +9,7 @@
  */
 
 
-#include <stdlib.h>
 #include "filter.h"
-#include "vpx_ports/mem.h"
 
 DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
 {
index b7591f268be7e27492c89cf2d22746e9143849e9..ccda7c8d0202bb2c3c3ef4364b4de7ca58e7eacb 100644 (file)
 #ifndef FILTER_H
 #define FILTER_H
 
+#include "vpx_ports/mem.h"
+
 #define BLOCK_HEIGHT_WIDTH 4
 #define VP8_FILTER_WEIGHT 128
 #define VP8_FILTER_SHIFT  7
 
-extern const short vp8_bilinear_filters[8][2];
-extern const short vp8_sub_pel_filters[8][6];
+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]);
+extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]);
 
 #endif
index ebab814f4aaa384497f82af882323b47750d4b71..7f496ed7dba754c6e60b65a27ba16651438c7bbc 100644 (file)
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"
 
 DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) =
 {
index efcc4dc2aeca5d9d96b2e8977a8fa42a3fa6adf7..cfadaeecbc12c34f36d449a632cfb7157a817054 100644 (file)
 #ifndef FILTER_X86_H
 #define FILTER_X86_H
 
+#include "vpx_ports/mem.h"
+
 /* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
  * duplicated values */
-extern const short vp8_bilinear_filters_x86_4[8][8];  /* duplicated 4x */
-extern const short vp8_bilinear_filters_x86_8[8][16]; /* duplicated 8x */
+
+/* duplicated 4x */
+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
+
+/* duplicated 8x */
+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);
 
 #endif /* FILTER_X86_H */
index 3e3e400a4b820a232bd97c96aabfddab56cf8c48..fb7b5cdc49c255628f02ec6d3967133cc55aac4b 100644 (file)
@@ -20,26 +20,28 @@ extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
                                                int16_t skip_adding,
                                                uint8_t *dest,
                                                int dest_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input,
+extern void vp9_short_idct16x16_10_add_neon_pass1(int16_t *input,
                                                int16_t *output,
                                                int output_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
+extern void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src,
                                                int16_t *output,
                                                int16_t *pass1Output,
                                                int16_t skip_adding,
                                                uint8_t *dest,
                                                int dest_stride);
-extern void save_neon_registers();
-extern void restore_neon_registers();
 
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
+extern void vp9_push_neon(int64_t *store);
+extern void vp9_pop_neon(int64_t *store);
 
 void vp9_short_idct16x16_add_neon(int16_t *input,
                                   uint8_t *dest, int dest_stride) {
+  int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
 
   // save d8-d15 register values.
-  save_neon_registers();
+  vp9_push_neon(store_reg);
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -102,28 +104,29 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
                                      dest_stride);
 
   // restore d8-d15 register values.
-  restore_neon_registers();
+  vp9_pop_neon(store_reg);
 
   return;
 }
 
-void vp9_short_idct10_16x16_add_neon(int16_t *input,
+void vp9_short_idct16x16_10_add_neon(int16_t *input,
                                   uint8_t *dest, int dest_stride) {
+  int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
 
   // save d8-d15 register values.
-  save_neon_registers();
+  vp9_push_neon(store_reg);
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8);
+  vp9_short_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_short_idct10_16x16_add_neon_pass2(input+1,
+  vp9_short_idct16x16_10_add_neon_pass2(input+1,
                                         row_idct_output,
                                         pass1_output,
                                         0,
@@ -163,7 +166,7 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input,
                                      dest_stride);
 
   // restore d8-d15 register values.
-  restore_neon_registers();
+  vp9_pop_neon(store_reg);
 
   return;
 }
diff --git a/vp9/common/arm/neon/vp9_save_reg_neon.asm b/vp9/common/arm/neon/vp9_save_reg_neon.asm
new file mode 100644 (file)
index 0000000..71c3e70
--- /dev/null
@@ -0,0 +1,36 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_push_neon|
+    EXPORT  |vp9_pop_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_push_neon| PROC
+    vst1.i64            {d8, d9, d10, d11}, [r0]!
+    vst1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+|vp9_pop_neon| PROC
+    vld1.i64            {d8, d9, d10, d11}, [r0]!
+    vld1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+    END
+
index 7464e800f3438f5be6c211957eb782018b23de0c..df2a0526ca4795c0d9045cde41ce9267c4f51e60 100644 (file)
 
     EXPORT  |vp9_short_idct16x16_add_neon_pass1|
     EXPORT  |vp9_short_idct16x16_add_neon_pass2|
-    EXPORT  |vp9_short_idct10_16x16_add_neon_pass1|
-    EXPORT  |vp9_short_idct10_16x16_add_neon_pass2|
-    EXPORT  |save_neon_registers|
-    EXPORT  |restore_neon_registers|
+    EXPORT  |vp9_short_idct16x16_10_add_neon_pass1|
+    EXPORT  |vp9_short_idct16x16_10_add_neon_pass2|
     ARM
     REQUIRE8
     PRESERVE8
@@ -788,7 +786,7 @@ end_idct16x16_pass2
     bx              lr
     ENDP  ; |vp9_short_idct16x16_add_neon_pass2|
 
-;void |vp9_short_idct10_16x16_add_neon_pass1|(int16_t *input,
+;void |vp9_short_idct16x16_10_add_neon_pass1|(int16_t *input,
 ;                                             int16_t *output, int output_stride)
 ;
 ; r0  int16_t input
@@ -798,7 +796,7 @@ end_idct16x16_pass2
 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_short_idct10_16x16_add_neon_pass1| PROC
+|vp9_short_idct16x16_10_add_neon_pass1| PROC
 
     ; TODO(hkuang): Find a better way to load the elements.
     ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -907,9 +905,9 @@ end_idct16x16_pass2
     vst1.64         {d31}, [r1], r2
 
     bx              lr
-    ENDP  ; |vp9_short_idct10_16x16_add_neon_pass1|
+    ENDP  ; |vp9_short_idct16x16_10_add_neon_pass1|
 
-;void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
+;void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src,
 ;                                           int16_t *output,
 ;                                           int16_t *pass1Output,
 ;                                           int16_t skip_adding,
@@ -926,7 +924,7 @@ end_idct16x16_pass2
 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_short_idct10_16x16_add_neon_pass2| PROC
+|vp9_short_idct16x16_10_add_neon_pass2| PROC
     push            {r3-r9}
 
     ; TODO(hkuang): Find a better way to load the elements.
@@ -1177,15 +1175,5 @@ end_idct16x16_pass2
 end_idct10_16x16_pass2
     pop             {r3-r9}
     bx              lr
-    ENDP  ; |vp9_short_idct10_16x16_add_neon_pass2|
-;void |save_neon_registers|()
-|save_neon_registers| PROC
-    vpush           {d8-d15}
-    bx              lr
-    ENDP  ; |save_registers|
-;void |restore_neon_registers|()
-|restore_neon_registers| PROC
-    vpop           {d8-d15}
-    bx             lr
-    ENDP  ; |restore_registers|
+    ENDP  ; |vp9_short_idct16x16_10_add_neon_pass2|
     END
index 869ee5f3f6aea6b975ec5aaf794d21f6c9e10629..0d4a721c4d329dc3c289659883097df2a2657af4 100644 (file)
@@ -8,21 +8,21 @@
 ;
 
 
-    EXPORT  |vp9_short_idct4x4_1_add_neon|
+    EXPORT  |vp9_idct4x4_1_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
 ;                                  int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct4x4_1_add_neon| PROC
+|vp9_idct4x4_1_add_neon| PROC
     ldrsh            r0, [r0]
 
     ; generate cospi_16_64 = 11585
@@ -63,6 +63,6 @@
     vst1.32          {d7[1]}, [r12]
 
     bx               lr
-    ENDP             ; |vp9_short_idct4x4_1_add_neon|
+    ENDP             ; |vp9_idct4x4_1_add_neon|
 
     END
index 640fb93569cd19674e22eab4ed7dc1aaff37b534..00283fc8d780b2c19fafcca398b2f0a2eed4c604 100644 (file)
@@ -8,7 +8,7 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_short_idct4x4_add_neon|
+    EXPORT  |vp9_idct4x4_16_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
     AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct4x4_add_neon| PROC
+|vp9_idct4x4_16_add_neon| PROC
 
     ; The 2D transform is done with two passes which are actually pretty
     ; similar. We first transform the rows. This is done by transposing
     vst1.32 {d26[1]}, [r1], r2
     vst1.32 {d26[0]}, [r1]  ; no post-increment
     bx              lr
-    ENDP  ; |vp9_short_idct4x4_add_neon|
+    ENDP  ; |vp9_idct4x4_16_add_neon|
 
     END
index a744f59dbdf6cafdabfbf5d4857d6e3be55347b5..c02251a3dab49f9582bb9380d5946cfeedbfa588 100644 (file)
@@ -9,7 +9,7 @@
 ;
 
     EXPORT  |vp9_short_idct8x8_add_neon|
-    EXPORT  |vp9_short_idct10_8x8_add_neon|
+    EXPORT  |vp9_short_idct8x8_10_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
     bx              lr
     ENDP  ; |vp9_short_idct8x8_add_neon|
 
-;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_short_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct10_8x8_add_neon| PROC
+|vp9_short_idct8x8_10_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
     vld1.s16        {q8,q9}, [r0]!
     vpop            {d8-d15}
     pop             {r4-r9}
     bx              lr
-    ENDP  ; |vp9_short_idct10_8x8_add_neon|
+    ENDP  ; |vp9_short_idct8x8_10_add_neon|
 
     END
index f144721139e21ae11ba9ddd1156af59d5fa870ae..536febb6522374237b8d3681fe0b26f4d7f46a4c 100644 (file)
@@ -10,7 +10,7 @@
 
 
 #include "./vpx_config.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
 void vp9_machine_specific_config(VP9_COMMON *cm) {
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
new file mode 100644 (file)
index 0000000..d2fa4c1
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_COMMON_DSPR2_H_
+#define VP9_COMMON_VP9_COMMON_DSPR2_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+
+#if HAVE_DSPR2
+#define CROP_WIDTH 512
+extern uint8_t *vp9_ff_cropTbl;
+
+#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                    ({   \
+                                                                               \
+  int32_t tmp, out;                                                            \
+  int     dct_cost_rounding = DCT_CONST_ROUNDING;                              \
+  int     in = input;                                                          \
+                                                                               \
+  __asm__ __volatile__ (                                                       \
+      /* out = dct_const_round_shift(input_dc * cospi_16_64); */               \
+      "mtlo     %[dct_cost_rounding],   $ac1                              \n\t"\
+      "mthi     $zero,                  $ac1                              \n\t"\
+      "madd     $ac1,                   %[in],            %[cospi_16_64]  \n\t"\
+      "extp     %[tmp],                 $ac1,             31              \n\t"\
+                                                                               \
+      /* out = dct_const_round_shift(out * cospi_16_64); */                    \
+      "mtlo     %[dct_cost_rounding],   $ac2                              \n\t"\
+      "mthi     $zero,                  $ac2                              \n\t"\
+      "madd     $ac2,                   %[tmp],           %[cospi_16_64]  \n\t"\
+      "extp     %[out],                 $ac2,             31              \n\t"\
+                                                                               \
+      : [tmp] "=&r" (tmp), [out] "=r" (out)                                    \
+      : [in] "r" (in),                                                         \
+        [dct_cost_rounding] "r" (dct_cost_rounding),                           \
+        [cospi_16_64] "r" (cospi_16_64)                                        \
+   );                                                                          \
+  out;                                                                    })
+
+static INLINE void vp9_prefetch_load(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   0,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void vp9_prefetch_store(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   1,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+
+static INLINE void vp9_prefetch_load_streamed(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   4,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   5,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+
+#endif  // #if HAVE_DSPR2
+#endif  // VP9_COMMON_VP9_COMMON_DSPR2_H_
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
new file mode 100644 (file)
index 0000000..0930ad1
--- /dev/null
@@ -0,0 +1,689 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_avg_vert_4_dspr2(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int16_t *filter_y,
+                                      int32_t w,
+                                      int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vp9_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2, load3, load4;
+  uint32_t      p1, p2;
+  uint32_t      n1, n2;
+  uint32_t      scratch1, scratch2;
+  uint32_t      store1, store2;
+  int32_t       vector1b, vector2b, vector3b, vector4b;
+  int32_t       Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [load3] "=&r" (load3), [load4] "=&r" (load4),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+            [vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_avg_vert_64_dspr2(const uint8_t *src,
+                                       int32_t src_stride,
+                                       uint8_t *dst,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_y,
+                                       int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vp9_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2, load3, load4;
+  uint32_t      p1, p2;
+  uint32_t      n1, n2;
+  uint32_t      scratch1, scratch2;
+  uint32_t      store1, store2;
+  int32_t       vector1b, vector2b, vector3b, vector4b;
+  int32_t       Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+    vp9_prefetch_store(dst + dst_stride + 32);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [load3] "=&r" (load3), [load4] "=&r" (load4),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+            [vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h) {
+  if (((const int32_t *)filter_y)[1] == 0x800000) {
+    vp9_convolve_avg(src, src_stride,
+                     dst, dst_stride,
+                     filter_x, x_step_q4,
+                     filter_y, y_step_q4,
+                     w, h);
+  } else {
+    if (16 == y_step_q4) {
+      uint32_t pos = 38;
+
+      /* bit positon for extract from acc */
+      __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+      );
+
+      vp9_prefetch_store(dst);
+
+      switch (w) {
+        case 4:
+        case 8:
+        case 16:
+        case 32:
+          convolve_avg_vert_4_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_y, w, h);
+          break;
+        case 64:
+          vp9_prefetch_store(dst + 32);
+          convolve_avg_vert_64_dspr2(src, src_stride,
+                                     dst, dst_stride,
+                                     filter_y, h);
+          break;
+        default:
+          vp9_convolve8_avg_vert_c(src, src_stride,
+                                   dst, dst_stride,
+                                   filter_x, x_step_q4,
+                                   filter_y, y_step_q4,
+                                   w, h);
+          break;
+      }
+    } else {
+      vp9_convolve8_avg_vert_c(src, src_stride,
+                               dst, dst_stride,
+                               filter_x, x_step_q4,
+                               filter_y, y_step_q4,
+                               w, h);
+    }
+  }
+}
+
+void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4,
+                             int w, int h) {
+  /* Fixed size intermediate buffer places limits on parameters. */
+  DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
+  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
+
+  assert(w <= 64);
+  assert(h <= 64);
+
+  if (intermediate_height < h)
+    intermediate_height = h;
+
+  if (x_step_q4 != 16 || y_step_q4 != 16)
+    return vp9_convolve8_avg_c(src, src_stride,
+                               dst, dst_stride,
+                               filter_x, x_step_q4,
+                               filter_y, y_step_q4,
+                               w, h);
+
+  vp9_convolve8_horiz(src - (src_stride * 3), src_stride,
+                      temp, 64,
+                      filter_x, x_step_q4,
+                      filter_y, y_step_q4,
+                      w, intermediate_height);
+
+  vp9_convolve8_avg_vert(temp + (64*3), 64,
+                         dst, dst_stride,
+                         filter_x, x_step_q4,
+                         filter_y, y_step_q4,
+                         w, h);
+}
+
+void vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int filter_x_stride,
+                            const int16_t *filter_y, int filter_y_stride,
+                            int w, int h) {
+  int x, y;
+  uint32_t tp1, tp2, tn1;
+  uint32_t tp3, tp4, tn2;
+
+  /* prefetch data to cache memory */
+  vp9_prefetch_load(src);
+  vp9_prefetch_load(src + 32);
+  vp9_prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+      /* 1 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+
+            : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
+              [tp2] "=&r" (tp2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    case 8:
+      /* 2 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "ulw              %[tp3],         4(%[src])      \n\t"
+            "ulw              %[tp4],         4(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    case 16:
+      /* 4 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "ulw              %[tp3],         4(%[src])      \n\t"
+            "ulw              %[tp4],         4(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         8(%[src])      \n\t"
+            "ulw              %[tp2],         8(%[dst])      \n\t"
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+            "ulw              %[tp3],         12(%[src])     \n\t"
+            "ulw              %[tp4],         12(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    case 32:
+      /* 8 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "ulw              %[tp3],         4(%[src])      \n\t"
+            "ulw              %[tp4],         4(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         8(%[src])      \n\t"
+            "ulw              %[tp2],         8(%[dst])      \n\t"
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+            "ulw              %[tp3],         12(%[src])     \n\t"
+            "ulw              %[tp4],         12(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         16(%[src])     \n\t"
+            "ulw              %[tp2],         16(%[dst])     \n\t"
+            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         20(%[src])     \n\t"
+            "ulw              %[tp4],         20(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         24(%[src])     \n\t"
+            "ulw              %[tp2],         24(%[dst])     \n\t"
+            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         28(%[src])     \n\t"
+            "ulw              %[tp4],         28(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    case 64:
+      vp9_prefetch_load(src + 64);
+      vp9_prefetch_store(dst + 32);
+
+      /* 16 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_load(src + src_stride + 64);
+        vp9_prefetch_store(dst + dst_stride);
+        vp9_prefetch_store(dst + dst_stride + 32);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "ulw              %[tp3],         4(%[src])      \n\t"
+            "ulw              %[tp4],         4(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         8(%[src])      \n\t"
+            "ulw              %[tp2],         8(%[dst])      \n\t"
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+            "ulw              %[tp3],         12(%[src])     \n\t"
+            "ulw              %[tp4],         12(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         16(%[src])     \n\t"
+            "ulw              %[tp2],         16(%[dst])     \n\t"
+            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         20(%[src])     \n\t"
+            "ulw              %[tp4],         20(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         24(%[src])     \n\t"
+            "ulw              %[tp2],         24(%[dst])     \n\t"
+            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         28(%[src])     \n\t"
+            "ulw              %[tp4],         28(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         32(%[src])     \n\t"
+            "ulw              %[tp2],         32(%[dst])     \n\t"
+            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         36(%[src])     \n\t"
+            "ulw              %[tp4],         36(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         40(%[src])     \n\t"
+            "ulw              %[tp2],         40(%[dst])     \n\t"
+            "sw               %[tn1],         32(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         36(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         44(%[src])     \n\t"
+            "ulw              %[tp4],         44(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         48(%[src])     \n\t"
+            "ulw              %[tp2],         48(%[dst])     \n\t"
+            "sw               %[tn1],         40(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         44(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         52(%[src])     \n\t"
+            "ulw              %[tp4],         52(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         56(%[src])     \n\t"
+            "ulw              %[tp2],         56(%[dst])     \n\t"
+            "sw               %[tn1],         48(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         52(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         60(%[src])     \n\t"
+            "ulw              %[tp4],         60(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         56(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         60(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    default:
+      for (y = h; y > 0; --y) {
+        for (x = 0; x < w; ++x) {
+          dst[x] = (dst[x] + src[x] + 1) >> 1;
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+  }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
new file mode 100644 (file)
index 0000000..37c665b
--- /dev/null
@@ -0,0 +1,1032 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
+                                       int32_t src_stride,
+                                       uint8_t *dst,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0,
+                                       int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  int32_t  vector1b, vector2b, vector3b, vector4b;
+  int32_t  Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4;
+  uint32_t n1, n2, n3, n4;
+  uint32_t tn1, tn2;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "ulw              %[tn2],         8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
+        "balign           %[tn1],         %[tn2],         3              \n\t"
+        "balign           %[tn2],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
+        "preceu.ph.qbr    %[n1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[n2],          %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[n3],          %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n4],          %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[n1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[n2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[n3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[n4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[n1],          %[tn1]                         \n\t"
+        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
+        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
+        "dpa.w.ph         $ac2,           %[n2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp4],       $ac2,           31             \n\t"
+
+        "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
+        "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
+
+        /* clamp */
+        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
+        "lbux             %[n2],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
+        "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
+
+        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
+        "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
+
+        "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t"  /* average odd 2 */
+        "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
+                                       int32_t src_stride,
+                                       uint8_t *dst,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0,
+                                       int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4, n1;
+  uint32_t tn1, tn2, tn3;
+  uint32_t st0, st1;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
+        "ulw              %[tn2],         8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+        "lbu              %[Temp2],       0(%[dst])                      \n\t"
+        "lbu              %[tn3],         2(%[dst])                      \n\t"
+
+        /* even 2. pixel */
+        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n1],          %[tn2]                         \n\t"
+        "ulw              %[tn1],         12(%[src])                     \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac1                           \n\t"
+        "mthi             $zero,          $ac1                           \n\t"
+        "preceu.ph.qbr    %[p2],          %[tn1]                         \n\t"
+        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac1,           31             \n\t"
+
+        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
+        "addqh_r.w        %[tn3],         %[tn3],         %[st1]         \n\t"
+        "sb               %[Temp2],       0(%[dst])                      \n\t"
+        "sb               %[tn3],         2(%[dst])                      \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+
+        "balign           %[tn3],         %[tn1],         3              \n\t"
+        "balign           %[tn1],         %[tn2],         3              \n\t"
+        "balign           %[tn2],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+
+        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
+        "lbu              %[Temp2],       4(%[dst])                      \n\t"
+        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
+
+        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a],    $ac1                           \n\t"
+        "mthi             $zero,          $ac1                           \n\t"
+        "sb               %[Temp2],       4(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tp1],         6(%[dst])                      \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
+        "preceu.ph.qbl    %[n1],          %[tn1]                         \n\t"
+        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac1,           31             \n\t"
+
+        "lbu              %[tp2],         1(%[dst])                      \n\t"
+        "lbu              %[tn2],         3(%[dst])                      \n\t"
+        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
+        "preceu.ph.qbr    %[p2],          %[tn3]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]    \n\t"
+        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tn3],         5(%[dst])                      \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[tp2],         1(%[dst])                      \n\t"
+        "sb               %[tp1],         6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac2,           31             \n\t"
+
+        "lbu              %[tn1],         7(%[dst])                      \n\t"
+
+        /* clamp */
+        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
+        "addqh_r.w        %[tn2],         %[tn2],         %[p4]          \n\t"
+
+        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
+        "addqh_r.w        %[tn3],         %[tn3],         %[p2]          \n\t"
+
+        "lbux             %[n1],          %[Temp1](%[cm])                \n\t"
+        "addqh_r.w        %[tn1],         %[tn1],         %[n1]          \n\t"
+
+        /* store bytes */
+        "sb               %[tn2],         3(%[dst])                      \n\t"
+        "sb               %[tn3],         5(%[dst])                      \n\t"
+        "sb               %[tn1],         7(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
+          [st0] "=&r" (st0), [st1] "=&r" (st1),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        uint8_t *dst_ptr,
+                                        int32_t dst_stride,
+                                        const int16_t *filter_x0,
+                                        int32_t h,
+                                        int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
+          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                   \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
+          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
+          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
+
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
+
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
+
+          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
+          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
+          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        uint8_t *dst_ptr,
+                                        int32_t dst_stride,
+                                        const int16_t *filter_x0,
+                                        int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_load(src_ptr + src_stride + 64);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
+          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                   \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
+          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
+          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
+
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
+
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
+
+          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
+          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
+          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vp9_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h) {
+  if (((const int32_t *)filter_x)[1] == 0x800000) {
+    vp9_convolve_avg(src, src_stride,
+                     dst, dst_stride,
+                     filter_x, x_step_q4,
+                     filter_y, y_step_q4,
+                     w, h);
+  } else {
+    if (16 == x_step_q4) {
+      uint32_t pos = 38;
+
+      src -= 3;
+
+      /* bit positon for extract from acc */
+      __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+      );
+
+      /* prefetch data to cache memory */
+      vp9_prefetch_load(src);
+      vp9_prefetch_load(src + 32);
+      vp9_prefetch_store(dst);
+
+      switch (w) {
+        case 4:
+          convolve_avg_horiz_4_dspr2(src, src_stride,
+                                     dst, dst_stride,
+                                     filter_x, h);
+          break;
+        case 8:
+          convolve_avg_horiz_8_dspr2(src, src_stride,
+                                     dst, dst_stride,
+                                     filter_x, h);
+          break;
+        case 16:
+          convolve_avg_horiz_16_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_x, h, 1);
+          break;
+        case 32:
+          convolve_avg_horiz_16_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_x, h, 2);
+          break;
+        case 64:
+          vp9_prefetch_load(src + 64);
+          vp9_prefetch_store(dst + 32);
+
+          convolve_avg_horiz_64_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_x, h);
+          break;
+        default:
+          vp9_convolve8_avg_horiz_c(src + 3, src_stride,
+                                    dst, dst_stride,
+                                    filter_x, x_step_q4,
+                                    filter_y, y_step_q4,
+                                    w, h);
+          break;
+      }
+    } else {
+      vp9_convolve8_avg_horiz_c(src, src_stride,
+                                dst, dst_stride,
+                                filter_x, x_step_q4,
+                                filter_y, y_step_q4,
+                                w, h);
+    }
+  }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
new file mode 100644 (file)
index 0000000..2c48bd0
--- /dev/null
@@ -0,0 +1,1281 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+uint8_t vp9_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
+uint8_t *vp9_ff_cropTbl;
+
+void vp9_dsputil_static_init(void) {
+  int i;
+
+  for (i = 0; i < 256; i++) vp9_ff_cropTbl_a[i + CROP_WIDTH] = i;
+
+  for (i = 0; i < CROP_WIDTH; i++) {
+    vp9_ff_cropTbl_a[i] = 0;
+    vp9_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
+  }
+
+  vp9_ff_cropTbl = &vp9_ff_cropTbl_a[CROP_WIDTH];
+}
+
+static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int16_t *filter_x0,
+                                              int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *dst_ptr;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4;
+  uint32_t tn1, tn2;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    dst_ptr = dst;
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "ulw              %[tn2],         8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
+        "balign           %[tn1],         %[tn2],         3              \n\t"
+        "balign           %[tn2],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp4],       $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"
+        "lbux             %[p2],          %[Temp4](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[tn1],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+          [dst_ptr] "+r" (dst_ptr)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int16_t *filter_x0,
+                                              int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *dst_ptr;
+  uint32_t vector4a = 64;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3;
+  uint32_t p1, p2, p3, p4, n1;
+  uint8_t *odd_dst;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+
+    dst_ptr = dst;
+    odd_dst = (dst_ptr + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp2],         0(%[src])                       \n\t"
+        "ulw              %[tp1],         4(%[src])                       \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp1]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp1]                          \n\t"
+        "ulw              %[tp3],         8(%[src])                       \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
+        "extp             %[Temp1],       $ac3,           31              \n\t"
+
+        /* even 2. pixel */
+        "preceu.ph.qbr    %[p1],          %[tp3]                          \n\t"
+        "preceu.ph.qbl    %[n1],          %[tp3]                          \n\t"
+        "ulw              %[tp2],         12(%[src])                      \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "preceu.ph.qbr    %[p2],          %[tp2]                          \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]     \n\t"
+        "lbux             %[tp3],         %[Temp3](%[cm])                 \n\t"
+        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]     \n\t"
+        "extp             %[p3],          $ac1,           31              \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "sb               %[tp3],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+
+        "ulw              %[tp1],         1(%[src])                       \n\t"
+        "ulw              %[tp3],         5(%[src])                       \n\t"
+
+        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        "lbux             %[tp2],         %[p3](%[cm])                    \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
+        "sb               %[tp2],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "ulw              %[tp2],         9(%[src])                       \n\t"
+
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[n1],          %[tp2]                          \n\t"
+        "ulw              %[Temp1],       13(%[src])                      \n\t"
+        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]     \n\t"
+        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]     \n\t"
+        "extp             %[Temp3],       $ac1,           31              \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
+        "preceu.ph.qbr    %[p2],          %[Temp1]                        \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
+        "extp             %[Temp1],       $ac2,           31              \n\t"
+
+        /* clamp */
+        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
+        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
+        "lbux             %[n1],          %[Temp1](%[cm])                 \n\t"
+
+        /* store bytes */
+        "sb               %[p4],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[p2],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[n1],          0(%[odd_dst])                   \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a), [cm] "r" (cm),
+          [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
+                                               int32_t src_stride,
+                                               uint8_t *dst_ptr,
+                                               int32_t dst_stride,
+                                               const int16_t *filter_x0,
+                                               int32_t h,
+                                               int32_t count) {
+  int32_t c, y;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t  filter12, filter34, filter56, filter78;
+  int32_t  Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  uint8_t  *odd_dst;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],        0(%[src])                       \n\t"
+          "ulw              %[qload2],        4(%[src])                       \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "ulw              %[qload2],        8(%[src])                       \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "ulw              %[qload1],        12(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        16(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        20(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       \n\t"
+          "ulw              %[qload2],        5(%[src])                       \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        9(%[src])                       \n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        17(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        21(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64), [cm] "r" (cm),
+            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+      );
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+
+    dst_ptr += 1;
+  }
+}
+
+static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
+                                               int32_t src_stride,
+                                               uint8_t *dst_ptr,
+                                               int32_t dst_stride,
+                                               const int16_t *filter_x0,
+                                               int32_t h) {
+  int32_t c, y;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t  filter12, filter34, filter56, filter78;
+  int32_t  Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  uint8_t  *odd_dst;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_load(src_ptr + src_stride + 64);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],        0(%[src])                       \n\t"
+          "ulw              %[qload2],        4(%[src])                       \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "ulw              %[qload2],        8(%[src])                       \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "ulw              %[qload1],        12(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        16(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        20(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       \n\t"
+          "ulw              %[qload2],        5(%[src])                       \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        9(%[src])                       \n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        17(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        21(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64), [cm] "r" (cm),
+            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+      );
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+
+    dst_ptr += 1;
+  }
+}
+
+void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter, int w, int h) {
+  int x, y, k;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int sum = 0;
+
+      for (k = 0; k < 8; ++k)
+        sum += src[x + k] * filter[k];
+
+      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
+  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
+  uint32_t pos = 38;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  if (intermediate_height < h)
+    intermediate_height = h;
+
+  if (x_step_q4 != 16 || y_step_q4 != 16)
+    return vp9_convolve8_c(src, src_stride,
+                           dst, dst_stride,
+                           filter_x, x_step_q4,
+                           filter_y, y_step_q4,
+                           w, h);
+
+  if ((((const int32_t *)filter_x)[1] == 0x800000)
+      && (((const int32_t *)filter_y)[1] == 0x800000))
+    return vp9_convolve_copy(src, src_stride,
+                             dst, dst_stride,
+                             filter_x, x_step_q4,
+                             filter_y, y_step_q4,
+                             w, h);
+
+  /* copy the src to dst */
+  if (filter_x[3] == 0x80) {
+    int32_t y;
+    int32_t c;
+    const uint8_t *src_ptr = src - src_stride * 3;
+    uint8_t *dst_ptr = temp;
+
+    for (y = intermediate_height; y--;) {
+      for (c = 0; c < w; c++) {
+        dst_ptr[c * intermediate_height] = src_ptr[c];
+      }
+
+      /* next row... */
+      src_ptr += src_stride;
+      dst_ptr += 1;
+    }
+  } else {
+    src -= (src_stride * 3 + 3);
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src);
+    vp9_prefetch_load(src + 32);
+
+    switch (w) {
+      case 4:
+        convolve_horiz_4_transposed_dspr2(src, src_stride,
+                                          temp, intermediate_height,
+                                          filter_x, intermediate_height);
+        break;
+      case 8:
+        convolve_horiz_8_transposed_dspr2(src, src_stride,
+                                          temp, intermediate_height,
+                                          filter_x, intermediate_height);
+        break;
+      case 16:
+      case 32:
+        convolve_horiz_16_transposed_dspr2(src, src_stride,
+                                           temp, intermediate_height,
+                                           filter_x, intermediate_height,
+                                           (w/16));
+        break;
+      case 64:
+        vp9_prefetch_load(src + 32);
+        convolve_horiz_64_transposed_dspr2(src, src_stride,
+                                           temp, intermediate_height,
+                                           filter_x, intermediate_height);
+        break;
+      default:
+        convolve_horiz_transposed(src, src_stride,
+                                  temp, intermediate_height,
+                                  filter_x, w, intermediate_height);
+        break;
+    }
+  }
+
+  /* copy the src to dst */
+  if (filter_y[3] == 0x80) {
+    int32_t y;
+    int32_t c;
+    uint8_t *src_ptr = temp + 3;
+    uint8_t *dst_ptr = dst;
+
+    for (y = w; y--;) {
+      for (c = 0; c < h; c++) {
+        dst_ptr[c * dst_stride] = src_ptr[c];
+      }
+
+      /* next row... */
+      src_ptr += intermediate_height;
+      dst_ptr += 1;
+    }
+  } else {
+    switch (h) {
+      case 4:
+        convolve_horiz_4_transposed_dspr2(temp, intermediate_height,
+                                          dst, dst_stride,
+                                          filter_y, w);
+        break;
+      case 8:
+        convolve_horiz_8_transposed_dspr2(temp, intermediate_height,
+                                          dst, dst_stride,
+                                          filter_y, w);
+        break;
+      case 16:
+      case 32:
+        convolve_horiz_16_transposed_dspr2(temp, intermediate_height,
+                                           dst, dst_stride,
+                                           filter_y, w, (h/16));
+        break;
+      case 64:
+        convolve_horiz_64_transposed_dspr2(temp, intermediate_height,
+                                           dst, dst_stride,
+                                           filter_y, w);
+        break;
+      default:
+        convolve_horiz_transposed(temp, intermediate_height,
+                                  dst, dst_stride,
+                                  filter_y, h, w);
+        break;
+    }
+  }
+}
+
+void vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int filter_x_stride,
+                             const int16_t *filter_y, int filter_y_stride,
+                             int w, int h) {
+  int x, y;
+
+  /* prefetch data to cache memory */
+  vp9_prefetch_load(src);
+  vp9_prefetch_load(src + 32);
+  vp9_prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+      {
+      uint32_t tp1;
+
+      /* 1 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         (%[src])      \n\t"
+            "sw               %[tp1],         (%[dst])      \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    case 8:
+      {
+      uint32_t tp1, tp2;
+
+      /* 2 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    case 16:
+      {
+      uint32_t tp1, tp2, tp3, tp4;
+
+      /* 4 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    case 32:
+      {
+      uint32_t tp1, tp2, tp3, tp4;
+      uint32_t tp5, tp6, tp7, tp8;
+
+      /* 8 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+            "ulw              %[tp5],         16(%[src])     \n\t"
+            "ulw              %[tp6],         20(%[src])     \n\t"
+            "ulw              %[tp7],         24(%[src])     \n\t"
+            "ulw              %[tp8],         28(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
+            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
+            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
+            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
+            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
+              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    case 64:
+      {
+      uint32_t tp1, tp2, tp3, tp4;
+      uint32_t tp5, tp6, tp7, tp8;
+
+      vp9_prefetch_load(src + 64);
+      vp9_prefetch_store(dst + 32);
+
+      /* 16 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_load(src + src_stride + 64);
+        vp9_prefetch_store(dst + dst_stride);
+        vp9_prefetch_store(dst + dst_stride + 32);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+            "ulw              %[tp5],         16(%[src])     \n\t"
+            "ulw              %[tp6],         20(%[src])     \n\t"
+            "ulw              %[tp7],         24(%[src])     \n\t"
+            "ulw              %[tp8],         28(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
+            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
+            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
+            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
+            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
+
+            "ulw              %[tp1],         32(%[src])     \n\t"
+            "ulw              %[tp2],         36(%[src])     \n\t"
+            "ulw              %[tp3],         40(%[src])     \n\t"
+            "ulw              %[tp4],         44(%[src])     \n\t"
+            "ulw              %[tp5],         48(%[src])     \n\t"
+            "ulw              %[tp6],         52(%[src])     \n\t"
+            "ulw              %[tp7],         56(%[src])     \n\t"
+            "ulw              %[tp8],         60(%[src])     \n\t"
+
+            "sw               %[tp1],         32(%[dst])     \n\t"  /* store */
+            "sw               %[tp2],         36(%[dst])     \n\t"  /* store */
+            "sw               %[tp3],         40(%[dst])     \n\t"  /* store */
+            "sw               %[tp4],         44(%[dst])     \n\t"  /* store */
+            "sw               %[tp5],         48(%[dst])     \n\t"  /* store */
+            "sw               %[tp6],         52(%[dst])     \n\t"  /* store */
+            "sw               %[tp7],         56(%[dst])     \n\t"  /* store */
+            "sw               %[tp8],         60(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
+              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    default:
+      for (y = h; y--; ) {
+        for (x = 0; x < w; ++x) {
+          dst[x] = src[x];
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+  }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
new file mode 100644 (file)
index 0000000..743d641
--- /dev/null
@@ -0,0 +1,917 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_horiz_4_dspr2(const uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int16_t *filter_x0,
+                                   int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4;
+  uint32_t n1, n2, n3, n4;
+  uint32_t tn1, tn2;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "ulw              %[tn2],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
+        "balign           %[tn1],      %[tn2],         3              \n\t"
+        "balign           %[tn2],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
+        "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp4],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
+        "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],      0(%[dst])                      \n\t"
+        "sb               %[tn1],      1(%[dst])                      \n\t"
+        "sb               %[tp2],      2(%[dst])                      \n\t"
+        "sb               %[n2],       3(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_horiz_8_dspr2(const uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int16_t *filter_x0,
+                                   int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4, n1;
+  uint32_t tn1, tn2, tn3;
+  uint32_t st0, st1;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "ulw              %[tn2],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
+        "ulw              %[tn1],      12(%[src])                     \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac1,           31             \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "sb               %[st0],      0(%[dst])                      \n\t"
+        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
+
+        "balign           %[tn3],      %[tn1],         3              \n\t"
+        "balign           %[tn1],      %[tn2],         3              \n\t"
+        "balign           %[tn2],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+
+        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "sb               %[st1],      2(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
+        "sb               %[st0],      4(%[dst])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
+        "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
+        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac1,           31             \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
+        "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[st1],      1(%[dst])                      \n\t"
+        "sb               %[st0],      6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[p4],       3(%[dst])                      \n\t"
+        "sb               %[p2],       5(%[dst])                      \n\t"
+        "sb               %[n1],       7(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
+          [st0] "=&r" (st0), [st1] "=&r" (st1),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
+                                    int32_t src_stride,
+                                    uint8_t *dst_ptr,
+                                    int32_t dst_stride,
+                                    const int16_t *filter_x0,
+                                    int32_t h,
+                                    int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst),
+            [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
+                                    int32_t src_stride,
+                                    uint8_t *dst_ptr,
+                                    int32_t dst_stride,
+                                    const int16_t *filter_x0,
+                                    int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_load(src_ptr + src_stride + 64);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst),
+            [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vp9_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  if (((const int32_t *)filter_x)[1] == 0x800000) {
+    vp9_convolve_copy(src, src_stride,
+                      dst, dst_stride,
+                      filter_x, x_step_q4,
+                      filter_y, y_step_q4,
+                      w, h);
+  } else {
+    if (16 == x_step_q4) {
+      uint32_t pos = 38;
+
+      vp9_prefetch_load((const uint8_t *)filter_x);
+      src -= 3;
+
+      /* bit positon for extract from acc */
+      __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+      );
+
+      /* prefetch data to cache memory */
+      vp9_prefetch_load(src);
+      vp9_prefetch_load(src + 32);
+      vp9_prefetch_store(dst);
+
+      switch (w) {
+        case 4:
+          convolve_horiz_4_dspr2(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filter_x, (int32_t)h);
+          break;
+        case 8:
+          convolve_horiz_8_dspr2(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filter_x, (int32_t)h);
+          break;
+        case 16:
+          convolve_horiz_16_dspr2(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filter_x, (int32_t)h, 1);
+          break;
+        case 32:
+          convolve_horiz_16_dspr2(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filter_x, (int32_t)h, 2);
+          break;
+        case 64:
+          vp9_prefetch_load(src + 64);
+          vp9_prefetch_store(dst + 32);
+
+          convolve_horiz_64_dspr2(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filter_x, (int32_t)h);
+          break;
+        default:
+          vp9_convolve8_horiz_c(src + 3, src_stride,
+                                dst, dst_stride,
+                                filter_x, x_step_q4,
+                                filter_y, y_step_q4,
+                                w, h);
+          break;
+      }
+    } else {
+      vp9_convolve8_horiz_c(src, src_stride,
+                            dst, dst_stride,
+                            filter_x, x_step_q4,
+                            filter_y, y_step_q4,
+                            w, h);
+    }
+  }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
new file mode 100644 (file)
index 0000000..bdc7930
--- /dev/null
@@ -0,0 +1,390 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_vert_4_dspr2(const uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int16_t *filter_y,
+                                  int32_t w,
+                                  int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2, load3, load4;
+  uint32_t p1, p2;
+  uint32_t n1, n2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [load3] "=&r" (load3), [load4] "=&r" (load4),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [n1] "=&r" (n1), [n2] "=&r" (n2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+            [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
+            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert_64_dspr2(const uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int16_t *filter_y,
+                                   int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2, load3, load4;
+  uint32_t p1, p2;
+  uint32_t n1, n2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+    vp9_prefetch_store(dst + dst_stride + 32);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [load3] "=&r" (load3), [load4] "=&r" (load4),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [n1] "=&r" (n1), [n2] "=&r" (n2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+            [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
+            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (((const int32_t *)filter_y)[1] == 0x800000) {
+    vp9_convolve_copy(src, src_stride,
+                      dst, dst_stride,
+                      filter_x, x_step_q4,
+                      filter_y, y_step_q4,
+                      w, h);
+  } else {
+    if (16 == y_step_q4) {
+      uint32_t pos = 38;
+
+      /* bit positon for extract from acc */
+      __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+      );
+
+      vp9_prefetch_store(dst);
+
+      switch (w) {
+        case 4 :
+        case 8 :
+        case 16 :
+        case 32 :
+          convolve_vert_4_dspr2(src, src_stride,
+                                dst, dst_stride,
+                                filter_y, w, h);
+          break;
+        case 64 :
+          vp9_prefetch_store(dst + 32);
+          convolve_vert_64_dspr2(src, src_stride,
+                                 dst, dst_stride,
+                                 filter_y, h);
+          break;
+        default:
+          vp9_convolve8_vert_c(src, src_stride,
+                               dst, dst_stride,
+                               filter_x, x_step_q4,
+                               filter_y, y_step_q4,
+                               w, h);
+          break;
+      }
+    } else {
+      vp9_convolve8_vert_c(src, src_stride,
+                           dst, dst_stride,
+                           filter_x, x_step_q4,
+                           filter_y, y_step_q4,
+                           w, h);
+    }
+  }
+}
+
+#endif
index 864e27e988ae07216c33be5c2af0af3875ddb07c..f0c653f72b9869e0e37c07078efd194bee683e06 100644 (file)
@@ -58,13 +58,13 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) {
 }
 
 static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
-  cm->mb_cols = (aligned_width + 8) >> 4;
-  cm->mb_rows = (aligned_height + 8) >> 4;
-  cm->MBs = cm->mb_rows * cm->mb_cols;
-
   cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
   cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
   cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE;
+
+  cm->mb_cols = (cm->mi_cols + 1) >> 1;
+  cm->mb_rows = (cm->mi_rows + 1) >> 1;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
 }
 
 static void setup_mi(VP9_COMMON *cm) {
@@ -170,13 +170,8 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
 void vp9_create_common(VP9_COMMON *cm) {
   vp9_machine_specific_config(cm);
 
-  vp9_init_mbmode_probs(cm);
-
   cm->tx_mode = ONLY_4X4;
   cm->comp_pred_mode = HYBRID_PREDICTION;
-
-  // Initialize reference frame sign bias structure to defaults
-  vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
 }
 
 void vp9_remove_common(VP9_COMMON *cm) {
index 9ab2cc31bec4f900a316b52a88e9c878a0cd5485..f116c0647d91e4962ba59d4349c52a9a1e962e53 100644 (file)
@@ -20,6 +20,7 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_common_data.h"
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_seg_common.h"
@@ -55,14 +56,6 @@ typedef enum {
   NUM_FRAME_TYPES,
 } FRAME_TYPE;
 
-typedef enum {
-  EIGHTTAP = 0,
-  EIGHTTAP_SMOOTH = 1,
-  EIGHTTAP_SHARP = 2,
-  BILINEAR = 3,
-  SWITCHABLE = 4  /* should be the last one */
-} INTERPOLATIONFILTERTYPE;
-
 typedef enum {
   DC_PRED,         // Average of above and left pixels
   V_PRED,          // Vertical
@@ -101,10 +94,10 @@ static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
 
-union b_mode_info {
+typedef struct {
   MB_PREDICTION_MODE as_mode;
   int_mv as_mv[2];  // first, second inter predictor motion vectors
-};
+} b_mode_info;
 
 typedef enum {
   NONE = -1,
@@ -154,7 +147,7 @@ typedef struct {
 
 typedef struct {
   MB_MODE_INFO mbmi;
-  union b_mode_info bmi[4];
+  b_mode_info bmi[4];
 } MODE_INFO;
 
 static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
@@ -244,7 +237,6 @@ typedef struct macroblockd {
   unsigned char ab_index;   // index of 4x4 block inside the 8x8 block
 
   int q_index;
-
 } MACROBLOCKD;
 
 static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
index 3822bfc08ffe59b95dfb2fe7e5ec738d4306a1fe..2945cd2034820c82336771649218c48761ca9535 100644 (file)
@@ -29,4 +29,4 @@ extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
 extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES];
 extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
 
-#endif    // VP9_COMMON_VP9_COMMON_DATA_H
+#endif  // VP9_COMMON_VP9_COMMON_DATA_H
index 13220e97e69e1eae167381e65299ffb6ee4f0f51..9a5caa6626912c5704a8f38aa0a49924b9a8053e 100644 (file)
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VP9_COMMON_CONVOLVE_H_
-#define VP9_COMMON_CONVOLVE_H_
+#ifndef VP9_COMMON_VP9_CONVOLVE_H_
+#define VP9_COMMON_VP9_CONVOLVE_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -21,9 +21,4 @@ typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h);
 
-struct subpix_fn_table {
-  const int16_t (*filter_x)[8];
-  const int16_t (*filter_y)[8];
-};
-
-#endif  // VP9_COMMON_CONVOLVE_H_
+#endif  // VP9_COMMON_VP9_CONVOLVE_H_
index 79f769e40e07e47e0ff9be2aa70e1ddad7aa8d5a..355ac1a490089121525b5129fbef262d58f9e216 100644 (file)
@@ -63,9 +63,9 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
   print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
   print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
 
-  log_frame_info(cm, "Vectors ",mvs);
+  log_frame_info(cm, "Vectors ", mvs);
   for (mi_row = 0; mi_row < rows; mi_row++) {
-    fprintf(mvs,"V ");
+    fprintf(mvs, "V ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row,
                                mi_8x8[mi_index]->mbmi.mv[0].as_mv.col);
index 185fcedbe2a4b6417162cc90e6ca7dacb82156bf..3b512beb9d3fbb4537290fe52c2e6a4d1de40f82 100644 (file)
@@ -7,6 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
 */
+#ifndef VP9_COMMON_DEFAULT_COEF_PROBS_H_
+#define VP9_COMMON_DEFAULT_COEF_PROBS_H_
 
 /*Generated file, included by vp9_entropy.c*/
 static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
@@ -694,3 +696,4 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
   }
 };
 
+#endif  // VP9_COMMON_DEFAULT_COEF_PROBS_H_
index 32d9e0cf7c87d77c9a5937090362cf2675064abe..72ea72e09176f13fa24484783307de54c9e58ddb 100644 (file)
@@ -107,101 +107,171 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
 };
 
 DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
-  0,  16,   1,  32,  17,   2,  48,  33,  18,   3,  64,  34,  49,  19,  65,  80,
-  50,   4,  35,  66,  20,  81,  96,  51,   5,  36,  82,  97,  67, 112,  21,  52,
-  98,  37,  83, 113,   6,  68, 128,  53,  22,  99, 114,  84,   7, 129,  38,  69,
-  100, 115, 144, 130,  85,  54,  23,   8, 145,  39,  70, 116, 101, 131, 160, 146,
-  55,  86,  24,  71, 132, 117, 161,  40,   9, 102, 147, 176, 162,  87,  56,  25,
-  133, 118, 177, 148,  72, 103,  41, 163,  10, 192, 178,  88,  57, 134, 149, 119,
-  26, 164,  73, 104, 193,  42, 179, 208,  11, 135,  89, 165, 120, 150,  58, 194,
-  180,  27,  74, 209, 105, 151, 136,  43,  90, 224, 166, 195, 181, 121, 210,  59,
-  12, 152, 106, 167, 196,  75, 137, 225, 211, 240, 182, 122,  91,  28, 197,  13,
-  226, 168, 183, 153,  44, 212, 138, 107, 241,  60,  29, 123, 198, 184, 227, 169,
-  242,  76, 213, 154,  45,  92,  14, 199, 139,  61, 228, 214, 170, 185, 243, 108,
-  77, 155,  30,  15, 200, 229, 124, 215, 244,  93,  46, 186, 171, 201, 109, 140,
-  230,  62, 216, 245,  31, 125,  78, 156, 231,  47, 187, 202, 217,  94, 246, 141,
-  63, 232, 172, 110, 247, 157,  79, 218, 203, 126, 233, 188, 248,  95, 173, 142,
-  219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251,
-  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255,
+  0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
+  50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
+  98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
+  100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146,
+  55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25,
+  133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119,
+  26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194,
+  180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59,
+  12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13,
+  226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169,
+  242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108,
+  77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140,
+  230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141,
+  63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142,
+  219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159,
+  251,
+  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
+  255,
 };
 
 DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
-  0,  16,  32,  48,   1,  64,  17,  80,  33,  96,  49,   2,  65, 112,  18,  81,
-  34, 128,  50,  97,   3,  66, 144,  19, 113,  35,  82, 160,  98,  51, 129,   4,
-  67, 176,  20, 114, 145,  83,  36,  99, 130,  52, 192,   5, 161,  68, 115,  21,
-  146,  84, 208, 177,  37, 131, 100,  53, 162, 224,  69,   6, 116, 193, 147,  85,
-  22, 240, 132,  38, 178, 101, 163,  54, 209, 117,  70,   7, 148, 194,  86, 179,
-  225,  23, 133,  39, 164,   8, 102, 210, 241,  55, 195, 118, 149,  71, 180,  24,
-  87, 226, 134, 165, 211,  40, 103,  56,  72, 150, 196, 242, 119,   9, 181, 227,
-  88, 166,  25, 135,  41, 104, 212,  57, 151, 197, 120,  73, 243, 182, 136, 167,
-  213,  89,  10, 228, 105, 152, 198,  26,  42, 121, 183, 244, 168,  58, 137, 229,
-  74, 214,  90, 153, 199, 184,  11, 106, 245,  27, 122, 230, 169,  43, 215,  59,
-  200, 138, 185, 246,  75,  12,  91, 154, 216, 231, 107,  28,  44, 201, 123, 170,
-  60, 247, 232,  76, 139,  13,  92, 217, 186, 248, 155, 108,  29, 124,  45, 202,
-  233, 171,  61,  14,  77, 140,  15, 249,  93,  30, 187, 156, 218,  46, 109, 125,
-  62, 172,  78, 203,  31, 141, 234,  94,  47, 188,  63, 157, 110, 250, 219,  79,
-  126, 204, 173, 142,  95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236,
-  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255,
+  0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
+  34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
+  67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
+  146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85,
+  22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179,
+  225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24,
+  87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227,
+  88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167,
+  213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229,
+  74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59,
+  200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170,
+  60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202,
+  233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125,
+  62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79,
+  126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205,
+  236,
+  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
+  255,
 };
 
 DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
-  0,   1,   2,  16,   3,  17,   4,  18,  32,   5,  33,  19,   6,  34,  48,  20,
-  49,   7,  35,  21,  50,  64,   8,  36,  65,  22,  51,  37,  80,   9,  66,  52,
-  23,  38,  81,  67,  10,  53,  24,  82,  68,  96,  39,  11,  54,  83,  97,  69,
-  25,  98,  84,  40, 112,  55,  12,  70,  99, 113,  85,  26,  41,  56, 114, 100,
-  13,  71, 128,  86,  27, 115, 101, 129,  42,  57,  72, 116,  14,  87, 130, 102,
-  144,  73, 131, 117,  28,  58,  15,  88,  43, 145, 103, 132, 146, 118,  74, 160,
-  89, 133, 104,  29,  59, 147, 119,  44, 161, 148,  90, 105, 134, 162, 120, 176,
-  75, 135, 149,  30,  60, 163, 177,  45, 121,  91, 106, 164, 178, 150, 192, 136,
-  165, 179,  31, 151, 193,  76, 122,  61, 137, 194, 107, 152, 180, 208,  46, 166,
-  167, 195,  92, 181, 138, 209, 123, 153, 224, 196,  77, 168, 210, 182, 240, 108,
-  197,  62, 154, 225, 183, 169, 211,  47, 139,  93, 184, 226, 212, 241, 198, 170,
-  124, 155, 199,  78, 213, 185, 109, 227, 200,  63, 228, 242, 140, 214, 171, 186,
-  156, 229, 243, 125,  94, 201, 244, 215, 216, 230, 141, 187, 202,  79, 172, 110,
-  157, 245, 217, 231,  95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158,
-  188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175,
-  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255,
+  0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
+  49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
+  23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
+  25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100,
+  13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102,
+  144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160,
+  89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176,
+  75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136,
+  165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166,
+  167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108,
+  197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170,
+  124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186,
+  156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110,
+  157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111,
+  158,
+  188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220,
+  175,
+  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
+  255,
 };
 
 DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
-  0,   32,    1,   64,   33,    2,   96,   65,   34,  128,    3,   97,   66,  160,  129,   35,   98,    4,   67,  130,  161,  192,   36,   99,  224,    5,  162,  193,   68,  131,   37,  100,
-  225,  194,  256,  163,   69,  132,    6,  226,  257,  288,  195,  101,  164,   38,  258,    7,  227,  289,  133,  320,   70,  196,  165,  290,  259,  228,   39,  321,  102,  352,    8,  197,
-  71,  134,  322,  291,  260,  353,  384,  229,  166,  103,   40,  354,  323,  292,  135,  385,  198,  261,   72,    9,  416,  167,  386,  355,  230,  324,  104,  293,   41,  417,  199,  136,
-  262,  387,  448,  325,  356,   10,   73,  418,  231,  168,  449,  294,  388,  105,  419,  263,   42,  200,  357,  450,  137,  480,   74,  326,  232,   11,  389,  169,  295,  420,  106,  451,
-  481,  358,  264,  327,  201,   43,  138,  512,  482,  390,  296,  233,  170,  421,   75,  452,  359,   12,  513,  265,  483,  328,  107,  202,  514,  544,  422,  391,  453,  139,   44,  234,
-  484,  297,  360,  171,   76,  515,  545,  266,  329,  454,   13,  423,  203,  108,  546,  485,  576,  298,  235,  140,  361,  330,  172,  547,   45,  455,  267,  577,  486,   77,  204,  362,
-  608,   14,  299,  578,  109,  236,  487,  609,  331,  141,  579,   46,   15,  173,  610,  363,   78,  205,   16,  110,  237,  611,  142,   47,  174,   79,  206,   17,  111,  238,   48,  143,
-  80,  175,  112,  207,   49,   18,  239,   81,  113,   19,   50,   82,  114,   51,   83,  115,  640,  516,  392,  268,  144,   20,  672,  641,  548,  517,  424,  393,  300,  269,  176,  145,
-  52,   21,  704,  673,  642,  580,  549,  518,  456,  425,  394,  332,  301,  270,  208,  177,  146,   84,   53,   22,  736,  705,  674,  643,  612,  581,  550,  519,  488,  457,  426,  395,
-  364,  333,  302,  271,  240,  209,  178,  147,  116,   85,   54,   23,  737,  706,  675,  613,  582,  551,  489,  458,  427,  365,  334,  303,  241,  210,  179,  117,   86,   55,  738,  707,
-  614,  583,  490,  459,  366,  335,  242,  211,  118,   87,  739,  615,  491,  367,  243,  119,  768,  644,  520,  396,  272,  148,   24,  800,  769,  676,  645,  552,  521,  428,  397,  304,
-  273,  180,  149,   56,   25,  832,  801,  770,  708,  677,  646,  584,  553,  522,  460,  429,  398,  336,  305,  274,  212,  181,  150,   88,   57,   26,  864,  833,  802,  771,  740,  709,
-  678,  647,  616,  585,  554,  523,  492,  461,  430,  399,  368,  337,  306,  275,  244,  213,  182,  151,  120,   89,   58,   27,  865,  834,  803,  741,  710,  679,  617,  586,  555,  493,
-  462,  431,  369,  338,  307,  245,  214,  183,  121,   90,   59,  866,  835,  742,  711,  618,  587,  494,  463,  370,  339,  246,  215,  122,   91,  867,  743,  619,  495,  371,  247,  123,
-  896,  772,  648,  524,  400,  276,  152,   28,  928,  897,  804,  773,  680,  649,  556,  525,  432,  401,  308,  277,  184,  153,   60,   29,  960,  929,  898,  836,  805,  774,  712,  681,
-  650,  588,  557,  526,  464,  433,  402,  340,  309,  278,  216,  185,  154,   92,   61,   30,  992,  961,  930,  899,  868,  837,  806,  775,  744,  713,  682,  651,  620,  589,  558,  527,
-  496,  465,  434,  403,  372,  341,  310,  279,  248,  217,  186,  155,  124,   93,   62,   31,  993,  962,  931,  869,  838,  807,  745,  714,  683,  621,  590,  559,  497,  466,  435,  373,
-  342,  311,  249,  218,  187,  125,   94,   63,  994,  963,  870,  839,  746,  715,  622,  591,  498,  467,  374,  343,  250,  219,  126,   95,  995,  871,  747,  623,  499,  375,  251,  127,
-  900,  776,  652,  528,  404,  280,  156,  932,  901,  808,  777,  684,  653,  560,  529,  436,  405,  312,  281,  188,  157,  964,  933,  902,  840,  809,  778,  716,  685,  654,  592,  561,
-  530,  468,  437,  406,  344,  313,  282,  220,  189,  158,  996,  965,  934,  903,  872,  841,  810,  779,  748,  717,  686,  655,  624,  593,  562,  531,  500,  469,  438,  407,  376,  345,
-  314,  283,  252,  221,  190,  159,  997,  966,  935,  873,  842,  811,  749,  718,  687,  625,  594,  563,  501,  470,  439,  377,  346,  315,  253,  222,  191,  998,  967,  874,  843,  750,
-  719,  626,  595,  502,  471,  378,  347,  254,  223,  999,  875,  751,  627,  503,  379,  255,  904,  780,  656,  532,  408,  284,  936,  905,  812,  781,  688,  657,  564,  533,  440,  409,
-  316,  285,  968,  937,  906,  844,  813,  782,  720,  689,  658,  596,  565,  534,  472,  441,  410,  348,  317,  286, 1000,  969,  938,  907,  876,  845,  814,  783,  752,  721,  690,  659,
-  628,  597,  566,  535,  504,  473,  442,  411,  380,  349,  318,  287, 1001,  970,  939,  877,  846,  815,  753,  722,  691,  629,  598,  567,  505,  474,  443,  381,  350,  319, 1002,  971,
-  878,  847,  754,  723,  630,  599,  506,  475,  382,  351, 1003,  879,  755,  631,  507,  383,  908,  784,  660,  536,  412,  940,  909,  816,  785,  692,  661,  568,  537,  444,  413,  972,
-  941,  910,  848,  817,  786,  724,  693,  662,  600,  569,  538,  476,  445,  414, 1004,  973,  942,  911,  880,  849,  818,  787,  756,  725,  694,  663,  632,  601,  570,  539,  508,  477,
-  446,  415, 1005,  974,  943,  881,  850,  819,  757,  726,  695,  633,  602,  571,  509,  478,  447, 1006,  975,  882,  851,  758,  727,  634,  603,  510,  479, 1007,  883,  759,  635,  511,
-  912,  788,  664,  540,  944,  913,  820,  789,  696,  665,  572,  541,  976,  945,  914,  852,  821,  790,  728,  697,  666,  604,  573,  542, 1008,  977,  946,  915,  884,  853,  822,  791,
-  760,  729,  698,  667,  636,  605,  574,  543, 1009,  978,  947,  885,  854,  823,  761,  730,  699,  637,  606,  575, 1010,  979,  886,  855,  762,  731,  638,  607, 1011,  887,  763,  639,
-  916,  792,  668,  948,  917,  824,  793,  700,  669,  980,  949,  918,  856,  825,  794,  732,  701,  670, 1012,  981,  950,  919,  888,  857,  826,  795,  764,  733,  702,  671, 1013,  982,
-  951,  889,  858,  827,  765,  734,  703, 1014,  983,  890,  859,  766,  735, 1015,  891,  767,  920,  796,  952,  921,  828,  797,  984,  953,  922,  860,  829,  798, 1016,  985,  954,  923,
-  892,  861,  830,  799, 1017,  986,  955,  893,  862,  831, 1018,  987,  894,  863, 1019,  895,  924,  956,  925,  988,  957,  926, 1020,  989,  958,  927, 1021,  990,  959, 1022,  991, 1023,
+  0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
+  129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
+  68, 131, 37, 100,
+  225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38,
+  258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321,
+  102, 352, 8, 197,
+  71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292,
+  135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293,
+  41, 417, 199, 136,
+  262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105,
+  419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169,
+  295, 420, 106, 451,
+  481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421,
+  75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391,
+  453, 139, 44, 234,
+  484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108,
+  546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577,
+  486, 77, 204, 362,
+  608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173,
+  610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17,
+  111, 238, 48, 143,
+  80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51,
+  83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424,
+  393, 300, 269, 176, 145,
+  52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301,
+  270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581,
+  550, 519, 488, 457, 426, 395,
+  364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737,
+  706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241,
+  210, 179, 117, 86, 55, 738, 707,
+  614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491,
+  367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676,
+  645, 552, 521, 428, 397, 304,
+  273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553,
+  522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26,
+  864, 833, 802, 771, 740, 709,
+  678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306,
+  275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741,
+  710, 679, 617, 586, 555, 493,
+  462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835,
+  742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867,
+  743, 619, 495, 371, 247, 123,
+  896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680,
+  649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929,
+  898, 836, 805, 774, 712, 681,
+  650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154,
+  92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682,
+  651, 620, 589, 558, 527,
+  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124,
+  93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590,
+  559, 497, 466, 435, 373,
+  342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715,
+  622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623,
+  499, 375, 251, 127,
+  900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560,
+  529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716,
+  685, 654, 592, 561,
+  530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903,
+  872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469,
+  438, 407, 376, 345,
+  314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718,
+  687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998,
+  967, 874, 843, 750,
+  719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503,
+  379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657,
+  564, 533, 440, 409,
+  316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534,
+  472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783,
+  752, 721, 690, 659,
+  628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970,
+  939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381,
+  350, 319, 1002, 971,
+  878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631,
+  507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568,
+  537, 444, 413, 972,
+  941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414,
+  1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601,
+  570, 539, 508, 477,
+  446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571,
+  509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479,
+  1007, 883, 759, 635, 511,
+  912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945,
+  914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915,
+  884, 853, 822, 791,
+  760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823,
+  761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607,
+  1011, 887, 763, 639,
+  916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825,
+  794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733,
+  702, 671, 1013, 982,
+  951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015,
+  891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798,
+  1016, 985, 954, 923,
+  892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863,
+  1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021,
+  990, 959, 1022, 991, 1023,
 };
 
 /* Array indices are identical to previously-existing CONTEXT_NODE indices */
 
-const vp9_tree_index vp9_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
-{
+const vp9_tree_index vp9_coef_tree[ 22] = {
   -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
   -ZERO_TOKEN, 4,                             /* 1 = ZERO */
   -ONE_TOKEN, 6,                              /* 2 = ONE */
@@ -569,31 +639,6 @@ void vp9_init_neighbors() {
                       vp9_default_scan_32x32_neighbors);
 }
 
-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) {
-  if (scan == vp9_default_scan_4x4) {
-    return vp9_default_scan_4x4_neighbors;
-  } else if (scan == vp9_row_scan_4x4) {
-    return vp9_row_scan_4x4_neighbors;
-  } else if (scan == vp9_col_scan_4x4) {
-    return vp9_col_scan_4x4_neighbors;
-  } else if (scan == vp9_default_scan_8x8) {
-    return vp9_default_scan_8x8_neighbors;
-  } else if (scan == vp9_row_scan_8x8) {
-    return vp9_row_scan_8x8_neighbors;
-  } else if (scan == vp9_col_scan_8x8) {
-    return vp9_col_scan_8x8_neighbors;
-  } else if (scan == vp9_default_scan_16x16) {
-    return vp9_default_scan_16x16_neighbors;
-  } else if (scan == vp9_row_scan_16x16) {
-    return vp9_row_scan_16x16_neighbors;
-  } else if (scan == vp9_col_scan_16x16) {
-    return vp9_col_scan_16x16_neighbors;
-  } else {
-    assert(scan == vp9_default_scan_32x32);
-    return vp9_default_scan_32x32_neighbors;
-  }
-}
-
 void vp9_coef_tree_initialize() {
   vp9_init_neighbors();
   init_bit_trees();
index c1f2d782b116e08e353c6a870f883f701ee36d9e..ef9ea46f57bc85fbe5e76305a9293d064dda14bc 100644 (file)
@@ -190,9 +190,6 @@ static INLINE int get_coef_context(const int16_t *neighbors,
           token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
 }
 
-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan);
-
-
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
@@ -210,9 +207,6 @@ typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
 typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
                                           [PREV_COEF_CONTEXTS]
                                           [UNCONSTRAINED_NODES + 1];
-typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS]
-                                          [PREV_COEF_CONTEXTS]
-                                          [UNCONSTRAINED_NODES][2];
 
 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
 
@@ -367,22 +361,24 @@ static int get_entropy_context(TX_SIZE tx_size,
 static void get_scan_and_band(const MACROBLOCKD *xd, TX_SIZE tx_size,
                               PLANE_TYPE type, int block_idx,
                               const int16_t **scan,
+                              const int16_t **scan_nb,
                               const uint8_t **band_translate) {
   switch (tx_size) {
     case TX_4X4:
-      *scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
+      get_scan_nb_4x4(get_tx_type_4x4(type, xd, block_idx), scan, scan_nb);
       *band_translate = vp9_coefband_trans_4x4;
       break;
     case TX_8X8:
-      *scan = get_scan_8x8(get_tx_type_8x8(type, xd));
+      get_scan_nb_8x8(get_tx_type_8x8(type, xd), scan, scan_nb);
       *band_translate = vp9_coefband_trans_8x8plus;
       break;
     case TX_16X16:
-      *scan = get_scan_16x16(get_tx_type_16x16(type, xd));
+      get_scan_nb_16x16(get_tx_type_16x16(type, xd), scan, scan_nb);
       *band_translate = vp9_coefband_trans_8x8plus;
       break;
     case TX_32X32:
       *scan = vp9_default_scan_32x32;
+      *scan_nb = vp9_default_scan_32x32_neighbors;
       *band_translate = vp9_coefband_trans_8x8plus;
       break;
     default:
index 93c89b03ac4a9d1d0aaa693f4e611325e76e41ac..e1767961686e3347948a2b91b071e2ba8b564640 100644 (file)
@@ -286,7 +286,7 @@ static const struct tx_probs default_tx_probs = {
     { 66  } }
 };
 
-void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
+void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
                                       unsigned int (*ct_32x32p)[2]) {
   ct_32x32p[0][0] = tx_count_32x32p[TX_4X4];
   ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] +
@@ -299,7 +299,7 @@ void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
   ct_32x32p[2][1] = tx_count_32x32p[TX_32X32];
 }
 
-void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
+void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
                                       unsigned int (*ct_16x16p)[2]) {
   ct_16x16p[0][0] = tx_count_16x16p[TX_4X4];
   ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16];
@@ -307,7 +307,7 @@ void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
   ct_16x16p[1][1] = tx_count_16x16p[TX_16X16];
 }
 
-void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
+void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
                                     unsigned int (*ct_8x8p)[2]) {
   ct_8x8p[0][0] = tx_count_8x8p[TX_4X4];
   ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
@@ -356,17 +356,19 @@ void vp9_entropy_mode_init() {
 #define COUNT_SAT 20
 #define MAX_UPDATE_FACTOR 128
 
-static int update_ct(vp9_prob pre_prob, vp9_prob prob, unsigned int ct[2]) {
+static int update_ct(vp9_prob pre_prob, vp9_prob prob,
+                     const unsigned int ct[2]) {
   return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
 }
 
-static int update_ct2(vp9_prob pre_prob, unsigned int ct[2]) {
+static int update_ct2(vp9_prob pre_prob, const unsigned int ct[2]) {
   return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
 }
 
 static void update_mode_probs(int n_modes,
-                              const vp9_tree_index *tree, unsigned int *cnt,
-                              vp9_prob *pre_probs, vp9_prob *dst_probs,
+                              const vp9_tree_index *tree,
+                              const unsigned int *cnt,
+                              const vp9_prob *pre_probs, vp9_prob *dst_probs,
                               unsigned int tok0_offset) {
 #define MAX_PROBS 32
   vp9_prob probs[MAX_PROBS];
@@ -382,8 +384,8 @@ static void update_mode_probs(int n_modes,
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   int i, j;
   FRAME_CONTEXT *fc = &cm->fc;
-  FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-  FRAME_COUNTS *counts = &cm->counts;
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  const FRAME_COUNTS *counts = &cm->counts;
 
   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
     fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i],
index 31537c7f7f18bfcd4fefb6313dc0acd82ef90860..ccade2752ffb445c38c23e1b70ecd70812d4a795 100644 (file)
@@ -61,11 +61,11 @@ void vp9_init_mbmode_probs(struct VP9Common *cm);
 
 void vp9_adapt_mode_probs(struct VP9Common *cm);
 
-void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
+void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
                                       unsigned int (*ct_32x32p)[2]);
-void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
+void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
                                       unsigned int (*ct_16x16p)[2]);
-void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
+void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
                                     unsigned int (*ct_8x8p)[2]);
 
 #endif  // VP9_COMMON_VP9_ENTROPYMODE_H_
index 2e973e53f03d5dcfdaba6aed57a56a6df0fd8306..a9e25b727b74cb2b3231bdd959315d47b93823ef 100644 (file)
@@ -39,12 +39,12 @@ const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
 };
 struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
 
-const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = {
+const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2] = {
   -0, -1,
 };
 struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
 
-const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {
+const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2] = {
   -0, 2,
   -1, 4,
   -2, -3
@@ -53,8 +53,8 @@ struct vp9_token vp9_mv_fp_encodings[4];
 
 static const nmv_context default_nmv_context = {
   {32, 64, 96},
-  {
-    { /* vert component */
+  { // NOLINT
+    { /* vert component */ // NOLINT
       128,                                                  /* sign */
       {224, 144, 192, 168, 192, 176, 192, 198, 198, 245},   /* class */
       {216},                                                /* class0 */
@@ -64,7 +64,7 @@ static const nmv_context default_nmv_context = {
       160,                                                  /* class0_hp bit */
       128,                                                  /* hp */
     },
-    { /* hor component */
+    { /* hor component */ // NOLINT
       128,                                                  /* sign */
       {216, 128, 176, 160, 176, 176, 192, 198, 198, 208},   /* class */
       {208},                                                /* class0 */
@@ -149,7 +149,7 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                              int incr, int usehp) {
   int s, z, c, o, d, e, f;
-  assert (v != 0);            /* should not be zero */
+  assert(v != 0);            /* should not be zero */
   s = v < 0;
   comp_counts->sign[s] += incr;
   z = (s ? -v : v) - 1;       /* magnitude - 1 */
@@ -198,8 +198,6 @@ static unsigned int adapt_probs(unsigned int i,
                                 vp9_prob this_probs[],
                                 const vp9_prob last_probs[],
                                 const unsigned int num_events[]) {
-
-
   const unsigned int left = tree[i] <= 0
           ? num_events[-tree[i]]
           : adapt_probs(tree[i], tree, this_probs, last_probs, num_events);
index a10c933f60e4acb8559f560c5f78b63792c274bd..3b782ab0a4e340f74885aedf99b2ab097b8fd467 100644 (file)
@@ -13,7 +13,7 @@
 #define VP9_COMMON_VP9_ENTROPYMV_H_
 
 #include "vp9/common/vp9_treecoder.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_blockd.h"
 
 struct VP9Common;
@@ -73,6 +73,10 @@ extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
 #define MV_MAX         ((1 << MV_MAX_BITS) - 1)
 #define MV_VALS        ((MV_MAX << 1) + 1)
 
+#define MV_IN_USE_BITS 14
+#define MV_UPP   ((1 << MV_IN_USE_BITS) - 1)
+#define MV_LOW   (-(1 << MV_IN_USE_BITS))
+
 extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];
 extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
 
index 4ac2bc93ff8a73000316ea27b093d79decf6094c..cedd44cad63e69420b9560e2a55716cabf40f274 100644 (file)
@@ -8,12 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_filter.h"
 
-DECLARE_ALIGNED(256, const int16_t,
-                vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const subpel_kernel,
+                vp9_bilinear_filters[SUBPEL_SHIFTS]) = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
   { 0, 0, 0, 120,   8, 0, 0, 0 },
   { 0, 0, 0, 112,  16, 0, 0, 0 },
@@ -33,8 +35,8 @@ DECLARE_ALIGNED(256, const int16_t,
 };
 
 // Lagrangian interpolation filter
-DECLARE_ALIGNED(256, const int16_t,
-                vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const subpel_kernel,
+                vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
   { 0,   0,   0, 128,   0,   0,   0,  0},
   { 0,   1,  -5, 126,   8,  -3,   1,  0},
   { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -54,8 +56,8 @@ DECLARE_ALIGNED(256, const int16_t,
 };
 
 // DCT based filter
-DECLARE_ALIGNED(256, const int16_t,
-                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const subpel_kernel,
+                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
   {0,   0,   0, 128,   0,   0,   0, 0},
   {-1,   3,  -7, 127,   8,  -3,   1, 0},
   {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -75,8 +77,8 @@ DECLARE_ALIGNED(256, const int16_t,
 };
 
 // freqmultiplier = 0.5
-DECLARE_ALIGNED(256, const int16_t,
-                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const subpel_kernel,
+                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
   { 0,  0,  0, 128,  0,  0,  0,  0},
   {-3, -1, 32,  64, 38,  1, -3,  0},
   {-2, -2, 29,  63, 41,  2, -3,  0},
@@ -94,3 +96,20 @@ DECLARE_ALIGNED(256, const int16_t,
   { 0, -3,  2,  41, 63, 29, -2, -2},
   { 0, -3,  1,  38, 64, 32, -1, -3}
 };
+
+const subpel_kernel *vp9_get_filter_kernel(INTERPOLATIONFILTERTYPE type) {
+  switch (type) {
+    case EIGHTTAP:
+      return vp9_sub_pel_filters_8;
+    case EIGHTTAP_SMOOTH:
+      return vp9_sub_pel_filters_8lp;
+    case EIGHTTAP_SHARP:
+      return vp9_sub_pel_filters_8s;
+    case BILINEAR:
+      return vp9_bilinear_filters;
+    default:
+      assert(!"Invalid filter type.");
+      return NULL;
+  }
+}
+
index 7b1ffaeda0f5b8e8f9fb60bf62008ab540361bb2..676b274b9423c131d234620b283ee2e1b2eda1c9 100644 (file)
@@ -11,7 +11,7 @@
 #ifndef VP9_COMMON_VP9_FILTER_H_
 #define VP9_COMMON_VP9_FILTER_H_
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
 #define SUBPEL_BITS 4
 #define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
 #define SUBPEL_TAPS 8
 
-extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS];
+typedef enum {
+  EIGHTTAP = 0,
+  EIGHTTAP_SMOOTH = 1,
+  EIGHTTAP_SHARP = 2,
+  BILINEAR = 3,
+  SWITCHABLE = 4  /* should be the last one */
+} INTERPOLATIONFILTERTYPE;
+
+typedef const int16_t subpel_kernel[SUBPEL_TAPS];
+
+struct subpix_fn_table {
+  const subpel_kernel *filter_x;
+  const subpel_kernel *filter_y;
+};
+
+const subpel_kernel *vp9_get_filter_kernel(INTERPOLATIONFILTERTYPE type);
+
+extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_6[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
 
 // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
 // filter kernel as a 2 tap filter.
index 73f6b4c1939b37a4206f31fa9e660f1f905b507a..b0c0c57ae77e660fb0ee3f7d732e32b243b3c27d 100644 (file)
@@ -57,7 +57,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
     vpx_memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv));
   } else if (block_idx == 1 || block_idx == 2) {
     int dst = 0, n;
-    union b_mode_info *bmi = mi->bmi;
+    b_mode_info *bmi = mi->bmi;
 
     dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int;
     for (n = 0; dst < MAX_MV_REF_CANDIDATES &&
@@ -66,7 +66,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
         dst_list[dst++].as_int = mv_list[n].as_int;
   } else {
     int dst = 0, n;
-    union b_mode_info *bmi = mi->bmi;
+    b_mode_info *bmi = mi->bmi;
 
     assert(block_idx == 3);
     dst_list[dst++].as_int = bmi[2].as_mv[ref_idx].as_int;
index ad0d882b99db198c33ab87722dd7201f6ac218a8..50dfdc7fb7fd490dfa29b1dc24dc07b3c8c5e414 100644 (file)
@@ -55,13 +55,11 @@ static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb,
     if (!mi)
       return DC_PRED;
 
-    if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
+    if (is_inter_block(&mi->mbmi))
       return DC_PRED;
-    } else if (mi->mbmi.sb_type < BLOCK_8X8) {
-      return ((mi->bmi + 1 + b)->as_mode);
-    } else {
-      return mi->mbmi.mode;
-    }
+    else
+      return mi->mbmi.sb_type < BLOCK_8X8 ? (mi->bmi + 1 + b)->as_mode
+                                          : mi->mbmi.mode;
   }
   assert(b == 1 || b == 3);
   return (mi->bmi + b - 1)->as_mode;
@@ -77,13 +75,11 @@ static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
     if (!mi)
       return DC_PRED;
 
-    if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
+    if (is_inter_block(&mi->mbmi))
       return DC_PRED;
-    } else if (mi->mbmi.sb_type < BLOCK_8X8) {
-      return ((mi->bmi + 2 + b)->as_mode);
-    } else {
-      return mi->mbmi.mode;
-    }
+    else
+      return mi->mbmi.sb_type < BLOCK_8X8 ? (mi->bmi + 2 + b)->as_mode
+                                          : mi->mbmi.mode;
   }
 
   return (mi->bmi + b - 2)->as_mode;
index f06bf047b68afb7244f5385f26a2a16313c2ef66..46363700764d4b5b0231d9cd4570dcbec7604c82 100644 (file)
@@ -18,7 +18,7 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
    0.5 shifts per pixel. */
   int i;
@@ -70,7 +70,7 @@ void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   }
 }
 
-void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
   int i;
   int a1, e1;
   int16_t tmp[4];
@@ -116,7 +116,7 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
   output[3] = step[0] - step[3];
 }
 
-void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[4 * 4];
   int16_t *outptr = out;
   int i, j;
@@ -140,7 +140,7 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   }
 }
 
-void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
   int a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -420,7 +420,7 @@ void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
                                   + dest[j * dest_stride + i]);  }
 }
 
-void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
+void vp9_short_idct8x8_10_add_c(int16_t *input, uint8_t *dest,
                                 int dest_stride) {
   int16_t out[8 * 8] = { 0 };
   int16_t *outptr = out;
@@ -838,7 +838,7 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
                                   + dest[j * dest_stride + i]);  }
 }
 
-void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
+void vp9_short_idct16x16_10_add_c(int16_t *input, uint8_t *dest,
                                   int dest_stride) {
   int16_t out[16 * 16] = { 0 };
   int16_t *outptr = out;
@@ -1269,8 +1269,107 @@ void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   }
 }
 
-void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
+void vp9_short_idct32x32_1_add_c(int16_t *input, uint8_t *dest,
+                                 int dest_stride) {
+  int i, j;
+  int a1;
+
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
-  output[0] = ROUND_POWER_OF_TWO(out, 6);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  for (j = 0; j < 32; ++j) {
+    for (i = 0; i < 32; ++i)
+      dest[i] = clip_pixel(dest[i] + a1);
+    dest += dest_stride;
+  }
+}
+
+// idct
+void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) {
+  if (eob > 1)
+    vp9_idct4x4_16_add(input, dest, stride);
+  else
+    vp9_idct4x4_1_add(input, dest, stride);
+}
+
+
+void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) {
+  if (eob > 1)
+    vp9_iwht4x4_16_add(input, dest, stride);
+  else
+    vp9_iwht4x4_1_add(input, dest, stride);
+}
+
+void vp9_idct_add_8x8(int16_t *input, uint8_t *dest, int stride, int eob) {
+  // If dc is 1, then input[0] is the reconstructed value, do not need
+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to decide what to do.
+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+  // Combine that with code here.
+  if (eob) {
+    if (eob == 1)
+      // DC only DCT coefficient
+      vp9_short_idct8x8_1_add(input, dest, stride);
+    else if (eob <= 10)
+      vp9_short_idct8x8_10_add(input, dest, stride);
+    else
+      vp9_short_idct8x8_add(input, dest, stride);
+  }
+}
+
+void vp9_idct_add_16x16(int16_t *input, uint8_t *dest, int stride, int eob) {
+  /* The calculation can be simplified if there are not many non-zero dct
+   * coefficients. Use eobs to separate different cases. */
+  if (eob) {
+    if (eob == 1)
+      /* DC only DCT coefficient. */
+      vp9_short_idct16x16_1_add(input, dest, stride);
+    else if (eob <= 10)
+      vp9_short_idct16x16_10_add(input, dest, stride);
+    else
+      vp9_short_idct16x16_add(input, dest, stride);
+  }
+}
+
+void vp9_idct_add_32x32(int16_t *input, uint8_t *dest, int stride, int eob) {
+  if (eob) {
+    if (eob == 1)
+      vp9_short_idct32x32_1_add(input, dest, stride);
+    else
+      vp9_short_idct32x32_add(input, dest, stride);
+  }
+}
+
+// iht
+void vp9_iht_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
+                   int eob) {
+  if (tx_type == DCT_DCT)
+    vp9_idct4x4_add(input, dest, stride, eob);
+  else
+    vp9_short_iht4x4_add(input, dest, stride, tx_type);
+}
+
+void vp9_iht_add_8x8(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
+                       int stride, int eob) {
+  if (tx_type == DCT_DCT) {
+    vp9_idct_add_8x8(input, dest, stride, eob);
+  } else {
+    if (eob > 0) {
+      vp9_short_iht8x8_add(input, dest, stride, tx_type);
+    }
+  }
+}
+
+void vp9_iht_add_16x16(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
+                         int stride, int eob) {
+  if (tx_type == DCT_DCT) {
+    vp9_idct_add_16x16(input, dest, stride, eob);
+  } else {
+    if (eob > 0) {
+      vp9_short_iht16x16_add(input, dest, stride, tx_type);
+    }
+  }
 }
index 59892cd03ec989523802b5df7dd833fa28d05107..a15b6d36f4032cad57e03557c4f4d25c06052b7d 100644 (file)
@@ -16,6 +16,7 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
 
 
 // Constants and Macros used by all idct/dct functions
@@ -86,4 +87,21 @@ typedef struct {
   transform_1d cols, rows;  // vertical and horizontal
 } transform_2d;
 
+
+void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct_add_8x8(int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct_add_16x16(int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct_add_32x32(int16_t *input, uint8_t *dest, int stride, int eob);
+
+void vp9_iht_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
+                 int stride, int eob);
+
+void vp9_iht_add_8x8(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
+                     int stride, int eob);
+
+void vp9_iht_add_16x16(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
+                       int stride, int eob);
+
+
 #endif  // VP9_COMMON_VP9_IDCT_H_
index 6e425e8fbd2f808573fd011dad9430bdb1630e81..85ac6d2bfd77b06c949846a3748e637817195dbf 100644 (file)
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_reconinter.h"
index 91d40ac9703689b0e5168db17442d46e2b7c772b..c698090d81288df90a1e2ac55d85e99cc795d3d8 100644 (file)
@@ -12,7 +12,7 @@
 #define VP9_COMMON_VP9_LOOPFILTER_H_
 
 #include "vpx_ports/mem.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_seg_common.h"
index 88130d801ed3421440f96186681a88a60722c751..2c4bf6cb2378933a7668964f0bd53d7abedb7a47 100644 (file)
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
index a444b85554b08dce5281d82735deac9947720c2d..659079639073a5f587a498dc2be51cba8e9308f7 100644 (file)
@@ -108,7 +108,7 @@ static const int idx_n_column_to_subblock[4][2] = {
 };
 
 // clamp_mv_ref
-#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
 
 static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
   clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
index f424e6a12de72679f3bfd23805cfaaddc3ad8767..acb4724e5f76b7acdeacc5dd16b0e47d4b5640b8 100644 (file)
@@ -13,7 +13,7 @@
 
 #ifdef __cplusplus
 extern "C"
-{
+{ // NOLINT
 #endif
 
 #include "./vpx_config.h"
@@ -33,7 +33,6 @@ extern "C"
     FOURFIVE    = 1,
     THREEFIVE   = 2,
     ONETWO      = 3
-
   } VPX_SCALING;
 
   typedef enum {
@@ -71,42 +70,48 @@ extern "C"
                   //   3 - lowest quality/fastest decode
     int width;  // width of data passed to the compressor
     int height;  // height of data passed to the compressor
-    double framerate;       // set to passed in framerate
-    int64_t target_bandwidth;    // bandwidth to be used in kilobits per second
+    double framerate;  // set to passed in framerate
+    int64_t target_bandwidth;  // bandwidth to be used in kilobits per second
 
-    int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0
-    int Sharpness;          // parameter used for sharpening output: recommendation 0:
+    int noise_sensitivity;  // pre processing blur: recommendation 0
+    int Sharpness;  // sharpening output: recommendation 0:
     int cpu_used;
     unsigned int rc_max_intra_bitrate_pct;
 
     // mode ->
-    // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing
-    //    a television signal or feed from a live camera). ( speed setting controls how fast )
-    // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to
-    //    encode the output. ( speed setting controls how fast )
-    // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding
-    //    speed. The output is compressed at the highest possible quality. This option takes the longest
-    //    amount of time to encode. ( speed setting ignored )
-    // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding
-    //    pass. ( speed setting controls how fast )
-    // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding
-    //    pass to create the compressed output. ( speed setting controls how fast )
-    // (5)=Two Pass - Second Pass Best.  The encoder uses the statistics that were generated in the first
-    //    encoding pass to create the compressed output using the highest possible quality, and taking a
+    // (0)=Realtime/Live Encoding. This mode is optimized for realtime
+    //     encoding (for example, capturing a television signal or feed from
+    //     a live camera). ( speed setting controls how fast )
+    // (1)=Good Quality Fast Encoding. The encoder balances quality with the
+    //     amount of time it takes to encode the output. ( speed setting
+    //     controls how fast )
+    // (2)=One Pass - Best Quality. The encoder places priority on the
+    //     quality of the output over encoding speed. The output is compressed
+    //     at the highest possible quality. This option takes the longest
+    //     amount of time to encode. ( speed setting ignored )
+    // (3)=Two Pass - First Pass. The encoder generates a file of statistics
+    //     for use in the second encoding pass. ( speed setting controls how
+    //     fast )
+    // (4)=Two Pass - Second Pass. The encoder uses the statistics that were
+    //     generated in the first encoding pass to create the compressed
+    //     output. ( speed setting controls how fast )
+    // (5)=Two Pass - Second Pass Best.  The encoder uses the statistics that
+    //     were generated in the first encoding pass to create the compressed
+    //     output using the highest possible quality, and taking a
     //    longer amount of time to encode.. ( speed setting ignored )
-    int Mode;               //
+    int Mode;
 
     // Key Framing Operations
-    int auto_key;            // automatically detect cut scenes and set the keyframes
-    int key_freq;            // maximum distance to key frame.
+    int auto_key;  // autodetect cut scenes and set the keyframes
+    int key_freq;  // maximum distance to key frame.
 
-    int allow_lag;           // allow lagged compression (if 0 lagin frames is ignored)
-    int lag_in_frames;        // how many frames lag before we start encoding
+    int allow_lag;  // allow lagged compression (if 0 lagin frames is ignored)
+    int lag_in_frames;  // how many frames lag before we start encoding
 
     // ----------------------------------------------------------------
     // DATARATE CONTROL OPTIONS
 
-    int end_usage; // vbr or cbr
+    int end_usage;  // vbr or cbr
 
     // buffer targeting aggressiveness
     int under_shoot_pct;
@@ -138,7 +143,7 @@ extern "C"
     int play_alternate;
     int alt_freq;
 
-    int encode_breakout;  // early breakout encode threshold : for video conf recommend 800
+    int encode_breakout;  // early breakout : for video conf recommend 800
 
     /* Bitfield defining the error resiliency features to enable.
      * Can provide decodable frames after losses in previous
@@ -173,8 +178,8 @@ extern "C"
 
   void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf);
 
-// receive a frames worth of data caller can assume that a copy of this frame is made
-// and not just a copy of the pointer..
+  // receive a frames worth of data. caller can assume that a copy of this
+  // frame is made and not just a copy of the pointer..
   int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags,
                             YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                             int64_t end_time_stamp);
index 44948ff4dd00da91ba3764c197907132767a690c..953764c859d7baf1cf247dd8f96d8a8369f22643 100644 (file)
@@ -11,9 +11,9 @@
 #ifndef VP9_COMMON_VP9_ONYXC_INT_H_
 #define VP9_COMMON_VP9_ONYXC_INT_H_
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
@@ -120,7 +120,7 @@ typedef struct VP9Common {
 
   YV12_BUFFER_CONFIG post_proc_buffer;
 
-  FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
+  FRAME_TYPE last_frame_type;  /* last frame's frame type for motion search.*/
   FRAME_TYPE frame_type;
 
   int show_frame;
@@ -291,10 +291,6 @@ static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,
   xd->right_available = (mi_col + bw < cm->cur_tile_mi_col_end);
 }
 
-static int get_token_alloc(int mb_rows, int mb_cols) {
-  return mb_rows * mb_cols * (48 * 16 + 4);
-}
-
 static void set_prev_mi(VP9_COMMON *cm) {
   const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&
                                        cm->height == cm->last_height &&
index 955e6766a31b8280b094b4efa949e64631c66a09..212a28ab9764f290ca7eb252ceaafb2305b98deb 100644 (file)
@@ -8,6 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
 
 #include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 
-
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-
 #define RGB_TO_YUV(t)                                            \
   ( (0.257*(float)(t >> 16))  + (0.504*(float)(t >> 8 & 0xff)) + \
     (0.098*(float)(t & 0xff)) + 16),                             \
@@ -155,7 +153,6 @@ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
     p_dst = dst_ptr;
 
     for (col = 0; col < cols; col++) {
-
       int kernel = 4;
       int v = p_src[col];
 
@@ -257,7 +254,7 @@ void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
 void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
                             int rows, int cols, int flimit) {
   int r, c, i;
-  const short *rv3 = &vp9_rv[63 & rand()];
+  const short *rv3 = &vp9_rv[63 & rand()]; // NOLINT
 
   for (c = 0; c < cols; c++) {
     uint8_t *s = &dst[c];
@@ -408,7 +405,6 @@ static void fillrd(struct postproc_state *state, int q, int a) {
 
         next = next + j;
       }
-
     }
 
     for (; next < 256; next++)
@@ -416,7 +412,7 @@ static void fillrd(struct postproc_state *state, int q, int a) {
   }
 
   for (i = 0; i < 3072; i++) {
-    state->noise[i] = char_dist[rand() & 0xff];
+    state->noise[i] = char_dist[rand() & 0xff];  // NOLINT
   }
 
   for (i = 0; i < 16; i++) {
@@ -680,13 +676,14 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 #if 0 && CONFIG_POSTPROC_VISUALIZER
   if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
     char message[512];
-    sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
-            (cm->frame_type == KEY_FRAME),
-            cm->refresh_golden_frame,
-            cm->base_qindex,
-            cm->filter_level,
-            flags,
-            cm->mb_cols, cm->mb_rows);
+    snprintf(message, sizeof(message) -1,
+             "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
+             (cm->frame_type == KEY_FRAME),
+             cm->refresh_golden_frame,
+             cm->base_qindex,
+             cm->filter_level,
+             flags,
+             cm->mb_cols, cm->mb_rows);
     vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
                   cm->post_proc_buffer.y_stride);
   }
@@ -707,7 +704,7 @@ int vp9_post_proc_frame(struct VP9Common *cm,
       for (j = 0; j < mb_cols; j++) {
         char zz[4];
 
-        sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
+        snprintf(zz, sizeof(zz) - 1, "%c", mi[mb_index].mbmi.mode + 'a');
 
         vp9_blit_text(zz, y_ptr, post->y_stride);
         mb_index++;
@@ -716,7 +713,6 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 
       mb_index++; /* border */
       y_ptr += post->y_stride  * 16 - post->y_width;
-
     }
   }
 
@@ -740,9 +736,9 @@ int vp9_post_proc_frame(struct VP9Common *cm,
                         mi[mb_index].mbmi.skip_coeff);
 
         if (cm->frame_type == KEY_FRAME)
-          sprintf(zz, "a");
+          snprintf(zz, sizeof(zz) - 1, "a");
         else
-          sprintf(zz, "%c", dc_diff + '0');
+          snprintf(zz, sizeof(zz) - 1, "%c", dc_diff + '0');
 
         vp9_blit_text(zz, y_ptr, post->y_stride);
         mb_index++;
@@ -751,7 +747,6 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 
       mb_index++; /* border */
       y_ptr += post->y_stride  * 16 - post->y_width;
-
     }
   }
 
@@ -894,8 +889,9 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 
             constrain_line(lx0, &x1, ly0 + 1, &y1, width, height);
             vp9_blit_line(lx0,  x1, ly0 + 1,  y1, y_buffer, y_stride);
-          } else
+          } else {
             vp9_blit_line(lx0,  x1, ly0,  y1, y_buffer, y_stride);
+          }
         }
 
         mi++;
index 81fbf1f2664ee7ef03cf809da44d7a54cedb8328..e89683150de5026ab8793dcd214e248ce4abb5d2 100644 (file)
@@ -392,11 +392,6 @@ void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) {
   xd->this_mi->mbmi.seg_id_predicted = pred_flag;
 }
 
-void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                              uint8_t pred_flag) {
-  xd->this_mi->mbmi.skip_coeff = pred_flag;
-}
-
 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
                        BLOCK_SIZE bsize, int mi_row, int mi_col) {
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
index 47ca8abd87a4f6cac6c0f0886185e86e60850f5d..9230c4531d332310333b02d3da1b4d8904a7899c 100644 (file)
@@ -52,9 +52,6 @@ static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) {
   return xd->this_mi->mbmi.skip_coeff;
 }
 
-void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                              uint8_t pred_flag);
-
 unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
 
 unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd);
@@ -69,8 +66,9 @@ unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
                                                     const MACROBLOCKD *xd);
 
 
-static INLINE vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm,
-                                                          const MACROBLOCKD *xd) {
+static INLINE
+vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm,
+                                            const MACROBLOCKD *xd) {
   const int pred_context = vp9_get_pred_context_comp_inter_inter(cm, xd);
   return cm->fc.comp_inter_prob[pred_context];
 }
index bc40854a3af59cad9e882d14edd984574420dfe8..6dbdb421623f7b0a00ae605e852648e72c83fcdf 100644 (file)
 
 #if 1
 static const int16_t dc_qlookup[QINDEX_RANGE] = {
-     4,    8,    8,    9,   10,   11,   12,   12,
-    13,   14,   15,   16,   17,   18,   19,   19,
-    20,   21,   22,   23,   24,   25,   26,   26,
-    27,   28,   29,   30,   31,   32,   32,   33,
-    34,   35,   36,   37,   38,   38,   39,   40,
-    41,   42,   43,   43,   44,   45,   46,   47,
-    48,   48,   49,   50,   51,   52,   53,   53,
-    54,   55,   56,   57,   57,   58,   59,   60,
-    61,   62,   62,   63,   64,   65,   66,   66,
-    67,   68,   69,   70,   70,   71,   72,   73,
-    74,   74,   75,   76,   77,   78,   78,   79,
-    80,   81,   81,   82,   83,   84,   85,   85,
-    87,   88,   90,   92,   93,   95,   96,   98,
-    99,  101,  102,  104,  105,  107,  108,  110,
-   111,  113,  114,  116,  117,  118,  120,  121,
-   123,  125,  127,  129,  131,  134,  136,  138,
-   140,  142,  144,  146,  148,  150,  152,  154,
-   156,  158,  161,  164,  166,  169,  172,  174,
-   177,  180,  182,  185,  187,  190,  192,  195,
-   199,  202,  205,  208,  211,  214,  217,  220,
-   223,  226,  230,  233,  237,  240,  243,  247,
-   250,  253,  257,  261,  265,  269,  272,  276,
-   280,  284,  288,  292,  296,  300,  304,  309,
-   313,  317,  322,  326,  330,  335,  340,  344,
-   349,  354,  359,  364,  369,  374,  379,  384,
-   389,  395,  400,  406,  411,  417,  423,  429,
-   435,  441,  447,  454,  461,  467,  475,  482,
-   489,  497,  505,  513,  522,  530,  539,  549,
-   559,  569,  579,  590,  602,  614,  626,  640,
-   654,  668,  684,  700,  717,  736,  755,  775,
-   796,  819,  843,  869,  896,  925,  955,  988,
+  4,       8,    8,    9,   10,   11,   12,   12,
+  13,     14,   15,   16,   17,   18,   19,   19,
+  20,     21,   22,   23,   24,   25,   26,   26,
+  27,     28,   29,   30,   31,   32,   32,   33,
+  34,     35,   36,   37,   38,   38,   39,   40,
+  41,     42,   43,   43,   44,   45,   46,   47,
+  48,     48,   49,   50,   51,   52,   53,   53,
+  54,     55,   56,   57,   57,   58,   59,   60,
+  61,     62,   62,   63,   64,   65,   66,   66,
+  67,     68,   69,   70,   70,   71,   72,   73,
+  74,     74,   75,   76,   77,   78,   78,   79,
+  80,     81,   81,   82,   83,   84,   85,   85,
+  87,     88,   90,   92,   93,   95,   96,   98,
+  99,    101,  102,  104,  105,  107,  108,  110,
+  111,   113,  114,  116,  117,  118,  120,  121,
+  123,   125,  127,  129,  131,  134,  136,  138,
+  140,   142,  144,  146,  148,  150,  152,  154,
+  156,   158,  161,  164,  166,  169,  172,  174,
+  177,   180,  182,  185,  187,  190,  192,  195,
+  199,   202,  205,  208,  211,  214,  217,  220,
+  223,   226,  230,  233,  237,  240,  243,  247,
+  250,   253,  257,  261,  265,  269,  272,  276,
+  280,   284,  288,  292,  296,  300,  304,  309,
+  313,   317,  322,  326,  330,  335,  340,  344,
+  349,   354,  359,  364,  369,  374,  379,  384,
+  389,   395,  400,  406,  411,  417,  423,  429,
+  435,   441,  447,  454,  461,  467,  475,  482,
+  489,   497,  505,  513,  522,  530,  539,  549,
+  559,   569,  579,  590,  602,  614,  626,  640,
+  654,   668,  684,  700,  717,  736,  755,  775,
+  796,   819,  843,  869,  896,  925,  955,  988,
   1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
 };
 
 static const int16_t ac_qlookup[QINDEX_RANGE] = {
-     4,    8,    9,   10,   11,   12,   13,   14,
-    15,   16,   17,   18,   19,   20,   21,   22,
-    23,   24,   25,   26,   27,   28,   29,   30,
-    31,   32,   33,   34,   35,   36,   37,   38,
-    39,   40,   41,   42,   43,   44,   45,   46,
-    47,   48,   49,   50,   51,   52,   53,   54,
-    55,   56,   57,   58,   59,   60,   61,   62,
-    63,   64,   65,   66,   67,   68,   69,   70,
-    71,   72,   73,   74,   75,   76,   77,   78,
-    79,   80,   81,   82,   83,   84,   85,   86,
-    87,   88,   89,   90,   91,   92,   93,   94,
-    95,   96,   97,   98,   99,  100,  101,  102,
-   104,  106,  108,  110,  112,  114,  116,  118,
-   120,  122,  124,  126,  128,  130,  132,  134,
-   136,  138,  140,  142,  144,  146,  148,  150,
-   152,  155,  158,  161,  164,  167,  170,  173,
-   176,  179,  182,  185,  188,  191,  194,  197,
-   200,  203,  207,  211,  215,  219,  223,  227,
-   231,  235,  239,  243,  247,  251,  255,  260,
-   265,  270,  275,  280,  285,  290,  295,  300,
-   305,  311,  317,  323,  329,  335,  341,  347,
-   353,  359,  366,  373,  380,  387,  394,  401,
-   408,  416,  424,  432,  440,  448,  456,  465,
-   474,  483,  492,  501,  510,  520,  530,  540,
-   550,  560,  571,  582,  593,  604,  615,  627,
-   639,  651,  663,  676,  689,  702,  715,  729,
-   743,  757,  771,  786,  801,  816,  832,  848,
-   864,  881,  898,  915,  933,  951,  969,  988,
+  4,       8,    9,   10,   11,   12,   13,   14,
+  15,     16,   17,   18,   19,   20,   21,   22,
+  23,     24,   25,   26,   27,   28,   29,   30,
+  31,     32,   33,   34,   35,   36,   37,   38,
+  39,     40,   41,   42,   43,   44,   45,   46,
+  47,     48,   49,   50,   51,   52,   53,   54,
+  55,     56,   57,   58,   59,   60,   61,   62,
+  63,     64,   65,   66,   67,   68,   69,   70,
+  71,     72,   73,   74,   75,   76,   77,   78,
+  79,     80,   81,   82,   83,   84,   85,   86,
+  87,     88,   89,   90,   91,   92,   93,   94,
+  95,     96,   97,   98,   99,  100,  101,  102,
+  104,   106,  108,  110,  112,  114,  116,  118,
+  120,   122,  124,  126,  128,  130,  132,  134,
+  136,   138,  140,  142,  144,  146,  148,  150,
+  152,   155,  158,  161,  164,  167,  170,  173,
+  176,   179,  182,  185,  188,  191,  194,  197,
+  200,   203,  207,  211,  215,  219,  223,  227,
+  231,   235,  239,  243,  247,  251,  255,  260,
+  265,   270,  275,  280,  285,  290,  295,  300,
+  305,   311,  317,  323,  329,  335,  341,  347,
+  353,   359,  366,  373,  380,  387,  394,  401,
+  408,   416,  424,  432,  440,  448,  456,  465,
+  474,   483,  492,  501,  510,  520,  530,  540,
+  550,   560,  571,  582,  593,  604,  615,  627,
+  639,   651,  663,  676,  689,  702,  715,  729,
+  743,   757,  771,  786,  801,  816,  832,  848,
+  864,   881,  898,  915,  933,  951,  969,  988,
   1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151,
   1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
   1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567,
index 0f2e4e999e36e944a4626f4537ba505a6c5e500a..b3b9e1d8a14829ffb2f83f4a4c8a98bbe89b8dab 100644 (file)
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
-
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                               INTERPOLATIONFILTERTYPE mcomp_filter_type,
                               VP9_COMMON *cm) {
   if (xd->mi_8x8 && xd->this_mi) {
-    MB_MODE_INFO * mbmi = &xd->this_mi->mbmi;
+    MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
 
-    set_scale_factors(xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1,
-                      cm->active_ref_scale);
+    set_scale_factors(xd, mbmi->ref_frame[0] - LAST_FRAME,
+                          mbmi->ref_frame[1] - LAST_FRAME,
+                          cm->active_ref_scale);
   } else {
     set_scale_factors(xd, -1, -1, cm->active_ref_scale);
   }
 
-  switch (mcomp_filter_type) {
-    case EIGHTTAP:
-    case SWITCHABLE:
-      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8;
-      break;
-    case EIGHTTAP_SMOOTH:
-      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp;
-      break;
-    case EIGHTTAP_SHARP:
-      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s;
-      break;
-    case BILINEAR:
-      xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters;
-      break;
-  }
+  xd->subpix.filter_x = xd->subpix.filter_y =
+      vp9_get_filter_kernel(mcomp_filter_type == SWITCHABLE ?
+                               EIGHTTAP : mcomp_filter_type);
+
   assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
 }
 
@@ -132,7 +121,7 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
   const int x = 4 * (block & ((1 << bwl) - 1));
   const int y = 4 * (block >> bwl);
   const MODE_INFO *mi = xd->this_mi;
-  const int use_second_ref = mi->mbmi.ref_frame[1] > 0;
+  const int is_compound = has_second_ref(&mi->mbmi);
   int ref;
 
   assert(x < bw);
@@ -140,7 +129,7 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
   assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw);
   assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh);
 
-  for (ref = 0; ref < 1 + use_second_ref; ++ref) {
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
     struct scale_factors *const scale = &xd->scale_factor[ref];
     struct buf_2d *const pre_buf = &pd->pre[ref];
     struct buf_2d *const dst_buf = &pd->dst;
index 4a451b909464080766ca4fc40da2727e4b43a697..bd609dcf0d43ca387c9c2ad1bfe540b4c3dd3d29 100644 (file)
@@ -13,7 +13,7 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/vpx_once.h"
 
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_onyxc_int.h"
index 72613ae079e8b31d1038c686d4d3e7dc0c59626e..dc15a84ff172ec27e3e0857eeea12c045bde9836 100644 (file)
@@ -7,9 +7,9 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #define RTCD_C
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
 void vpx_scale_rtcd(void);
index 864a84095cfc614632cc89e837ddde24fc70b754..67dced2102162fdc98e9328785adefe4eadba273 100644 (file)
@@ -27,22 +27,6 @@ forward_decls vp9_common_forward_decls
 # this variable is for functions that are 64 bit only.
 [ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3
 
-#
-# Dequant
-#
-
-prototype void vp9_idct_add_16x16 "int16_t *input, uint8_t *dest, int stride, int eob"
-specialize vp9_idct_add_16x16
-
-prototype void vp9_idct_add_8x8 "int16_t *input, uint8_t *dest, int stride, int eob"
-specialize vp9_idct_add_8x8
-
-prototype void vp9_idct_add "int16_t *input, uint8_t *dest, int stride, int eob"
-specialize vp9_idct_add
-
-prototype void vp9_idct_add_32x32 "int16_t *q, uint8_t *dst, int stride, int eob"
-specialize vp9_idct_add_32x32
-
 #
 # RECON
 #
@@ -202,17 +186,6 @@ specialize vp9_dc_left_predictor_32x32
 prototype void vp9_dc_128_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_128_predictor_32x32
 
-if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
-prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_8x8 sse2 neon
-
-prototype void vp9_add_constant_residual_16x16 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_16x16 sse2 neon
-
-prototype void vp9_add_constant_residual_32x32 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_32x32 sse2 neon
-fi
-
 #
 # Loopfilter
 #
@@ -268,37 +241,37 @@ specialize vp9_blend_b
 # Sub Pixel Filters
 #
 prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_copy $sse2_x86inc neon
+specialize vp9_convolve_copy $sse2_x86inc neon dspr2
 
 prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_avg $sse2_x86inc neon
+specialize vp9_convolve_avg $sse2_x86inc neon dspr2
 
 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 ssse3 neon
+specialize vp9_convolve8 ssse3 neon dspr2
 
 prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_horiz ssse3 neon
+specialize vp9_convolve8_horiz ssse3 neon dspr2
 
 prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_vert ssse3 neon
+specialize vp9_convolve8_vert ssse3 neon dspr2
 
 prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg ssse3 neon
+specialize vp9_convolve8_avg ssse3 neon dspr2
 
 prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_horiz ssse3 neon
+specialize vp9_convolve8_avg_horiz ssse3 neon dspr2
 
 prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_vert ssse3 neon
+specialize vp9_convolve8_avg_vert ssse3 neon dspr2
 
 #
 # dct
 #
-prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_1_add sse2 neon
+prototype void vp9_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct4x4_1_add sse2 neon
 
-prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_add sse2 neon
+prototype void vp9_idct4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct4x4_16_add sse2 neon
 
 prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct8x8_1_add sse2 neon
@@ -306,8 +279,8 @@ specialize vp9_short_idct8x8_1_add sse2 neon
 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct8x8_add sse2 neon
 
-prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct10_8x8_add sse2 neon
+prototype void vp9_short_idct8x8_10_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct8x8_10_add sse2 neon
 
 prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct16x16_1_add sse2 neon
@@ -315,14 +288,14 @@ specialize vp9_short_idct16x16_1_add sse2 neon
 prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct16x16_add sse2 neon
 
-prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct10_16x16_add sse2 neon
+prototype void vp9_short_idct16x16_10_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct16x16_10_add sse2 neon
 
 prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct32x32_add sse2 neon
 
-prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
-specialize vp9_short_idct1_32x32
+prototype void vp9_short_idct32x32_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct32x32_1_add sse2
 
 prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
 specialize vp9_short_iht4x4_add sse2 neon
@@ -337,11 +310,11 @@ prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
 specialize vp9_idct4_1d sse2
 # dct and add
 
-prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_iwalsh4x4_1_add
+prototype void vp9_iwht4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_iwht4x4_1_add
 
-prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_iwalsh4x4_add
+prototype void vp9_iwht4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_iwht4x4_16_add
 
 #
 # Encoder functions below this point.
index 7a720d035e5b8ead3adb3ec5f1e444f1254506b0..ece011477b288f09b8d5542bac6c57ca1a298a64 100644 (file)
@@ -48,4 +48,4 @@ static int vp9_is_scaled(const struct scale_factors *sf) {
          sf->y_scale_fp != REF_NO_SCALE;
 }
 
-#endif  //  VP9_COMMON_VP9_SCALE_H_
+#endif  // VP9_COMMON_VP9_SCALE_H_
diff --git a/vp9/common/vp9_subpelvar.h b/vp9/common/vp9_subpelvar.h
deleted file mode 100644 (file)
index fe75481..0000000
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_SUBPELVAR_H_
-#define VP9_COMMON_VP9_SUBPELVAR_H_
-
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_convolve.h"
-
-static void variance(const uint8_t *src_ptr,
-                     int  source_stride,
-                     const uint8_t *ref_ptr,
-                     int  recon_stride,
-                     int  w,
-                     int  h,
-                     unsigned int *sse,
-                     int *sum) {
-  int i, j;
-  int diff;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      diff = src_ptr[j] - ref_ptr[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    src_ptr += source_stride;
-    ref_ptr += recon_stride;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_first_pass
- *
- *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
- *                  uint32_t src_pixels_per_line : Stride of input block.
- *                  uint32_t pixel_step        : Offset between filter input samples (see notes).
- *                  uint32_t output_height     : Input block height.
- *                  uint32_t output_width      : Input block width.
- *                  int32_t  *vp9_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : int32_t *output_ptr        : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
-                                              uint16_t *output_ptr,
-                                              unsigned int src_pixels_per_line,
-                                              int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const int16_t *vp9_filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-
-      src_ptr++;
-    }
-
-    // Next row...
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_second_pass
- *
- *  INPUTS        : int32_t  *src_ptr          : Pointer to source block.
- *                  uint32_t src_pixels_per_line : Stride of input block.
- *                  uint32_t pixel_step        : Offset between filter input samples (see notes).
- *                  uint32_t output_height     : Input block height.
- *                  uint32_t output_width      : Input block width.
- *                  int32_t  *vp9_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : uint16_t *output_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
-                                               uint8_t *output_ptr,
-                                               unsigned int src_pixels_per_line,
-                                               unsigned int pixel_step,
-                                               unsigned int output_height,
-                                               unsigned int output_width,
-                                               const int16_t *vp9_filter) {
-  unsigned int  i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-      src_ptr++;
-    }
-
-    src_ptr += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-#endif  // VP9_COMMON_VP9_SUBPELVAR_H_
index 2e21a5b3016f14cac93d125679a3571d5c393ef1..da1213d71530475c739f057ea83205ea2e78f70e 100644 (file)
@@ -25,8 +25,9 @@ static void tree2tok(struct vp9_token *const p, vp9_tree t,
     if (j <= 0) {
       p[-j].value = v;
       p[-j].len = l;
-    } else
+    } else {
       tree2tok(p, t, j, v, l);
+    }
   } while (++v & 1);
 }
 
@@ -65,11 +66,9 @@ static unsigned int convert_distribution(unsigned int i,
   return left + right;
 }
 
-void vp9_tree_probs_from_distribution(
-  vp9_tree tree,
-  vp9_prob probs          [ /* n-1 */ ],
-  unsigned int branch_ct       [ /* n-1 */ ] [2],
-  const unsigned int num_events[ /* n */ ],
-  unsigned int tok0_offset) {
+void vp9_tree_probs_from_distribution(vp9_tree tree, vp9_prob probs[/* n-1 */],
+                                      unsigned int branch_ct[/* n-1 */][2],
+                                      const unsigned int num_events[/* n */],
+                                      unsigned int tok0_offset) {
   convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset);
 }
index 8f740f4127f7de539af001a1b9d7a33a13f6314c..f03af33782e434a1ba991ccc5f2a52e0be1ce5a2 100644 (file)
@@ -15,7 +15,7 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct4x4_16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -148,7 +148,7 @@ void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE4X4(dest, input3);
 }
 
-void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a;
@@ -985,7 +985,7 @@ void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
   RECON_AND_STORE(dest, in[7]);
 }
 
-void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_short_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -1014,7 +1014,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
 
   // Stage1
-  {
+  { //NOLINT
     const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
     const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
 
@@ -1039,7 +1039,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   }
 
   // Stage2
-  {
+  { //NOLINT
     const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
     const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
 
@@ -1069,7 +1069,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   }
 
   // Stage3
-  {
+  { //NOLINT
     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
     stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
     stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
@@ -2456,7 +2456,7 @@ void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride,
   write_buffer_8x16(dest, in1, stride);
 }
 
-void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
+void vp9_short_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest,
                                      int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
@@ -3548,4 +3548,52 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
       dest += 8 - (stride * 32);
     }
   }
+}  //NOLINT
+
+void vp9_short_idct32x32_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a, i;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 6);
+
+  dc_value = _mm_set1_epi16(a);
+
+  for (i = 0; i < 4; ++i) {
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    dest += 8 - (stride * 32);
+  }
 }
index b0e8b181f0eba0c281fd8cf1d898203f0c371d3b..8870215a27b7640066857fd71d1a7c07c4f8c07a 100644 (file)
@@ -61,4 +61,4 @@ extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt);
 #endif
 #endif
 
-#endif
+#endif  // VP9_COMMON_X86_VP9_POSTPROC_X86_H_
index bbf9888caf67ba8e8414b2de7879788095c623c4..7a5cca056a387050d5fee195d13d5a41b15df055 100644 (file)
@@ -534,6 +534,21 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     ret
 
 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%macro HORIZx4_ROW 2
+    movdqa      %2,   %1
+    pshufb      %1,   [GLOBAL(shuf_t0t1)]
+    pshufb      %2,   [GLOBAL(shuf_t2t3)]
+    pmaddubsw   %1,   xmm6
+    pmaddubsw   %2,   xmm7
+
+    paddsw      %1,   %2
+    movdqa      %2,   %1
+    psrldq      %2,   8
+    paddsw      %1,   %2
+    paddsw      %1,   xmm5
+    psraw       %1,   7
+    packuswb    %1,   %1
+%endm
 
 %macro HORIZx4 1
     mov         rdx, arg(5)                 ;filter ptr
@@ -544,64 +559,84 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     movdqa      xmm4, [rdx]                 ;load filters
     movq        xmm5, rcx
     packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-    movdqa      krd, xmm5
+    pshuflw     xmm6, xmm4, 0b              ;k0_k1
+    pshufhw     xmm6, xmm6, 10101010b       ;k0_k1_k4_k5
+    pshuflw     xmm7, xmm4, 01010101b       ;k2_k3
+    pshufhw     xmm7, xmm7, 11111111b       ;k2_k3_k6_k7
+    pshufd      xmm5, xmm5, 0               ;rounding
 
     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
     movsxd      rdx, dword ptr arg(3)       ;output_pitch
     movsxd      rcx, dword ptr arg(4)       ;output_height
-
+    shr         rcx, 1
 .loop:
-    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
-
-    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-    punpcklqdq  xmm0,   xmm3
+    ;Do two rows once
+    movq        xmm0,   [rsi - 3]           ;load src
+    movq        xmm1,   [rsi + 5]
+    movq        xmm2,   [rsi + rax - 3]
+    movq        xmm3,   [rsi + rax + 5]
+    punpcklqdq  xmm0,   xmm1
+    punpcklqdq  xmm2,   xmm3
+
+    HORIZx4_ROW xmm0,   xmm1
+    HORIZx4_ROW xmm2,   xmm3
+%if %1
+    movd        xmm1,   [rdi]
+    pavgb       xmm0,   xmm1
+    movd        xmm3,   [rdi + rdx]
+    pavgb       xmm2,   xmm3
+%endif
+    movd        [rdi],  xmm0
+    movd        [rdi +rdx],  xmm2
 
-    movdqa      xmm1,   xmm0
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm0,   k0k1
+    lea         rsi,    [rsi + rax]
+    prefetcht0  [rsi + 4 * rax - 3]
+    lea         rsi,    [rsi + rax]
+    lea         rdi,    [rdi + 2 * rdx]
+    prefetcht0  [rsi + 2 * rax - 3]
 
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
+    dec         rcx
+    jnz         .loop
 
-    movdqa      xmm4,   xmm2
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
+    ; Do last row if output_height is odd
+    movsxd      rcx,    dword ptr arg(4)       ;output_height
+    and         rcx,    1
+    je          .done
 
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
+    movq        xmm0,   [rsi - 3]    ; load src
+    movq        xmm1,   [rsi + 5]
+    punpcklqdq  xmm0,   xmm1
 
-    paddsw      xmm0,   xmm1
-    paddsw      xmm0,   xmm4
-    paddsw      xmm0,   xmm2
-    paddsw      xmm0,   krd
-    psraw       xmm0,   7
-    packuswb    xmm0,   xmm0
+    HORIZx4_ROW xmm0, xmm1
 %if %1
     movd        xmm1,   [rdi]
     pavgb       xmm0,   xmm1
 %endif
-    lea         rsi,    [rsi + rax]
     movd        [rdi],  xmm0
+.done
+%endm
 
-    lea         rdi,    [rdi + rdx]
-    dec         rcx
-    jnz         .loop
+%macro HORIZx8_ROW 4
+    movdqa      %2,   %1
+    movdqa      %3,   %1
+    movdqa      %4,   %1
+
+    pshufb      %1,   [GLOBAL(shuf_t0t1)]
+    pshufb      %2,   [GLOBAL(shuf_t2t3)]
+    pshufb      %3,   [GLOBAL(shuf_t4t5)]
+    pshufb      %4,   [GLOBAL(shuf_t6t7)]
+
+    pmaddubsw   %1,   k0k1
+    pmaddubsw   %2,   k2k3
+    pmaddubsw   %3,   k4k5
+    pmaddubsw   %4,   k6k7
+
+    paddsw      %1,   %2
+    paddsw      %1,   %4
+    paddsw      %1,   %3
+    paddsw      %1,   krd
+    psraw       %1,   7
+    packuswb    %1,   %1
 %endm
 
 %macro HORIZx8 1
@@ -633,45 +668,51 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
     movsxd      rdx, dword ptr arg(3)       ;output_pitch
     movsxd      rcx, dword ptr arg(4)       ;output_height
+    shr         rcx, 1
 
 .loop:
-    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
-
-    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
+    movq        xmm0,   [rsi - 3]           ;load src
+    movq        xmm3,   [rsi + 5]
+    movq        xmm4,   [rsi + rax - 3]
+    movq        xmm7,   [rsi + rax + 5]
     punpcklqdq  xmm0,   xmm3
+    punpcklqdq  xmm4,   xmm7
 
-    movdqa      xmm1,   xmm0
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm0,   k0k1
+    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
+    HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
+%if %1
+    movq        xmm1,   [rdi]
+    movq        xmm2,   [rdi + rdx]
+    pavgb       xmm0,   xmm1
+    pavgb       xmm4,   xmm2
+%endif
+    movq        [rdi],  xmm0
+    movq        [rdi + rdx],  xmm4
 
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
+    lea         rsi,    [rsi + rax]
+    prefetcht0  [rsi + 4 * rax - 3]
+    lea         rsi,    [rsi + rax]
+    lea         rdi,    [rdi + 2 * rdx]
+    prefetcht0  [rsi + 2 * rax - 3]
+    dec         rcx
+    jnz         .loop
 
-    movdqa      xmm4,   xmm2
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
+    ;Do last row if output_height is odd
+    movsxd      rcx,    dword ptr arg(4)    ;output_height
+    and         rcx,    1
+    je          .done
 
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
+    movq        xmm0,   [rsi - 3]
+    movq        xmm3,   [rsi + 5]
+    punpcklqdq  xmm0,   xmm3
 
-    paddsw      xmm0,   xmm1
-    paddsw      xmm0,   xmm4
-    paddsw      xmm0,   xmm2
-    paddsw      xmm0,   krd
-    psraw       xmm0,   7
-    packuswb    xmm0,   xmm0
+    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
 %if %1
     movq        xmm1,   [rdi]
     pavgb       xmm0,   xmm1
 %endif
-
-    lea         rsi,    [rsi + rax]
     movq        [rdi],  xmm0
-
-    lea         rdi,    [rdi + rdx]
-    dec         rcx
-    jnz         .loop
+.done
 %endm
 
 %macro HORIZx16 1
@@ -705,60 +746,53 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     movsxd      rcx, dword ptr arg(4)       ;output_height
 
 .loop:
-    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
+    prefetcht0  [rsi + 2 * rax -3]
 
-    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-    punpcklqdq  xmm0,   xmm3
+    movq        xmm0,   [rsi - 3]           ;load src data
+    movq        xmm4,   [rsi + 5]
+    movq        xmm7,   [rsi + 13]
+    punpcklqdq  xmm0,   xmm4
+    punpcklqdq  xmm4,   xmm7
 
     movdqa      xmm1,   xmm0
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm0,   k0k1
+    movdqa      xmm2,   xmm0
+    movdqa      xmm3,   xmm0
+    movdqa      xmm5,   xmm4
+    movdqa      xmm6,   xmm4
+    movdqa      xmm7,   xmm4
 
-    movdqa      xmm2,   xmm1
+    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
     pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
-
-    movdqa      xmm4,   xmm2
     pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
+    pshufb      xmm3,   [GLOBAL(shuf_t6t7)]
+    pshufb      xmm4,   [GLOBAL(shuf_t0t1)]
+    pshufb      xmm5,   [GLOBAL(shuf_t2t3)]
+    pshufb      xmm6,   [GLOBAL(shuf_t4t5)]
+    pshufb      xmm7,   [GLOBAL(shuf_t6t7)]
 
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
+    pmaddubsw   xmm0,   k0k1
+    pmaddubsw   xmm1,   k2k3
+    pmaddubsw   xmm2,   k4k5
+    pmaddubsw   xmm3,   k6k7
+    pmaddubsw   xmm4,   k0k1
+    pmaddubsw   xmm5,   k2k3
+    pmaddubsw   xmm6,   k4k5
+    pmaddubsw   xmm7,   k6k7
 
     paddsw      xmm0,   xmm1
-    paddsw      xmm0,   xmm4
+    paddsw      xmm0,   xmm3
     paddsw      xmm0,   xmm2
+    paddsw      xmm4,   xmm5
+    paddsw      xmm4,   xmm7
+    paddsw      xmm4,   xmm6
+
     paddsw      xmm0,   krd
+    paddsw      xmm4,   krd
     psraw       xmm0,   7
+    psraw       xmm4,   7
     packuswb    xmm0,   xmm0
-
-
-    movq        xmm3,   [rsi +  5]
-    movq        xmm7,   [rsi + 13]
-    punpcklqdq  xmm3,   xmm7
-
-    movdqa      xmm1,   xmm3
-    pshufb      xmm3,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm3,   k0k1
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
-
-    movdqa      xmm4,   xmm2
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
-
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
-
-    paddsw      xmm3,   xmm1
-    paddsw      xmm3,   xmm4
-    paddsw      xmm3,   xmm2
-    paddsw      xmm3,   krd
-    psraw       xmm3,   7
-    packuswb    xmm3,   xmm3
-    punpcklqdq  xmm0,   xmm3
+    packuswb    xmm4,   xmm4
+    punpcklqdq  xmm0,   xmm4
 %if %1
     movdqa      xmm1,   [rdi]
     pavgb       xmm0,   xmm1
@@ -792,19 +826,8 @@ sym(vp9_filter_block1d4_h8_ssse3):
     push        rdi
     ; end prolog
 
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
     HORIZx4 0
 
-    add rsp, 16*5
-    pop rsp
-
     ; begin epilog
     pop rdi
     pop rsi
@@ -909,19 +932,8 @@ sym(vp9_filter_block1d4_h8_avg_ssse3):
     push        rdi
     ; end prolog
 
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
     HORIZx4 1
 
-    add rsp, 16*5
-    pop rsp
-
     ; begin epilog
     pop rdi
     pop rsi
diff --git a/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm b/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm
deleted file mode 100644 (file)
index 174e747..0000000
+++ /dev/null
@@ -1,230 +0,0 @@
-;
-;   Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;   Use of this source code is governed by a BSD-style license
-;   that can be found in the LICENSE file in the root of the source
-;   tree. An additional intellectual property rights grant can be found
-;   in the file PATENTS.  All contributing project authors may
-;   be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT |vp9_add_constant_residual_8x8_neon|
-    EXPORT |vp9_add_constant_residual_16x16_neon|
-    EXPORT |vp9_add_constant_residual_32x32_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    MACRO
-    LD_16x8 $src, $stride
-    vld1.8              {q8},       [$src],     $stride
-    vld1.8              {q9},       [$src],     $stride
-    vld1.8              {q10},      [$src],     $stride
-    vld1.8              {q11},      [$src],     $stride
-    vld1.8              {q12},      [$src],     $stride
-    vld1.8              {q13},      [$src],     $stride
-    vld1.8              {q14},      [$src],     $stride
-    vld1.8              {q15},      [$src],     $stride
-    MEND
-
-    MACRO
-    ADD_DIFF_16x8 $diff
-    vqadd.u8            q8,         q8,         $diff
-    vqadd.u8            q9,         q9,         $diff
-    vqadd.u8            q10,        q10,        $diff
-    vqadd.u8            q11,        q11,        $diff
-    vqadd.u8            q12,        q12,        $diff
-    vqadd.u8            q13,        q13,        $diff
-    vqadd.u8            q14,        q14,        $diff
-    vqadd.u8            q15,        q15,        $diff
-    MEND
-
-    MACRO
-    SUB_DIFF_16x8 $diff
-    vqsub.u8            q8,         q8,         $diff
-    vqsub.u8            q9,         q9,         $diff
-    vqsub.u8            q10,        q10,        $diff
-    vqsub.u8            q11,        q11,        $diff
-    vqsub.u8            q12,        q12,        $diff
-    vqsub.u8            q13,        q13,        $diff
-    vqsub.u8            q14,        q14,        $diff
-    vqsub.u8            q15,        q15,        $diff
-    MEND
-
-    MACRO
-    ST_16x8 $dst, $stride
-    vst1.8              {q8},       [$dst],     $stride
-    vst1.8              {q9},       [$dst],     $stride
-    vst1.8              {q10},      [$dst],     $stride
-    vst1.8              {q11},      [$dst],     $stride
-    vst1.8              {q12},      [$dst],     $stride
-    vst1.8              {q13},      [$dst],     $stride
-    vst1.8              {q14},      [$dst],     $stride
-    vst1.8              {q15},      [$dst],     $stride
-    MEND
-
-; void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
-;                             int width, int height) {
-;  int r, c;
-;
-;  for (r = 0; r < height; r++) {
-;    for (c = 0; c < width; c++)
-;      dest[c] = clip_pixel(diff + dest[c]);
-;
-;    dest += stride;
-;  }
-;}
-;void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest,
-;                                     int stride) {
-;  add_constant_residual(diff, dest, stride, 8, 8);
-;}
-;       r0      : const int16_t diff
-;       r1      : const uint8_t *dest
-;       r2      : int stride
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp9_add_constant_residual_8x8_neon| PROC
-    mov                 r3,         r1                      ; r3: save dest to r3
-    vld1.8              {d0},       [r1],       r2
-    vld1.8              {d1},       [r1],       r2
-    vld1.8              {d2},       [r1],       r2
-    vld1.8              {d3},       [r1],       r2
-    vld1.8              {d4},       [r1],       r2
-    vld1.8              {d5},       [r1],       r2
-    vld1.8              {d6},       [r1],       r2
-    vld1.8              {d7},       [r1],       r2
-    cmp                 r0,         #0
-    bge                 DIFF_POSITIVE_8x8
-
-DIFF_NEGATIVE_8x8                                           ; diff < 0
-    neg                 r0,         r0
-    usat                r0,         #8,         r0
-    vdup.u8             q8,         r0
-
-    vqsub.u8            q0,         q0,         q8
-    vqsub.u8            q1,         q1,         q8
-    vqsub.u8            q2,         q2,         q8
-    vqsub.u8            q3,         q3,         q8
-    b                   DIFF_SAVE_8x8
-
-DIFF_POSITIVE_8x8                                           ; diff >= 0
-    usat                r0,         #8,         r0
-    vdup.u8             q8,         r0
-
-    vqadd.u8            q0,         q0,         q8
-    vqadd.u8            q1,         q1,         q8
-    vqadd.u8            q2,         q2,         q8
-    vqadd.u8            q3,         q3,         q8
-
-DIFF_SAVE_8x8
-    vst1.8              {d0},       [r3],       r2
-    vst1.8              {d1},       [r3],       r2
-    vst1.8              {d2},       [r3],       r2
-    vst1.8              {d3},       [r3],       r2
-    vst1.8              {d4},       [r3],       r2
-    vst1.8              {d5},       [r3],       r2
-    vst1.8              {d6},       [r3],       r2
-    vst1.8              {d7},       [r3],       r2
-
-    bx                  lr
-    ENDP
-
-;void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest,
-;                                       int stride) {
-;  add_constant_residual(diff, dest, stride, 16, 16);
-;}
-;       r0      : const int16_t diff
-;       r1      : const uint8_t *dest
-;       r2      : int stride
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp9_add_constant_residual_16x16_neon| PROC
-    mov                 r3,         r1
-    LD_16x8             r1,         r2
-    cmp                 r0,         #0
-    bge                 DIFF_POSITIVE_16x16
-
-|DIFF_NEGATIVE_16x16|
-    neg                 r0,         r0
-    usat                r0,         #8,         r0
-    vdup.u8             q0,         r0
-
-    SUB_DIFF_16x8       q0
-    ST_16x8             r3,         r2
-    LD_16x8             r1,         r2
-    SUB_DIFF_16x8       q0
-    b                   DIFF_SAVE_16x16
-
-|DIFF_POSITIVE_16x16|
-    usat                r0,         #8,         r0
-    vdup.u8             q0,         r0
-
-    ADD_DIFF_16x8       q0
-    ST_16x8             r3,         r2
-    LD_16x8             r1,         r2
-    ADD_DIFF_16x8       q0
-
-|DIFF_SAVE_16x16|
-    ST_16x8             r3,         r2
-    bx                  lr
-    ENDP
-
-;void vp9_add_constant_residual_32x32_c(const int16_t diff, uint8_t *dest,
-;                                       int stride) {
-;  add_constant_residual(diff, dest, stride, 32, 32);
-;}
-;       r0      : const int16_t diff
-;       r1      : const uint8_t *dest
-;       r2      : int stride
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp9_add_constant_residual_32x32_neon| PROC
-    push                {r4,lr}
-    pld                 [r1]
-    mov                 r3,         r1
-    add                 r4,         r1,         #16         ; r4 dest + 16 for second loop
-    cmp                 r0,         #0
-    bge                 DIFF_POSITIVE_32x32
-
-|DIFF_NEGATIVE_32x32|
-    neg                 r0,         r0
-    usat                r0,         #8,         r0
-    vdup.u8             q0,         r0
-    mov                 r0,         #4
-
-|DIFF_NEGATIVE_32x32_LOOP|
-    sub                 r0,         #1
-    LD_16x8             r1,         r2
-    SUB_DIFF_16x8       q0
-    ST_16x8             r3,         r2
-
-    LD_16x8             r1,         r2
-    SUB_DIFF_16x8       q0
-    ST_16x8             r3,         r2
-    cmp                 r0,         #2
-    moveq               r1,         r4
-    moveq               r3,         r4
-    cmp                 r0,         #0
-    bne                 DIFF_NEGATIVE_32x32_LOOP
-    pop                 {r4,pc}
-
-|DIFF_POSITIVE_32x32|
-    usat                r0,         #8,         r0
-    vdup.u8             q0,         r0
-    mov                 r0,         #4
-
-|DIFF_POSITIVE_32x32_LOOP|
-    sub                 r0,         #1
-    LD_16x8             r1,         r2
-    ADD_DIFF_16x8       q0
-    ST_16x8             r3,         r2
-
-    LD_16x8             r1,         r2
-    ADD_DIFF_16x8       q0
-    ST_16x8             r3,         r2
-    cmp                 r0,         #2
-    moveq               r1,         r4
-    moveq               r3,         r4
-    cmp                 r0,         #0
-    bne                 DIFF_POSITIVE_32x32_LOOP
-    pop                 {r4,pc}
-    ENDP
-
-    END
index dc12876b18d2640f1c05e6cb39a6edf99c22b6ad..27e5f2cda06d528fe2eaac7d71c5a8c68c82294e 100644 (file)
@@ -30,10 +30,26 @@ static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
   return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p);
 }
 
+static MB_PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r,
+                                            int size_group) {
+  const MB_PREDICTION_MODE y_mode = read_intra_mode(r,
+                                        cm->fc.y_mode_prob[size_group]);
+  ++cm->counts.y_mode[size_group][y_mode];
+  return y_mode;
+}
+
+static MB_PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r,
+                                             MB_PREDICTION_MODE y_mode) {
+  const MB_PREDICTION_MODE uv_mode = read_intra_mode(r,
+                                         cm->fc.uv_mode_prob[y_mode]);
+  ++cm->counts.uv_mode[y_mode][uv_mode];
+  return uv_mode;
+}
+
 static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r,
                                           uint8_t context) {
-  MB_PREDICTION_MODE mode = treed_read(r, vp9_inter_mode_tree,
-                            cm->fc.inter_mode_probs[context]);
+  const MB_PREDICTION_MODE mode = treed_read(r, vp9_inter_mode_tree,
+                                             cm->fc.inter_mode_probs[context]);
   ++cm->counts.inter_mode[context][inter_mode_offset(mode)];
   return mode;
 }
@@ -200,7 +216,6 @@ static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
 
 static int read_mv_component(vp9_reader *r,
                              const nmv_component *mvcomp, int usehp) {
-
   int mag, d, fr, hp;
   const int sign = vp9_read(r, mvcomp->sign);
   const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
@@ -348,16 +363,15 @@ static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
   for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j)
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
+      vp9_diff_update_prob(r, MODE_UPDATE_PROB,
+                           &fc->switchable_interp_prob[j][i]);
 }
 
 static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
     for (j = 0; j < INTER_MODES - 1; ++j)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
+      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &fc->inter_mode_probs[i][j]);
 }
 
 static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
@@ -388,9 +402,7 @@ static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
   mbmi->ref_frame[1] = NONE;
 
   if (bsize >= BLOCK_8X8) {
-    const int size_group = size_group_lookup[bsize];
-    mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[size_group]);
-    cm->counts.y_mode[size_group][mbmi->mode]++;
+    mbmi->mode = read_intra_mode_y(cm, r, size_group_lookup[bsize]);
   } else {
      // Only 4x4, 4x8, 8x4 blocks
      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
@@ -400,10 +412,8 @@ static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
      for (idy = 0; idy < 2; idy += num_4x4_h) {
        for (idx = 0; idx < 2; idx += num_4x4_w) {
          const int ib = idy * 2 + idx;
-         const int b_mode = read_intra_mode(r, cm->fc.y_mode_prob[0]);
+         const int b_mode = read_intra_mode_y(cm, r, 0);
          mi->bmi[ib].as_mode = b_mode;
-         cm->counts.y_mode[0][b_mode]++;
-
          if (num_4x4_h == 2)
            mi->bmi[ib + 2].as_mode = b_mode;
          if (num_4x4_w == 2)
@@ -413,8 +423,46 @@ static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
     mbmi->mode = mi->bmi[3].as_mode;
   }
 
-  mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
-  cm->counts.uv_mode[mbmi->mode][mbmi->uv_mode]++;
+  mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
+}
+
+static INLINE void assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
+                             int_mv mv[2], int_mv best_mv[2],
+                             int_mv nearest_mv[2], int_mv near_mv[2],
+                             int is_compound, int allow_hp, vp9_reader *r) {
+  int i;
+
+  switch (mode) {
+    case NEWMV:
+       read_mv(r, &mv[0].as_mv, &best_mv[0].as_mv,
+               &cm->fc.nmvc, &cm->counts.mv, allow_hp);
+       if (is_compound)
+         read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv,
+                 &cm->fc.nmvc, &cm->counts.mv, allow_hp);
+       break;
+    case NEARESTMV:
+      mv[0].as_int = nearest_mv[0].as_int;
+      if (is_compound)
+        mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    case NEARMV:
+      mv[0].as_int = near_mv[0].as_int;
+      if (is_compound)
+        mv[1].as_int = near_mv[1].as_int;
+      break;
+    case ZEROMV:
+      mv[0].as_int = 0;
+      if (is_compound)
+        mv[1].as_int = 0;
+      break;
+    default:
+      assert(!"Invalid inter mode value.");
+  }
+
+  for (i = 0; i < 1 + is_compound; ++i) {
+    assert(mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW);
+    assert(mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW);
+  }
 }
 
 static int read_is_inter_block(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
@@ -436,15 +484,11 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
                                        int mi_row, int mi_col, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  nmv_context *const nmvc = &cm->fc.nmvc;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-  int_mv *const mv0 = &mbmi->mv[0];
-  int_mv *const mv1 = &mbmi->mv[1];
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = xd->allow_high_precision_mv;
 
-  int_mv nearest, nearby, best_mv;
-  int_mv nearest_second, nearby_second, best_mv_second;
+  int_mv nearest[2], nearmv[2], best[2];
   uint8_t inter_mode_ctx;
   MV_REFERENCE_FRAME ref0;
   int is_compound;
@@ -469,8 +513,8 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
 
   // nearest, nearby
   if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
-    vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref0], &nearest, &nearby);
-    best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int;
+    vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref0], &nearest[0], &nearmv[0]);
+    best[0].as_int = nearest[0].as_int;
   }
 
   if (is_compound) {
@@ -479,9 +523,8 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
                      ref1, mbmi->ref_mvs[ref1], mi_row, mi_col);
 
     if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
-      vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref1],
-                            &nearest_second, &nearby_second);
-      best_mv_second.as_int = mbmi->ref_mvs[ref1][0].as_int;
+      vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref1], &nearest[1], &nearmv[1]);
+      best[1].as_int = nearest[1].as_int;
     }
   }
 
@@ -496,50 +539,27 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
     int b_mode;
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
-        int_mv blockmv, secondmv;
+        int_mv block[2];
         const int j = idy * 2 + idx;
         b_mode = read_inter_mode(cm, r, inter_mode_ctx);
 
         if (b_mode == NEARESTMV || b_mode == NEARMV) {
-          vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0,
+          vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest[0],
+                                        &nearmv[0], j, 0,
                                         mi_row, mi_col);
 
           if (is_compound)
-            vp9_append_sub8x8_mvs_for_idx(cm, xd,  &nearest_second,
-                                         &nearby_second, j, 1,
-                                         mi_row, mi_col);
+            vp9_append_sub8x8_mvs_for_idx(cm, xd,  &nearest[1],
+                                          &nearmv[1], j, 1,
+                                          mi_row, mi_col);
         }
 
-        switch (b_mode) {
-          case NEWMV:
-            read_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
-                    &cm->counts.mv, allow_hp);
-
-            if (is_compound)
-              read_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
-                      &cm->counts.mv, allow_hp);
-            break;
-          case NEARESTMV:
-            blockmv.as_int = nearest.as_int;
-            if (is_compound)
-              secondmv.as_int = nearest_second.as_int;
-            break;
-          case NEARMV:
-            blockmv.as_int = nearby.as_int;
-            if (is_compound)
-              secondmv.as_int = nearby_second.as_int;
-            break;
-          case ZEROMV:
-            blockmv.as_int = 0;
-            if (is_compound)
-              secondmv.as_int = 0;
-            break;
-          default:
-            assert(!"Invalid inter mode value");
-        }
-        mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
+        assign_mv(cm, b_mode, block, best, nearest, nearmv,
+                  is_compound, allow_hp, r);
+
+        mi->bmi[j].as_mv[0].as_int = block[0].as_int;
         if (is_compound)
-          mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
+          mi->bmi[j].as_mv[1].as_int = block[1].as_int;
 
         if (num_4x4_h == 2)
           mi->bmi[j + 2] = mi->bmi[j];
@@ -549,37 +569,12 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
     }
 
     mi->mbmi.mode = b_mode;
-    mv0->as_int = mi->bmi[3].as_mv[0].as_int;
-    mv1->as_int = mi->bmi[3].as_mv[1].as_int;
-  } else {
-    switch (mbmi->mode) {
-      case NEARMV:
-        mv0->as_int = nearby.as_int;
-        if (is_compound)
-          mv1->as_int = nearby_second.as_int;
-        break;
-
-      case NEARESTMV:
-        mv0->as_int = nearest.as_int;
-        if (is_compound)
-          mv1->as_int = nearest_second.as_int;
-        break;
-
-      case ZEROMV:
-        mv0->as_int = 0;
-        if (is_compound)
-          mv1->as_int = 0;
-        break;
 
-      case NEWMV:
-        read_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->counts.mv, allow_hp);
-        if (is_compound)
-          read_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc, &cm->counts.mv,
-                  allow_hp);
-        break;
-      default:
-        assert(!"Invalid inter mode value");
-    }
+    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  } else {
+    assign_mv(cm, mbmi->mode, mbmi->mv, best, nearest, nearmv,
+              is_compound, allow_hp, r);
   }
 }
 
@@ -611,21 +606,17 @@ static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
 
   if (cm->comp_pred_mode == HYBRID_PREDICTION)
     for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
+      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.comp_inter_prob[i]);
 
   if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
     for (i = 0; i < REF_CONTEXTS; i++) {
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
+      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.single_ref_prob[i][0]);
+      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.single_ref_prob[i][1]);
     }
 
   if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
     for (i = 0; i < REF_CONTEXTS; i++)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
+      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.comp_ref_prob[i]);
 }
 
 void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
@@ -635,8 +626,7 @@ void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
   // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.
   // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
   for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-    if (vp9_read(r, MODE_UPDATE_PROB))
-      vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]);
+    vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.mbskip_probs[k]);
 
   if (cm->frame_type != KEY_FRAME && !cm->intra_only) {
     nmv_context *const nmvc = &pbi->common.fc.nmvc;
@@ -649,20 +639,18 @@ void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
       read_switchable_interp_probs(&cm->fc, r);
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]);
+      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.intra_inter_prob[i]);
 
     read_comp_pred(cm, r);
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
       for (i = 0; i < INTRA_MODES - 1; ++i)
-        if (vp9_read(r, MODE_UPDATE_PROB))
-          vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]);
+        vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.y_mode_prob[j][i]);
 
     for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j)
       for (i = 0; i < PARTITION_TYPES - 1; ++i)
-        if (vp9_read(r, MODE_UPDATE_PROB))
-          vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]);
+        vp9_diff_update_prob(r, MODE_UPDATE_PROB,
+                             &cm->fc.partition_prob[INTER_FRAME][j][i]);
 
     read_mv_probs(r, nmvc, xd->allow_high_precision_mv);
   }
index 34ed0c7593f58e42d18e92bbc7b1a87c4b07f018..9038748b976a62ea5ff7c00350d65dcd173a48bb 100644 (file)
@@ -19,6 +19,7 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_extend.h"
+#include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconintra.h"
@@ -31,7 +32,6 @@
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_dsubexp.h"
-#include "vp9/decoder/vp9_idct_blk.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/decoder/vp9_thread.h"
@@ -63,18 +63,15 @@ static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 3; ++j)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);
+      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &tx_probs->p8x8[i][j]);
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 2; ++j)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);
+      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &tx_probs->p16x16[i][j]);
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 1; ++j)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
+      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &tx_probs->p32x32[i][j]);
 }
 
 static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
@@ -92,32 +89,44 @@ static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
   const int stride = pd->dst.stride;
   const int eob = pd->eobs[block];
-  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
-                                                       block);
-  uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
-                                                 pd->dst.buf, stride);
-  switch (tx_size) {
-    case TX_4X4: {
-      const TX_TYPE tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
-      if (tx_type == DCT_DCT)
-        xd->itxm_add(qcoeff, dst, stride, eob);
+  if (eob > 0) {
+    TX_TYPE tx_type;
+    const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
+                                                         block);
+    uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
+                                                   pd->dst.buf, stride);
+    switch (tx_size) {
+      case TX_4X4:
+        tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
+        if (tx_type == DCT_DCT)
+          xd->itxm_add(qcoeff, dst, stride, eob);
+        else
+          vp9_iht_add(tx_type, qcoeff, dst, stride, eob);
+        break;
+      case TX_8X8:
+        tx_type = get_tx_type_8x8(pd->plane_type, xd);
+        vp9_iht_add_8x8(tx_type, qcoeff, dst, stride, eob);
+        break;
+      case TX_16X16:
+        tx_type = get_tx_type_16x16(pd->plane_type, xd);
+        vp9_iht_add_16x16(tx_type, qcoeff, dst, stride, eob);
+        break;
+      case TX_32X32:
+        tx_type = DCT_DCT;
+        vp9_idct_add_32x32(qcoeff, dst, stride, eob);
+        break;
+      default:
+        assert(!"Invalid transform size");
+    }
+
+    if (eob == 1) {
+      *((int32_t *)qcoeff) = 0;
+    } else {
+      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
+        vpx_memset(qcoeff, 0, 4 * (4 << tx_size) * sizeof(qcoeff[0]));
       else
-        vp9_iht_add_c(tx_type, qcoeff, dst, stride, eob);
-      break;
+        vpx_memset(qcoeff, 0, (16 << (tx_size << 1)) * sizeof(qcoeff[0]));
     }
-    case TX_8X8:
-      vp9_iht_add_8x8_c(get_tx_type_8x8(pd->plane_type, xd), qcoeff, dst,
-                        stride, eob);
-      break;
-    case TX_16X16:
-      vp9_iht_add_16x16_c(get_tx_type_16x16(pd->plane_type, xd), qcoeff, dst,
-                          stride, eob);
-      break;
-    case TX_32X32:
-      vp9_idct_add_32x32(qcoeff, dst, stride, eob);
-      break;
-    default:
-      assert(!"Invalid transform size");
   }
 }
 
@@ -215,10 +224,10 @@ static void set_ref(VP9D_COMP *pbi, int i, int mi_row, int mi_col) {
 
 static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
                            vp9_reader *r, BLOCK_SIZE bsize) {
-  VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi;
+  int eobtotal;
 
   if (less8x8)
     if (xd->ab_index > 0)
@@ -232,33 +241,31 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
 
   // Has to be called after set_offsets
   mbmi = &xd->this_mi->mbmi;
+  eobtotal = decode_tokens(pbi, bsize, r);
 
   if (!is_inter_block(mbmi)) {
     // Intra reconstruction
-    decode_tokens(pbi, bsize, r);
     foreach_transformed_block(xd, bsize, decode_block_intra, xd);
   } else {
     // Inter reconstruction
-    int eobtotal;
+    const int decode_blocks = (eobtotal > 0);
+
+    if (!less8x8) {
+      assert(mbmi->sb_type == bsize);
+      if (eobtotal == 0)
+        mbmi->skip_coeff = 1;  // skip loopfilter
+    }
 
     set_ref(pbi, 0, mi_row, mi_col);
     if (has_second_ref(mbmi))
       set_ref(pbi, 1, mi_row, mi_col);
 
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+    xd->subpix.filter_x = xd->subpix.filter_y =
+        vp9_get_filter_kernel(mbmi->interp_filter);
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-    eobtotal = decode_tokens(pbi, bsize, r);
-    if (less8x8) {
-      if (eobtotal >= 0)
-        foreach_transformed_block(xd, bsize, decode_block, xd);
-    } else {
-      assert(mbmi->sb_type == bsize);
-      if (eobtotal == 0)
-        // skip loopfilter
-        vp9_set_pred_flag_mbskip(xd, bsize, 1);
-      else if (eobtotal > 0)
-        foreach_transformed_block(xd, bsize, decode_block, xd);
-    }
+
+    if (decode_blocks)
+      foreach_transformed_block(xd, bsize, decode_block, xd);
   }
   xd->corrupted |= vp9_reader_has_error(r);
 }
@@ -364,8 +371,8 @@ static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs,
           for (l = 0; l < PREV_COEF_CONTEXTS; l++)
             if (k > 0 || l < 3)
               for (m = 0; m < UNCONSTRAINED_NODES; m++)
-                if (vp9_read(r, VP9_COEF_UPDATE_PROB))
-                  vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
+                vp9_diff_update_prob(r, VP9_COEF_UPDATE_PROB,
+                                     &coef_probs[i][j][k][l][m]);
 }
 
 static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
@@ -436,7 +443,6 @@ static void setup_segmentation(struct segmentation *seg,
 
 static void setup_loopfilter(struct loopfilter *lf,
                              struct vp9_read_bit_buffer *rb) {
-
   lf->filter_level = vp9_rb_read_literal(rb, 6);
   lf->sharpness_level = vp9_rb_read_literal(rb, 3);
 
@@ -484,15 +490,15 @@ static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
 
-  xd->itxm_add = xd->lossless ? vp9_idct_add_lossless_c
-                              : vp9_idct_add;
+  xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 }
 
 static INTERPOLATIONFILTERTYPE read_interp_filter_type(
     struct vp9_read_bit_buffer *rb) {
   const INTERPOLATIONFILTERTYPE literal_to_type[] = { EIGHTTAP_SMOOTH,
                                                       EIGHTTAP,
-                                                      EIGHTTAP_SHARP };
+                                                      EIGHTTAP_SHARP,
+                                                      BILINEAR };
   return vp9_rb_read_bit(rb) ? SWITCHABLE
                              : literal_to_type[vp9_rb_read_literal(rb, 2)];
 }
index 3792b9c7849d1b0c89fd9c82f7bf6d0de653b82f..8fcf83ee327e60177bc92f1d74cae3426dc7b1ce 100644 (file)
@@ -106,8 +106,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
   const uint8_t *band_translate;
   uint8_t token_cache[1024];
   int pt = get_entropy_context(tx_size, A, L);
-  get_scan_and_band(xd, tx_size, type, block_idx, &scan, &band_translate);
-  nb = vp9_get_coef_neighbors_handle(scan);
+  get_scan_and_band(xd, tx_size, type, block_idx, &scan, &nb, &band_translate);
 
   while (1) {
     int val;
@@ -122,7 +121,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
     if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
       break;
 
-SKIP_START:
+  SKIP_START:
     if (c >= seg_eob)
       break;
     if (c)
index 8cc64f73e592c37bfd1f2e6a6985c568f9e71f06..6f01cead661bb3e23da8aada2873f8905eca96ed 100644 (file)
@@ -67,7 +67,6 @@ static int inv_remap_prob(int v, int m) {
     206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221,
     222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
     238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
-
   };
   // v = merge_index(v, MAX_PROBS - 1, MODULUS_PARAM);
   v = inv_map_table[v];
@@ -100,7 +99,9 @@ static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {
   return word;
 }
 
-void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) {
-  int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
-  *p = (vp9_prob)inv_remap_prob(delp, *p);
+void vp9_diff_update_prob(vp9_reader *r, int update_prob, vp9_prob* p) {
+  if (vp9_read(r, update_prob)) {
+    const int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
+    *p = (vp9_prob)inv_remap_prob(delp, *p);
+  }
 }
index aeb9399d0278a1284786a62a1ad1c7cdb7a25604..21ac313938d1208763f61034994ed253e1a9e5a6 100644 (file)
@@ -14,6 +14,6 @@
 
 #include "vp9/decoder/vp9_dboolhuff.h"
 
-void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p);
+void vp9_diff_update_prob(vp9_reader *r, int update_prob, vp9_prob* p);
 
 #endif  // VP9_DECODER_VP9_DSUBEXP_H_
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
deleted file mode 100644 (file)
index 395e636..0000000
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9_rtcd.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/decoder/vp9_idct_blk.h"
-
-static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
-                                  int width, int height) {
-  int r, c;
-
-  for (r = 0; r < height; r++) {
-    for (c = 0; c < width; c++)
-      dest[c] = clip_pixel(diff + dest[c]);
-
-    dest += stride;
-  }
-}
-
-void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest,
-                                     int stride) {
-  add_constant_residual(diff, dest, stride, 8, 8);
-}
-
-void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest,
-                                       int stride) {
-  add_constant_residual(diff, dest, stride, 16, 16);
-}
-
-void vp9_add_constant_residual_32x32_c(const int16_t diff,  uint8_t *dest,
-                                       int stride) {
-  add_constant_residual(diff, dest, stride, 32, 32);
-}
-
-void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
-                   int eob) {
-  if (tx_type == DCT_DCT) {
-    vp9_idct_add(input, dest, stride, eob);
-  } else {
-    vp9_short_iht4x4_add(input, dest, stride, tx_type);
-    vpx_memset(input, 0, 32);
-  }
-}
-
-void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                       int stride, int eob) {
-  if (tx_type == DCT_DCT) {
-    vp9_idct_add_8x8(input, dest, stride, eob);
-  } else {
-    if (eob > 0) {
-      vp9_short_iht8x8_add(input, dest, stride, tx_type);
-      vpx_memset(input, 0, 128);
-    }
-  }
-}
-
-void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  if (eob > 1) {
-    vp9_short_idct4x4_add(input, dest, stride);
-    vpx_memset(input, 0, 32);
-  } else {
-    vp9_short_idct4x4_1_add(input, dest, stride);
-    ((int *)input)[0] = 0;
-  }
-}
-
-void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,
-                             int eob) {
-  if (eob > 1) {
-    vp9_short_iwalsh4x4_add(input, dest, stride);
-    vpx_memset(input, 0, 32);
-  } else {
-    vp9_short_iwalsh4x4_1_add_c(input, dest, stride);
-    ((int *)input)[0] = 0;
-  }
-}
-
-void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  // If dc is 1, then input[0] is the reconstructed value, do not need
-  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
-
-  // The calculation can be simplified if there are not many non-zero dct
-  // coefficients. Use eobs to decide what to do.
-  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
-  // Combine that with code here.
-  if (eob) {
-    if (eob == 1) {
-      // DC only DCT coefficient
-      vp9_short_idct8x8_1_add(input, dest, stride);
-      input[0] = 0;
-    } else if (eob <= 10) {
-      vp9_short_idct10_8x8_add(input, dest, stride);
-      vpx_memset(input, 0, 128);
-    } else {
-      vp9_short_idct8x8_add(input, dest, stride);
-      vpx_memset(input, 0, 128);
-    }
-  }
-}
-
-void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                         int stride, int eob) {
-  if (tx_type == DCT_DCT) {
-    vp9_idct_add_16x16(input, dest, stride, eob);
-  } else {
-    if (eob > 0) {
-      vp9_short_iht16x16_add(input, dest, stride, tx_type);
-      vpx_memset(input, 0, 512);
-    }
-  }
-}
-
-void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  /* The calculation can be simplified if there are not many non-zero dct
-   * coefficients. Use eobs to separate different cases. */
-  if (eob) {
-    if (eob == 1) {
-      /* DC only DCT coefficient. */
-      vp9_short_idct16x16_1_add(input, dest, stride);
-      input[0] = 0;
-    } else if (eob <= 10) {
-      vp9_short_idct10_16x16_add(input, dest, stride);
-      vpx_memset(input, 0, 512);
-    } else {
-      vp9_short_idct16x16_add(input, dest, stride);
-      vpx_memset(input, 0, 512);
-    }
-  }
-}
-
-void vp9_idct_add_32x32_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 1024);
-
-  if (eob) {
-    if (eob == 1) {
-      vp9_short_idct1_32x32(input, output);
-      vp9_add_constant_residual_32x32(output[0], dest, stride);
-      input[0] = 0;
-    } else {
-      vp9_short_idct32x32_add(input, dest, stride);
-      vpx_memset(input, 0, 2048);
-    }
-  }
-}
-
diff --git a/vp9/decoder/vp9_idct_blk.h b/vp9/decoder/vp9_idct_blk.h
deleted file mode 100644 (file)
index 00f1bc6..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_DECODER_VP9_IDCT_BLK_H_
-#define VP9_DECODER_VP9_IDCT_BLK_H_
-
-#include "vp9/common/vp9_blockd.h"
-
-void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest,
-                             int stride, int eob);
-
-void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                   int stride, int eob);
-
-void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                       int stride, int eob);
-
-void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                         int stride, int eob);
-
-#endif  // VP9_DECODER_VP9_IDCT_BLK_H_
index cd5b7508f2123930ac5f3f02ebae250b287f060c..a4b9c24fc9b29772f13fcf1a6bf37d5a68adc894 100644 (file)
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ONYXD_H_
-#define VP9_COMMON_VP9_ONYXD_H_
+#ifndef VP9_DECODER_VP9_ONYXD_H_
+#define VP9_DECODER_VP9_ONYXD_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -40,7 +40,7 @@ typedef enum {
 void vp9_initialize_dec();
 
 int vp9_receive_compressed_data(VP9D_PTR comp,
-                                uint64_t size, const uint8_t **dest,
+                                size_t size, const uint8_t **dest,
                                 int64_t time_stamp);
 
 int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,
@@ -66,4 +66,4 @@ void vp9_remove_decompressor(VP9D_PTR comp);
 }
 #endif
 
-#endif  // VP9_COMMON_VP9_ONYXD_H_
+#endif  // VP9_DECODER_VP9_ONYXD_H_
index 17d5def33736ab07b7c47713d178e28e8eef58bf..a42c2cf30d64a0e6563afb8d877fbac43539ded1 100644 (file)
@@ -65,13 +65,12 @@ static void recon_write_yuv_frame(const char *name,
 #endif
 #if WRITE_RECON_BUFFER == 2
 void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-
   // write the frame
   FILE *yframe;
   int i;
   char filename[255];
 
-  sprintf(filename, "dx\\y%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename)-1, "dx\\y%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->y_height; i++)
@@ -79,7 +78,7 @@ void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
            frame->y_width, 1, yframe);
 
   fclose(yframe);
-  sprintf(filename, "dx\\u%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename)-1, "dx\\u%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->uv_height; i++)
@@ -87,7 +86,7 @@ void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
            frame->uv_width, 1, yframe);
 
   fclose(yframe);
-  sprintf(filename, "dx\\v%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename)-1, "dx\\v%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->uv_height; i++)
@@ -214,13 +213,13 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
    * vpxenc --test-decode functionality working, and will be replaced in a
    * later commit that adds VP9-specific controls for this functionality.
    */
-  if (ref_frame_flag == VP9_LAST_FLAG)
+  if (ref_frame_flag == VP9_LAST_FLAG) {
     ref_fb_ptr = &pbi->common.active_ref_idx[0];
-  else if (ref_frame_flag == VP9_GOLD_FLAG)
+  } else if (ref_frame_flag == VP9_GOLD_FLAG) {
     ref_fb_ptr = &pbi->common.active_ref_idx[1];
-  else if (ref_frame_flag == VP9_ALT_FLAG)
+  } else if (ref_frame_flag == VP9_ALT_FLAG) {
     ref_fb_ptr = &pbi->common.active_ref_idx[2];
-  else {
+  else {
     vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
                        "Invalid reference frame");
     return pbi->common.error.error_code;
@@ -277,7 +276,7 @@ static void swap_frame_buffers(VP9D_COMP *pbi) {
 }
 
 int vp9_receive_compressed_data(VP9D_PTR ptr,
-                                uint64_t size, const uint8_t **psource,
+                                size_t size, const uint8_t **psource,
                                 int64_t time_stamp) {
   VP9D_COMP *pbi = (VP9D_COMP *) ptr;
   VP9_COMMON *cm = &pbi->common;
index a051971a119c75e9ce5ffd1ed02e40189ff3689a..76d7c5765ac245c503d14ba37cf18f31de40138e 100644 (file)
@@ -25,7 +25,7 @@ typedef struct VP9Decompressor {
   VP9D_CONFIG oxcf;
 
   const uint8_t *source;
-  uint32_t source_sz;
+  size_t source_sz;
 
   int64_t last_time_stamp;
   int ready_for_new_data;
@@ -41,4 +41,4 @@ typedef struct VP9Decompressor {
   VP9Worker lf_worker;
 } VP9D_COMP;
 
-#endif  // VP9_DECODER_VP9_TREEREADER_H_
+#endif  // VP9_DECODER_VP9_ONYXD_INT_H_
index c7fa3aa27c40796ff3a940177e8ef2327bec8da2..41a686837242251bf66e42daef247854ee04c816 100644 (file)
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_READ_BIT_BUFFER_
-#define VP9_READ_BIT_BUFFER_
+#ifndef VP9_DECODER_VP9_READ_BIT_BUFFER_H_
+#define VP9_DECODER_VP9_READ_BIT_BUFFER_H_
 
 #include <limits.h>
 
@@ -57,4 +57,4 @@ static int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb,
   return vp9_rb_read_bit(rb) ? -value : value;
 }
 
-#endif  // VP9_READ_BIT_BUFFER_
+#endif  // VP9_DECODER_VP9_READ_BIT_BUFFER_H_
index dc3b68196499016d242db5fb328341107c01f920..5442ddfa190ef4eb66ce8d31afa732eca6525372 100644 (file)
@@ -29,7 +29,7 @@ extern "C" {
 //------------------------------------------------------------------------------
 // simplistic pthread emulation layer
 
-#include <process.h>
+#include <process.h>  // NOLINT
 
 // _beginthreadex requires __stdcall
 #define THREADFN unsigned int __stdcall
index a8f7e046a5f580541bc247ba0e1c857f798ffe40..e5e6f606b87ec7b8c867faf0d55d2e9a2b09a3f1 100644 (file)
@@ -17,7 +17,7 @@
 #ifndef VP9_DECODER_VP9_THREAD_H_
 #define VP9_DECODER_VP9_THREAD_H_
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -27,7 +27,7 @@ extern "C" {
 
 #if defined(_WIN32)
 
-#include <windows.h>
+#include <windows.h>  // NOLINT
 typedef HANDLE pthread_t;
 typedef CRITICAL_SECTION pthread_mutex_t;
 typedef struct {
@@ -38,7 +38,7 @@ typedef struct {
 
 #else
 
-#include <pthread.h>
+#include <pthread.h> // NOLINT
 
 #endif    /* _WIN32 */
 #endif    /* CONFIG_MULTITHREAD */
@@ -90,4 +90,4 @@ void vp9_worker_end(VP9Worker* const worker);
 }    // extern "C"
 #endif
 
-#endif  /* VP9_DECODER_VP9_THREAD_H_ */
+#endif  // VP9_DECODER_VP9_THREAD_H_
index 710cc4cd05292feb2f0b7e7d87b83025f9d1e34e..f6124973f9febac9db89b665416cfa439bd68304 100644 (file)
@@ -23,7 +23,8 @@ static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
                       const vp9_prob *const p) {
   register vp9_tree_index i = 0;
 
-  while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0);
+  while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0)
+    continue;
 
   return -i;
 }
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
deleted file mode 100644 (file)
index 54ec67f..0000000
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_idct.h"
-
-void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
-                                        int stride) {
-  uint8_t abs_diff;
-  __m128i d;
-
-  // Prediction data.
-  __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
-  __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
-  __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
-  __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
-  __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride));
-  __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride));
-  __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride));
-  __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride));
-
-  p0 = _mm_unpacklo_epi64(p0, p1);
-  p2 = _mm_unpacklo_epi64(p2, p3);
-  p4 = _mm_unpacklo_epi64(p4, p5);
-  p6 = _mm_unpacklo_epi64(p6, p7);
-
-  // Clip diff value to [0, 255] range. Then, do addition or subtraction
-  // according to its sign.
-  if (diff >= 0) {
-    abs_diff = (diff > 255) ? 255 : diff;
-    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-
-    p0 = _mm_adds_epu8(p0, d);
-    p2 = _mm_adds_epu8(p2, d);
-    p4 = _mm_adds_epu8(p4, d);
-    p6 = _mm_adds_epu8(p6, d);
-  } else {
-    abs_diff = (diff < -255) ? 255 : -diff;
-    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-
-    p0 = _mm_subs_epu8(p0, d);
-    p2 = _mm_subs_epu8(p2, d);
-    p4 = _mm_subs_epu8(p4, d);
-    p6 = _mm_subs_epu8(p6, d);
-  }
-
-  _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);
-  p0 = _mm_srli_si128(p0, 8);
-  _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);
-
-  _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);
-  p2 = _mm_srli_si128(p2, 8);
-  _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);
-
-  _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);
-  p4 = _mm_srli_si128(p4, 8);
-  _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);
-
-  _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);
-  p6 = _mm_srli_si128(p6, 8);
-  _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
-}
-
-void vp9_add_constant_residual_16x16_sse2(const int16_t diff, uint8_t *dest,
-                                          int stride) {
-  uint8_t abs_diff;
-  __m128i d;
-
-  // Prediction data.
-  __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
-  __m128i p1 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
-  __m128i p2 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
-  __m128i p3 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
-  __m128i p4 = _mm_load_si128((const __m128i *)(dest + 4 * stride));
-  __m128i p5 = _mm_load_si128((const __m128i *)(dest + 5 * stride));
-  __m128i p6 = _mm_load_si128((const __m128i *)(dest + 6 * stride));
-  __m128i p7 = _mm_load_si128((const __m128i *)(dest + 7 * stride));
-  __m128i p8 = _mm_load_si128((const __m128i *)(dest + 8 * stride));
-  __m128i p9 = _mm_load_si128((const __m128i *)(dest + 9 * stride));
-  __m128i p10 = _mm_load_si128((const __m128i *)(dest + 10 * stride));
-  __m128i p11 = _mm_load_si128((const __m128i *)(dest + 11 * stride));
-  __m128i p12 = _mm_load_si128((const __m128i *)(dest + 12 * stride));
-  __m128i p13 = _mm_load_si128((const __m128i *)(dest + 13 * stride));
-  __m128i p14 = _mm_load_si128((const __m128i *)(dest + 14 * stride));
-  __m128i p15 = _mm_load_si128((const __m128i *)(dest + 15 * stride));
-
-  // Clip diff value to [0, 255] range. Then, do addition or subtraction
-  // according to its sign.
-  if (diff >= 0) {
-    abs_diff = (diff > 255) ? 255 : diff;
-    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-
-    p0 = _mm_adds_epu8(p0, d);
-    p1 = _mm_adds_epu8(p1, d);
-    p2 = _mm_adds_epu8(p2, d);
-    p3 = _mm_adds_epu8(p3, d);
-    p4 = _mm_adds_epu8(p4, d);
-    p5 = _mm_adds_epu8(p5, d);
-    p6 = _mm_adds_epu8(p6, d);
-    p7 = _mm_adds_epu8(p7, d);
-    p8 = _mm_adds_epu8(p8, d);
-    p9 = _mm_adds_epu8(p9, d);
-    p10 = _mm_adds_epu8(p10, d);
-    p11 = _mm_adds_epu8(p11, d);
-    p12 = _mm_adds_epu8(p12, d);
-    p13 = _mm_adds_epu8(p13, d);
-    p14 = _mm_adds_epu8(p14, d);
-    p15 = _mm_adds_epu8(p15, d);
-  } else {
-    abs_diff = (diff < -255) ? 255 : -diff;
-    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-
-    p0 = _mm_subs_epu8(p0, d);
-    p1 = _mm_subs_epu8(p1, d);
-    p2 = _mm_subs_epu8(p2, d);
-    p3 = _mm_subs_epu8(p3, d);
-    p4 = _mm_subs_epu8(p4, d);
-    p5 = _mm_subs_epu8(p5, d);
-    p6 = _mm_subs_epu8(p6, d);
-    p7 = _mm_subs_epu8(p7, d);
-    p8 = _mm_subs_epu8(p8, d);
-    p9 = _mm_subs_epu8(p9, d);
-    p10 = _mm_subs_epu8(p10, d);
-    p11 = _mm_subs_epu8(p11, d);
-    p12 = _mm_subs_epu8(p12, d);
-    p13 = _mm_subs_epu8(p13, d);
-    p14 = _mm_subs_epu8(p14, d);
-    p15 = _mm_subs_epu8(p15, d);
-  }
-
-  // Store results
-  _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
-  _mm_store_si128((__m128i *)(dest + 1 * stride), p1);
-  _mm_store_si128((__m128i *)(dest + 2 * stride), p2);
-  _mm_store_si128((__m128i *)(dest + 3 * stride), p3);
-  _mm_store_si128((__m128i *)(dest + 4 * stride), p4);
-  _mm_store_si128((__m128i *)(dest + 5 * stride), p5);
-  _mm_store_si128((__m128i *)(dest + 6 * stride), p6);
-  _mm_store_si128((__m128i *)(dest + 7 * stride), p7);
-  _mm_store_si128((__m128i *)(dest + 8 * stride), p8);
-  _mm_store_si128((__m128i *)(dest + 9 * stride), p9);
-  _mm_store_si128((__m128i *)(dest + 10 * stride), p10);
-  _mm_store_si128((__m128i *)(dest + 11 * stride), p11);
-  _mm_store_si128((__m128i *)(dest + 12 * stride), p12);
-  _mm_store_si128((__m128i *)(dest + 13 * stride), p13);
-  _mm_store_si128((__m128i *)(dest + 14 * stride), p14);
-  _mm_store_si128((__m128i *)(dest + 15 * stride), p15);
-}
-
-void vp9_add_constant_residual_32x32_sse2(const int16_t diff, uint8_t *dest,
-                                          int stride) {
-  uint8_t abs_diff;
-  __m128i d;
-  int i = 8;
-
-  if (diff >= 0) {
-    abs_diff = (diff > 255) ? 255 : diff;
-    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-  } else {
-    abs_diff = (diff < -255) ? 255 : -diff;
-    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-  }
-
-  do {
-    // Prediction data.
-    __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
-    __m128i p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));
-    __m128i p2 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
-    __m128i p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));
-    __m128i p4 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
-    __m128i p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride + 16));
-    __m128i p6 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
-    __m128i p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride + 16));
-
-    // Clip diff value to [0, 255] range. Then, do addition or subtraction
-    // according to its sign.
-    if (diff >= 0) {
-      p0 = _mm_adds_epu8(p0, d);
-      p1 = _mm_adds_epu8(p1, d);
-      p2 = _mm_adds_epu8(p2, d);
-      p3 = _mm_adds_epu8(p3, d);
-      p4 = _mm_adds_epu8(p4, d);
-      p5 = _mm_adds_epu8(p5, d);
-      p6 = _mm_adds_epu8(p6, d);
-      p7 = _mm_adds_epu8(p7, d);
-    } else {
-      p0 = _mm_subs_epu8(p0, d);
-      p1 = _mm_subs_epu8(p1, d);
-      p2 = _mm_subs_epu8(p2, d);
-      p3 = _mm_subs_epu8(p3, d);
-      p4 = _mm_subs_epu8(p4, d);
-      p5 = _mm_subs_epu8(p5, d);
-      p6 = _mm_subs_epu8(p6, d);
-      p7 = _mm_subs_epu8(p7, d);
-    }
-
-    // Store results
-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
-    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
-    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
-    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
-    _mm_store_si128((__m128i *)(dest + 2 * stride), p4);
-    _mm_store_si128((__m128i *)(dest + 2 * stride + 16), p5);
-    _mm_store_si128((__m128i *)(dest + 3 * stride), p6);
-    _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7);
-
-    dest += 4 * stride;
-  } while (--i);
-}
index 20dd8e175059aea1e58825556ee0ed3ae700221b..f7778a453d2d20d875ff5f561a745aad2c218c67 100644 (file)
@@ -1152,7 +1152,7 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
 
 static void write_interp_filter_type(INTERPOLATIONFILTERTYPE type,
                                      struct vp9_write_bit_buffer *wb) {
-  const int type_to_literal[] = { 1, 0, 2 };
+  const int type_to_literal[] = { 1, 0, 2, 3 };
 
   vp9_wb_write_bit(wb, type == SWITCHABLE);
   if (type != SWITCHABLE)
index 5a0d746c83a37d4416a6c96ddc91e1bc0079e458..2e28a2ed8d38b5ce65bed4c2188a607f0629131f 100644 (file)
@@ -30,7 +30,6 @@ typedef struct {
 } PARTITION_INFO;
 
 // Structure to hold snapshot of coding context during the mode picking process
-// TODO Do we need all of these?
 typedef struct {
   MODE_INFO mic;
   PARTITION_INFO partition_info;
index 0f1aa594e94582beb752acb348c4cbfead93605a..32c136e0f70e4c0da27c2ce80644de654f1d4604 100644 (file)
@@ -22,23 +22,28 @@ unsigned int active_section = 0;
 #endif
 
 const unsigned int vp9_prob_cost[256] = {
-  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
-  1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
-  767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
-  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
-  511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
-  428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
-  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
-  304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
-  255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
-  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
-  172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
-  137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
-  105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
-  75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
-  48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
-  22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
-};
+  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161,
+  1129, 1099, 1072, 1046, 1023, 1000, 979,  959,  940,  922,  905,  889,
+  873,  858,  843,  829,  816,  803,  790,  778,  767,  755,  744,  733,
+  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
+  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,
+  534,  528,  522,  516,  511,  505,  499,  494,  488,  483,  477,  472,
+  467,  462,  457,  452,  447,  442,  437,  433,  428,  424,  419,  415,
+  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
+  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,
+  317,  314,  311,  307,  304,  301,  297,  294,  291,  288,  285,  281,
+  278,  275,  272,  269,  266,  263,  260,  257,  255,  252,  249,  246,
+  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
+  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,
+  181,  179,  177,  174,  172,  170,  168,  165,  163,  161,  159,  156,
+  154,  152,  150,  148,  145,  143,  141,  139,  137,  135,  133,  131,
+  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
+  105,  103,  101,  99,   97,   95,   93,   92,   90,   88,   86,   84,
+  82,   81,   79,   77,   75,   73,   72,   70,   68,   66,   65,   63,
+  61,   60,   58,   56,   55,   53,   51,   50,   48,   46,   45,   43,
+  41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
+  22,   21,   19,   18,   16,   15,   13,   12,   10,   9,    7,    6,
+  4,    3,    1,    1};
 
 void vp9_start_encode(vp9_writer *br, uint8_t *source) {
   br->lowvalue = 0;
index b9c300033ba880d2eaf05326bc8acd318c6147a7..27e4cd07fc30adf4cb2408b382ff827e4e640d13 100644 (file)
@@ -593,11 +593,11 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
 
 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
    pixel. */
-void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
+void vp9_short_walsh4x4_c(int16_t *input, int16_t *output, int pitch) {
   int i;
   int a1, b1, c1, d1, e1;
-  short *ip = input;
-  short *op = output;
+  int16_t *ip = input;
+  int16_t *op = output;
   int pitch_short = pitch >> 1;
 
   for (i = 0; i < 4; i++) {
@@ -647,7 +647,7 @@ void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
   }
 }
 
-void vp9_short_walsh8x4_c(short *input, short *output, int pitch) {
+void vp9_short_walsh8x4_c(int16_t *input, int16_t *output, int pitch) {
   vp9_short_walsh4x4_c(input,   output,    pitch);
   vp9_short_walsh4x4_c(input + 4, output + 16, pitch);
 }
index d138580b0ea2f4653df3b7c6567d2f4817b10938..bd48a9baa0a61d40d1cda1fe99bdd85d99d2009f 100644 (file)
@@ -173,8 +173,9 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
           tmp = sortlist[j - 1];
           sortlist[j - 1] = sortlist[j];
           sortlist[j] = tmp;
-        } else
-        break;
+        } else {
+          break;
+        }
       }
     }
 
@@ -246,13 +247,11 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
 #if OUTPUT_NORM_ACT_STATS
     fprintf(f, "\n");
 #endif
-
   }
 
 #if OUTPUT_NORM_ACT_STATS
   fclose(f);
 #endif
-
 }
 #endif  // USE_ACT_INDEX
 
@@ -317,7 +316,6 @@ static void build_activity_map(VP9_COMP *cpi) {
   // Calculate an activity index number of each mb
   calc_activity_index(cpi, x);
 #endif
-
 }
 
 // Macroblock activity masking
@@ -360,7 +358,6 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
 
   assert(mi->mbmi.mode < MB_MODE_COUNT);
-  assert(mb_mode_index < MAX_MODES);
   assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);
   assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
   assert(mi->mbmi.sb_type == bsize);
@@ -422,7 +419,6 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
       THR_D207_PRED /*D207_PRED*/,
       THR_D63_PRED /*D63_PRED*/,
       THR_TM /*TM_PRED*/,
-      THR_B_PRED /*I4X4_PRED*/,
     };
     cpi->mode_chosen_counts[kf_mode_index[mi->mbmi.mode]]++;
 #endif
@@ -461,18 +457,17 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
 }
 
 void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
-                          int mb_row, int mb_col) {
-  uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, src
-      ->alpha_buffer};
-  int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, src
-      ->alpha_stride};
+                          int mi_row, int mi_col) {
+  uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                               src->alpha_buffer};
+  const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                          src->alpha_stride};
   int i;
 
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mb_row, mb_col,
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mi_row, mi_col,
                      NULL, x->e_mbd.plane[i].subsampling_x,
                      x->e_mbd.plane[i].subsampling_y);
-  }
 }
 
 static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
@@ -507,8 +502,7 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
   // cannot be used.
   xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
 
-  xd->this_mi =
-  xd->mi_8x8[0] = cm->mi + idx_str;
+  xd->this_mi = xd->mi_8x8[0] = cm->mi + idx_str;
 
   mbmi = &xd->this_mi->mbmi;
 
@@ -599,12 +593,17 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
 
   // Find best coding mode & reconstruct the MB so it is available
   // as a predictor for MBs that follow in the SB
-  if (cm->frame_type == KEY_FRAME)
+  if (cm->frame_type == KEY_FRAME) {
     vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx,
                               best_rd);
-  else
-    vp9_rd_pick_inter_mode_sb(cpi, x, mi_row, mi_col, totalrate, totaldist,
-                              bsize, ctx, best_rd);
+  } else {
+    if (bsize >= BLOCK_8X8)
+      vp9_rd_pick_inter_mode_sb(cpi, x, mi_row, mi_col, totalrate, totaldist,
+                                bsize, ctx, best_rd);
+    else
+      vp9_rd_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, totalrate,
+                                    totaldist, bsize, ctx, best_rd);
+  }
 }
 
 static void update_stats(VP9_COMP *cpi) {
@@ -683,7 +682,7 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
       return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
     default:
       assert(0);
-      return NULL ;
+      return NULL;
   }
 }
 
@@ -700,7 +699,7 @@ static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) {
       return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
     default:
       assert(0);
-      return NULL ;
+      return NULL;
   }
 }
 
@@ -951,321 +950,24 @@ static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
   }
 }
 
-static void set_block_size(VP9_COMMON * const cm, MODE_INFO **mi_8x8,
-                           BLOCK_SIZE bsize, int mis, int mi_row,
-                           int mi_col) {
-  int r, c;
-  const int bs = MAX(num_8x8_blocks_wide_lookup[bsize],
-                     num_8x8_blocks_high_lookup[bsize]);
-  const int idx_str = mis * mi_row + mi_col;
-  MODE_INFO **const mi2 = &mi_8x8[idx_str];
-
-  mi2[0] = cm->mi + idx_str;
-  mi2[0]->mbmi.sb_type = bsize;
-
-  for (r = 0; r < bs; r++)
-    for (c = 0; c < bs; c++)
-      if (mi_row + r < cm->mi_rows && mi_col + c < cm->mi_cols)
-        mi2[r * mis + c] = mi2[0];
-}
-
-typedef struct {
-  int64_t sum_square_error;
-  int64_t sum_error;
-  int count;
-  int variance;
-} var;
-
-typedef struct {
-  var none;
-  var horz[2];
-  var vert[2];
-} partition_variance;
-
-#define VT(TYPE, BLOCKSIZE) \
-  typedef struct { \
-    partition_variance vt; \
-    BLOCKSIZE split[4]; } TYPE;
-
-VT(v8x8, var)
-VT(v16x16, v8x8)
-VT(v32x32, v16x16)
-VT(v64x64, v32x32)
-
-typedef struct {
-  partition_variance *vt;
-  var *split[4];
-} vt_node;
-
-typedef enum {
-  V16X16,
-  V32X32,
-  V64X64,
-} TREE_LEVEL;
-
-static void tree_to_node(void *data, BLOCK_SIZE bsize, vt_node *node) {
-  int i;
-  switch (bsize) {
-    case BLOCK_64X64: {
-      v64x64 *vt = (v64x64 *) data;
-      node->vt = &vt->vt;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].vt.none;
-      break;
-    }
-    case BLOCK_32X32: {
-      v32x32 *vt = (v32x32 *) data;
-      node->vt = &vt->vt;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].vt.none;
-      break;
-    }
-    case BLOCK_16X16: {
-      v16x16 *vt = (v16x16 *) data;
-      node->vt = &vt->vt;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].vt.none;
-      break;
-    }
-    case BLOCK_8X8: {
-      v8x8 *vt = (v8x8 *) data;
-      node->vt = &vt->vt;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i];
-      break;
-    }
-    default:
-      node->vt = 0;
-      for (i = 0; i < 4; i++)
-        node->split[i] = 0;
-      assert(-1);
-  }
-}
-
-// Set variance values given sum square error, sum error, count.
-static void fill_variance(var *v, int64_t s2, int64_t s, int c) {
-  v->sum_square_error = s2;
-  v->sum_error = s;
-  v->count = c;
-  if (c > 0)
-    v->variance = (int)(256
-        * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
-        / v->count);
-  else
-    v->variance = 0;
-}
-
-// Combine 2 variance structures by summing the sum_error, sum_square_error,
-// and counts and then calculating the new variance.
-void sum_2_variances(var *r, var *a, var*b) {
-  fill_variance(r, a->sum_square_error + b->sum_square_error,
-                a->sum_error + b->sum_error, a->count + b->count);
-}
-
-static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
-  vt_node node;
-  tree_to_node(data, bsize, &node);
-  sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]);
-  sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]);
-  sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]);
-  sum_2_variances(&node.vt->vert[1], node.split[1], node.split[3]);
-  sum_2_variances(&node.vt->none, &node.vt->vert[0], &node.vt->vert[1]);
-}
-
-#if PERFORM_RANDOM_PARTITIONING
-static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
-    BLOCK_SIZE block_size, int mi_row,
-    int mi_col, int mi_size) {
-  VP9_COMMON * const cm = &cpi->common;
-  vt_node vt;
-  const int mis = cm->mode_info_stride;
-  int64_t threshold = 4 * cpi->common.base_qindex * cpi->common.base_qindex;
-
-  tree_to_node(data, block_size, &vt);
-
-  // split none is available only if we have more than half a block size
-  // in width and height inside the visible image
-  if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows &&
-      (rand() & 3) < 1) {
-    set_block_size(cm, m, block_size, mis, mi_row, mi_col);
-    return 1;
-  }
-
-  // vertical split is available on all but the bottom border
-  if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
-      && (rand() & 3) < 1) {
-    set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
-        mi_col);
-    return 1;
-  }
-
-  // horizontal split is available on all but the right border
-  if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
-      && (rand() & 3) < 1) {
-    set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
-        mi_col);
-    return 1;
-  }
-
-  return 0;
-}
-
-#else  // !PERFORM_RANDOM_PARTITIONING
-
-static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO **m,
-                               BLOCK_SIZE bsize, int mi_row,
-                               int mi_col, int mi_size) {
-  VP9_COMMON * const cm = &cpi->common;
-  vt_node vt;
-  const int mis = cm->mode_info_stride;
-  int64_t threshold = 50 * cpi->common.base_qindex;
-
-  tree_to_node(data, bsize, &vt);
-
-  // split none is available only if we have more than half a block size
-  // in width and height inside the visible image
-  if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows
-      && vt.vt->none.variance < threshold) {
-    set_block_size(cm, m, bsize, mis, mi_row, mi_col);
-    return 1;
-  }
-
-  // vertical split is available on all but the bottom border
-  if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
-      && vt.vt->vert[1].variance < threshold) {
-    set_block_size(cm, m, get_subsize(bsize, PARTITION_VERT), mis, mi_row,
-                   mi_col);
-    return 1;
-  }
-
-  // horizontal split is available on all but the right border
-  if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
-      && vt.vt->horz[1].variance < threshold) {
-    set_block_size(cm, m, get_subsize(bsize, PARTITION_HORZ), mis, mi_row,
-                   mi_col);
-    return 1;
-  }
-
-  return 0;
-}
-#endif  // PERFORM_RANDOM_PARTITIONING
-
-static void choose_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
-                                int mi_row, int mi_col) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK *x = &cpi->mb;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+static int sb_has_motion(VP9_COMP *cpi, MODE_INFO **prev_mi_8x8) {
+  VP9_COMMON *const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
-  // TODO(JBB): More experimentation or testing of this threshold;
-  int64_t threshold = 4;
-  int i, j, k;
-  v64x64 vt;
-  unsigned char * s;
-  int sp;
-  const unsigned char * d;
-  int dp;
-  int pixels_wide = 64, pixels_high = 64;
-
-  vp9_zero(vt);
-  set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
-
-  if (xd->mb_to_right_edge < 0)
-    pixels_wide += (xd->mb_to_right_edge >> 3);
-
-  if (xd->mb_to_bottom_edge < 0)
-    pixels_high += (xd->mb_to_bottom_edge >> 3);
-
-  s = x->plane[0].src.buf;
-  sp = x->plane[0].src.stride;
-
-  // TODO(JBB): Clearly the higher the quantizer the fewer partitions we want
-  // but this needs more experimentation.
-  threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex;
-
-  d = vp9_64x64_zeros;
-  dp = 64;
-  if (cm->frame_type != KEY_FRAME) {
-    int_mv nearest_mv, near_mv;
-    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, LAST_FRAME)];
-    YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];
-    YV12_BUFFER_CONFIG *second_ref_fb = NULL;
-
-    setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col,
-                     &xd->scale_factor[0]);
-    setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
-                     &xd->scale_factor[1]);
+  int block_row, block_col;
 
-    xd->this_mi->mbmi.ref_frame[0] = LAST_FRAME;
-    xd->this_mi->mbmi.sb_type = BLOCK_64X64;
-    vp9_find_best_ref_mvs(xd,
-                          mi_8x8[0]->mbmi.ref_mvs[mi_8x8[0]->mbmi.ref_frame[0]],
-                          &nearest_mv, &near_mv);
-
-    xd->this_mi->mbmi.mv[0] = nearest_mv;
-    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64);
-
-    d = xd->plane[0].dst.buf;
-    dp = xd->plane[0].dst.stride;
-  }
-
-  // Fill in the entire tree of 8x8 variances for splits.
-  for (i = 0; i < 4; i++) {
-    const int x32_idx = ((i & 1) << 5);
-    const int y32_idx = ((i >> 1) << 5);
-    for (j = 0; j < 4; j++) {
-      const int x16_idx = x32_idx + ((j & 1) << 4);
-      const int y16_idx = y32_idx + ((j >> 1) << 4);
-      v16x16 *vst = &vt.split[i].split[j];
-      for (k = 0; k < 4; k++) {
-        int x_idx = x16_idx + ((k & 1) << 3);
-        int y_idx = y16_idx + ((k >> 1) << 3);
-        unsigned int sse = 0;
-        int sum = 0;
-        if (x_idx < pixels_wide && y_idx < pixels_high)
-          vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
-                              d + y_idx * dp + x_idx, dp, &sse, &sum);
-        fill_variance(&vst->split[k].vt.none, sse, sum, 64);
-      }
-    }
-  }
-  // Fill the rest of the variance tree by summing the split partition
-  // values.
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
-    }
-    fill_variance_tree(&vt.split[i], BLOCK_32X32);
-  }
-  fill_variance_tree(&vt, BLOCK_64X64);
-  // Now go through the entire structure,  splitting every block size until
-  // we get to one that's got a variance lower than our threshold,  or we
-  // hit 8x8.
-  if (!set_vt_partitioning(cpi, &vt, mi_8x8, BLOCK_64X64, mi_row, mi_col,
-                           4)) {
-    for (i = 0; i < 4; ++i) {
-      const int x32_idx = ((i & 1) << 2);
-      const int y32_idx = ((i >> 1) << 2);
-      if (!set_vt_partitioning(cpi, &vt.split[i], mi_8x8, BLOCK_32X32,
-                               (mi_row + y32_idx), (mi_col + x32_idx), 2)) {
-        for (j = 0; j < 4; ++j) {
-          const int x16_idx = ((j & 1) << 1);
-          const int y16_idx = ((j >> 1) << 1);
-          if (!set_vt_partitioning(cpi, &vt.split[i].split[j], mi_8x8,
-                                   BLOCK_16X16,
-                                   (mi_row + y32_idx + y16_idx),
-                                   (mi_col + x32_idx + x16_idx), 1)) {
-            for (k = 0; k < 4; ++k) {
-              const int x8_idx = (k & 1);
-              const int y8_idx = (k >> 1);
-              set_block_size(cm, mi_8x8, BLOCK_8X8, mis,
-                             (mi_row + y32_idx + y16_idx + y8_idx),
-                             (mi_col + x32_idx + x16_idx + x8_idx));
-            }
-          }
+  if (cm->prev_mi) {
+    for (block_row = 0; block_row < 8; ++block_row) {
+      for (block_col = 0; block_col < 8; ++block_col) {
+        MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col];
+        if (prev_mi) {
+          if (abs(prev_mi->mbmi.mv[0].as_mv.row) >= 8 ||
+              abs(prev_mi->mbmi.mv[0].as_mv.col) >= 8)
+            return 1;
         }
       }
     }
   }
+  return 0;
 }
 
 static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
@@ -1578,63 +1280,64 @@ static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8,
 static void rd_auto_partition_range(VP9_COMP *cpi, int row, int col,
                                     BLOCK_SIZE *min_block_size,
                                     BLOCK_SIZE *max_block_size) {
+  VP9_COMMON * const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   MODE_INFO ** mi_8x8 = xd->mi_8x8;
+  MODE_INFO ** prev_mi_8x8 = xd->prev_mi_8x8;
+
   const int left_in_image = xd->left_available && mi_8x8[-1];
   const int above_in_image = xd->up_available &&
                              mi_8x8[-xd->mode_info_stride];
   MODE_INFO ** above_sb64_mi_8x8;
   MODE_INFO ** left_sb64_mi_8x8;
 
-  // Frequency check
-  if (cpi->sf.auto_min_max_partition_count <= 0) {
-    cpi->sf.auto_min_max_partition_count =
-      cpi->sf.auto_min_max_partition_interval;
+  int row8x8_remaining = cm->cur_tile_mi_row_end - row;
+  int col8x8_remaining = cm->cur_tile_mi_col_end - col;
+  int bh, bw;
+
+  // Trap case where we do not have a prediction.
+  if (!left_in_image && !above_in_image &&
+      ((cm->frame_type == KEY_FRAME) || !cm->prev_mi)) {
     *min_block_size = BLOCK_4X4;
     *max_block_size = BLOCK_64X64;
   } else {
-    --cpi->sf.auto_min_max_partition_count;
-
-    // Set default values if no left or above neighbour
-    if (!left_in_image && !above_in_image) {
-      *min_block_size = BLOCK_4X4;
-      *max_block_size = BLOCK_64X64;
-    } else {
-      VP9_COMMON *const cm = &cpi->common;
-      int row8x8_remaining = cm->cur_tile_mi_row_end - row;
-      int col8x8_remaining = cm->cur_tile_mi_col_end - col;
-      int bh, bw;
-
-      // Default "min to max" and "max to min"
-      *min_block_size = BLOCK_64X64;
-      *max_block_size = BLOCK_4X4;
-
-      // Find the min and max partition sizes used in the left SB64
-      if (left_in_image) {
-        left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE];
-        get_sb_partition_size_range(cpi, left_sb64_mi_8x8,
-                                    min_block_size, max_block_size);
-      }
-
-      // Find the min and max partition sizes used in the above SB64 taking
-      // the values found for left as a starting point.
-      if (above_in_image) {
-        above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE];
-        get_sb_partition_size_range(cpi, above_sb64_mi_8x8,
-                                    min_block_size, max_block_size);
-      }
+    // Default "min to max" and "max to min"
+    *min_block_size = BLOCK_64X64;
+    *max_block_size = BLOCK_4X4;
+
+    // NOTE: each call to get_sb_partition_size_range() uses the previous
+    // passed in values for min and max as a starting point.
+    //
+    // Find the min and max partition used in previous frame at this location
+    if (cm->prev_mi && (cm->frame_type != KEY_FRAME)) {
+      get_sb_partition_size_range(cpi, prev_mi_8x8,
+                                  min_block_size, max_block_size);
+    }
 
-      // Give a bit of leaway either side of the observed min and max
-      *min_block_size = min_partition_size[*min_block_size];
-      *max_block_size = max_partition_size[*max_block_size];
+    // Find the min and max partition sizes used in the left SB64
+    if (left_in_image) {
+      left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE];
+      get_sb_partition_size_range(cpi, left_sb64_mi_8x8,
+                                  min_block_size, max_block_size);
+    }
 
-      // Check border cases where max and min from neighbours may not be legal.
-      *max_block_size = find_partition_size(*max_block_size,
-                                            row8x8_remaining, col8x8_remaining,
-                                            &bh, &bw);
-      *min_block_size = MIN(*min_block_size, *max_block_size);
+    // Find the min and max partition sizes used in the above SB64.
+    if (above_in_image) {
+      above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE];
+      get_sb_partition_size_range(cpi, above_sb64_mi_8x8,
+                                  min_block_size, max_block_size);
     }
   }
+
+  // Give a bit of leaway either side of the observed min and max
+  *min_block_size = min_partition_size[*min_block_size];
+  *max_block_size = max_partition_size[*max_block_size];
+
+  // Check border cases where max and min from neighbours may not be legal.
+  *max_block_size = find_partition_size(*max_block_size,
+                                        row8x8_remaining, col8x8_remaining,
+                                        &bh, &bw);
+  *min_block_size = MIN(*min_block_size, *max_block_size);
 }
 
 static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
@@ -1885,12 +1588,12 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
         best_dist = sum_dist;
         best_rd = sum_rd;
         *(get_sb_partitioning(x, bsize)) = subsize;
-      } else {
-        // skip rectangular partition test when larger block size
-        // gives better rd cost
-        if (cpi->sf.less_rectangular_check)
-          do_rect &= !partition_none_allowed;
       }
+    } else {
+      // skip rectangular partition test when larger block size
+      // gives better rd cost
+      if (cpi->sf.less_rectangular_check)
+        do_rect &= !partition_none_allowed;
     }
     partition_split_done = 1;
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2055,12 +1758,12 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
     int dummy_rate;
     int64_t dummy_dist;
 
-    vpx_memset(cpi->mb.pred_mv, 0, sizeof(cpi->mb.pred_mv));
+    vp9_zero(cpi->mb.pred_mv);
 
     if (cpi->sf.reference_masking)
       rd_pick_reference_frame(cpi, mi_row, mi_col);
 
-    if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning ||
+    if (cpi->sf.use_lastframe_partitioning ||
         cpi->sf.use_one_partition_size_always ) {
       const int idx_str = cm->mode_info_stride * mi_row + mi_col;
       MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
@@ -2072,17 +1775,16 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
         set_partitioning(cpi, mi_8x8, mi_row, mi_col);
         rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
-      } else if (cpi->sf.partition_by_variance) {
-        choose_partitioning(cpi, cm->mi_grid_visible, mi_row, mi_col);
-        rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                         &dummy_rate, &dummy_dist, 1);
       } else {
         if ((cpi->common.current_video_frame
             % cpi->sf.last_partitioning_redo_frequency) == 0
             || cm->prev_mi == 0
             || cpi->common.show_frame == 0
             || cpi->common.frame_type == KEY_FRAME
-            || cpi->is_src_frame_alt_ref) {
+            || cpi->is_src_frame_alt_ref
+            || ((cpi->sf.use_lastframe_partitioning ==
+                 LAST_FRAME_PARTITION_LOW_MOTION) &&
+                 sb_has_motion(cpi, prev_mi_8x8))) {
           // If required set upper and lower partition size limits
           if (cpi->sf.auto_min_max_partition_size) {
             set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
@@ -2164,8 +1866,8 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
     // printf("Switching to lossless\n");
     cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
     cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
+    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_iwht4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add = vp9_iwht4x4_16_add;
     cpi->mb.optimize = 0;
     cpi->common.lf.filter_level = 0;
     cpi->zbin_mode_boost_enabled = 0;
@@ -2174,8 +1876,8 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
     // printf("Not lossless\n");
     cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
     cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
+    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add = vp9_idct4x4_16_add;
   }
 }
 
@@ -2216,13 +1918,11 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
   xd->mi_8x8 = cm->mi_grid_visible;
   // required for vp9_frame_init_quantizer
-  xd->this_mi =
-  xd->mi_8x8[0] = cm->mi;
+  xd->this_mi = xd->mi_8x8[0] = cm->mi;
   xd->mic_stream_ptr = cm->mi;
 
   xd->last_mi = cm->prev_mi;
 
-
   vp9_zero(cpi->NMVcount);
   vp9_zero(cpi->coef_counts);
   vp9_zero(cm->counts.eob_branch);
@@ -2310,7 +2010,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   // Keep record of the total distortion this time around for future use
   cpi->last_frame_distortion = cpi->frame_distortion;
 #endif
-
 }
 
 static int check_dual_ref_flags(VP9_COMP *cpi) {
@@ -2736,7 +2435,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
     int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[0])];
     YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];
     YV12_BUFFER_CONFIG *second_ref_fb = NULL;
-    if (mbmi->ref_frame[1] > 0) {
+    if (has_second_ref(mbmi)) {
       idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[1])];
       second_ref_fb = &cm->yv12_fb[idx];
     }
index 399196927cb805c2aaa94a1c12f1d052e7437e62..3e9f5381c06ae66b0cc61cb6bca8891a168965ab 100644 (file)
@@ -17,6 +17,6 @@ struct yv12_buffer_config;
 
 void vp9_setup_src_planes(struct macroblock *x,
                           const struct yv12_buffer_config *src,
-                          int mb_row, int mb_col);
+                          int mi_row, int mi_col);
 
 #endif  // VP9_ENCODER_VP9_ENCODEFRAME_H_
index c5e5dff08b27f9303c503cbd6886598e193add6a..32b4593fcb864d2c0d635eec25e5cce2921e9074 100644 (file)
@@ -9,7 +9,7 @@
  */
 
 #include "./vpx_config.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/encoder/vp9_encodemb.h"
index 6b9109c944936497ea788f2734c07898c594ee91..1f36f161865f4ffbe2594eb476ca39e85a6b81b8 100644 (file)
@@ -8,16 +8,21 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+
+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
-#include "vp9/encoder/vp9_encodemb.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_reconinter.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/common/vp9_systemdependent.h"
-#include "vp9_rtcd.h"
+
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_tokenize.h"
 
 DECLARE_ALIGNED(16, extern const uint8_t,
                 vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
@@ -47,28 +52,6 @@ static void inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob,
     xd->inv_txm4x4_add(dqcoeff, dest, stride);
 }
 
-static void inverse_transform_b_8x8_add(int eob,
-                                        int16_t *dqcoeff, uint8_t *dest,
-                                        int stride) {
-  if (eob <= 1)
-    vp9_short_idct8x8_1_add(dqcoeff, dest, stride);
-  else if (eob <= 10)
-    vp9_short_idct10_8x8_add(dqcoeff, dest, stride);
-  else
-    vp9_short_idct8x8_add(dqcoeff, dest, stride);
-}
-
-static void inverse_transform_b_16x16_add(int eob,
-                                          int16_t *dqcoeff, uint8_t *dest,
-                                          int stride) {
-  if (eob <= 1)
-    vp9_short_idct16x16_1_add(dqcoeff, dest, stride);
-  else if (eob <= 10)
-    vp9_short_idct10_16x16_add(dqcoeff, dest, stride);
-  else
-    vp9_short_idct16x16_add(dqcoeff, dest, stride);
-}
-
 static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const MACROBLOCKD *const xd = &x->e_mbd;
@@ -97,8 +80,7 @@ void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   vp9_subtract_sbuv(x, bsize);
 }
 
-
-#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
+#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
 typedef struct vp9_token_state vp9_token_state;
 
 struct vp9_token_state {
@@ -109,7 +91,7 @@ struct vp9_token_state {
   short         qc;
 };
 
-// TODO: experiments to find optimal multiple numbers
+// TODO(jimbankoski): experiment to find optimal RD numbers.
 #define Y1_RD_MULT 4
 #define UV_RD_MULT 2
 
@@ -172,7 +154,7 @@ static void optimize_b(MACROBLOCK *mb,
   assert((!type && !plane) || (type && plane));
   dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
   qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
-  get_scan_and_band(xd, tx_size, type, ib, &scan, &band_translate);
+  get_scan_and_band(xd, tx_size, type, ib, &scan, &nb, &band_translate);
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
@@ -191,7 +173,6 @@ static void optimize_b(MACROBLOCK *mb,
   for (i = 0; i < eob; i++)
     token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
         qcoeff_ptr[scan[i]]].token];
-  nb = vp9_get_coef_neighbors_handle(scan);
 
   for (i = eob; i-- > i0;) {
     int base_bits, d2, dx;
@@ -290,11 +271,10 @@ static void optimize_b(MACROBLOCK *mb,
       best_index[i][1] = best;
       /* Finally, make this the new head of the trellis. */
       next = i;
-    }
-    /* There's no choice to make for a zero coefficient, so we don't
-     *  add a new trellis node, but we do need to update the costs.
-     */
-    else {
+    } else {
+      /* There's no choice to make for a zero coefficient, so we don't
+       *  add a new trellis node, but we do need to update the costs.
+       */
       band = get_coef_band(band_translate, i + 1);
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
@@ -477,12 +457,10 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
       vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
       break;
     case TX_16X16:
-      inverse_transform_b_16x16_add(pd->eobs[block], dqcoeff, dst,
-                                    pd->dst.stride);
+      vp9_idct_add_16x16(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
       break;
     case TX_8X8:
-      inverse_transform_b_8x8_add(pd->eobs[block], dqcoeff, dst,
-                                  pd->dst.stride);
+      vp9_idct_add_8x8(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
       break;
     case TX_4X4:
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
@@ -598,12 +576,8 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
-      if (!x->skip_encode && *eob) {
-        if (tx_type == DCT_DCT)
-          inverse_transform_b_16x16_add(*eob, dqcoeff, dst, pd->dst.stride);
-        else
-          vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
-      }
+      if (!x->skip_encode && *eob)
+        vp9_iht_add_16x16(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
       break;
     case TX_8X8:
       tx_type = get_tx_type_8x8(pd->plane_type, xd);
@@ -627,12 +601,8 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
-      if (!x->skip_encode && *eob) {
-        if (tx_type == DCT_DCT)
-          inverse_transform_b_8x8_add(*eob, dqcoeff, dst, pd->dst.stride);
-        else
-          vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
-      }
+      if (!x->skip_encode && *eob)
+        vp9_iht_add_8x8(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
       break;
     case TX_4X4:
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
index 54e69fd9d860f4054b1b1ba20f79bbd964f2880a..61dd7358e0e30f82d8043fd5c114eda6f5fd705a 100644 (file)
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-typedef enum {
-  RD_DC_PRED = DC_PRED,
-  RD_V_PRED =  V_PRED,
-  RD_H_PRED = H_PRED,
-  RD_D45_PRED = D45_PRED,
-  RD_D135_PRED = D135_PRED,
-  RD_D117_PRED = D117_PRED,
-  RD_D153_PRED = D153_PRED,
-  RD_D207_PRED = D207_PRED,
-  RD_D63_PRED = D63_PRED,
-  RD_TM_PRED = TM_PRED,
-  RD_NEARESTMV = NEARESTMV,
-  RD_NEARMV = NEARMV,
-  RD_ZEROMV = ZEROMV,
-  RD_NEWMV = NEWMV,
-  RD_I4X4_PRED,
-  RD_SPLITMV,
-  RD_MODE_COUNT
-} RD_PREDICTION_MODE;
-
 typedef struct {
-  RD_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE mode;
   MV_REFERENCE_FRAME ref_frame;
   MV_REFERENCE_FRAME second_ref_frame;
 } MODE_DEFINITION;
 
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame;
+  MV_REFERENCE_FRAME second_ref_frame;
+} REF_DEFINITION;
+
 struct optimize_ctx {
   ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
index db08ee856f9ccc75683c3d663b12d432dbbd3483..04a4172a5e42ce043171d45b53c24584f3e6ac0c 100644 (file)
@@ -8,13 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
 
 #include "vp9/common/vp9_common.h"
-#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_systemdependent.h"
+#include "vp9/encoder/vp9_encodemv.h"
 
-#include <math.h>
 
 #ifdef ENTROPY_STATS
 extern unsigned int active_section;
index eaa3bd1838b825665b783b8e871e38265be66733..471931349a8940f98f5993fb02055faf00bfc7ae 100644 (file)
@@ -569,7 +569,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
                      mb_row << 1,
                      1 << mi_height_log2(xd->this_mi->mbmi.sb_type),
                      mb_col << 1,
-                     1 << mi_height_log2(xd->this_mi->mbmi.sb_type));
+                     1 << mi_width_log2(xd->this_mi->mbmi.sb_type));
 
       // do intra 16x16 prediction
       this_error = vp9_encode_intra(x, use_dc_pred);
index 81445a97f8da607e7f627895b94d512e93f7b87a..c28c868457a004ac373ac65afe780dcdf489a220 100644 (file)
@@ -10,7 +10,7 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/common/vp9_extend.h"
@@ -77,7 +77,7 @@ struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
         goto bail;
   }
   return ctx;
-bail:
+ bail:
   vp9_lookahead_destroy(ctx);
   return NULL;
 }
index eb5211d16facd4525d886a3047941b3dc8a4c453..f83fcc5315731ca73e43fb615d77cba76a21a0d6 100644 (file)
@@ -49,9 +49,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit,
+  best_err = vp9_hex_search(x, &ref_full.as_mv, step_param, x->errorperbit,
                             0, &v_fn_ptr,
-                            0, ref_mv, dst_mv);
+                            0, &ref_mv->as_mv, &dst_mv->as_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -103,7 +103,8 @@ static int do_16x16_motion_search(VP9_COMP *cpi, int_mv *ref_mv, int_mv *dst_mv,
     dst_mv->as_int = tmp_mv.as_int;
   }
 
-  // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+  // If the current best reference mv is not centered on 0,0 then do a 0,0
+  // based search as well.
   if (ref_mv->as_int) {
     unsigned int tmp_err;
     int_mv zero_ref_mv, tmp_mv;
@@ -217,7 +218,8 @@ static void update_mbgraph_mb_stats
     stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
   }
 
-  // Alt-ref frame MV search, if it exists and is different than last/golden frame
+  // Do an Alt-ref frame MV search, if it exists and is different than
+  // last/golden frame.
   if (alt_ref) {
     int a_motion_error;
     xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
@@ -246,7 +248,8 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   int_mv arf_top_mv, gld_top_mv;
   MODE_INFO mi_local = { { 0 } };
 
-  // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+  // Set up limit values for motion vectors to prevent them extending outside
+  // the UMV borders.
   arf_top_mv.as_int = 0;
   gld_top_mv.as_int = 0;
   x->mv_row_min     = -BORDER_MV_PIXELS_B16;
@@ -266,7 +269,8 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
     int arf_y_in_offset = arf_y_offset;
     int gld_y_in_offset = gld_y_offset;
 
-    // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+    // Set up limit values for motion vectors to prevent them extending outside
+    // the UMV borders.
     arf_left_mv.as_int = arf_top_mv.as_int;
     gld_left_mv.as_int = gld_top_mv.as_int;
     x->mv_col_min      = -BORDER_MV_PIXELS_B16;
@@ -407,7 +411,8 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
   for (i = 0; i < n_frames; i++) {
     MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
     vpx_memset(frame_stats->mb_stats, 0,
-               cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
+               cm->mb_rows * cm->mb_cols *
+               sizeof(*cpi->mbgraph_stats[i].mb_stats));
   }
 
   // do motion search to find contribution of each reference to data
index ad8ea70b760caed82c59b0e675fdfaa5ad543c2b..44eaa657ccd82256939ee90ed25697467576abbf 100644 (file)
@@ -680,10 +680,10 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
 
 #define CHECK_POINT \
   {\
-    if (this_mv.as_mv.col < x->mv_col_min) continue;\
-    if (this_mv.as_mv.col > x->mv_col_max) continue;\
-    if (this_mv.as_mv.row < x->mv_row_min) continue;\
-    if (this_mv.as_mv.row > x->mv_row_max) continue;\
+    if (this_mv.col < x->mv_col_min) continue;\
+    if (this_mv.col > x->mv_col_max) continue;\
+    if (this_mv.row < x->mv_row_min) continue;\
+    if (this_mv.row > x->mv_row_max) continue;\
   }
 
 #define CHECK_BETTER \
@@ -691,7 +691,7 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
     if (thissad < bestsad)\
     {\
       if (use_mvcost) \
-        thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, \
+        thissad += mvsad_err_cost(&this_mv, &fcenter_mv.as_mv, \
                                   mvjsadcost, mvsadcost, \
                                   sad_per_bit);\
       if (thissad < bestsad)\
@@ -716,14 +716,14 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
 static int vp9_pattern_search(MACROBLOCK *x,
-                              int_mv *ref_mv,
+                              MV *ref_mv,
                               int search_param,
                               int sad_per_bit,
                               int do_init_search,
                               int do_refine,
                               const vp9_variance_fn_ptr_t *vfp,
                               int use_mvcost,
-                              int_mv *center_mv, int_mv *best_mv,
+                              const MV *center_mv, MV *best_mv,
                               const int num_candidates[MAX_PATTERN_SCALES],
                               const MV candidates[MAX_PATTERN_SCALES]
                                                  [MAX_PATTERN_CANDIDATES]) {
@@ -736,7 +736,7 @@ static int vp9_pattern_search(MACROBLOCK *x,
   int what_stride = x->plane[0].src.stride;
   int in_what_stride = xd->plane[0].pre[0].stride;
   int br, bc;
-  int_mv this_mv;
+  MV this_mv;
   int bestsad = INT_MAX;
   int thissad;
   uint8_t *base_offset;
@@ -749,22 +749,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
   int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+  fcenter_mv.as_mv.row = center_mv->row >> 3;
+  fcenter_mv.as_mv.col = center_mv->col >> 3;
 
   // adjust ref_mv to make sure it is within MV range
-  clamp_mv(&ref_mv->as_mv,
-           x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  br = ref_mv->as_mv.row;
-  bc = ref_mv->as_mv.col;
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  br = ref_mv->row;
+  bc = ref_mv->col;
 
   // Work out the start point for the search
   base_offset = (uint8_t *)(xd->plane[0].pre[0].buf);
   this_offset = base_offset + (br * in_what_stride) + bc;
-  this_mv.as_mv.row = br;
-  this_mv.as_mv.col = bc;
+  this_mv.row = br;
+  this_mv.col = bc;
   bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff)
-                + mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                + mvsad_err_cost(&this_mv, &fcenter_mv.as_mv,
                                  mvjsadcost, mvsadcost, sad_per_bit);
 
   // Search all possible scales upto the search param around the center point
@@ -778,21 +777,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
       CHECK_BOUNDS((1 << t))
       if (all_in) {
         for (i = 0; i < num_candidates[t]; i++) {
-          this_mv.as_mv.row = br + candidates[t][i].row;
-          this_mv.as_mv.col = bc + candidates[t][i].col;
-          this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
-              this_mv.as_mv.col;
+          this_mv.row = br + candidates[t][i].row;
+          this_mv.col = bc + candidates[t][i].col;
+          this_offset = base_offset + (this_mv.row * in_what_stride) +
+                                       this_mv.col;
           thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                              bestsad);
           CHECK_BETTER
         }
       } else {
         for (i = 0; i < num_candidates[t]; i++) {
-          this_mv.as_mv.row = br + candidates[t][i].row;
-          this_mv.as_mv.col = bc + candidates[t][i].col;
+          this_mv.row = br + candidates[t][i].row;
+          this_mv.col = bc + candidates[t][i].col;
           CHECK_POINT
-          this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
-                        this_mv.as_mv.col;
+          this_offset = base_offset + (this_mv.row * in_what_stride) +
+                                       this_mv.col;
           thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                              bestsad);
           CHECK_BETTER
@@ -822,21 +821,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
         CHECK_BOUNDS((1 << s))
         if (all_in) {
           for (i = 0; i < num_candidates[s]; i++) {
-            this_mv.as_mv.row = br + candidates[s][i].row;
-            this_mv.as_mv.col = bc + candidates[s][i].col;
-            this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
-                this_mv.as_mv.col;
+            this_mv.row = br + candidates[s][i].row;
+            this_mv.col = bc + candidates[s][i].col;
+            this_offset = base_offset + (this_mv.row * in_what_stride) +
+                                         this_mv.col;
             thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                                bestsad);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < num_candidates[s]; i++) {
-            this_mv.as_mv.row = br + candidates[s][i].row;
-            this_mv.as_mv.col = bc + candidates[s][i].col;
+            this_mv.row = br + candidates[s][i].row;
+            this_mv.col = bc + candidates[s][i].col;
             CHECK_POINT
-            this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
-                          this_mv.as_mv.col;
+            this_offset = base_offset + (this_mv.row * in_what_stride) +
+                                         this_mv.col;
             thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                                bestsad);
             CHECK_BETTER
@@ -860,25 +859,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
         get_next_chkpts(next_chkpts_indices, k, num_candidates[s]);
         if (all_in) {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            this_mv.as_mv.row = br +
-                candidates[s][next_chkpts_indices[i]].row;
-            this_mv.as_mv.col = bc +
-                candidates[s][next_chkpts_indices[i]].col;
-            this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
-                this_mv.as_mv.col;
+            this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
+            this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col;
+            this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+                                         this_mv.col;
             thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                                bestsad);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            this_mv.as_mv.row = br +
-                candidates[s][next_chkpts_indices[i]].row;
-            this_mv.as_mv.col = bc +
-                candidates[s][next_chkpts_indices[i]].col;
+            this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
+            this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col;
             CHECK_POINT
-            this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
-                          this_mv.as_mv.col;
+            this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+                                         this_mv.col;
             thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                                bestsad);
             CHECK_BETTER
@@ -905,21 +900,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
       CHECK_BOUNDS(1)
       if (all_in) {
         for (i = 0; i < 4; i++) {
-          this_mv.as_mv.row = br + neighbors[i].row;
-          this_mv.as_mv.col = bc + neighbors[i].col;
-          this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
-              this_mv.as_mv.col;
+          this_mv.row = br + neighbors[i].row;
+          this_mv.col = bc + neighbors[i].col;
+          this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+                                       this_mv.col;
           thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                              bestsad);
           CHECK_BETTER
         }
       } else {
         for (i = 0; i < 4; i++) {
-          this_mv.as_mv.row = br + neighbors[i].row;
-          this_mv.as_mv.col = bc + neighbors[i].col;
+          this_mv.row = br + neighbors[i].row;
+          this_mv.col = bc + neighbors[i].col;
           CHECK_POINT
-          this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
-                        this_mv.as_mv.col;
+          this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+                                       this_mv.col;
           thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                              bestsad);
           CHECK_BETTER
@@ -935,32 +930,32 @@ static int vp9_pattern_search(MACROBLOCK *x,
     }
   }
 
-  best_mv->as_mv.row = br;
-  best_mv->as_mv.col = bc;
+  best_mv->row = br;
+  best_mv->col = bc;
 
-  this_offset = base_offset + (best_mv->as_mv.row * (in_what_stride)) +
-      best_mv->as_mv.col;
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
+  this_offset = base_offset + (best_mv->row * in_what_stride) +
+                               best_mv->col;
+  this_mv.row = best_mv->row * 8;
+  this_mv.col = best_mv->col * 8;
   if (bestsad == INT_MAX)
     return INT_MAX;
 
   return vfp->vf(what, what_stride, this_offset, in_what_stride,
                  (unsigned int *)&bestsad) +
-         use_mvcost ? mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+         use_mvcost ? mv_err_cost(&this_mv, center_mv,
                                   x->nmvjointcost, x->mvcost, x->errorperbit)
                     : 0;
 }
 
 
 int vp9_hex_search(MACROBLOCK *x,
-                   int_mv *ref_mv,
+                   MV *ref_mv,
                    int search_param,
                    int sad_per_bit,
                    int do_init_search,
                    const vp9_variance_fn_ptr_t *vfp,
                    int use_mvcost,
-                   int_mv *center_mv, int_mv *best_mv) {
+                   const MV *center_mv, MV *best_mv) {
   // First scale has 8-closest points, the rest have 6 points in hex shape
   // at increasing scales
   static const int hex_num_candidates[MAX_PATTERN_SCALES] = {
@@ -989,14 +984,14 @@ int vp9_hex_search(MACROBLOCK *x,
 }
 
 int vp9_bigdia_search(MACROBLOCK *x,
-                      int_mv *ref_mv,
+                      MV *ref_mv,
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
-                      int_mv *center_mv,
-                      int_mv *best_mv) {
+                      const MV *center_mv,
+                      MV *best_mv) {
   // First scale has 4-closest points, the rest have 8 points in diamond
   // shape at increasing scales
   static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
@@ -1023,22 +1018,21 @@ int vp9_bigdia_search(MACROBLOCK *x,
     {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024},
       {-512, 512}, {-1024, 0}},
   };
-  return
-      vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                         do_init_search, 0, vfp, use_mvcost,
-                         center_mv, best_mv,
-                         bigdia_num_candidates, bigdia_candidates);
+  return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                            do_init_search, 0, vfp, use_mvcost,
+                            center_mv, best_mv,
+                            bigdia_num_candidates, bigdia_candidates);
 }
 
 int vp9_square_search(MACROBLOCK *x,
-                      int_mv *ref_mv,
+                      MV *ref_mv,
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
-                      int_mv *center_mv,
-                      int_mv *best_mv) {
+                      const MV *center_mv,
+                      MV *best_mv) {
   // All scales have 8 closest points in square shape
   static const int square_num_candidates[MAX_PATTERN_SCALES] = {
     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
@@ -1065,11 +1059,10 @@ int vp9_square_search(MACROBLOCK *x,
     {{-1024, -1024}, {0, -1024}, {1024, -1024}, {1024, 0}, {1024, 1024},
       {0, 1024}, {-1024, 1024}, {-1024, 0}},
   };
-  return
-      vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                         do_init_search, 0, vfp, use_mvcost,
-                         center_mv, best_mv,
-                         square_num_candidates, square_candidates);
+  return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                            do_init_search, 0, vfp, use_mvcost,
+                            center_mv, best_mv,
+                            square_num_candidates, square_candidates);
 };
 
 #undef CHECK_BOUNDS
index 835d8e4d76dd25bae7b64482995ebfad7b378d95..77c157c5bdf803b85dd3ba7f031974e516da4c59 100644 (file)
@@ -44,32 +44,32 @@ int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
                            int_mv *ref_mv, int_mv *dst_mv);
 
 int vp9_hex_search(MACROBLOCK *x,
-                   int_mv *ref_mv,
+                   MV *ref_mv,
                    int search_param,
                    int error_per_bit,
                    int do_init_search,
                    const vp9_variance_fn_ptr_t *vf,
                    int use_mvcost,
-                   int_mv *center_mv,
-                   int_mv *best_mv);
+                   const MV *center_mv,
+                   MV *best_mv);
 int vp9_bigdia_search(MACROBLOCK *x,
-                      int_mv *ref_mv,
+                      MV *ref_mv,
                       int search_param,
                       int error_per_bit,
                       int do_init_search,
                       const vp9_variance_fn_ptr_t *vf,
                       int use_mvcost,
-                      int_mv *center_mv,
-                      int_mv *best_mv);
+                      const MV *center_mv,
+                      MV *best_mv);
 int vp9_square_search(MACROBLOCK *x,
-                      int_mv *ref_mv,
+                      MV *ref_mv,
                       int search_param,
                       int error_per_bit,
                       int do_init_search,
                       const vp9_variance_fn_ptr_t *vf,
                       int use_mvcost,
-                      int_mv *center_mv,
-                      int_mv *best_mv);
+                      const MV *center_mv,
+                      MV *best_mv);
 
 typedef int (fractional_mv_step_fp) (
     MACROBLOCK *x,
index d815dc96a5579984a6b65694c649f54ea5dfe246..2bed64c42736ef830d50880721544787a98ab5b7 100644 (file)
@@ -62,6 +62,12 @@ static void set_default_lf_deltas(struct loopfilter *lf);
                                            now so that HIGH_PRECISION is always
                                            chosen */
 
+// Masks for partially or completely disabling split mode
+#define DISABLE_ALL_SPLIT         0x3F
+#define DISABLE_ALL_INTER_SPLIT   0x1F
+#define DISABLE_COMPOUND_SPLIT    0x18
+#define LAST_AND_INTRA_SPLIT_ONLY 0x1E
+
 #if CONFIG_INTERNAL_STATS
 #include "math.h"
 
@@ -195,17 +201,17 @@ static void init_minq_luts(void) {
     gf_low_motion_minq[i] = calculate_minq_index(maxq,
                                                  0.0000015,
                                                  -0.0009,
-                                                 0.33,
+                                                 0.32,
                                                  0.0);
     gf_high_motion_minq[i] = calculate_minq_index(maxq,
                                                   0.0000021,
                                                   -0.00125,
-                                                  0.45,
+                                                  0.50,
                                                   0.0);
     inter_minq[i] = calculate_minq_index(maxq,
                                          0.00000271,
                                          -0.00113,
-                                         0.697,
+                                         0.75,
                                          0.0);
     afq_low_motion_minq[i] = calculate_minq_index(maxq,
                                                   0.0000015,
@@ -220,6 +226,27 @@ static void init_minq_luts(void) {
   }
 }
 
+static int get_active_quality(int q,
+                              int gfu_boost,
+                              int low,
+                              int high,
+                              int *low_motion_minq,
+                              int *high_motion_minq) {
+  int active_best_quality;
+  if (gfu_boost > high) {
+    active_best_quality = low_motion_minq[q];
+  } else if (gfu_boost < low) {
+    active_best_quality = high_motion_minq[q];
+  } else {
+    const int gap = high - low;
+    const int offset = high - gfu_boost;
+    const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+    const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+    active_best_quality = low_motion_minq[q] + adjustment;
+  }
+  return active_best_quality;
+}
+
 static void set_mvcost(MACROBLOCK *mb) {
   if (mb->e_mbd.allow_high_precision_mv) {
     mb->mvcost = mb->nmvcost_hp;
@@ -595,19 +622,12 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
   sf->thresh_mult[THR_COMP_NEARGA] += 1500;
   sf->thresh_mult[THR_COMP_NEWGA] += 2000;
 
-  sf->thresh_mult[THR_SPLITMV] += 2500;
-  sf->thresh_mult[THR_SPLITG] += 2500;
-  sf->thresh_mult[THR_SPLITA] += 2500;
-  sf->thresh_mult[THR_COMP_SPLITLA] += 4500;
-  sf->thresh_mult[THR_COMP_SPLITGA] += 4500;
-
   sf->thresh_mult[THR_ZEROMV] += 2000;
   sf->thresh_mult[THR_ZEROG] += 2000;
   sf->thresh_mult[THR_ZEROA] += 2000;
   sf->thresh_mult[THR_COMP_ZEROLA] += 2500;
   sf->thresh_mult[THR_COMP_ZEROGA] += 2500;
 
-  sf->thresh_mult[THR_B_PRED] += 2500;
   sf->thresh_mult[THR_H_PRED] += 2000;
   sf->thresh_mult[THR_V_PRED] += 2000;
   sf->thresh_mult[THR_D45_PRED ] += 2500;
@@ -623,21 +643,18 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
     sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
     sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
     sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
-    sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
   }
   if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
     sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
     sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
     sf->thresh_mult[THR_NEARG    ] = INT_MAX;
     sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-    sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
   }
   if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
     sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
     sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
     sf->thresh_mult[THR_NEARA    ] = INT_MAX;
     sf->thresh_mult[THR_NEWA     ] = INT_MAX;
-    sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
   }
 
   if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
@@ -646,7 +663,6 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
     sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
     sf->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;
     sf->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_SPLITLA  ] = INT_MAX;
   }
   if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
       (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
@@ -654,17 +670,42 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
     sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
     sf->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
     sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_SPLITGA  ] = INT_MAX;
   }
+}
 
-  if (sf->disable_splitmv == 1) {
-    sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-    sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
-    sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
+static void set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi, int mode) {
+  SPEED_FEATURES *sf = &cpi->sf;
+  int i;
 
-    sf->thresh_mult[THR_COMP_SPLITLA  ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_SPLITGA  ] = INT_MAX;
+  for (i = 0; i < MAX_REFS; ++i)
+    sf->thresh_mult_sub8x8[i] = mode == 0 ? -500 : 0;
+
+  sf->thresh_mult_sub8x8[THR_LAST] += 2500;
+  sf->thresh_mult_sub8x8[THR_GOLD] += 2500;
+  sf->thresh_mult_sub8x8[THR_ALTR] += 2500;
+  sf->thresh_mult_sub8x8[THR_INTRA] += 2500;
+  sf->thresh_mult_sub8x8[THR_COMP_LA] += 4500;
+  sf->thresh_mult_sub8x8[THR_COMP_GA] += 4500;
+
+  // Check for masked out split cases.
+  for (i = 0; i < MAX_REFS; i++) {
+    if (sf->disable_split_mask & (1 << i))
+      sf->thresh_mult_sub8x8[i] = INT_MAX;
   }
+
+  // disable mode test if frame flag is not set
+  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG))
+    sf->thresh_mult_sub8x8[THR_LAST] = INT_MAX;
+  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG))
+    sf->thresh_mult_sub8x8[THR_GOLD] = INT_MAX;
+  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG))
+    sf->thresh_mult_sub8x8[THR_ALTR] = INT_MAX;
+  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
+      (VP9_LAST_FLAG | VP9_ALT_FLAG))
+    sf->thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX;
+  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
+      (VP9_GOLD_FLAG | VP9_ALT_FLAG))
+    sf->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX;
 }
 
 void vp9_set_speed_features(VP9_COMP *cpi) {
@@ -677,12 +718,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   if (mode > 1)
     mode = 1;
 
-  // Initialise default mode frequency sampling variables
-  for (i = 0; i < MAX_MODES; i ++) {
-    cpi->mode_check_freq[i] = 0;
-    cpi->mode_test_hit_counts[i] = 0;
+  for (i = 0; i < MAX_MODES; ++i)
     cpi->mode_chosen_counts[i] = 0;
-  }
 
   // best quality defaults
   sf->RD = 1;
@@ -697,24 +734,21 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->adaptive_rd_thresh = 0;
-  sf->use_lastframe_partitioning = 0;
+  sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
   sf->use_avoid_tested_higherror = 0;
   sf->reference_masking = 0;
-  sf->partition_by_variance = 0;
   sf->use_one_partition_size_always = 0;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = 0;
-  sf->auto_min_max_partition_interval = 0;
-  sf->auto_min_max_partition_count = 0;
   sf->max_partition_size = BLOCK_64X64;
   sf->min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
   sf->last_partitioning_redo_frequency = 4;
-  sf->disable_splitmv = 0;
+  sf->disable_split_mask = 0;
   sf->mode_search_skip_flags = 0;
   sf->disable_split_var_thresh = 0;
   sf->disable_filter_search_var_thresh = 0;
@@ -747,113 +781,132 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
       sf->static_segmentation = 0;
 #endif
       sf->use_avoid_tested_higherror = 1;
-      sf->adaptive_rd_thresh = MIN((speed + 1), 4);
+      sf->adaptive_rd_thresh = 1;
 
       if (speed == 1) {
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-        sf->less_rectangular_check  = 1;
+        sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME ||
+                                          cpi->common.intra_only);
         sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
-                                      cpi->common.intra_only ||
-                                      cpi->common.show_frame == 0) ?
-                                     USE_FULL_RD :
-                                     USE_LARGESTALL);
+                                      cpi->common.intra_only)
+                                     ? USE_FULL_RD : USE_LARGESTALL);
+
+        if (MIN(cpi->common.width, cpi->common.height) >= 720)
+          sf->disable_split_mask = DISABLE_ALL_SPLIT;
+        else
+          sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+
+        sf->use_rd_breakout = 1;
+        sf->adaptive_motion_search = 1;
+        sf->auto_mv_step_size = 1;
+        sf->adaptive_rd_thresh = 2;
+      }
+      if (speed == 2) {
         sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME ||
-                                   cpi->common.intra_only ||
-                                   cpi->common.show_frame == 0);
-        sf->disable_splitmv =
-            (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
+                                          cpi->common.intra_only);
+        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+                                      cpi->common.intra_only)
+                                     ? USE_FULL_RD : USE_LARGESTALL);
+
+        if (MIN(cpi->common.width, cpi->common.height) >= 720)
+          sf->disable_split_mask = DISABLE_ALL_SPLIT;
+        else
+          sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
                                      FLAG_SKIP_INTRA_LOWVAR;
-        sf->use_uv_intra_rd_estimate = 1;
+
         sf->use_rd_breakout = 1;
-        sf->skip_encode_sb = 1;
-        sf->use_lp32x32fdct = 1;
         sf->adaptive_motion_search = 1;
         sf->auto_mv_step_size = 1;
 
-        sf->auto_min_max_partition_size = 1;
-        sf->auto_min_max_partition_interval = 1;
-        // FIXME(jingning): temporarily turn off disable_split_var_thresh
-        // during refactoring process. will get this back after finishing
-        // the main framework of partition search type.
-        sf->disable_split_var_thresh = 0;
         sf->disable_filter_search_var_thresh = 16;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
-        sf->intra_y_mode_mask = INTRA_DC_TM_H_V;
-        sf->intra_uv_mode_mask = INTRA_DC_TM_H_V;
-        sf->use_fast_coef_updates = 1;
+        sf->auto_min_max_partition_size = 1;
+        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
+        sf->adjust_partitioning_from_last_frame = 1;
+        sf->last_partitioning_redo_frequency = 3;
+
+        sf->adaptive_rd_thresh = 2;
         sf->mode_skip_start = 11;
       }
-      if (speed == 2) {
-        sf->less_rectangular_check  = 1;
+      if (speed == 3) {
         sf->use_square_partition_only = 1;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-        sf->use_lastframe_partitioning = 1;
-        sf->adjust_partitioning_from_last_frame = 1;
-        sf->last_partitioning_redo_frequency = 3;
-        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
-                                      cpi->common.intra_only ||
-                                      cpi->common.show_frame == 0) ?
-                                     USE_FULL_RD :
-                                     USE_LARGESTALL);
+        sf->tx_size_search_method = USE_LARGESTALL;
+
+        if (MIN(cpi->common.width, cpi->common.height) >= 720)
+          sf->disable_split_mask = DISABLE_ALL_SPLIT;
+        else
+          sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_COMP_REFMISMATCH |
-                                     FLAG_SKIP_INTRA_LOWVAR |
-                                     FLAG_EARLY_TERMINATE;
-        sf->intra_y_mode_mask = INTRA_DC_TM;
-        sf->intra_uv_mode_mask = INTRA_DC_TM;
-        sf->use_uv_intra_rd_estimate = 1;
+                                     FLAG_SKIP_INTRA_LOWVAR;
+
         sf->use_rd_breakout = 1;
-        sf->skip_encode_sb = 1;
-        sf->use_lp32x32fdct = 1;
         sf->adaptive_motion_search = 1;
-        sf->using_small_partition_info = 0;
-        sf->disable_splitmv =
-            (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
         sf->auto_mv_step_size = 1;
-        sf->search_method = SQUARE;
-        sf->subpel_iters_per_step = 1;
-        sf->use_fast_lpf_pick = 1;
+
+        sf->disable_filter_search_var_thresh = 16;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+
         sf->auto_min_max_partition_size = 1;
-        sf->auto_min_max_partition_interval = 2;
-        sf->disable_split_var_thresh = 32;
-        sf->disable_filter_search_var_thresh = 32;
+        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+        sf->adjust_partitioning_from_last_frame = 1;
+        sf->last_partitioning_redo_frequency = 3;
+
+        sf->use_uv_intra_rd_estimate = 1;
+        sf->skip_encode_sb = 1;
+        sf->use_lp32x32fdct = 1;
+        sf->subpel_iters_per_step = 1;
         sf->use_fast_coef_updates = 2;
+
+        sf->adaptive_rd_thresh = 4;
         sf->mode_skip_start = 6;
       }
-      if (speed == 3) {
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-        sf->partition_by_variance = 1;
-        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
-                                      cpi->common.intra_only ||
-                                      cpi->common.show_frame == 0) ?
-                                     USE_FULL_RD :
-                                     USE_LARGESTALL);
+      if (speed == 4) {
+        sf->use_square_partition_only = 1;
+        sf->tx_size_search_method = USE_LARGESTALL;
+        sf->disable_split_mask = DISABLE_ALL_SPLIT;
+
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
                                      FLAG_SKIP_COMP_REFMISMATCH |
                                      FLAG_SKIP_INTRA_LOWVAR |
                                      FLAG_EARLY_TERMINATE;
+
         sf->use_rd_breakout = 1;
+        sf->adaptive_motion_search = 1;
+        sf->auto_mv_step_size = 1;
+
+        sf->disable_filter_search_var_thresh = 16;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+
+        sf->auto_min_max_partition_size = 1;
+        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+        sf->adjust_partitioning_from_last_frame = 1;
+        sf->last_partitioning_redo_frequency = 3;
+
+        sf->use_uv_intra_rd_estimate = 1;
         sf->skip_encode_sb = 1;
         sf->use_lp32x32fdct = 1;
-        sf->disable_splitmv = 1;
-        sf->auto_mv_step_size = 1;
-        sf->search_method = BIGDIA;
         sf->subpel_iters_per_step = 1;
-        sf->disable_split_var_thresh = 64;
-        sf->disable_filter_search_var_thresh = 64;
-        sf->intra_y_mode_mask = INTRA_DC_ONLY;
-        sf->intra_uv_mode_mask = INTRA_DC_ONLY;
         sf->use_fast_coef_updates = 2;
+
+        sf->adaptive_rd_thresh = 4;
         sf->mode_skip_start = 6;
+
+        /* sf->intra_y_mode_mask = INTRA_DC_ONLY;
+        sf->intra_uv_mode_mask = INTRA_DC_ONLY;
+        sf->search_method = BIGDIA;
+        sf->disable_split_var_thresh = 64;
+        sf->disable_filter_search_var_thresh = 64; */
       }
-      if (speed == 4) {
+      if (speed == 5) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
         sf->use_one_partition_size_always = 1;
         sf->always_this_block_size = BLOCK_16X16;
@@ -875,7 +928,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         // sf->reduce_first_step_size = 1;
         // sf->reference_masking = 1;
 
-        sf->disable_splitmv = 1;
+        sf->disable_split_mask = DISABLE_ALL_SPLIT;
         sf->search_method = HEX;
         sf->subpel_iters_per_step = 1;
         sf->disable_split_var_thresh = 64;
@@ -883,6 +936,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->intra_y_mode_mask = INTRA_DC_ONLY;
         sf->intra_uv_mode_mask = INTRA_DC_ONLY;
         sf->use_fast_coef_updates = 2;
+        sf->adaptive_rd_thresh = 4;
         sf->mode_skip_start = 6;
       }
       break;
@@ -891,6 +945,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 
   // Set rd thresholds based on mode and speed setting
   set_rd_speed_thresholds(cpi, mode);
+  set_rd_speed_thresholds_sub8x8(cpi, mode);
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
@@ -1206,11 +1261,11 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 
   cpi->oxcf.lossless = oxcf->lossless;
   if (cpi->oxcf.lossless) {
-    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_iwht4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_iwht4x4_16_add;
   } else {
-    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_idct4x4_16_add;
   }
 
   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
@@ -1587,9 +1642,12 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   vp9_set_speed_features(cpi);
 
   // Default rd threshold factors for mode selection
-  for (i = 0; i < BLOCK_SIZES; ++i)
+  for (i = 0; i < BLOCK_SIZES; ++i) {
     for (j = 0; j < MAX_MODES; ++j)
       cpi->rd_thresh_freq_fact[i][j] = 32;
+    for (j = 0; j < MAX_REFS; ++j)
+      cpi->rd_thresh_freq_sub8x8[i][j] = 32;
+  }
 
 #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
             SDX3F, SDX8F, SDX4DF)\
@@ -2702,18 +2760,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       double q_val;
 
       // Baseline value derived from cpi->active_worst_quality and kf boost
-      if (cpi->kf_boost > high) {
-        cpi->active_best_quality = kf_low_motion_minq[q];
-      } else if (cpi->kf_boost < low) {
-        cpi->active_best_quality = kf_high_motion_minq[q];
-      } else {
-        const int gap = high - low;
-        const int offset = high - cpi->kf_boost;
-        const int qdiff = kf_high_motion_minq[q] - kf_low_motion_minq[q];
-        const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-
-        cpi->active_best_quality = kf_low_motion_minq[q] + adjustment;
-      }
+      cpi->active_best_quality = get_active_quality(q, cpi->kf_boost,
+                                                    low, high,
+                                                    kf_low_motion_minq,
+                                                    kf_high_motion_minq);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
@@ -2748,47 +2798,48 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       q = cpi->avg_frame_qindex;
     }
     // For constrained quality dont allow Q less than the cq level
-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
-        q < cpi->cq_target_quality) {
-      q = cpi->cq_target_quality;
-    }
-    if (cpi->gfu_boost > high) {
-      cpi->active_best_quality = gf_low_motion_minq[q];
-    } else if (cpi->gfu_boost < low) {
-      cpi->active_best_quality = gf_high_motion_minq[q];
-    } else {
-      const int gap = high - low;
-      const int offset = high - cpi->gfu_boost;
-      const int qdiff = gf_high_motion_minq[q] - gf_low_motion_minq[q];
-      const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-
-      cpi->active_best_quality = gf_low_motion_minq[q] + adjustment;
-    }
-
-    // Constrained quality use slightly lower active best.
-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+      if (q < cpi->cq_target_quality)
+        q = cpi->cq_target_quality;
+      if (cpi->frames_since_key > 1) {
+        cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+                                                      low, high,
+                                                      afq_low_motion_minq,
+                                                      afq_high_motion_minq);
+      } else {
+        cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+                                                      low, high,
+                                                      gf_low_motion_minq,
+                                                      gf_high_motion_minq);
+      }
+      // Constrained quality use slightly lower active best.
       cpi->active_best_quality = cpi->active_best_quality * 15 / 16;
 
-    // TODO(debargha): Refine the logic below
-    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+    } else if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
       if (!cpi->refresh_alt_ref_frame) {
         cpi->active_best_quality = cpi->cq_target_quality;
       } else {
         if (cpi->frames_since_key > 1) {
-          if (cpi->gfu_boost > high) {
-            cpi->active_best_quality = afq_low_motion_minq[q];
-          } else if (cpi->gfu_boost < low) {
-            cpi->active_best_quality = afq_high_motion_minq[q];
-          } else {
-            const int gap = high - low;
-            const int offset = high - cpi->gfu_boost;
-            const int qdiff = afq_high_motion_minq[q] - afq_low_motion_minq[q];
-            const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-
-            cpi->active_best_quality = afq_low_motion_minq[q] + adjustment;
-          }
+          cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+                                                        low, high,
+                                                        afq_low_motion_minq,
+                                                        afq_high_motion_minq);
+        } else {
+          cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+                                                        low, high,
+                                                        gf_low_motion_minq,
+                                                        gf_high_motion_minq);
         }
       }
+    } else {
+      if (!cpi->refresh_alt_ref_frame) {
+        cpi->active_best_quality = inter_minq[q];
+      } else {
+        cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+                                                      low, high,
+                                                      gf_low_motion_minq,
+                                                      gf_high_motion_minq);
+      }
     }
   } else {
     if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
@@ -3352,8 +3403,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
               cm->frame_type, cpi->refresh_golden_frame,
               cpi->refresh_alt_ref_frame);
 
-      for (i = 0; i < MAX_MODES; i++)
+      for (i = 0; i < MAX_MODES; ++i)
         fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+      for (i = 0; i < MAX_REFS; ++i)
+        fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]);
 
       fprintf(fmodes, "\n");
 
index 8d371bbadf920e3bc8f73f44efd0c4b3c5b2a3fa..2652929ce9678e93e70f17b494e91439bc72a300 100644 (file)
@@ -36,7 +36,7 @@
 #define DISABLE_RC_LONG_TERM_MEM 0
 #endif
 
-// #define MODE_TEST_HIT_STATS
+#define MODE_TEST_HIT_STATS
 
 // #define SPEEDSTATS 1
 #if CONFIG_MULTIPLE_ARF
@@ -49,7 +49,8 @@
 
 #define KEY_FRAME_CONTEXT 5
 
-#define MAX_MODES 36
+#define MAX_MODES 30
+#define MAX_REFS  6
 
 #define MIN_THRESHMULT  32
 #define MAX_THRESHMULT  512
 #define INTRA_ZBIN_BOOST     0
 
 typedef struct {
-  nmv_context nmvc;
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
   int nmvcosts_hp[2][MV_VALS];
 
   vp9_prob segment_pred_probs[PREDICTION_PROBS];
-  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
-  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
-  vp9_prob single_ref_prob[REF_CONTEXTS][2];
-  vp9_prob comp_ref_prob[REF_CONTEXTS];
 
   unsigned char *last_frame_seg_map_copy;
 
@@ -79,20 +75,8 @@ typedef struct {
   // 0 = ZERO_MV, MV
   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
 
-  vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
-
-  vp9_prob y_mode_prob[4][INTRA_MODES - 1];
-  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-  vp9_prob partition_prob[2][NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
-
-  vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
-                                 [SWITCHABLE_FILTERS - 1];
-
   int inter_mode_counts[INTER_MODE_CONTEXTS][INTER_MODES - 1][2];
-  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
-
-  struct tx_probs tx_probs;
-  vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
+  FRAME_CONTEXT fc;
 } CODING_CONTEXT;
 
 typedef struct {
@@ -169,19 +153,12 @@ typedef enum {
   THR_COMP_NEARGA,
   THR_COMP_NEWGA,
 
-  THR_SPLITMV,
-  THR_SPLITG,
-  THR_SPLITA,
-  THR_COMP_SPLITLA,
-  THR_COMP_SPLITGA,
-
   THR_ZEROMV,
   THR_ZEROG,
   THR_ZEROA,
   THR_COMP_ZEROLA,
   THR_COMP_ZEROGA,
 
-  THR_B_PRED,
   THR_H_PRED,
   THR_V_PRED,
   THR_D135_PRED,
@@ -192,6 +169,15 @@ typedef enum {
   THR_D45_PRED,
 } THR_MODES;
 
+typedef enum {
+  THR_LAST,
+  THR_GOLD,
+  THR_ALTR,
+  THR_COMP_LA,
+  THR_COMP_GA,
+  THR_INTRA,
+} THR_MODES_SUB8X8;
+
 typedef enum {
   DIAMOND = 0,
   NSTEP = 1,
@@ -246,6 +232,12 @@ typedef enum {
 #define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED))
 #define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED))
 
+typedef enum {
+  LAST_FRAME_PARTITION_OFF = 0,
+  LAST_FRAME_PARTITION_LOW_MOTION = 1,
+  LAST_FRAME_PARTITION_ALL = 2
+} LAST_FRAME_PARTITION_METHOD;
+
 typedef struct {
   int RD;
   SEARCH_METHODS search_method;
@@ -254,6 +246,7 @@ typedef struct {
   SUBPEL_SEARCH_METHODS subpel_search_method;
   int subpel_iters_per_step;
   int thresh_mult[MAX_MODES];
+  int thresh_mult_sub8x8[MAX_REFS];
   int max_step_search_steps;
   int reduce_first_step_size;
   int auto_mv_step_size;
@@ -263,11 +256,10 @@ typedef struct {
   int adaptive_rd_thresh;
   int skip_encode_sb;
   int skip_encode_frame;
-  int use_lastframe_partitioning;
+  LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
   int use_lp32x32fdct;
   int use_avoid_tested_higherror;
-  int partition_by_variance;
   int use_one_partition_size_always;
   int less_rectangular_check;
   int use_square_partition_only;
@@ -275,13 +267,11 @@ typedef struct {
   int reference_masking;
   BLOCK_SIZE always_this_block_size;
   int auto_min_max_partition_size;
-  int auto_min_max_partition_interval;
-  int auto_min_max_partition_count;
   BLOCK_SIZE min_partition_size;
   BLOCK_SIZE max_partition_size;
   int adjust_partitioning_from_last_frame;
   int last_partitioning_redo_frequency;
-  int disable_splitmv;
+  int disable_split_mask;
   int using_small_partition_info;
   // TODO(jingning): combine the related motion search speed features
   int adaptive_motion_search;
@@ -338,13 +328,13 @@ typedef struct VP9_COMP {
   YV12_BUFFER_CONFIG scaled_source;
 
   unsigned int frames_till_alt_ref_frame;
-  int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
-  int source_alt_ref_active;  // an alt ref frame has been encoded and is usable
+  int source_alt_ref_pending;
+  int source_alt_ref_active;
 
-  int is_src_frame_alt_ref;   // source of frame to encode is an exact copy of an alt ref frame
+  int is_src_frame_alt_ref;
 
-  int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
-  int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)
+  int gold_is_last;  // gold same as last frame ( short circuit gold searches)
+  int alt_is_last;  // Alt same as last ( short circuit altref search)
   int gold_is_alt;  // don't do both alt and gold search ( just do gold).
 
   int scaled_ref_idx[3];
@@ -381,15 +371,16 @@ typedef struct VP9_COMP {
   // Ambient reconstruction err target for force key frames
   int ambient_err;
 
-  unsigned int mode_check_freq[MAX_MODES];
-  unsigned int mode_test_hit_counts[MAX_MODES];
   unsigned int mode_chosen_counts[MAX_MODES];
+  unsigned int sub8x8_mode_chosen_counts[MAX_REFS];
   int64_t mode_skip_mask;
   int ref_frame_mask;
   int set_ref_frame_mask;
 
   int rd_threshes[BLOCK_SIZES][MAX_MODES];
   int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+  int rd_thresh_sub8x8[BLOCK_SIZES][MAX_REFS];
+  int rd_thresh_freq_sub8x8[BLOCK_SIZES][MAX_REFS];
 
   int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
   int64_t rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
@@ -422,14 +413,14 @@ typedef struct VP9_COMP {
   double gf_rate_correction_factor;
 
   unsigned int frames_since_golden;
-  int frames_till_gf_update_due;      // Count down till next GF
+  int frames_till_gf_update_due;  // Count down till next GF
 
-  int gf_overspend_bits;            // Total bits overspent becasue of GF boost (cumulative)
+  int gf_overspend_bits;  // cumulative bits overspent because of GF boost
 
-  int non_gf_bitrate_adjustment;     // Used in the few frames following a GF to recover the extra bits spent in that GF
+  int non_gf_bitrate_adjustment;  // Following GF to recover extra bits spent
 
-  int kf_overspend_bits;            // Extra bits spent on key frames that need to be recovered on inter frames
-  int kf_bitrate_adjustment;        // Current number of bit s to try and recover on each inter frame.
+  int kf_overspend_bits;  // Bits spent on key frames to be recovered on inters
+  int kf_bitrate_adjustment;  // number of bits to recover on each inter frame.
   int max_gf_interval;
   int baseline_gf_interval;
   int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
@@ -437,9 +428,9 @@ typedef struct VP9_COMP {
 
   int64_t key_frame_count;
   int prior_key_frame_distance[KEY_FRAME_CONTEXT];
-  int per_frame_bandwidth;          // Current section per frame bandwidth target
-  int av_per_frame_bandwidth;        // Average frame size target for clip
-  int min_frame_bandwidth;          // Minimum allocation that should be used for any frame
+  int per_frame_bandwidth;  // Current section per frame bandwidth target
+  int av_per_frame_bandwidth;  // Average frame size target for clip
+  int min_frame_bandwidth;  // Minimum allocation used for any frame
   int inter_frame_target;
   double output_framerate;
   int64_t last_time_stamp_seen;
@@ -535,7 +526,8 @@ typedef struct VP9_COMP {
   unsigned int max_mv_magnitude;
   int mv_step_param;
 
-  // Data used for real time conferencing mode to help determine if it would be good to update the gf
+  // Data used for real time conferencing mode to help determine if it
+  // would be good to update the gf
   int inter_zz_count;
   int gf_bad_count;
   int gf_update_recommended;
@@ -716,4 +708,8 @@ int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
 
 void vp9_alloc_compressor_data(VP9_COMP *cpi);
 
+static int get_token_alloc(int mb_rows, int mb_cols) {
+  return mb_rows * mb_cols * (48 * 16 + 4);
+}
+
 #endif  // VP9_ENCODER_VP9_ONYX_INT_H_
index 239fd6b77c57c443a027f856c95b22fe96964828..476ecaaa254aa64acf67213be1ac3d0151d21e09 100644 (file)
@@ -54,7 +54,8 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
   src += srcoffset;
   dst += dstoffset;
 
-  // Loop through the Y plane raw and reconstruction data summing (square differences)
+  // Loop through the raw Y plane and reconstruction data summing the square
+  // differences.
   for (i = 0; i < linestocopy; i += 16) {
     for (j = 0; j < source->y_width; j += 16) {
       unsigned int sse;
@@ -72,20 +73,6 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
 // Enforce a minimum filter level based upon baseline Q
 static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
   int min_filter_level;
-  /*int q = (int) vp9_convert_qindex_to_q(base_qindex);
-
-  if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame)
-      min_filter_level = 0;
-  else
-  {
-      if (q <= 10)
-          min_filter_level = 0;
-      else if (q <= 64)
-          min_filter_level = 1;
-      else
-          min_filter_level = (q >> 6);
-  }
-  */
   min_filter_level = 0;
 
   return min_filter_level;
@@ -93,11 +80,7 @@ static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
 
 // Enforce a maximum filter level based upon baseline Q
 static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {
-  // PGW August 2006: Highest filter values almost always a bad idea
-
-  // jbb chg: 20100118 - not so any more with this overquant stuff allow high values
-  // with lots of intra coming in.
-  int max_filter_level = MAX_LOOP_FILTER;// * 3 / 4;
+  int max_filter_level = MAX_LOOP_FILTER;
   (void)base_qindex;
 
   if (cpi->twopass.section_intra_rating > 8)
@@ -128,7 +111,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
   int filt_best;
   int filt_direction = 0;
 
-  int Bias = 0;                       // Bias against raising loop filter and in favour of lowering it
+  int Bias = 0;  // Bias against raising loop filter in favor of lowering it.
 
   //  Make a copy of the unfiltered / processed recon buffer
   vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
@@ -136,7 +119,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
   lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
                                                     : cpi->oxcf.Sharpness;
 
-  // Start the search at the previous frame filter level unless it is now out of range.
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
   filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
 
   // Define the initial step size
@@ -153,9 +137,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
   vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
 
   while (filter_step > 0) {
-    Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images
+    Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
-    // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value
     if (cpi->twopass.section_intra_rating < 20)
       Bias = Bias * cpi->twopass.section_intra_rating / 20;
 
@@ -163,8 +146,12 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
     if (cpi->common.tx_mode != ONLY_4X4)
       Bias >>= 1;
 
-    filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
-    filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
+    filt_high = ((filt_mid + filter_step) > max_filter_level)
+                    ? max_filter_level
+                    : (filt_mid + filter_step);
+    filt_low = ((filt_mid - filter_step) < min_filter_level)
+                   ? min_filter_level
+                   : (filt_mid - filter_step);
 
     if ((filt_direction <= 0) && (filt_low != filt_mid)) {
       // Get Low filter error score
@@ -176,7 +163,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
       //  Re-instate the unfiltered frame
       vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
 
-      // If value is close to the best so far then bias towards a lower loop filter value.
+      // If value is close to the best so far then bias towards a lower loop
+      // filter value.
       if ((filt_err - Bias) < best_err) {
         // Was it actually better than the previous best?
         if (filt_err < best_err)
@@ -215,4 +203,3 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
 
   lf->filter_level = filt_best;
 }
-
index 94394341d8ea38cc51f6f9853395faae081b7518..58294e15a38f8df7d14cbf93d8004d4c5a4fce1f 100644 (file)
@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
 
 #include "vpx_scale/yv12config.h"
-#include "math.h"
 
 #define MAX_PSNR 100
 
index 2d12ba94ff79f67e133f5bd1063cef44a01d202c..bbcad172d029f7f47c412bf5190a51002d48b90f 100644 (file)
@@ -76,35 +76,19 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
   // restored with a call to vp9_restore_coding_context. These functions are
   // intended for use in a re-code loop in vp9_compress_frame where the
   // quantizer value is adjusted between loop iterations.
-
-  cc->nmvc = cm->fc.nmvc;
   vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
   vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);
   vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);
 
-  vp9_copy(cc->inter_mode_probs, cm->fc.inter_mode_probs);
-
-  vp9_copy(cc->y_mode_prob, cm->fc.y_mode_prob);
-  vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
-  vp9_copy(cc->partition_prob, cm->fc.partition_prob);
-
   vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
 
-  vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob);
-  vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob);
-  vp9_copy(cc->single_ref_prob, cm->fc.single_ref_prob);
-  vp9_copy(cc->comp_ref_prob, cm->fc.comp_ref_prob);
-
   vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
              cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
 
   vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
   vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
 
-  vp9_copy(cc->coef_probs, cm->fc.coef_probs);
-  vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
-  cc->tx_probs = cm->fc.tx_probs;
-  vp9_copy(cc->mbskip_probs, cm->fc.mbskip_probs);
+  cc->fc = cm->fc;
 }
 
 void vp9_restore_coding_context(VP9_COMP *cpi) {
@@ -113,25 +97,12 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
 
   // Restore key state variables to the snapshot state stored in the
   // previous call to vp9_save_coding_context.
-
-  cm->fc.nmvc = cc->nmvc;
   vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
   vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
   vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
 
-  vp9_copy(cm->fc.inter_mode_probs, cc->inter_mode_probs);
-
-  vp9_copy(cm->fc.y_mode_prob, cc->y_mode_prob);
-  vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
-  vp9_copy(cm->fc.partition_prob, cc->partition_prob);
-
   vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
 
-  vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob);
-  vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob);
-  vp9_copy(cm->fc.single_ref_prob, cc->single_ref_prob);
-  vp9_copy(cm->fc.comp_ref_prob, cc->comp_ref_prob);
-
   vpx_memcpy(cm->last_frame_seg_map,
              cpi->coding_context.last_frame_seg_map_copy,
              (cm->mi_rows * cm->mi_cols));
@@ -139,10 +110,7 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
   vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
   vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
 
-  vp9_copy(cm->fc.coef_probs, cc->coef_probs);
-  vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
-  cm->fc.tx_probs = cc->tx_probs;
-  vp9_copy(cm->fc.mbskip_probs, cc->mbskip_probs);
+  cm->fc = cc->fc;
 }
 
 void vp9_setup_key_frame(VP9_COMP *cpi) {
index 8b911e669fa42b2f1686fae6efecb72a3e47968b..26bbc825e984595fbec87eb11cc96e3f8c2164bd 100644 (file)
 DECLARE_ALIGNED(16, extern const uint8_t,
                 vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 
-#define LAST_FRAME_MODE_MASK    0xFFDADCD60
-#define GOLDEN_FRAME_MODE_MASK  0xFFB5A3BB0
-#define ALT_REF_MODE_MASK       0xFF8C648D0
+#define LAST_FRAME_MODE_MASK    0xFFEDCD60
+#define GOLDEN_FRAME_MODE_MASK  0xFFDA3BB0
+#define ALT_REF_MODE_MASK       0xFFC648D0
+
+#define MIN_EARLY_TERM_INDEX    3
 
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  {RD_NEARESTMV, LAST_FRAME,   NONE},
-  {RD_NEARESTMV, ALTREF_FRAME, NONE},
-  {RD_NEARESTMV, GOLDEN_FRAME, NONE},
-
-  {RD_DC_PRED,   INTRA_FRAME,  NONE},
-
-  {RD_NEWMV,     LAST_FRAME,   NONE},
-  {RD_NEWMV,     ALTREF_FRAME, NONE},
-  {RD_NEWMV,     GOLDEN_FRAME, NONE},
-
-  {RD_NEARMV,    LAST_FRAME,   NONE},
-  {RD_NEARMV,    ALTREF_FRAME, NONE},
-  {RD_NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
-  {RD_NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
-
-  {RD_TM_PRED,   INTRA_FRAME,  NONE},
-
-  {RD_NEARMV,    LAST_FRAME,   ALTREF_FRAME},
-  {RD_NEWMV,     LAST_FRAME,   ALTREF_FRAME},
-  {RD_NEARMV,    GOLDEN_FRAME, NONE},
-  {RD_NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
-  {RD_NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
-
-  {RD_SPLITMV,   LAST_FRAME,   NONE},
-  {RD_SPLITMV,   GOLDEN_FRAME, NONE},
-  {RD_SPLITMV,   ALTREF_FRAME, NONE},
-  {RD_SPLITMV,   LAST_FRAME,   ALTREF_FRAME},
-  {RD_SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
-
-  {RD_ZEROMV,    LAST_FRAME,   NONE},
-  {RD_ZEROMV,    GOLDEN_FRAME, NONE},
-  {RD_ZEROMV,    ALTREF_FRAME, NONE},
-  {RD_ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
-  {RD_ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
-
-  {RD_I4X4_PRED, INTRA_FRAME,  NONE},
-  {RD_H_PRED,    INTRA_FRAME,  NONE},
-  {RD_V_PRED,    INTRA_FRAME,  NONE},
-  {RD_D135_PRED, INTRA_FRAME,  NONE},
-  {RD_D207_PRED, INTRA_FRAME,  NONE},
-  {RD_D153_PRED, INTRA_FRAME,  NONE},
-  {RD_D63_PRED,  INTRA_FRAME,  NONE},
-  {RD_D117_PRED, INTRA_FRAME,  NONE},
-  {RD_D45_PRED,  INTRA_FRAME,  NONE},
+  {NEARESTMV, LAST_FRAME,   NONE},
+  {NEARESTMV, ALTREF_FRAME, NONE},
+  {NEARESTMV, GOLDEN_FRAME, NONE},
+
+  {DC_PRED,   INTRA_FRAME,  NONE},
+
+  {NEWMV,     LAST_FRAME,   NONE},
+  {NEWMV,     ALTREF_FRAME, NONE},
+  {NEWMV,     GOLDEN_FRAME, NONE},
+
+  {NEARMV,    LAST_FRAME,   NONE},
+  {NEARMV,    ALTREF_FRAME, NONE},
+  {NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
+  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
+
+  {TM_PRED,   INTRA_FRAME,  NONE},
+
+  {NEARMV,    LAST_FRAME,   ALTREF_FRAME},
+  {NEWMV,     LAST_FRAME,   ALTREF_FRAME},
+  {NEARMV,    GOLDEN_FRAME, NONE},
+  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
+  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
+
+  {ZEROMV,    LAST_FRAME,   NONE},
+  {ZEROMV,    GOLDEN_FRAME, NONE},
+  {ZEROMV,    ALTREF_FRAME, NONE},
+  {ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
+  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
+
+  {H_PRED,    INTRA_FRAME,  NONE},
+  {V_PRED,    INTRA_FRAME,  NONE},
+  {D135_PRED, INTRA_FRAME,  NONE},
+  {D207_PRED, INTRA_FRAME,  NONE},
+  {D153_PRED, INTRA_FRAME,  NONE},
+  {D63_PRED,  INTRA_FRAME,  NONE},
+  {D117_PRED, INTRA_FRAME,  NONE},
+  {D45_PRED,  INTRA_FRAME,  NONE},
+};
+
+const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
+  {LAST_FRAME,   NONE},
+  {GOLDEN_FRAME, NONE},
+  {ALTREF_FRAME, NONE},
+  {LAST_FRAME,   ALTREF_FRAME},
+  {GOLDEN_FRAME, ALTREF_FRAME},
+  {INTRA_FRAME,  NONE},
 };
 
 // The baseline rd thresholds for breaking out of the rd loop for
@@ -160,21 +164,11 @@ static int compute_rd_mult(int qindex) {
   return (11 * q * q) >> 2;
 }
 
-static MB_PREDICTION_MODE rd_mode_to_mode(RD_PREDICTION_MODE rd_mode) {
-  if (rd_mode == RD_SPLITMV || rd_mode == RD_I4X4_PRED) {
-    assert(!"Invalid rd_mode");
-    return MB_MODE_COUNT;
-  }
-  assert((int)rd_mode < (int)MB_MODE_COUNT);
-  return (MB_PREDICTION_MODE)rd_mode;
-}
-
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
   cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
   cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
 }
 
-
 void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
   int q, i, bsize;
 
@@ -186,6 +180,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
   //     cpi->common.refresh_alt_ref_frame)
   qindex = clamp(qindex, 0, MAXQ);
 
+  cpi->RDDIV = 100;
   cpi->RDMULT = compute_rd_mult(qindex);
   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     if (cpi->twopass.next_iiratio > 31)
@@ -204,42 +199,30 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
   if (q < 8)
     q = 8;
 
-  if (cpi->RDMULT > 1000) {
-    cpi->RDDIV = 1;
-    cpi->RDMULT /= 100;
-
-    for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
-      for (i = 0; i < MAX_MODES; ++i) {
-        // Threshold here seem unecessarily harsh but fine given actual
-        // range of values used for cpi->sf.thresh_mult[]
-        int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
+  for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
+    for (i = 0; i < MAX_MODES; ++i) {
+      // Threshold here seem unecessarily harsh but fine given actual
+      // range of values used for cpi->sf.thresh_mult[]
+      int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
 
-        // *4 relates to the scaling of rd_thresh_block_size_factor[]
-        if ((int64_t)cpi->sf.thresh_mult[i] < thresh_max) {
-          cpi->rd_threshes[bsize][i] =
+      if (cpi->sf.thresh_mult[i] < thresh_max) {
+        cpi->rd_threshes[bsize][i] =
             cpi->sf.thresh_mult[i] * q *
-            rd_thresh_block_size_factor[bsize] / (4 * 100);
-        } else {
-          cpi->rd_threshes[bsize][i] = INT_MAX;
-        }
+            rd_thresh_block_size_factor[bsize] / 4;
+      } else {
+        cpi->rd_threshes[bsize][i] = INT_MAX;
       }
     }
-  } else {
-    cpi->RDDIV = 100;
 
-    for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
-      for (i = 0; i < MAX_MODES; i++) {
-        // Threshold here seem unecessarily harsh but fine given actual
-        // range of values used for cpi->sf.thresh_mult[]
-        int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
+    for (i = 0; i < MAX_REFS; ++i) {
+      int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
 
-        if (cpi->sf.thresh_mult[i] < thresh_max) {
-          cpi->rd_threshes[bsize][i] =
-            cpi->sf.thresh_mult[i] * q *
+      if (cpi->sf.thresh_mult_sub8x8[i] < thresh_max) {
+        cpi->rd_thresh_sub8x8[bsize][i] =
+            cpi->sf.thresh_mult_sub8x8[i] * q *
             rd_thresh_block_size_factor[bsize] / 4;
-        } else {
-          cpi->rd_threshes[bsize][i] = INT_MAX;
-        }
+      } else {
+        cpi->rd_thresh_sub8x8[bsize][i] = INT_MAX;
       }
     }
   }
@@ -684,6 +667,7 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
   const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
   const int num_4x4_w = num_4x4_blocks_wide_lookup[bs];
   const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
+  const uint8_t *band_translate;  // just for the get_scan_and_band call
 
   struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size,
                                     num_4x4_w, num_4x4_h,
@@ -695,26 +679,9 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
   vp9_get_entropy_contexts(tx_size, args.t_above, args.t_left,
                            pd->above_context, pd->left_context,
                            num_4x4_w, num_4x4_h);
-  switch (tx_size) {
-    case TX_4X4:
-      get_scan_nb_4x4(get_tx_type_4x4(pd->plane_type, xd, 0),
-                      &args.scan, &args.nb);
-      break;
-    case TX_8X8:
-      get_scan_nb_8x8(get_tx_type_8x8(pd->plane_type, xd),
-                      &args.scan, &args.nb);
-      break;
-    case TX_16X16:
-      get_scan_nb_16x16(get_tx_type_16x16(pd->plane_type, xd),
-                        &args.scan, &args.nb);
-      break;
-    case TX_32X32:
-      args.scan = vp9_default_scan_32x32;
-      args.nb = vp9_default_scan_32x32_neighbors;
-      break;
-    default:
-      assert(0);
-  }
+
+  get_scan_and_band(xd, tx_size, pd->plane_type, 0, &args.scan, &args.nb,
+                    &band_translate);
 
   foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args);
   if (args.skip) {
@@ -1097,6 +1064,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
         int64_t ssz;
         const int16_t *scan;
+        const int16_t *nb;
         uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
         uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
 
@@ -1122,10 +1090,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
           x->quantize_b_4x4(x, block, tx_type, 16);
         }
 
-        scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block));
+        get_scan_nb_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block),
+                        &scan, &nb);
         ratey += cost_coeffs(x, 0, block,
-                             tempa + idx, templ + idy, TX_4X4, scan,
-                             vp9_get_coef_neighbors_handle(scan));
+                             tempa + idx, templ + idy, TX_4X4, scan, nb);
         distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                       16, &ssz) >> 2;
         if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
@@ -1836,20 +1804,23 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           // adjust src pointer for this block
           mi_buf_shift(x, i);
           if (cpi->sf.search_method == HEX) {
-            bestsme = vp9_hex_search(x, &mvp_full,
+            bestsme = vp9_hex_search(x, &mvp_full.as_mv,
                                      step_param,
                                      sadpb, 1, v_fn_ptr, 1,
-                                     bsi->ref_mv, &mode_mv[NEWMV]);
+                                     &bsi->ref_mv->as_mv,
+                                     &mode_mv[NEWMV].as_mv);
           } else if (cpi->sf.search_method == SQUARE) {
-            bestsme = vp9_square_search(x, &mvp_full,
+            bestsme = vp9_square_search(x, &mvp_full.as_mv,
                                         step_param,
                                         sadpb, 1, v_fn_ptr, 1,
-                                        bsi->ref_mv, &mode_mv[NEWMV]);
+                                        &bsi->ref_mv->as_mv,
+                                        &mode_mv[NEWMV].as_mv);
           } else if (cpi->sf.search_method == BIGDIA) {
-            bestsme = vp9_bigdia_search(x, &mvp_full,
+            bestsme = vp9_bigdia_search(x, &mvp_full.as_mv,
                                         step_param,
                                         sadpb, 1, v_fn_ptr, 1,
-                                        bsi->ref_mv, &mode_mv[NEWMV]);
+                                        &bsi->ref_mv->as_mv,
+                                        &mode_mv[NEWMV].as_mv);
           } else {
             bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                              sadpb, further_steps, 0, v_fn_ptr,
@@ -2446,23 +2417,23 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
   if (cpi->sf.search_method == HEX) {
-    bestsme = vp9_hex_search(x, &mvp_full,
+    bestsme = vp9_hex_search(x, &mvp_full.as_mv,
                              step_param,
                              sadpb, 1,
                              &cpi->fn_ptr[block_size], 1,
-                             &ref_mv, tmp_mv);
+                             &ref_mv.as_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == SQUARE) {
-    bestsme = vp9_square_search(x, &mvp_full,
+    bestsme = vp9_square_search(x, &mvp_full.as_mv,
                                 step_param,
                                 sadpb, 1,
                                 &cpi->fn_ptr[block_size], 1,
-                                &ref_mv, tmp_mv);
+                                &ref_mv.as_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == BIGDIA) {
-    bestsme = vp9_bigdia_search(x, &mvp_full,
+    bestsme = vp9_bigdia_search(x, &mvp_full.as_mv,
                                 step_param,
                                 sadpb, 1,
                                 &cpi->fn_ptr[block_size], 1,
-                                &ref_mv, tmp_mv);
+                                &ref_mv.as_mv, &tmp_mv->as_mv);
   } else {
     bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                      sadpb, further_steps, 1,
@@ -2672,7 +2643,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
-  const int is_comp_pred = (mbmi->ref_frame[1] > 0);
+  const int is_comp_pred = has_second_ref(mbmi);
   const int num_refs = is_comp_pred ? 2 : 1;
   const int this_mode = mbmi->mode;
   int_mv *frame_mv = mode_mv[this_mode];
@@ -3136,7 +3107,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   const struct segmentation *seg = &cm->seg;
   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  RD_PREDICTION_MODE this_mode;
+  MB_PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i;
@@ -3150,7 +3121,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                      cpi->gld_fb_idx,
                      cpi->alt_fb_idx};
   int64_t best_rd = best_rd_so_far;
-  int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_tx_rd[TX_MODES];
   int64_t best_tx_diff[TX_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
@@ -3165,7 +3135,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_intra_rd = INT64_MAX;
   int64_t best_inter_rd = INT64_MAX;
   MB_PREDICTION_MODE best_intra_mode = DC_PRED;
-  // MB_PREDICTION_MODE best_inter_mode = ZEROMV;
   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
   INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
@@ -3179,23 +3148,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
   int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
                                              cpi->common.y_dc_delta_q);
-  int_mv seg_mvs[4][MAX_REF_FRAMES];
-  union b_mode_info best_bmodes[4];
-  PARTITION_INFO best_partition;
   const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   int best_skip2 = 0;
   unsigned char best_zcoeff_blk[256] = { 0 };
 
   x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
-  vpx_memset(x->zcoeff_blk, 0, sizeof(x->zcoeff_blk));
-  vpx_memset(ctx->zcoeff_blk, 0, sizeof(ctx->zcoeff_blk));
+  vp9_zero(x->zcoeff_blk);
+  vp9_zero(ctx->zcoeff_blk);
 
-  for (i = 0; i < 4; i++) {
-    int j;
-    for (j = 0; j < MAX_REF_FRAMES; j++)
-      seg_mvs[i][j].as_int = INVALID_MV;
-  }
   // Everywhere the flag is set the error is much higher than its neighbors.
   ctx->frames_with_high_error = 0;
   ctx->modes_with_high_error = 0;
@@ -3363,25 +3324,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
             second_ref_frame != best_inter_ref_frame)
           continue;
     }
-    // TODO(jingning, jkoleszar): scaling reference frame not supported for
-    // SPLITMV.
-    if (ref_frame > 0 &&
-        vp9_is_scaled(&scale_factor[ref_frame]) &&
-        this_mode == RD_SPLITMV)
-      continue;
-
-    if (second_ref_frame > 0 &&
-        vp9_is_scaled(&scale_factor[second_ref_frame]) &&
-        this_mode == RD_SPLITMV)
-      continue;
-
-    if (bsize >= BLOCK_8X8 &&
-        (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
-      continue;
-
-    if (bsize < BLOCK_8X8 &&
-        !(this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
-      continue;
 
     set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
     mbmi->uv_mode = DC_PRED;
@@ -3423,7 +3365,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     // If the segment skip feature is enabled....
     // then do nothing if the current mode is not allowed..
     } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != RD_ZEROMV && ref_frame != INTRA_FRAME)) {
+               (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
       continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
@@ -3435,11 +3377,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       // an unfiltered alternative. We allow near/nearest as well
       // because they may result in zero-zero MVs but be cheaper.
       if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if ((this_mode != RD_ZEROMV &&
-             !(this_mode == RD_NEARMV &&
-               frame_mv[RD_NEARMV][ALTREF_FRAME].as_int == 0) &&
-             !(this_mode == RD_NEARESTMV &&
-               frame_mv[RD_NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
+        if ((this_mode != ZEROMV &&
+             !(this_mode == NEARMV &&
+               frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
+             !(this_mode == NEARESTMV &&
+               frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
             ref_frame != ALTREF_FRAME) {
           continue;
         }
@@ -3451,7 +3393,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     // a representative block in the boundary ( first ) and then implement a
     // function that does sads when inside the border..
     if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
-        this_mode == RD_NEWMV) {
+        this_mode == NEWMV) {
       continue;
     }
 
@@ -3461,39 +3403,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     cpi->mode_test_hits[bsize]++;
 #endif
 
-    if (this_mode == RD_I4X4_PRED) {
-      int rate;
-
-      /*
-      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-          (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME))
-        continue;
-        */
-
-      // RD_I4X4_PRED is only considered for block sizes less than 8x8.
-      mbmi->tx_size = TX_4X4;
-      if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
-                                       &distortion_y, best_rd) >= best_rd)
-        continue;
-      rate2 += rate;
-      rate2 += intra_cost_penalty;
-      distortion2 += distortion_y;
 
-      if (rate_uv_intra[TX_4X4] == INT_MAX) {
-        choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
-                             &rate_uv_tokenonly[TX_4X4],
-                             &dist_uv[TX_4X4], &skip_uv[TX_4X4],
-                             &mode_uv[TX_4X4]);
-      }
-      rate2 += rate_uv_intra[TX_4X4];
-      rate_uv = rate_uv_tokenonly[TX_4X4];
-      distortion2 += dist_uv[TX_4X4];
-      distortion_uv = dist_uv[TX_4X4];
-      mbmi->uv_mode = mode_uv[TX_4X4];
-      tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-      for (i = 0; i < TX_MODES; ++i)
-        tx_cache[i] = tx_cache[ONLY_4X4];
-    } else if (ref_frame == INTRA_FRAME) {
+    if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       // Disable intra modes other than DC_PRED for blocks with low variance
       // Threshold for intra skipping based on source variance
@@ -3502,17 +3413,17 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
       };
       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
-          this_mode != RD_DC_PRED &&
+          this_mode != DC_PRED &&
           x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
         continue;
       // Only search the oblique modes if the best so far is
       // one of the neighboring directional modes
       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-          (this_mode >= RD_D45_PRED && this_mode <= RD_TM_PRED)) {
+          (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
         if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
           continue;
       }
-      mbmi->mode = rd_mode_to_mode(this_mode);
+      mbmi->mode = this_mode;
       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
         if (conditional_skipintra(mbmi->mode, best_intra_mode))
             continue;
@@ -3538,191 +3449,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       mbmi->uv_mode = mode_uv[uv_tx];
 
       rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
-      if (this_mode != RD_DC_PRED && this_mode != RD_TM_PRED)
+      if (this_mode != DC_PRED && this_mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
-    } else if (this_mode == RD_SPLITMV) {
-      const int is_comp_pred = second_ref_frame > 0;
-      int rate;
-      int64_t distortion;
-      int64_t this_rd_thresh;
-      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
-      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
-      int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
-      int tmp_best_skippable = 0;
-      int switchable_filter_index;
-      int_mv *second_ref = is_comp_pred ?
-          &mbmi->ref_mvs[second_ref_frame][0] : NULL;
-      union b_mode_info tmp_best_bmodes[16];
-      MB_MODE_INFO tmp_best_mbmode;
-      PARTITION_INFO tmp_best_partition;
-      BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
-      int pred_exists = 0;
-      int uv_skippable;
-      if (is_comp_pred) {
-        if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
-          if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
-            continue;
-        if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
-          if (ref_frame != best_inter_ref_frame &&
-              second_ref_frame != best_inter_ref_frame)
-            continue;
-      }
-
-      this_rd_thresh = (ref_frame == LAST_FRAME) ?
-          cpi->rd_threshes[bsize][THR_NEWMV] :
-          cpi->rd_threshes[bsize][THR_NEWA];
-      this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
-          cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
-      xd->this_mi->mbmi.tx_size = TX_4X4;
-
-      cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
-      if (cm->mcomp_filter_type != BILINEAR) {
-        tmp_best_filter = EIGHTTAP;
-        if (x->source_variance <
-            cpi->sf.disable_filter_search_var_thresh) {
-          tmp_best_filter = EIGHTTAP;
-          vp9_zero(cpi->rd_filter_cache);
-        } else {
-          for (switchable_filter_index = 0;
-               switchable_filter_index < SWITCHABLE_FILTERS;
-               ++switchable_filter_index) {
-            int newbest, rs;
-            int64_t rs_rd;
-            mbmi->interp_filter = switchable_filter_index;
-            vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
-            tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
-                                                 &mbmi->ref_mvs[ref_frame][0],
-                                                 second_ref,
-                                                 best_yrd,
-                                                 &rate, &rate_y, &distortion,
-                                                 &skippable, &total_sse,
-                                                 (int)this_rd_thresh, seg_mvs,
-                                                 bsi, switchable_filter_index,
-                                                 mi_row, mi_col);
-
-            if (tmp_rd == INT64_MAX)
-              continue;
-            cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
-            rs = get_switchable_rate(x);
-            rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
-            cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
-                MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
-                    tmp_rd + rs_rd);
-            if (cm->mcomp_filter_type == SWITCHABLE)
-              tmp_rd += rs_rd;
-
-            newbest = (tmp_rd < tmp_best_rd);
-            if (newbest) {
-              tmp_best_filter = mbmi->interp_filter;
-              tmp_best_rd = tmp_rd;
-            }
-            if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
-                (mbmi->interp_filter == cm->mcomp_filter_type &&
-                 cm->mcomp_filter_type != SWITCHABLE)) {
-              tmp_best_rdu = tmp_rd;
-              tmp_best_rate = rate;
-              tmp_best_ratey = rate_y;
-              tmp_best_distortion = distortion;
-              tmp_best_sse = total_sse;
-              tmp_best_skippable = skippable;
-              tmp_best_mbmode = *mbmi;
-              tmp_best_partition = *x->partition_info;
-              for (i = 0; i < 4; i++)
-                tmp_best_bmodes[i] = xd->this_mi->bmi[i];
-              pred_exists = 1;
-              if (switchable_filter_index == 0 &&
-                  cpi->sf.use_rd_breakout &&
-                  best_rd < INT64_MAX) {
-                if (tmp_best_rdu / 2 > best_rd) {
-                  // skip searching the other filters if the first is
-                  // already substantially larger than the best so far
-                  tmp_best_filter = mbmi->interp_filter;
-                  tmp_best_rdu = INT64_MAX;
-                  break;
-                }
-              }
-            }
-          }  // switchable_filter_index loop
-        }
-      }
-
-      if (tmp_best_rdu == INT64_MAX)
-        continue;
-
-      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
-                             tmp_best_filter : cm->mcomp_filter_type);
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-      if (!pred_exists) {
-        // Handles the special case when a filter that is not in the
-        // switchable list (bilinear, 6-tap) is indicated at the frame level
-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
-                     &mbmi->ref_mvs[ref_frame][0],
-                     second_ref,
-                     best_yrd,
-                     &rate, &rate_y, &distortion,
-                     &skippable, &total_sse,
-                     (int)this_rd_thresh, seg_mvs,
-                     bsi, 0,
-                     mi_row, mi_col);
-        if (tmp_rd == INT64_MAX)
-          continue;
-      } else {
-        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          int rs = get_switchable_rate(x);
-          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
-        }
-        tmp_rd = tmp_best_rdu;
-        total_sse = tmp_best_sse;
-        rate = tmp_best_rate;
-        rate_y = tmp_best_ratey;
-        distortion = tmp_best_distortion;
-        skippable = tmp_best_skippable;
-        *mbmi = tmp_best_mbmode;
-        *x->partition_info = tmp_best_partition;
-        for (i = 0; i < 4; i++)
-          xd->this_mi->bmi[i] = tmp_best_bmodes[i];
-      }
-
-      rate2 += rate;
-      distortion2 += distortion;
-
-      if (cpi->common.mcomp_filter_type == SWITCHABLE)
-        rate2 += get_switchable_rate(x);
-
-      if (!mode_excluded) {
-        if (is_comp_pred)
-          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
-        else
-          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
-      }
-      compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred);
-
-      tmp_best_rdu = best_rd -
-          MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
-              RDCOST(x->rdmult, x->rddiv, 0, total_sse));
-
-      if (tmp_best_rdu > 0) {
-        // If even the 'Y' rd value of split is higher than best so far
-        // then dont bother looking at UV
-        vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
-                                        BLOCK_8X8);
-        super_block_uvrd(cm, x, &rate_uv, &distortion_uv, &uv_skippable,
-                         &uv_sse, BLOCK_8X8, tmp_best_rdu);
-        if (rate_uv == INT_MAX)
-          continue;
-        rate2 += rate_uv;
-        distortion2 += distortion_uv;
-        skippable = skippable && uv_skippable;
-        total_sse += uv_sse;
-
-        tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-        for (i = 0; i < TX_MODES; ++i)
-          tx_cache[i] = tx_cache[ONLY_4X4];
-      }
     } else {
-      mbmi->mode = rd_mode_to_mode(this_mode);
+      mbmi->mode = this_mode;
       compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
       this_rd = handle_inter_mode(cpi, x, bsize,
                                   tx_cache,
@@ -3758,7 +3489,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
                                                          SEG_LVL_SKIP);
 
-      if (skippable && bsize >= BLOCK_8X8) {
+      if (skippable) {
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
         // for best yrd calculation
@@ -3820,7 +3551,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         this_rd < best_inter_rd) {
       best_inter_rd = this_rd;
       best_inter_ref_frame = ref_frame;
-      // best_inter_mode = xd->this_mi->mbmi.mode;
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
@@ -3830,16 +3560,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
     }
 
-    if (this_mode != RD_I4X4_PRED && this_mode != RD_SPLITMV) {
-      // Store the respective mode distortions for later use.
-      if (mode_distortions[this_mode] == -1
-          || distortion2 < mode_distortions[this_mode]) {
-        mode_distortions[this_mode] = distortion2;
-      }
-      if (frame_distortions[ref_frame] == -1
-          || distortion2 < frame_distortions[ref_frame]) {
-        frame_distortions[ref_frame] = distortion2;
-      }
+    // Store the respective mode distortions for later use.
+    if (mode_distortions[this_mode] == -1
+        || distortion2 < mode_distortions[this_mode]) {
+      mode_distortions[this_mode] = distortion2;
+    }
+    if (frame_distortions[ref_frame] == -1
+        || distortion2 < frame_distortions[ref_frame]) {
+      frame_distortions[ref_frame] = distortion2;
     }
 
     // Did this mode help.. i.e. is it the new best mode
@@ -3856,21 +3584,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         *returnrate = rate2;
         *returndistortion = distortion2;
         best_rd = this_rd;
-        best_yrd = best_rd -
-                   RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
-        best_partition = *x->partition_info;
         vpx_memcpy(best_zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
                    sizeof(best_zcoeff_blk));
 
-        if (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV)
-          for (i = 0; i < 4; i++)
-            best_bmodes[i] = xd->this_mi->bmi[i];
-
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
-        if (cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) {
+        if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+            (mode_index > MIN_EARLY_TERM_INDEX)) {
           const int qstep = xd->plane[0].dequant[1];
           // TODO(debargha): Enhance this by specializing for each mode_index
           int scale = 4;
@@ -3933,21 +3655,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     /* keep record of best txfm size */
     if (bsize < BLOCK_32X32) {
-      if (bsize < BLOCK_16X16) {
-        if (this_mode == RD_SPLITMV || this_mode == RD_I4X4_PRED)
-          tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
+      if (bsize < BLOCK_16X16)
         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
-      }
+
       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
     }
     if (!mode_excluded && this_rd != INT64_MAX) {
       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
         int64_t adj_rd = INT64_MAX;
-        if (this_mode != RD_I4X4_PRED) {
-          adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
-        } else {
-          adj_rd = this_rd;
-        }
+        adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
 
         if (adj_rd < best_tx_rd[i])
           best_tx_rd[i] = adj_rd;
@@ -4001,10 +3717,715 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
-    *returnrate = INT_MAX;
-    *returndistortion = INT_MAX;
-    return best_rd;
+  assert((cm->mcomp_filter_type == SWITCHABLE) ||
+         (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
+         (best_mbmode.ref_frame[0] == INTRA_FRAME));
+
+  // Updating rd_thresh_freq_fact[] here means that the different
+  // partition/block sizes are handled independently based on the best
+  // choice for the current partition. It may well be better to keep a scaled
+  // best rd so far value and update rd_thresh_freq_fact based on the mode/size
+  // combination that wins out.
+  if (cpi->sf.adaptive_rd_thresh) {
+    for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
+      if (mode_index == best_mode_index) {
+        cpi->rd_thresh_freq_fact[bsize][mode_index] -=
+          (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
+      } else {
+        cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
+        if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
+            (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) {
+          cpi->rd_thresh_freq_fact[bsize][mode_index] =
+            cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT;
+        }
+      }
+    }
+  }
+
+  // macroblock modes
+  *mbmi = best_mbmode;
+  x->skip |= best_skip2;
+
+  vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], best_zcoeff_blk,
+             sizeof(best_zcoeff_blk));
+
+  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+    if (best_pred_rd[i] == INT64_MAX)
+      best_pred_diff[i] = INT_MIN;
+    else
+      best_pred_diff[i] = best_rd - best_pred_rd[i];
+  }
+
+  if (!x->skip) {
+    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+      if (best_filter_rd[i] == INT64_MAX)
+        best_filter_diff[i] = 0;
+      else
+        best_filter_diff[i] = best_rd - best_filter_rd[i];
+    }
+    if (cm->mcomp_filter_type == SWITCHABLE)
+      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
+  } else {
+    vp9_zero(best_filter_diff);
+  }
+
+  if (!x->skip) {
+    for (i = 0; i < TX_MODES; i++) {
+      if (best_tx_rd[i] == INT64_MAX)
+        best_tx_diff[i] = 0;
+      else
+        best_tx_diff[i] = best_rd - best_tx_rd[i];
+    }
+  } else {
+    vp9_zero(best_tx_diff);
+  }
+
+  set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
+                    scale_factor);
+  store_coding_context(x, ctx, best_mode_index,
+                       NULL,
+                       &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
+                       &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
+                                      mbmi->ref_frame[1]][0],
+                       best_pred_diff, best_tx_diff, best_filter_diff);
+
+  return best_rd;
+}
+
+
+int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                      int mi_row, int mi_col,
+                                      int *returnrate,
+                                      int64_t *returndistortion,
+                                      BLOCK_SIZE bsize,
+                                      PICK_MODE_CONTEXT *ctx,
+                                      int64_t best_rd_so_far) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+  const struct segmentation *seg = &cm->seg;
+  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+  unsigned char segment_id = mbmi->segment_id;
+  int comp_pred, i;
+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int idx_list[4] = {0,
+                     cpi->lst_fb_idx,
+                     cpi->gld_fb_idx,
+                     cpi->alt_fb_idx};
+  int64_t best_rd = best_rd_so_far;
+  int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
+  int64_t best_tx_rd[TX_MODES];
+  int64_t best_tx_diff[TX_MODES];
+  int64_t best_pred_diff[NB_PREDICTION_TYPES];
+  int64_t best_pred_rd[NB_PREDICTION_TYPES];
+  int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+  MB_MODE_INFO best_mbmode = { 0 };
+  int mode_index, best_mode_index = 0;
+  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  vp9_prob comp_mode_p;
+  int64_t best_inter_rd = INT64_MAX;
+  MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
+  INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
+  int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
+  int64_t dist_uv[TX_SIZES];
+  int skip_uv[TX_SIZES];
+  MB_PREDICTION_MODE mode_uv[TX_SIZES] = { 0 };
+  struct scale_factors scale_factor[4];
+  unsigned int ref_frame_mask = 0;
+  unsigned int mode_mask = 0;
+  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
+                                             cpi->common.y_dc_delta_q);
+  int_mv seg_mvs[4][MAX_REF_FRAMES];
+  b_mode_info best_bmodes[4];
+  PARTITION_INFO best_partition;
+  int best_skip2 = 0;
+  unsigned char best_zcoeff_blk[256] = { 0 };
+
+  x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
+  vp9_zero(x->zcoeff_blk);
+  vp9_zero(ctx->zcoeff_blk);
+
+  for (i = 0; i < 4; i++) {
+    int j;
+    for (j = 0; j < MAX_REF_FRAMES; j++)
+      seg_mvs[i][j].as_int = INVALID_MV;
+  }
+
+  estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
+
+  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+    best_pred_rd[i] = INT64_MAX;
+  for (i = 0; i < TX_MODES; i++)
+    best_tx_rd[i] = INT64_MAX;
+  for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+    best_filter_rd[i] = INT64_MAX;
+  for (i = 0; i < TX_SIZES; i++)
+    rate_uv_intra[i] = INT_MAX;
+
+  *returnrate = INT_MAX;
+
+  // Create a mask set to 1 for each reference frame used by a smaller
+  // resolution.
+  if (cpi->sf.use_avoid_tested_higherror) {
+    ref_frame_mask = 0;
+    mode_mask = 0;
+    ref_frame_mask = ~ref_frame_mask;
+    mode_mask = ~mode_mask;
+  }
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
+                         mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
+                         yv12_mb, scale_factor);
+    }
+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[ZEROMV][ref_frame].as_int = 0;
+  }
+
+  for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
+    int mode_excluded = 0;
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int compmode_cost = 0;
+    int rate2 = 0, rate_y = 0, rate_uv = 0;
+    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+    int skippable = 0;
+    int64_t tx_cache[TX_MODES];
+    int i;
+    int this_skip2 = 0;
+    int64_t total_sse = INT_MAX;
+    int early_term = 0;
+
+    for (i = 0; i < TX_MODES; ++i)
+      tx_cache[i] = INT64_MAX;
+
+    x->skip = 0;
+    ref_frame = vp9_ref_order[mode_index].ref_frame;
+    second_ref_frame = vp9_ref_order[mode_index].second_ref_frame;
+
+    // Skip if the current reference frame has been masked off
+    if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
+        (cpi->ref_frame_mask & (1 << ref_frame)))
+      continue;
+
+    // Test best rd so far against threshold for trying this mode.
+    if ((best_rd < ((int64_t)cpi->rd_thresh_sub8x8[bsize][mode_index] *
+                     cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 5)) ||
+        cpi->rd_thresh_sub8x8[bsize][mode_index] == INT_MAX)
+      continue;
+
+    // Do not allow compound prediction if the segment level reference
+    // frame feature is in use as in this case there can only be one reference.
+    if ((second_ref_frame > INTRA_FRAME) &&
+         vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+      continue;
+
+    mbmi->ref_frame[0] = ref_frame;
+    mbmi->ref_frame[1] = second_ref_frame;
+
+    if (!(ref_frame == INTRA_FRAME
+        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
+      continue;
+    }
+    if (!(second_ref_frame == NONE
+        || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
+      continue;
+    }
+
+    comp_pred = second_ref_frame > INTRA_FRAME;
+    if (comp_pred) {
+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
+        if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME)
+          continue;
+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
+        if (ref_frame != best_inter_ref_frame &&
+            second_ref_frame != best_inter_ref_frame)
+          continue;
+    }
+
+    // TODO(jingning, jkoleszar): scaling reference frame not supported for
+    // sub8x8 blocks.
+    if (ref_frame > 0 &&
+        vp9_is_scaled(&scale_factor[ref_frame]))
+      continue;
+
+    if (second_ref_frame > 0 &&
+        vp9_is_scaled(&scale_factor[second_ref_frame]))
+      continue;
+
+    set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
+    mbmi->uv_mode = DC_PRED;
+
+    // Evaluate all sub-pel filters irrespective of whether we can use
+    // them for this frame.
+    mbmi->interp_filter = cm->mcomp_filter_type;
+    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+    if (comp_pred) {
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
+        continue;
+      set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
+
+      mode_excluded = mode_excluded
+                         ? mode_excluded
+                         : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+    } else {
+      if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
+        mode_excluded =
+            mode_excluded ?
+                mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+      }
+    }
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred)
+        xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
+            (int)ref_frame) {
+      continue;
+    // If the segment skip feature is enabled....
+    // then do nothing if the current mode is not allowed..
+    } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
+               ref_frame != INTRA_FRAME) {
+      continue;
+    // Disable this drop out case if the ref frame
+    // segment level feature is enabled for this segment. This is to
+    // prevent the possibility that we end up unable to pick any mode.
+    } else if (!vp9_segfeature_active(seg, segment_id,
+                                      SEG_LVL_REF_FRAME)) {
+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+      // unless ARNR filtering is enabled in which case we want
+      // an unfiltered alternative. We allow near/nearest as well
+      // because they may result in zero-zero MVs but be cheaper.
+      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+        continue;
+    }
+
+#ifdef MODE_TEST_HIT_STATS
+    // TEST/DEBUG CODE
+    // Keep a rcord of the number of test hits at each size
+    cpi->mode_test_hits[bsize]++;
+#endif
+
+    if (ref_frame == INTRA_FRAME) {
+      int rate;
+      mbmi->tx_size = TX_4X4;
+      if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
+                                       &distortion_y, best_rd) >= best_rd)
+        continue;
+      rate2 += rate;
+      rate2 += intra_cost_penalty;
+      distortion2 += distortion_y;
+
+      if (rate_uv_intra[TX_4X4] == INT_MAX) {
+        choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
+                             &rate_uv_tokenonly[TX_4X4],
+                             &dist_uv[TX_4X4], &skip_uv[TX_4X4],
+                             &mode_uv[TX_4X4]);
+      }
+      rate2 += rate_uv_intra[TX_4X4];
+      rate_uv = rate_uv_tokenonly[TX_4X4];
+      distortion2 += dist_uv[TX_4X4];
+      distortion_uv = dist_uv[TX_4X4];
+      mbmi->uv_mode = mode_uv[TX_4X4];
+      tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      for (i = 0; i < TX_MODES; ++i)
+        tx_cache[i] = tx_cache[ONLY_4X4];
+    } else {
+      int rate;
+      int64_t distortion;
+      int64_t this_rd_thresh;
+      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
+      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
+      int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
+      int tmp_best_skippable = 0;
+      int switchable_filter_index;
+      int_mv *second_ref = comp_pred ?
+                             &mbmi->ref_mvs[second_ref_frame][0] : NULL;
+      b_mode_info tmp_best_bmodes[16];
+      MB_MODE_INFO tmp_best_mbmode;
+      PARTITION_INFO tmp_best_partition;
+      BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
+      int pred_exists = 0;
+      int uv_skippable;
+
+      this_rd_thresh = (ref_frame == LAST_FRAME) ?
+          cpi->rd_thresh_sub8x8[bsize][THR_LAST] :
+          cpi->rd_thresh_sub8x8[bsize][THR_ALTR];
+      this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
+          cpi->rd_thresh_sub8x8[bsize][THR_GOLD] : this_rd_thresh;
+      xd->this_mi->mbmi.tx_size = TX_4X4;
+
+      cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
+      if (cm->mcomp_filter_type != BILINEAR) {
+        tmp_best_filter = EIGHTTAP;
+        if (x->source_variance <
+            cpi->sf.disable_filter_search_var_thresh) {
+          tmp_best_filter = EIGHTTAP;
+          vp9_zero(cpi->rd_filter_cache);
+        } else {
+          for (switchable_filter_index = 0;
+               switchable_filter_index < SWITCHABLE_FILTERS;
+               ++switchable_filter_index) {
+            int newbest, rs;
+            int64_t rs_rd;
+            mbmi->interp_filter = switchable_filter_index;
+            vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+            tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+                                                 &mbmi->ref_mvs[ref_frame][0],
+                                                 second_ref,
+                                                 best_yrd,
+                                                 &rate, &rate_y, &distortion,
+                                                 &skippable, &total_sse,
+                                                 (int)this_rd_thresh, seg_mvs,
+                                                 bsi, switchable_filter_index,
+                                                 mi_row, mi_col);
+
+            if (tmp_rd == INT64_MAX)
+              continue;
+            cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
+            rs = get_switchable_rate(x);
+            rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+            cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
+                MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
+                    tmp_rd + rs_rd);
+            if (cm->mcomp_filter_type == SWITCHABLE)
+              tmp_rd += rs_rd;
+
+            newbest = (tmp_rd < tmp_best_rd);
+            if (newbest) {
+              tmp_best_filter = mbmi->interp_filter;
+              tmp_best_rd = tmp_rd;
+            }
+            if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
+                (mbmi->interp_filter == cm->mcomp_filter_type &&
+                 cm->mcomp_filter_type != SWITCHABLE)) {
+              tmp_best_rdu = tmp_rd;
+              tmp_best_rate = rate;
+              tmp_best_ratey = rate_y;
+              tmp_best_distortion = distortion;
+              tmp_best_sse = total_sse;
+              tmp_best_skippable = skippable;
+              tmp_best_mbmode = *mbmi;
+              tmp_best_partition = *x->partition_info;
+              for (i = 0; i < 4; i++)
+                tmp_best_bmodes[i] = xd->this_mi->bmi[i];
+              pred_exists = 1;
+              if (switchable_filter_index == 0 &&
+                  cpi->sf.use_rd_breakout &&
+                  best_rd < INT64_MAX) {
+                if (tmp_best_rdu / 2 > best_rd) {
+                  // skip searching the other filters if the first is
+                  // already substantially larger than the best so far
+                  tmp_best_filter = mbmi->interp_filter;
+                  tmp_best_rdu = INT64_MAX;
+                  break;
+                }
+              }
+            }
+          }  // switchable_filter_index loop
+        }
+      }
+
+      if (tmp_best_rdu == INT64_MAX)
+        continue;
+
+      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
+                             tmp_best_filter : cm->mcomp_filter_type);
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      if (!pred_exists) {
+        // Handles the special case when a filter that is not in the
+        // switchable list (bilinear, 6-tap) is indicated at the frame level
+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+                     &mbmi->ref_mvs[ref_frame][0],
+                     second_ref,
+                     best_yrd,
+                     &rate, &rate_y, &distortion,
+                     &skippable, &total_sse,
+                     (int)this_rd_thresh, seg_mvs,
+                     bsi, 0,
+                     mi_row, mi_col);
+        if (tmp_rd == INT64_MAX)
+          continue;
+      } else {
+        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+          int rs = get_switchable_rate(x);
+          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
+        }
+        tmp_rd = tmp_best_rdu;
+        total_sse = tmp_best_sse;
+        rate = tmp_best_rate;
+        rate_y = tmp_best_ratey;
+        distortion = tmp_best_distortion;
+        skippable = tmp_best_skippable;
+        *mbmi = tmp_best_mbmode;
+        *x->partition_info = tmp_best_partition;
+        for (i = 0; i < 4; i++)
+          xd->this_mi->bmi[i] = tmp_best_bmodes[i];
+      }
+
+      rate2 += rate;
+      distortion2 += distortion;
+
+      if (cpi->common.mcomp_filter_type == SWITCHABLE)
+        rate2 += get_switchable_rate(x);
+
+      if (!mode_excluded) {
+        if (comp_pred)
+          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
+        else
+          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+      }
+      compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
+
+      tmp_best_rdu = best_rd -
+          MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
+              RDCOST(x->rdmult, x->rddiv, 0, total_sse));
+
+      if (tmp_best_rdu > 0) {
+        // If even the 'Y' rd value of split is higher than best so far
+        // then dont bother looking at UV
+        vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
+                                        BLOCK_8X8);
+        super_block_uvrd(cm, x, &rate_uv, &distortion_uv, &uv_skippable,
+                         &uv_sse, BLOCK_8X8, tmp_best_rdu);
+        if (rate_uv == INT_MAX)
+          continue;
+        rate2 += rate_uv;
+        distortion2 += distortion_uv;
+        skippable = skippable && uv_skippable;
+        total_sse += uv_sse;
+
+        tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+        for (i = 0; i < TX_MODES; ++i)
+          tx_cache[i] = tx_cache[ONLY_4X4];
+      }
+    }
+
+    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+      rate2 += compmode_cost;
+    }
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    if (second_ref_frame > INTRA_FRAME) {
+      rate2 += ref_costs_comp[ref_frame];
+    } else {
+      rate2 += ref_costs_single[ref_frame];
+    }
+
+    if (!disable_skip) {
+      // Test for the condition where skip block will be activated
+      // because there are no non zero coefficients and make any
+      // necessary adjustment for rate. Ignore if skip is coded at
+      // segment level as the cost wont have been added in.
+      // Is Mb level skip allowed (i.e. not coded at segment level).
+      const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
+                                                         SEG_LVL_SKIP);
+
+      if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
+        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+          // Add in the cost of the no skip flag.
+          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+                                            0);
+          rate2 += prob_skip_cost;
+        } else {
+          // FIXME(rbultje) make this work for splitmv also
+          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+                                            1);
+          rate2 += prob_skip_cost;
+          distortion2 = total_sse;
+          assert(total_sse >= 0);
+          rate2 -= (rate_y + rate_uv);
+          rate_y = 0;
+          rate_uv = 0;
+          this_skip2 = 1;
+        }
+      } else if (mb_skip_allowed) {
+        // Add in the cost of the no skip flag.
+        int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+                                          0);
+        rate2 += prob_skip_cost;
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    }
+
+    // Keep record of best inter rd with single reference
+    if (xd->this_mi->mbmi.ref_frame[0] > INTRA_FRAME &&
+        xd->this_mi->mbmi.ref_frame[1] == NONE &&
+        !mode_excluded &&
+        this_rd < best_inter_rd) {
+      best_inter_rd = this_rd;
+      best_inter_ref_frame = ref_frame;
+    }
+
+    if (!disable_skip && ref_frame == INTRA_FRAME) {
+      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
+      for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+        best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      if (!mode_excluded) {
+        // Note index of best mode so far
+        best_mode_index = mode_index;
+
+        if (ref_frame == INTRA_FRAME) {
+          /* required for left and above block mv */
+          mbmi->mv[0].as_int = 0;
+        }
+
+        *returnrate = rate2;
+        *returndistortion = distortion2;
+        best_rd = this_rd;
+        best_yrd = best_rd -
+                   RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
+        best_mbmode = *mbmi;
+        best_skip2 = this_skip2;
+        best_partition = *x->partition_info;
+        vpx_memcpy(best_zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+                   sizeof(best_zcoeff_blk));
+
+        for (i = 0; i < 4; i++)
+          best_bmodes[i] = xd->this_mi->bmi[i];
+
+        // TODO(debargha): enhance this test with a better distortion prediction
+        // based on qp, activity mask and history
+        if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+            (mode_index > MIN_EARLY_TERM_INDEX)) {
+          const int qstep = xd->plane[0].dequant[1];
+          // TODO(debargha): Enhance this by specializing for each mode_index
+          int scale = 4;
+          if (x->source_variance < UINT_MAX) {
+            const int var_adjust = (x->source_variance < 16);
+            scale -= var_adjust;
+          }
+          if (ref_frame > INTRA_FRAME &&
+              distortion2 * scale < qstep * qstep) {
+            early_term = 1;
+          }
+        }
+      }
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && ref_frame != INTRA_FRAME) {
+      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+      if (second_ref_frame <= INTRA_FRAME &&
+          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
+        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
+      } else if (second_ref_frame > INTRA_FRAME &&
+                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
+        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
+      }
+      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
+        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+    }
+
+    /* keep record of best filter type */
+    if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
+        cm->mcomp_filter_type != BILINEAR) {
+      int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
+                              SWITCHABLE_FILTERS : cm->mcomp_filter_type];
+      for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+        int64_t adj_rd;
+        // In cases of poor prediction, filter_cache[] can contain really big
+        // values, which actually are bigger than this_rd itself. This can
+        // cause negative best_filter_rd[] values, which is obviously silly.
+        // Therefore, if filter_cache < ref, we do an adjusted calculation.
+        if (cpi->rd_filter_cache[i] >= ref)
+          adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
+        else  // FIXME(rbultje) do this for comppred also
+          adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref;
+        best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
+      }
+    }
+
+    /* keep record of best txfm size */
+    if (bsize < BLOCK_32X32) {
+      if (bsize < BLOCK_16X16) {
+        tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
+        tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
+      }
+      tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
+    }
+    if (!mode_excluded && this_rd != INT64_MAX) {
+      for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
+        int64_t adj_rd = INT64_MAX;
+        if (ref_frame > INTRA_FRAME)
+          adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
+        else
+          adj_rd = this_rd;
+
+        if (adj_rd < best_tx_rd[i])
+          best_tx_rd[i] = adj_rd;
+      }
+    }
+
+    if (early_term)
+      break;
+
+    if (x->skip && !comp_pred)
+      break;
+  }
+
+  if (best_rd >= best_rd_so_far)
+    return INT64_MAX;
+
+  // If we used an estimate for the uv intra rd in the loop above...
+  if (cpi->sf.use_uv_intra_rd_estimate) {
+    // Do Intra UV best rd mode selection if best mode choice above was intra.
+    if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) {
+      TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
+                              &rate_uv_tokenonly[uv_tx_size],
+                              &dist_uv[uv_tx_size],
+                              &skip_uv[uv_tx_size],
+                              BLOCK_8X8);
+    }
+  }
+
+  // If we are using reference masking and the set mask flag is set then
+  // create the reference frame mask.
+  if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
+    cpi->ref_frame_mask = ~(1 << vp9_ref_order[best_mode_index].ref_frame);
+
+  if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
+    *returnrate = INT_MAX;
+    *returndistortion = INT_MAX;
+    return best_rd;
   }
 
   assert((cm->mcomp_filter_type == SWITCHABLE) ||
@@ -4017,15 +4438,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   // best rd so far value and update rd_thresh_freq_fact based on the mode/size
   // combination that wins out.
   if (cpi->sf.adaptive_rd_thresh) {
-    for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
+    for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
       if (mode_index == best_mode_index) {
-        cpi->rd_thresh_freq_fact[bsize][mode_index] -=
-          (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
+        cpi->rd_thresh_freq_sub8x8[bsize][mode_index] -=
+          (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 3);
       } else {
-        cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
-        if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
+        cpi->rd_thresh_freq_sub8x8[bsize][mode_index] += RD_THRESH_INC;
+        if (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >
             (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) {
-          cpi->rd_thresh_freq_fact[bsize][mode_index] =
+          cpi->rd_thresh_freq_sub8x8[bsize][mode_index] =
             cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT;
         }
       }
@@ -4035,22 +4456,17 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   // macroblock modes
   *mbmi = best_mbmode;
   x->skip |= best_skip2;
-  if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
-      best_mbmode.sb_type < BLOCK_8X8) {
+  if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
     for (i = 0; i < 4; i++)
       xd->this_mi->bmi[i].as_mode = best_bmodes[i].as_mode;
-  }
-
-  if (best_mbmode.ref_frame[0] != INTRA_FRAME &&
-      best_mbmode.sb_type < BLOCK_8X8) {
+  } else {
     for (i = 0; i < 4; i++)
       xd->this_mi->bmi[i].as_mv[0].as_int =
           best_bmodes[i].as_mv[0].as_int;
 
-    if (mbmi->ref_frame[1] > 0)
+    if (has_second_ref(mbmi))
       for (i = 0; i < 4; i++)
-        xd->this_mi->bmi[i].as_mv[1].as_int =
-            best_bmodes[i].as_mv[1].as_int;
+        xd->this_mi->bmi[i].as_mv[1].as_int = best_bmodes[i].as_mv[1].as_int;
 
     *x->partition_info = best_partition;
 
@@ -4078,7 +4494,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (cm->mcomp_filter_type == SWITCHABLE)
       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
   } else {
-    vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff));
+    vp9_zero(best_filter_diff);
   }
 
   if (!x->skip) {
@@ -4089,7 +4505,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         best_tx_diff[i] = best_rd - best_tx_rd[i];
     }
   } else {
-    vpx_memset(best_tx_diff, 0, sizeof(best_tx_diff));
+    vp9_zero(best_tx_diff);
   }
 
   set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
@@ -4103,3 +4519,4 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   return best_rd;
 }
+
index 9796c0d7c9ba989c2cd03f3213af29befaea8af9..c86ea27231e2a8c59ced9f2582db920cb26098e8 100644 (file)
@@ -12,7 +12,8 @@
 #ifndef VP9_ENCODER_VP9_RDOPT_H_
 #define VP9_ENCODER_VP9_RDOPT_H_
 
-#define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
+#define RDCOST(RM, DM, R, D) \
+  (((128 + ((int64_t)R) * (RM)) >> 8) + ((int64_t)DM) * (D))
 #define QIDX_SKIP_THRESH     115
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
@@ -28,6 +29,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int *r, int64_t *d, BLOCK_SIZE bsize,
                                   PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
+int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                      int mi_row, int mi_col,
+                                      int *r, int64_t *d, BLOCK_SIZE bsize,
+                                      PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+
 void vp9_init_me_luts();
 
 void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
index c1555166376d1fc47772ea9096126966074f3979..a5f18e6313b7c6f9c029b7e51427f4726ddee88d 100644 (file)
@@ -42,8 +42,8 @@ void vp9_ssim_parms_8x8_c(uint8_t *s, int sp, uint8_t *r, int rp,
   }
 }
 
-const static int64_t cc1 =  26634; // (64^2*(.01*255)^2
-const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
+static const int64_t cc1 =  26634;  // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
 
 static double similarity(unsigned long sum_s, unsigned long sum_r,
                          unsigned long sum_sq_s, unsigned long sum_sq_r,
index 0d0fd6d5f455c570648c2028d5b6930b9ca6d1c5..6ff0de4bf2262f1969025e9d84d3bf794e461c5d 100644 (file)
@@ -29,7 +29,7 @@
 #include "vpx_ports/vpx_timer.h"
 
 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
-#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
+#define ALT_REF_SUBPEL_ENABLED 1  // dis/enable subpel in MC AltRef filtering
 
 static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                             uint8_t *y_mb_ptr,
@@ -83,7 +83,6 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
 
   for (i = 0, k = 0; i < block_size; i++) {
     for (j = 0; j < block_size; j++, k++) {
-
       int src_byte = frame1[byte];
       int pixel_value = *frame2++;
 
@@ -151,13 +150,12 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
 
   /*cpi->sf.search_method == HEX*/
-  // TODO Check that the 16x16 vf & sdf are selected here
   // Ignore mv costing by sending NULL pointer instead of cost arrays
   ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0];
-  bestsme = vp9_hex_search(x, &best_ref_mv1_full,
+  bestsme = vp9_hex_search(x, &best_ref_mv1_full.as_mv,
                            step_param, sadpb, 1,
                            &cpi->fn_ptr[BLOCK_16X16],
-                           0, &best_ref_mv1, ref_mv);
+                           0, &best_ref_mv1.as_mv, &ref_mv->as_mv);
 
 #if ALT_REF_SUBPEL_ENABLED
   // Try sub-pixel MC?
@@ -424,16 +422,12 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
 
 #ifdef DEBUGFWG
   // DEBUG FWG
-  printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"
-, max_frames
-, num_frames_backward
-, num_frames_forward
-, frames_to_blur
-, frames_to_blur_backward
-, frames_to_blur_forward
-, cpi->source_encode_index
-, cpi->last_alt_ref_sei
-, start_frame);
+  printf(
+      "max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d "
+      "start:%d",
+      max_frames, num_frames_backward, num_frames_forward, frames_to_blur,
+      frames_to_blur_backward, frames_to_blur_forward, cpi->source_encode_index,
+      cpi->last_alt_ref_sei, start_frame);
 #endif
 
   // Setup scaling factors. Scaling on each of the arnr frames is not supported
@@ -443,7 +437,7 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
       cm->width, cm->height);
 
   // Setup frame pointers, NULL indicates frame not included in filter
-  vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));
+  vp9_zero(cpi->frames);
   for (frame = 0; frame < frames_to_blur; frame++) {
     int which_buffer = start_frame - frame;
     struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
index a59f6db885cca09d9e6fe57d63b636326114773d..4e095f2436d60d10ca78c60ae272af64bf8966c7 100644 (file)
@@ -38,7 +38,6 @@ static int dct_value_cost[DCT_MAX_VALUE * 2];
 const int *vp9_dct_value_cost_ptr;
 
 static void fill_value_tokens() {
-
   TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
   const vp9_extra_bit *const e = vp9_extra_bits;
 
@@ -60,9 +59,9 @@ static void fill_value_tokens() {
 
         t[i].token = --j;
         eb |= (a - e[j].base_val) << 1;
-      } else
+      } else {
         t[i].token = a;
-
+      }
       t[i].extra = eb;
     }
 
@@ -81,9 +80,7 @@ static void fill_value_tokens() {
         cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */
         dct_value_cost[i + DCT_MAX_VALUE] = cost;
       }
-
     }
-
   } while (++i < DCT_MAX_VALUE);
 
   vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
@@ -137,8 +134,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
 
   pt = get_entropy_context(tx_size, pd->above_context + aoff,
                                     pd->left_context + loff);
-  get_scan_and_band(xd, tx_size, type, block, &scan, &band_translate);
-  nb = vp9_get_coef_neighbors_handle(scan);
+  get_scan_and_band(xd, tx_size, type, block, &scan, &nb, &band_translate);
   c = 0;
   do {
     const int band = get_coef_band(band_translate, c);
index 6e686d6f970e81ad098f079d05c990ee9816546d..61031e064c2ad9334c855e570661d73face5c905 100644 (file)
@@ -67,12 +67,6 @@ typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
                                                    unsigned int *sse,
                                                    const uint8_t *second_pred);
 
-typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
-                                int rp, unsigned long *sum_s,
-                                unsigned long *sum_r, unsigned long *sum_sq_s,
-                                unsigned long *sum_sq_r,
-                                unsigned long *sum_sxr);
-
 typedef unsigned int (*vp9_getmbss_fn_t)(const short *);
 
 typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
index 155ba8a3ee48c548c8a3a204c05f433ba7e20d80..991ef4d29bff68c2df73743b327230227dd91d74 100644 (file)
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp9_rtcd.h"
 
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_filter.h"
-#include "vp9/common/vp9_subpelvar.h"
-#include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
-#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/vp9_filter.h"
+#include "vp9/encoder/vp9_variance.h"
+
+static void variance(const uint8_t *src_ptr,
+                     int  source_stride,
+                     const uint8_t *ref_ptr,
+                     int  recon_stride,
+                     int  w,
+                     int  h,
+                     unsigned int *sse,
+                     int *sum) {
+  int i, j;
+  int diff;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      diff = src_ptr[j] - ref_ptr[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    src_ptr += source_stride;
+    ref_ptr += recon_stride;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_first_pass
+ *
+ *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
+ *                  uint32_t src_pixels_per_line : Stride of input block.
+ *                  uint32_t pixel_step        : Offset between filter input
+ *                                               samples (see notes).
+ *                  uint32_t output_height     : Input block height.
+ *                  uint32_t output_width      : Input block width.
+ *                  int32_t  *vp9_filter       : Array of 2 bi-linear filter
+ *                                               taps.
+ *
+ *  OUTPUTS       : int32_t *output_ptr        : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement first-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=
+ *                  stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
+                                              uint16_t *output_ptr,
+                                              unsigned int src_pixels_per_line,
+                                              int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const int16_t *vp9_filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                          (int)src_ptr[pixel_step] * vp9_filter[1],
+                          FILTER_BITS);
+
+      src_ptr++;
+    }
+
+    // Next row...
+    src_ptr    += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_second_pass
+ *
+ *  INPUTS        : int32_t  *src_ptr          : Pointer to source block.
+ *                  uint32_t src_pixels_per_line : Stride of input block.
+ *                  uint32_t pixel_step        : Offset between filter input
+ *                                               samples (see notes).
+ *                  uint32_t output_height     : Input block height.
+ *                  uint32_t output_width      : Input block width.
+ *                  int32_t  *vp9_filter       : Array of 2 bi-linear filter
+ *                                               taps.
+ *
+ *  OUTPUTS       : uint16_t *output_ptr       : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement second-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Requires 32-bit input as produced by
+ *                  filter_block2d_bil_first_pass.
+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=
+ *                  stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
+                                               uint8_t *output_ptr,
+                                               unsigned int src_pixels_per_line,
+                                               unsigned int pixel_step,
+                                               unsigned int output_height,
+                                               unsigned int output_width,
+                                               const int16_t *vp9_filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                          (int)src_ptr[pixel_step] * vp9_filter[1],
+                          FILTER_BITS);
+      src_ptr++;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
 
 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
index 95ae2664390d7da5b5f5ce2dae1144ea06dab758..2d5c0b47ba2f9bb427682e5ac3f6e3e3af77a826 100644 (file)
@@ -2647,4 +2647,4 @@ void FDCT32x32_2D(int16_t *input,
       }
     }
   }
-}
+}  // NOLINT
index d1415606e431ca68bc2ebc743f699d9f627df3f3..a3d011401dd7ece354c7baf55802c9daab47ef02 100644 (file)
@@ -8,12 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp9/encoder/vp9_variance.h"
 #include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
-extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
+extern unsigned int vp9_get_mb_ss_mmx(const int16_t *src_ptr);
 extern unsigned int vp9_get8x8var_mmx
 (
   const unsigned char *src_ptr,
@@ -45,7 +45,6 @@ unsigned int vp9_variance4x4_mmx(
   vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
   *sse = var;
   return (var - (((unsigned int)avg * avg) >> 4));
-
 }
 
 unsigned int vp9_variance8x8_mmx(
@@ -61,7 +60,6 @@ unsigned int vp9_variance8x8_mmx(
   *sse = var;
 
   return (var - (((unsigned int)avg * avg) >> 6));
-
 }
 
 unsigned int vp9_mse16x16_mmx(
@@ -74,10 +72,14 @@ unsigned int vp9_mse16x16_mmx(
   int sum0, sum1, sum2, sum3;
 
 
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                    &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+                    &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+                    ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
+                    ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
 
   var = sse0 + sse1 + sse2 + sse3;
   *sse = var;
@@ -94,11 +96,14 @@ unsigned int vp9_variance16x16_mmx(
   unsigned int sse0, sse1, sse2, sse3, var;
   int sum0, sum1, sum2, sum3, avg;
 
-
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                    &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+                    &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+                    ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
+                    ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
 
   var = sse0 + sse1 + sse2 + sse3;
   avg = sum0 + sum1 + sum2 + sum3;
@@ -115,14 +120,15 @@ unsigned int vp9_variance16x8_mmx(
   unsigned int sse0, sse1, var;
   int sum0, sum1, avg;
 
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                    &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+                    &sse1, &sum1);
 
   var = sse0 + sse1;
   avg = sum0 + sum1;
   *sse = var;
   return (var - (((unsigned int)avg * avg) >> 7));
-
 }
 
 
@@ -135,13 +141,14 @@ unsigned int vp9_variance8x16_mmx(
   unsigned int sse0, sse1, var;
   int sum0, sum1, avg;
 
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                    &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+                    ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
 
   var = sse0 + sse1;
   avg = sum0 + sum1;
   *sse = var;
 
   return (var - (((unsigned int)avg * avg) >> 7));
-
 }
index 67784749c171896fba6182bae5b057821929463d..10fa461d83529eaf1c7a91d2dd6150c88e0aea00 100644 (file)
@@ -48,7 +48,6 @@ VP9_COMMON_SRCS-yes += common/vp9_reconintra.h
 VP9_COMMON_SRCS-yes += common/vp9_rtcd.c
 VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh
 VP9_COMMON_SRCS-yes += common/vp9_sadmxn.h
-VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h
 VP9_COMMON_SRCS-yes += common/vp9_scale.h
 VP9_COMMON_SRCS-yes += common/vp9_scale.c
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
@@ -88,6 +87,14 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
 endif
 
+# common (c)
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_common_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_avg_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
@@ -108,5 +115,6 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_save_reg_neon$(ASM)
 
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh))
index 08a1a845831add057ec1718e50435f9081a10336..e58debfd8158b9204809ae54e7f8b32844af6bf7 100644 (file)
@@ -8,30 +8,30 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <stdlib.h>
+#include <string.h>
 
 #include "vpx/vpx_codec.h"
 #include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_version.h"
+#include "./vpx_version.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vpx/vp8cx.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/common/vp9_onyx.h"
 #include "vp9/vp9_iface_common.h"
-#include <stdlib.h>
-#include <string.h>
 
 struct vp9_extracfg {
   struct vpx_codec_pkt_list *pkt_list;
-  int                         cpu_used;                    /** available cpu percentage in 1/16*/
-  unsigned int                enable_auto_alt_ref;           /** if encoder decides to uses alternate reference frame */
+  int                         cpu_used;  /* available cpu percentage in 1/16 */
+  unsigned int                enable_auto_alt_ref;
   unsigned int                noise_sensitivity;
   unsigned int                Sharpness;
   unsigned int                static_thresh;
   unsigned int                tile_columns;
   unsigned int                tile_rows;
-  unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */
-  unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */
-  unsigned int                arnr_type;        /* alt_ref filter type */
+  unsigned int                arnr_max_frames;
+  unsigned int                arnr_strength;
+  unsigned int                arnr_type;
   unsigned int                experimental;
   vp8e_tuning                 tuning;
   unsigned int                cq_level;         /* constrained quality level */
@@ -48,7 +48,7 @@ struct extraconfig_map {
 static const struct extraconfig_map extracfg_map[] = {
   {
     0,
-    {
+    { // NOLINT
       NULL,
       0,                          /* cpu_used      */
       1,                          /* enable_auto_alt_ref */
@@ -85,11 +85,11 @@ struct vpx_codec_alg_priv {
   uint32_t                pending_frame_magnitude;
   vpx_image_t             preview_img;
   vp8_postproc_cfg_t      preview_ppcfg;
-  vpx_codec_pkt_list_decl(64) pkt_list;              // changed to accomendate the maximum number of lagged frames allowed
+  vpx_codec_pkt_list_decl(64) pkt_list;
   unsigned int                fixed_kf_cntr;
 };
 
-static const VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
+static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
   switch (frame) {
     case VP8_LAST_FRAME:
       return VP9_LAST_FLAG;
@@ -120,26 +120,26 @@ update_error_state(vpx_codec_alg_priv_t                 *ctx,
 #define ERROR(str) do {\
     ctx->base.err_detail = str;\
     return VPX_CODEC_INVALID_PARAM;\
-  } while(0)
+  } while (0)
 
-#define RANGE_CHECK(p,memb,lo,hi) do {\
-    if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
+#define RANGE_CHECK(p, memb, lo, hi) do {\
+    if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
       ERROR(#memb " out of range ["#lo".."#hi"]");\
-  } while(0)
+  } while (0)
 
-#define RANGE_CHECK_HI(p,memb,hi) do {\
-    if(!((p)->memb <= (hi))) \
+#define RANGE_CHECK_HI(p, memb, hi) do {\
+    if (!((p)->memb <= (hi))) \
       ERROR(#memb " out of range [.."#hi"]");\
-  } while(0)
+  } while (0)
 
-#define RANGE_CHECK_LO(p,memb,lo) do {\
-    if(!((p)->memb >= (lo))) \
+#define RANGE_CHECK_LO(p, memb, lo) do {\
+    if (!((p)->memb >= (lo))) \
       ERROR(#memb " out of range ["#lo"..]");\
-  } while(0)
+  } while (0)
 
-#define RANGE_CHECK_BOOL(p,memb) do {\
-    if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
-  } while(0)
+#define RANGE_CHECK_BOOL(p, memb) do {\
+    if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
+  } while (0)
 
 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
                                        const vpx_codec_enc_cfg_t *cfg,
@@ -247,7 +247,8 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   oxcf->width   = cfg.g_w;
   oxcf->height  = cfg.g_h;
   /* guess a frame rate if out of whack, use 30 */
-  oxcf->framerate             = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
+  oxcf->framerate = (double)(cfg.g_timebase.den)
+                    / (double)(cfg.g_timebase.num);
 
   if (oxcf->framerate > 180) {
     oxcf->framerate = 30;
@@ -498,7 +499,7 @@ static vpx_codec_err_t vp9e_common_init(vpx_codec_ctx_t *ctx,
      */
     for (i = 0;
          extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
-         i++);
+         i++) {}
 
     priv->vp8_cfg = extracfg_map[i].cfg;
     priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
@@ -553,7 +554,6 @@ static vpx_codec_err_t vp9e_exp_init(vpx_codec_ctx_t *ctx,
 
 
 static vpx_codec_err_t vp9e_destroy(vpx_codec_alg_priv_t *ctx) {
-
   free(ctx->cx_data);
   vp9_remove_compressor(&ctx->cpi);
   free(ctx);
@@ -712,8 +712,10 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t  *ctx,
     lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
     /* vp8 use 10,000,000 ticks/second as time stamp */
-    dst_time_stamp    = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
-    dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+    dst_time_stamp = pts * 10000000 * ctx->cfg.g_timebase.num
+                     / ctx->cfg.g_timebase.den;
+    dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num /
+                         ctx->cfg.g_timebase.den;
 
     if (img != NULL) {
       res = image2yuvconfig(img, &sd);
@@ -839,8 +841,6 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t  *ctx,
           cx_data += size;
           cx_data_sz -= size;
         }
-
-        // printf("timestamp: %lld, duration: %d\n", pkt->data.frame.pts, pkt->data.frame.duration);
       }
     }
   }
@@ -867,15 +867,14 @@ static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx,
     vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type),
                           &sd);
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
-
+  }
 }
 
 static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx,
                                            int ctr_id,
                                            va_list args) {
-
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
   if (data) {
@@ -886,8 +885,9 @@ static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx,
     vp9_copy_reference_enc(ctx->cpi,
                            ref_frame_to_vp9_reframe(frame->frame_type), &sd);
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 }
 
 static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
@@ -916,8 +916,9 @@ static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx,
   if (data) {
     ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 #else
   (void)ctx;
   (void)ctr_id;
@@ -928,7 +929,6 @@ static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx,
 
 
 static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) {
-
   YV12_BUFFER_CONFIG sd;
   vp9_ppflags_t flags = {0};
 
@@ -941,8 +941,9 @@ static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) {
   if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {
     yuvconfig2image(&ctx->preview_img, &sd, NULL);
     return &ctx->preview_img;
-  } else
+  } else {
     return NULL;
+  }
 }
 
 static vpx_codec_err_t vp9e_update_entropy(vpx_codec_alg_priv_t *ctx,
@@ -951,7 +952,6 @@ static vpx_codec_err_t vp9e_update_entropy(vpx_codec_alg_priv_t *ctx,
   int update = va_arg(args, int);
   vp9_update_entropy(ctx->cpi, update);
   return VPX_CODEC_OK;
-
 }
 
 static vpx_codec_err_t vp9e_update_reference(vpx_codec_alg_priv_t *ctx,
@@ -983,8 +983,9 @@ static vpx_codec_err_t vp9e_set_roi_map(vpx_codec_alg_priv_t *ctx,
       return VPX_CODEC_OK;
     else
       return VPX_CODEC_INVALID_PARAM;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 }
 
 
@@ -994,21 +995,20 @@ static vpx_codec_err_t vp9e_set_activemap(vpx_codec_alg_priv_t *ctx,
   vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
 
   if (data) {
-
     vpx_active_map_t *map = (vpx_active_map_t *)data;
 
     if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
       return VPX_CODEC_OK;
     else
       return VPX_CODEC_INVALID_PARAM;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 }
 
 static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
-
   vpx_scaling_mode_t *data =  va_arg(args, vpx_scaling_mode_t *);
 
   if (data) {
@@ -1019,10 +1019,12 @@ static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
 
     if (!res) {
       return VPX_CODEC_OK;
-    } else
+    } else {
       return VPX_CODEC_INVALID_PARAM;
-  } else
+    }
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 }
 
 static vpx_codec_err_t vp9e_set_width(vpx_codec_alg_priv_t *ctx, int ctr_id,
@@ -1128,7 +1130,7 @@ static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = {
 static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = {
   {
     0,
-    {
+    {  // NOLINT
       0,                  /* g_usage */
       0,                  /* g_threads */
       0,                  /* g_profile */
@@ -1197,13 +1199,13 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = {
   vp9e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
   NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
   NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
-  {
+  {  // NOLINT
     NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
     NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
     NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
     NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
   },
-  {
+  {  // NOLINT
     vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
     vp9e_encode,        /* vpx_codec_encode_fn_t      encode; */
     vp9e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
@@ -1226,13 +1228,13 @@ CODEC_INTERFACE(vpx_codec_vp9x_cx) = {
   vp9e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
   NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
   NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
-  {
+  {  // NOLINT
     NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
     NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
     NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
     NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
   },
-  {
+  {  // NOLINT
     vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
     vp9e_encode,        /* vpx_codec_encode_fn_t      encode; */
     vp9e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
index 10b32385c2ced7324657fe93926733a3304e2f70..7a5b78634c91995499ecdee69eb9e413b1827c23 100644 (file)
@@ -14,7 +14,7 @@
 #include "vpx/vpx_decoder.h"
 #include "vpx/vp8dx.h"
 #include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_version.h"
+#include "./vpx_version.h"
 #include "vp9/decoder/vp9_onyxd.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
@@ -205,7 +205,6 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t         *data,
 
 static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t    *ctx,
                                   vpx_codec_stream_info_t *si) {
-
   unsigned int sz;
 
   if (si->sz >= sizeof(vp9_stream_info_t))
@@ -323,15 +322,20 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,
     vp9_ppflags_t flags = {0};
 
     if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) {
-      flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag
+      flags.post_proc_flag =
 #if CONFIG_POSTPROC_VISUALIZER
-
-                             | ((ctx->dbg_color_ref_frame_flag != 0) ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)
-                             | ((ctx->dbg_color_mb_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
-                             | ((ctx->dbg_color_b_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
-                             | ((ctx->dbg_display_mv_flag != 0) ? VP9D_DEBUG_DRAW_MV : 0)
+          ((ctx->dbg_color_ref_frame_flag != 0) ?
+              VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)
+          | ((ctx->dbg_color_mb_modes_flag != 0) ?
+              VP9D_DEBUG_CLR_BLK_MODES : 0)
+          | ((ctx->dbg_color_b_modes_flag != 0) ?
+              VP9D_DEBUG_CLR_BLK_MODES : 0)
+          | ((ctx->dbg_display_mv_flag != 0) ?
+              VP9D_DEBUG_DRAW_MV : 0)
+          |
 #endif
-;
+          ctx->postproc_cfg.post_proc_flag;
+
       flags.deblocking_level      = ctx->postproc_cfg.deblocking_level;
       flags.noise_level           = ctx->postproc_cfg.noise_level;
 #if CONFIG_POSTPROC_VISUALIZER
@@ -496,8 +500,9 @@ static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t      *ctx,
         mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags);
 
       res = VPX_CODEC_OK;
-    } else
+    } else {
       res = VPX_CODEC_LIST_END;
+    }
   } while (!mmap->sz && res != VPX_CODEC_LIST_END);
 
   return res;
@@ -542,7 +547,6 @@ static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t         *ctx,
 static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
                                      int ctr_id,
                                      va_list args) {
-
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
   if (data) {
@@ -553,15 +557,14 @@ static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
 
     return vp9_set_reference_dec(ctx->pbi,
                                  (VP9_REFFRAME)frame->frame_type, &sd);
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
-
+  }
 }
 
 static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx,
                                       int ctr_id,
                                       va_list args) {
-
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
   if (data) {
@@ -572,9 +575,9 @@ static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx,
 
     return vp9_copy_reference_dec(ctx->pbi,
                                   (VP9_REFFRAME)frame->frame_type, &sd);
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
-
+  }
 }
 
 static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
@@ -603,9 +606,9 @@ static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx,
     ctx->postproc_cfg_set = 1;
     ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
-
+  }
 #else
   return VPX_CODEC_INCAPABLE;
 #endif
@@ -642,15 +645,15 @@ static vpx_codec_err_t get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
     *update_info = pbi->refresh_frame_flags;
 
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 }
 
 
 static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
                                            int ctrl_id,
                                            va_list args) {
-
   int *corrupted = va_arg(args, int *);
 
   if (corrupted) {
@@ -658,9 +661,9 @@ static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
     *corrupted = pbi->common.frame_to_show->corrupted;
 
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
-
+  }
 }
 
 static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
@@ -699,13 +702,13 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = {
   ctf_maps,         /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
   vp9_xma_get_mmap, /* vpx_codec_get_mmap_fn_t   get_mmap; */
   vp9_xma_set_mmap, /* vpx_codec_set_mmap_fn_t   set_mmap; */
-  {
+  { // NOLINT
     vp9_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */
     vp9_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
     vp9_decode,       /* vpx_codec_decode_fn_t     decode; */
     vp9_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
   },
-  {
+  { // NOLINT
     /* encoder functions */
     NOT_IMPLEMENTED,
     NOT_IMPLEMENTED,
index be3afe835156a3ded98e3e1587c05f9de2ec86a4..3a27cdd04c3b76495a4e93d766e90c5c0bdd058d 100644 (file)
@@ -32,12 +32,7 @@ VP9_DX_SRCS-yes += decoder/vp9_thread.c
 VP9_DX_SRCS-yes += decoder/vp9_thread.h
 VP9_DX_SRCS-yes += decoder/vp9_treereader.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c
-VP9_DX_SRCS-yes += decoder/vp9_idct_blk.c
-VP9_DX_SRCS-yes += decoder/vp9_idct_blk.h
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
 
 VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
-
-VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c
-VP9_DX_SRCS-$(HAVE_NEON) += decoder/arm/neon/vp9_add_constant_residual_neon$(ASM)
index 0b4cb1b9e6d24b9d51a25e229c29d27273f4f932..ff71503284fdec6d7f66b8f1c78dda3f1226a345 100644 (file)
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
  */
 #ifndef VP8_H
 #define VP8_H
-#include "vpx_codec_impl_top.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*!\brief Control functions
  *
@@ -125,5 +128,8 @@ VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE,           vp9_ref_frame_t *)
 
 /*! @} - end defgroup vp8 */
 
-#include "vpx_codec_impl_bottom.h"
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif
index f3ea6d3a206b41eef805e828d6c0ad7c8aabb40d..92fdb004ba9899b298c4a6e4d309d0157c451a4a 100644 (file)
  */
 #ifndef VP8CX_H
 #define VP8CX_H
-#include "vpx_codec_impl_top.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*!\name Algorithm interface for VP8
  *
@@ -334,5 +337,8 @@ VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int)
 VPX_CTRL_USE_TYPE(VP9E_SET_MAX_Q,      unsigned int)
 VPX_CTRL_USE_TYPE(VP9E_SET_MIN_Q,      unsigned int)
 /*! @} - end defgroup vp8_encoder */
-#include "vpx_codec_impl_bottom.h"
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif
index 7d250ccae8eb95403c1e6f52cbfdc7aa83d63dce..50a223f2ad403bdc27b7f0cf30539cc4c6a61431 100644 (file)
  */
 #ifndef VP8DX_H
 #define VP8DX_H
-#include "vpx_codec_impl_top.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*!\name Algorithm interface for VP8
  *
@@ -100,6 +103,8 @@ VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 
 /*! @} - end defgroup vp8_decoder */
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
-#include "vpx_codec_impl_bottom.h"
 #endif
index ffa123f127f5e609e0f490065f4cce77368e4b7a..3d5510f66d2a3211c8b53290fe948d4e7bd3b22c 100644 (file)
@@ -35,7 +35,5 @@ API_SRCS-yes                += src/vpx_codec.c
 API_SRCS-yes                += src/vpx_image.c
 API_SRCS-yes                += vpx_codec.h
 API_SRCS-yes                += vpx_codec.mk
-API_SRCS-yes                += vpx_codec_impl_bottom.h
-API_SRCS-yes                += vpx_codec_impl_top.h
 API_SRCS-yes                += vpx_image.h
 API_SRCS-$(BUILD_LIBVPX)    += vpx_integer.h
diff --git a/vpx/vpx_codec_impl_bottom.h b/vpx/vpx_codec_impl_bottom.h
deleted file mode 100644 (file)
index 6eb79a8..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/*
- * This file is to be included at the bottom of the header files defining the
- * interface to individual codecs and contains matching blocks to those defined
- * in vpx_codec_impl_top.h
- */
-#ifdef __cplusplus
-}
-#endif
diff --git a/vpx/vpx_codec_impl_top.h b/vpx/vpx_codec_impl_top.h
deleted file mode 100644 (file)
index c9b8cfa..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/*
- * This file is to be included at the top of the header files defining the
- * interface to individual codecs and contains various workarounds common
- * to all codec implementations.
- */
-#ifdef __cplusplus
-extern "C" {
-#endif
diff --git a/vpx_scale/mips/dspr2/yv12extend_dspr2.c b/vpx_scale/mips/dspr2/yv12extend_dspr2.c
new file mode 100644 (file)
index 0000000..2c5cd1a
--- /dev/null
@@ -0,0 +1,149 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpx_scale.h"
+
+#if HAVE_DSPR2
+static void extend_plane(uint8_t *const src, int src_stride,
+                         int width, int height,
+                         int extend_top, int extend_left,
+                         int extend_bottom, int extend_right) {
+  int       i, j;
+  uint8_t   *left_src, *right_src;
+  uint8_t   *left_dst_start, *right_dst_start;
+  uint8_t   *left_dst, *right_dst;
+  uint8_t   *top_src, *bot_src;
+  uint8_t   *top_dst, *bot_dst;
+  uint32_t  left_pix;
+  uint32_t  right_pix;
+  uint32_t  linesize;
+
+  /* copy the left and right most columns out */
+  left_src  = src;
+  right_src = src + width - 1;
+  left_dst_start = src - extend_left;
+  right_dst_start = src + width;
+
+  for (i = height; i--; ) {
+    left_dst  = left_dst_start;
+    right_dst = right_dst_start;
+
+    __asm__ __volatile__ (
+        "lb        %[left_pix],     0(%[left_src])      \n\t"
+        "lb        %[right_pix],    0(%[right_src])     \n\t"
+        "replv.qb  %[left_pix],     %[left_pix]         \n\t"
+        "replv.qb  %[right_pix],    %[right_pix]        \n\t"
+
+        : [left_pix] "=&r" (left_pix), [right_pix] "=&r" (right_pix)
+        : [left_src] "r" (left_src), [right_src] "r" (right_src)
+    );
+
+    for (j = extend_left/4; j--; ) {
+      __asm__ __volatile__ (
+        "sw     %[left_pix],    0(%[left_dst])     \n\t"
+        "sw     %[right_pix],   0(%[right_dst])    \n\t"
+
+        :
+        : [left_dst] "r" (left_dst), [left_pix] "r" (left_pix),
+          [right_dst] "r" (right_dst), [right_pix] "r" (right_pix)
+      );
+
+      left_dst += 4;
+      right_dst += 4;
+    }
+
+    for (j = extend_left%4; j--; ) {
+      __asm__ __volatile__ (
+        "sb     %[left_pix],    0(%[left_dst])     \n\t"
+        "sb     %[right_pix],   0(%[right_dst])     \n\t"
+
+        :
+        : [left_dst] "r" (left_dst), [left_pix] "r" (left_pix),
+          [right_dst] "r" (right_dst), [right_pix] "r" (right_pix)
+      );
+
+      left_dst += 1;
+      right_dst += 1;
+    }
+
+    left_src  += src_stride;
+    right_src += src_stride;
+    left_dst_start += src_stride;
+    right_dst_start += src_stride;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  top_src = src - extend_left;
+  bot_src = src + src_stride * (height - 1) - extend_left;
+  top_dst = src + src_stride * (-extend_top) - extend_left;
+  bot_dst = src + src_stride * (height) - extend_left;
+  linesize = extend_left + extend_right + width;
+
+  for (i = 0; i < extend_top; i++) {
+    vpx_memcpy(top_dst, top_src, linesize);
+    top_dst += src_stride;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    vpx_memcpy(bot_dst, bot_src, linesize);
+    bot_dst += src_stride;
+  }
+}
+
+static void extend_frame(YV12_BUFFER_CONFIG *const ybf,
+                         int subsampling_x, int subsampling_y,
+                         int ext_size) {
+  const int c_w = (ybf->y_crop_width + subsampling_x) >> subsampling_x;
+  const int c_h = (ybf->y_crop_height + subsampling_y) >> subsampling_y;
+  const int c_et = ext_size >> subsampling_y;
+  const int c_el = ext_size >> subsampling_x;
+  const int c_eb = (ext_size + ybf->y_height - ybf->y_crop_height +
+                    subsampling_y) >> subsampling_y;
+  const int c_er = (ext_size + ybf->y_width - ybf->y_crop_width +
+                    subsampling_x) >> subsampling_x;
+
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+  extend_plane(ybf->y_buffer, ybf->y_stride,
+               ybf->y_crop_width, ybf->y_crop_height,
+               ext_size, ext_size,
+               ext_size + ybf->y_height - ybf->y_crop_height,
+               ext_size + ybf->y_width - ybf->y_crop_width);
+
+  extend_plane(ybf->u_buffer, ybf->uv_stride,
+               c_w, c_h, c_et, c_el, c_eb, c_er);
+
+  extend_plane(ybf->v_buffer, ybf->uv_stride,
+               c_w, c_h, c_et, c_el, c_eb, c_er);
+}
+
+void vp9_extend_frame_borders_dspr2(YV12_BUFFER_CONFIG *ybf,
+                                int subsampling_x, int subsampling_y) {
+  extend_frame(ybf, subsampling_x, subsampling_y, ybf->border);
+}
+
+void vp9_extend_frame_inner_borders_dspr2(YV12_BUFFER_CONFIG *ybf,
+                                          int subsampling_x,
+                                          int subsampling_y) {
+  const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) ?
+                       VP9INNERBORDERINPIXELS : ybf->border;
+  extend_frame(ybf, subsampling_x, subsampling_y, inner_bw);
+}
+#endif
index 76c11e79215c072a0976e7c8cbabcceba880ae6d..50d3e9d8ea7007b0ad86f0c2c3e42cd2b0c2e9a9 100644 (file)
@@ -16,6 +16,9 @@ SCALE_SRCS-$(HAVE_NEON)  += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM)
 SCALE_SRCS-$(HAVE_NEON)  += arm/neon/vp8_vpxyv12_extendframeborders_neon$(ASM)
 SCALE_SRCS-$(HAVE_NEON)  += arm/neon/yv12extend_arm.c
 
+#mips(dspr2)
+SCALE_SRCS-$(HAVE_DSPR2)  += mips/dspr2/yv12extend_dspr2.c
+
 SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes)
 
 $(eval $(call asm_offsets_template,\
index ea7b0e2e8b99af97b4d390d51859e80eac11f571..a5faf1148e1daea6ef9ead6b386b77b877c6babc 100644 (file)
@@ -27,8 +27,8 @@ specialize vpx_yv12_copy_y neon
 
 if [ "$CONFIG_VP9" = "yes" ]; then
     prototype void vp9_extend_frame_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
-    specialize vp9_extend_frame_borders
+    specialize vp9_extend_frame_borders dspr2
 
     prototype void vp9_extend_frame_inner_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
-    specialize vp9_extend_frame_inner_borders_c
+    specialize vp9_extend_frame_inner_borders dspr2
 fi