Extends temporal filtering to work for 422 data

author Deb Mukherjee <debargha@google.com>

Tue, 20 May 2014 17:48:54 +0000 (10:48 -0700)

committer Deb Mukherjee <debargha@google.com>

Tue, 20 May 2014 22:19:40 +0000 (15:19 -0700)
author Deb Mukherjee <debargha@google.com>
Tue, 20 May 2014 17:48:54 +0000 (10:48 -0700)
committer Deb Mukherjee <debargha@google.com>
Tue, 20 May 2014 22:19:40 +0000 (15:19 -0700)
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl

index 1037bfbc360a59146b3e476605f7ed6b4cd49504..18273962009f55aa48cc315c8fe63b45a807db4f 100644 (file)
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -772,7 +772,7 @@ $vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4;
  add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
  specialize qw/vp9_full_range_search/;
  
-add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
  specialize qw/vp9_temporal_filter_apply sse2/;
  
  }
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c

index 6eff200801425a957a7239969c4cff4dec867371..f501971e267bfb0a19df615d0b3abe54ff1c0807 100644 (file)
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -34,7 +34,8 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                              uint8_t *u_mb_ptr,
                                              uint8_t *v_mb_ptr,
                                              int stride,
-                                            int uv_block_size,
+                                            int uv_block_width,
+                                            int uv_block_height,
                                              int mv_row,
                                              int mv_col,
                                              uint8_t *pred,
@@ -47,7 +48,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
  
    enum mv_precision mv_precision_uv;
    int uv_stride;
-  if (uv_block_size == 8) {
+  if (uv_block_width == 8) {
      uv_stride = (stride + 1) >> 1;
      mv_precision_uv = MV_PRECISION_Q4;
    } else {
@@ -64,18 +65,18 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                              kernel, MV_PRECISION_Q3, x, y);
  
    vp9_build_inter_predictor(u_mb_ptr, uv_stride,
-                            &pred[256], uv_block_size,
+                            &pred[256], uv_block_width,
                              &mv,
                              scale,
-                            uv_block_size, uv_block_size,
+                            uv_block_width, uv_block_height,
                              which_mv,
                              kernel, mv_precision_uv, x, y);
  
    vp9_build_inter_predictor(v_mb_ptr, uv_stride,
-                            &pred[512], uv_block_size,
+                            &pred[512], uv_block_width,
                              &mv,
                              scale,
-                            uv_block_size, uv_block_size,
+                            uv_block_width, uv_block_height,
                              which_mv,
                              kernel, mv_precision_uv, x, y);
  }
@@ -91,7 +92,8 @@ void vp9_temporal_filter_init() {
  void vp9_temporal_filter_apply_c(uint8_t *frame1,
                                   unsigned int stride,
                                   uint8_t *frame2,
-                                 unsigned int block_size,
+                                 unsigned int block_width,
+                                 unsigned int block_height,
                                   int strength,
                                   int filter_weight,
                                   unsigned int *accumulator,
@@ -101,8 +103,8 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
    int byte = 0;
    const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
  
-  for (i = 0, k = 0; i < block_size; i++) {
-    for (j = 0; j < block_size; j++, k++) {
+  for (i = 0, k = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++, k++) {
        int src_byte = frame1[byte];
        int pixel_value = *frame2++;
  
@@ -127,7 +129,7 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
        byte++;
      }
  
-    byte += stride - block_size;
+    byte += stride - block_width;
    }
  }
  
@@ -204,14 +206,12 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
    uint8_t *dst1, *dst2;
    DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor, 16 * 16 * 3);
    const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
+  const int mb_uv_width  = 16 >> mbd->plane[1].subsampling_x;
  
    // Save input state
    uint8_t* input_buffer[MAX_MB_PLANE];
    int i;
  
-  // TODO(aconverse): Add 4:2:2 support
-  assert(mbd->plane[1].subsampling_x == mbd->plane[1].subsampling_y);
-
    for (i = 0; i < MAX_MB_PLANE; i++)
      input_buffer[i] = mbd->plane[i].pre[0].buf;
  
@@ -275,7 +275,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
                cpi->frames[frame]->u_buffer + mb_uv_offset,
                cpi->frames[frame]->v_buffer + mb_uv_offset,
                cpi->frames[frame]->y_stride,
-              mb_uv_height,
+              mb_uv_width, mb_uv_height,
                mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
                mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
                predictor, scale,
@@ -283,16 +283,17 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
  
            // Apply the filter (YUV)
            vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
-                                    predictor, 16, strength, filter_weight,
+                                    predictor, 16, 16,
+                                    strength, filter_weight,
                                      accumulator, count);
-
            vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 256, mb_uv_height, strength,
+                                    predictor + 256,
+                                    mb_uv_width, mb_uv_height, strength,
                                      filter_weight, accumulator + 256,
                                      count + 256);
-
            vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 512, mb_uv_height, strength,
+                                    predictor + 512,
+                                    mb_uv_width, mb_uv_height, strength,
                                      filter_weight, accumulator + 512,
                                      count + 512);
          }
@@ -321,7 +322,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
        stride = cpi->alt_ref_buffer.uv_stride;
        byte = mb_uv_offset;
        for (i = 0, k = 256; i < mb_uv_height; i++) {
-        for (j = 0; j < mb_uv_height; j++, k++) {
+        for (j = 0; j < mb_uv_width; j++, k++) {
            int m = k + 256;
  
            // U
@@ -339,13 +340,13 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
            // move to next pixel
            byte++;
          }
-        byte += stride - mb_uv_height;
+        byte += stride - mb_uv_width;
        }
        mb_y_offset += 16;
-      mb_uv_offset += mb_uv_height;
+      mb_uv_offset += mb_uv_width;
      }
      mb_y_offset += 16 * (f->y_stride - mb_cols);
-    mb_uv_offset += mb_uv_height * (f->uv_stride - mb_cols);
+    mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
    }
  
    // Restore input state
diff --git a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm

index d2d13b3839425b5d297e1af3823963965b049ea0..673e0b3a668050b17c305f5e8f97dd5d94336319 100644 (file)
--- a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
+++ b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
@@ -15,41 +15,45 @@
  ;  (unsigned char  *frame1,           |  0
  ;   unsigned int    stride,           |  1
  ;   unsigned char  *frame2,           |  2
-;   unsigned int    block_size,       |  3
-;   int             strength,         |  4
-;   int             filter_weight,    |  5
-;   unsigned int   *accumulator,      |  6
-;   unsigned short *count)            |  7
+;   unsigned int    block_width,      |  3
+;   unsigned int    block_height,     |  4
+;   int             strength,         |  5
+;   int             filter_weight,    |  6
+;   unsigned int   *accumulator,      |  7
+;   unsigned short *count)            |  8
  global sym(vp9_temporal_filter_apply_sse2) PRIVATE
  sym(vp9_temporal_filter_apply_sse2):
  
      push        rbp
      mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
+    SHADOW_ARGS_TO_STACK 9
      SAVE_XMM 7
      GET_GOT     rbx
      push        rsi
      push        rdi
      ALIGN_STACK 16, rax
-    %define block_size    0
-    %define strength      16
-    %define filter_weight 32
-    %define rounding_bit  48
-    %define rbp_backup    64
-    %define stack_size    80
+    %define block_width    0
+    %define block_height  16
+    %define strength      32
+    %define filter_weight 48
+    %define rounding_bit  64
+    %define rbp_backup    80
+    %define stack_size    96
      sub         rsp,           stack_size
      mov         [rsp + rbp_backup], rbp
      ; end prolog
  
          mov         rdx,            arg(3)
-        mov         [rsp + block_size], rdx
-        movd        xmm6,            arg(4)
+        mov         [rsp + block_width], rdx
+        mov         rdx,            arg(4)
+        mov         [rsp + block_height], rdx
+        movd        xmm6,           arg(5)
          movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
  
          ; calculate the rounding bit outside the loop
          ; 0x8000 >> (16 - strength)
          mov         rdx,            16
-        sub         rdx,            arg(4) ; 16 - strength
+        sub         rdx,            arg(5) ; 16 - strength
          movq        xmm4,           rdx    ; can't use rdx w/ shift
          movdqa      xmm5,           [GLOBAL(_const_top_bit)]
          psrlw       xmm5,           xmm4
@@ -57,11 +61,11 @@ sym(vp9_temporal_filter_apply_sse2):
  
          mov         rsi,            arg(0) ; src/frame1
          mov         rdx,            arg(2) ; predictor frame
-        mov         rdi,            arg(6) ; accumulator
-        mov         rax,            arg(7) ; count
+        mov         rdi,            arg(7) ; accumulator
+        mov         rax,            arg(8) ; count
  
          ; dup the filter weight and store for later
-        movd        xmm0,           arg(5) ; filter_weight
+        movd        xmm0,           arg(6) ; filter_weight
          pshuflw     xmm0,           xmm0, 0
          punpcklwd   xmm0,           xmm0
          movdqa      [rsp + filter_weight], xmm0
@@ -69,10 +73,11 @@ sym(vp9_temporal_filter_apply_sse2):
          mov         rbp,            arg(1) ; stride
          pxor        xmm7,           xmm7   ; zero for extraction
  
-        lea         rcx,            [rdx + 16*16*1]
-        cmp         dword ptr [rsp + block_size], 8
+        mov         rcx,            [rsp + block_width]
+        imul        rcx,            [rsp + block_height]
+        add         rcx,            rdx
+        cmp         dword ptr [rsp + block_width], 8
          jne         .temporal_filter_apply_load_16
-        lea         rcx,            [rdx + 8*8*1]
  
  .temporal_filter_apply_load_8:
          movq        xmm0,           [rsi]  ; first row
@@ -178,7 +183,7 @@ sym(vp9_temporal_filter_apply_sse2):
          cmp         rdx,            rcx
          je          .temporal_filter_apply_epilog
          pxor        xmm7,           xmm7   ; zero for extraction
-        cmp         dword ptr [rsp + block_size], 16
+        cmp         dword ptr [rsp + block_width], 16
          je          .temporal_filter_apply_load_16
          jmp         .temporal_filter_apply_load_8
author	Deb Mukherjee <debargha@google.com>
	Tue, 20 May 2014 17:48:54 +0000 (10:48 -0700)
committer	Deb Mukherjee <debargha@google.com>
	Tue, 20 May 2014 22:19:40 +0000 (15:19 -0700)
vp9/common/vp9_rtcd_defs.pl		patch \| blob \| history
vp9/encoder/vp9_temporal_filter.c		patch \| blob \| history
vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm		patch \| blob \| history