]> granicus.if.org Git - libvpx/commitdiff
Add palette coding mode for inter frames
authorhui su <huisu@google.com>
Wed, 25 Feb 2015 18:00:40 +0000 (10:00 -0800)
committerhui su <huisu@google.com>
Mon, 23 Mar 2015 15:41:51 +0000 (08:41 -0700)
on screen_content
--enable-palette                                    +6.74%

on derflr
with all other experiments                          +6.02%
(--enable-supertx --enable-copy-mode
 --enable-ext-tx --enable-filterintra
 --enable-tx64x64 --enable-tx-skip
 --enable-interintra --enable-wedge-partition
 --enable-compound-modes --enable-new-quant
 --enable-palette)

Change-Id: Ib85049b4c3fcf52bf95efbc9d6aecf53d53ca1a3

vp9/common/vp9_blockd.h
vp9/common/vp9_palette.c
vp9/common/vp9_palette.h
vp9/decoder/vp9_decodeframe.c
vp9/decoder/vp9_decodemv.c
vp9/decoder/vp9_decoder.c
vp9/encoder/vp9_bitstream.c
vp9/encoder/vp9_block.h
vp9/encoder/vp9_encodeframe.c
vp9/encoder/vp9_rdopt.c

index 0f647ceeaf283d18813c94c0fcff14996bd6bb09..463998e36424c8269de9791cbc55c3055543486d 100644 (file)
@@ -113,7 +113,7 @@ typedef enum {
 typedef enum {
   H_SCAN,
   V_SCAN,
-  SPIN_SCAN,
+  SPIRAL_SCAN,
   ZZ_SCAN,
   PALETTE_SCAN_ORDERS
 } PALETTE_SCAN_ORDER;
@@ -365,6 +365,8 @@ typedef struct macroblockd {
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
 #if CONFIG_PALETTE
   DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
+  DECLARE_ALIGNED(16, int, palette_scan_buffer[64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, palette_map_buffer[64 * 64]);
 #endif
 
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
index e303619ab3e6bda250a9f4cea29446126e437486..a70e86218fbfe9298909f75f3638b13c33aa0662 100644 (file)
@@ -41,7 +41,7 @@ void insertion_sort(double *data, int n) {
 int count_colors(const uint8_t *src, int stride, int rows, int cols) {
   int n = 0, r, c, i, val_count[256];
   uint8_t val;
-  memset(val_count, 0, sizeof(val_count));
+  vpx_memset(val_count, 0, sizeof(val_count));
 
   for (r = 0; r < rows; r++) {
       for (c = 0; c < cols; c++) {
@@ -85,7 +85,7 @@ int run_lengh_decoding(uint16_t *runs, int l, uint8_t *seq) {
   int i, j = 0;
 
   for (i = 0; i < l; i += 2) {
-    memset(seq + j, runs[i], runs[i + 1]);
+    vpx_memset(seq + j, runs[i], runs[i + 1]);
     j += runs[i + 1];
   }
 
@@ -94,14 +94,10 @@ int run_lengh_decoding(uint16_t *runs, int l, uint8_t *seq) {
 
 void transpose_block(uint8_t *seq_in, uint8_t *seq_out, int rows, int cols) {
   int r, c;
-  uint8_t seq_dup[4096];
-  memcpy(seq_dup, seq_in, rows * cols);
 
-  for (r = 0; r < cols; r++) {
-    for (c = 0; c < rows; c++) {
-      seq_out[r * rows + c] = seq_dup[c * cols + r];
-    }
-  }
+  for (r = 0; r < cols; r++)
+    for (c = 0; c < rows; c++)
+      seq_out[r * rows + c] = seq_in[c * cols + r];
 }
 
 void palette_color_insertion(uint8_t *old_colors, int *m, int *count,
@@ -230,8 +226,8 @@ void calc_centroids(double *data, double *centroids, int *indices,
   int i, j, index;
   int count[256];
   unsigned int seed = time(NULL);
-  memset(count, 0, sizeof(count[0]) * k);
-  memset(centroids, 0, sizeof(centroids[0]) * k * dim);
+  vpx_memset(count, 0, sizeof(count[0]) * k);
+  vpx_memset(centroids, 0, sizeof(centroids[0]) * k * dim);
 
   for (i = 0; i < n; i++) {
     index = indices[i];
@@ -243,8 +239,8 @@ void calc_centroids(double *data, double *centroids, int *indices,
 
   for (i = 0; i < k; i++) {
     if (!count[i])
-      memcpy(centroids + i * dim, data + (rand_r(&seed) % n) * dim,
-             sizeof(centroids[0]) * dim);
+      vpx_memcpy(centroids + i * dim, data + (rand_r(&seed) % n) * dim,
+                 sizeof(centroids[0]) * dim);
     else
       for (j = 0; j < dim; j++)
         centroids[i * dim + j] /= count[i];
@@ -267,32 +263,35 @@ double calc_total_dist(double *data, double *centroids, int *indices,
 int k_means(double *data, double *centroids, int *indices,
              int n, int k, int dim, int max_itr) {
   int i = 0;
-  int pre_indices[4096];
+  int *pre_indices;
   double pre_total_dist, cur_total_dist;
   double pre_centroids[256];
 
+  pre_indices = vpx_memalign(16, n * sizeof(indices[0]));
   calc_indices(data, centroids, indices, n, k, dim);
   pre_total_dist = calc_total_dist(data, centroids, indices, n, k, dim);
-  memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
-  memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+  vpx_memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
+  vpx_memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
   while (i < max_itr) {
     calc_centroids(data, centroids, indices, n, k, dim);
     calc_indices(data, centroids, indices, n, k, dim);
     cur_total_dist = calc_total_dist(data, centroids, indices, n, k, dim);
 
     if (cur_total_dist > pre_total_dist && 0) {
-      memcpy(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim);
-      memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
+      vpx_memcpy(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim);
+      vpx_memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
       break;
     }
     if (!memcmp(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim))
       break;
 
-    memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
-    memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+    vpx_memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
+    vpx_memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
     pre_total_dist = cur_total_dist;
     i++;
   }
+
+  vpx_free(pre_indices);
   return i;
 }
 
@@ -305,7 +304,7 @@ int is_in_boundary(int rows, int cols, int r, int c) {
 void zz_scan_order(int *order, int rows, int cols) {
   int r, c, dir, idx;
 
-  memset(order, 0, sizeof(order[0]) * rows * cols);
+  vpx_memset(order, 0, sizeof(order[0]) * rows * cols);
   r = 0;
   c = 0;
   dir = 1;
@@ -339,7 +338,7 @@ void zz_scan_order(int *order, int rows, int cols) {
   order[idx] = (rows - 1) * cols + cols - 1;
 }
 
-void spin_order(int *order, int cols, int r_start, int c_start,
+void spiral_order(int *order, int cols, int r_start, int c_start,
                 int h, int w, int idx) {
   int r, c;
 
@@ -367,10 +366,62 @@ void spin_order(int *order, int cols, int r_start, int c_start,
   for (c = 0; c < w; c++)
     order[idx++] = r_start * cols + w - c + c_start;
 
-  spin_order(order, cols, r_start + 1, c_start + 1, h - 2, w - 2, idx);
+  spiral_order(order, cols, r_start + 1, c_start + 1, h - 2, w - 2, idx);
 }
 
-void spin_scan_order(int *order, int rows, int cols) {
-  spin_order(order, cols, 0, 0, rows - 1, cols - 1, 0);
+void spiral_scan_order(int *order, int rows, int cols) {
+  spiral_order(order, cols, 0, 0, rows - 1, cols - 1, 0);
+}
+
+void palette_scan(uint8_t *color_index_map, uint8_t *sequence,
+                  int rows, int cols, PALETTE_SCAN_ORDER ps, int *scan_order) {
+  int i;
+
+  switch (ps) {
+    case H_SCAN:
+      vpx_memcpy(sequence, color_index_map, rows * cols);
+      break;
+    case V_SCAN:
+      transpose_block(color_index_map, sequence, rows, cols);
+      break;
+    case SPIRAL_SCAN:
+      spiral_scan_order(scan_order, rows, cols);
+      for (i = 0; i < rows * cols; i++)
+        sequence[i] = color_index_map[scan_order[i]];
+      break;
+    case ZZ_SCAN:
+      zz_scan_order(scan_order, rows, cols);
+      for (i = 0; i < rows * cols; i++)
+        sequence[i] = color_index_map[scan_order[i]];
+      break;
+    default:
+      break;
+  }
+}
+
+void palette_iscan(uint8_t *color_index_map, uint8_t *sequence,
+                   int rows, int cols, PALETTE_SCAN_ORDER ps, int *scan_order) {
+  int i;
+
+  switch (ps) {
+    case H_SCAN:
+      vpx_memcpy(color_index_map, sequence, rows * cols);
+      break;
+    case V_SCAN:
+      transpose_block(sequence, color_index_map, cols, rows);
+      break;
+    case SPIRAL_SCAN:
+      spiral_scan_order(scan_order, rows, cols);
+      for (i = 0; i < rows * cols; i++)
+        color_index_map[scan_order[i]] = sequence[i];
+      break;
+    case ZZ_SCAN:
+      zz_scan_order(scan_order, rows, cols);
+      for (i = 0; i < rows * cols; i++)
+        color_index_map[scan_order[i]] = sequence[i];
+      break;
+    default:
+      break;
+  }
 }
 #endif
index 40464b1c8908f070afa62e9929f62ce8397a299e..5b00ff29df6b320444c955a789894b5524f22e6a 100644 (file)
@@ -29,7 +29,11 @@ int k_means(double *data, double *centroids, int *indices,
 void calc_indices(double *data, double *centroids, int *indices,
                   int n, int k, int dim);
 void zz_scan_order(int *order, int rows, int cols);
-void spin_scan_order(int *order, int rows, int cols);
+void spiral_scan_order(int *order, int rows, int cols);
+void palette_scan(uint8_t *color_index_map, uint8_t *sequence,
+                  int rows, int cols, PALETTE_SCAN_ORDER ps, int *scan_order);
+void palette_iscan(uint8_t *color_index_map, uint8_t *sequence,
+                   int rows, int cols, PALETTE_SCAN_ORDER ps, int *scan_order);
 #endif
 
 #endif  // VP9_COMMON_VP9_PALETTE_H_
index 715f7025f2e0698296dd0e6fe8b2bff61fe4e8b8..fa9afebadbf62414bb96d3ad5793667654aa8299 100644 (file)
@@ -271,11 +271,11 @@ static void vp9_intra_dpcm_add_nocoeff(uint8_t *dst, int stride,
   switch (mode) {
     case H_PRED:
       for (r = 0; r < bs; r++)
-        memset(dst + r * stride + 1, dst[r * stride], bs - 1);
+        vpx_memset(dst + r * stride + 1, dst[r * stride], bs - 1);
       break;
     case V_PRED:
       for (r = 1; r < bs; r++)
-        memcpy(dst + r * stride, dst, bs * sizeof(*dst));
+        vpx_memcpy(dst + r * stride, dst, bs * sizeof(*dst));
       break;
     case TM_PRED:
       for (r = 1; r < bs; r++)
index 7c9e39953c585231c8cc7509c2c54ba86fa11932..c5cf2ab4559e53b6828a807dedb4b06df35962f1 100644 (file)
@@ -235,8 +235,6 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
     int i, m1, m2, d, val;
     int rows = 4 * num_4x4_blocks_high_lookup[bsize];
     int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
-    int scan_order[4096];
-    uint8_t map[4096];
     PALETTE_RUN_LENGTH bits;
 
     mbmi->mode = DC_PRED;
@@ -279,8 +277,8 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
               s * vp9_read_literal(r, mbmi->palette_delta_bitdepth);
         }
       } else {
-        memset(mbmi->palette_color_delta, 0,
-               m1 * sizeof(mbmi->palette_color_delta[0]));
+        vpx_memset(mbmi->palette_color_delta, 0,
+                   m1 * sizeof(mbmi->palette_color_delta[0]));
       }
       for (i = 0; i < m1; i++) {
         val = cm->current_palette_colors[mbmi->palette_indexed_colors[i]];
@@ -314,27 +312,11 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
                             &cm ->current_palette_size,
                             cm->current_palette_count, mbmi);
 
-    run_lengh_decoding(mbmi->palette_runs, mbmi->palette_run_length[0], map);
-    switch (mbmi->palette_scan_order[0]) {
-      case H_SCAN:
-        memcpy(xd->plane[0].color_index_map, map, rows * cols);
-        break;
-      case V_SCAN:
-        transpose_block(map, xd->plane[0].color_index_map, cols, rows);
-        break;
-      case SPIN_SCAN:
-        spin_scan_order(scan_order, rows, cols);
-        for (i = 0; i < rows * cols; i++)
-          xd->plane[0].color_index_map[scan_order[i]] = map[i];
-        break;
-      case ZZ_SCAN:
-        zz_scan_order(scan_order, rows, cols);
-        for (i = 0; i < rows * cols; i++)
-          xd->plane[0].color_index_map[scan_order[i]] = map[i];
-        break;
-      default:
-        break;
-    }
+    run_lengh_decoding(mbmi->palette_runs, mbmi->palette_run_length[0],
+                       xd->palette_map_buffer);
+    palette_iscan(xd->plane[0].color_index_map, xd->palette_map_buffer,
+                  rows, cols, mbmi->palette_scan_order[0],
+                  xd->palette_scan_buffer);
     mbmi->tx_size = MIN(max_txsize_lookup[bsize],
                         tx_mode_to_biggest_tx_size[cm->tx_mode]);
   }
@@ -345,8 +327,6 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
         xd->plane[1].subsampling_y;
     int cols = 4 * num_4x4_blocks_wide_lookup[bsize] >>
         xd->plane[1].subsampling_x;
-    int scan_order[4096];
-    uint8_t map[4096];
     PALETTE_RUN_LENGTH bits;
     BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
 
@@ -387,27 +367,9 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
       }
 
       run_lengh_decoding(mbmi->palette_runs + PALETTE_MAX_RUNS,
-                         mbmi->palette_run_length[1], map);
-      switch (mbmi->palette_scan_order[1]) {
-        case H_SCAN:
-          memcpy(xd->plane[1].color_index_map, map, rows * cols);
-          break;
-        case V_SCAN:
-          transpose_block(map, xd->plane[1].color_index_map, cols, rows);
-          break;
-        case SPIN_SCAN:
-          spin_scan_order(scan_order, rows, cols);
-          for (i = 0; i < rows * cols; i++)
-            xd->plane[1].color_index_map[scan_order[i]] = map[i];
-          break;
-        case ZZ_SCAN:
-          zz_scan_order(scan_order, rows, cols);
-          for (i = 0; i < rows * cols; i++)
-            xd->plane[1].color_index_map[scan_order[i]] = map[i];
-          break;
-        default:
-          break;
-      }
+                         mbmi->palette_run_length[1], xd->palette_map_buffer);
+      palette_iscan(xd->plane[1].color_index_map, xd->palette_map_buffer, rows,
+                    cols, mbmi->palette_scan_order[1], xd->palette_scan_buffer);
     }
   }
 
@@ -757,36 +719,67 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi,
 #endif
       break;
     default:
+#if CONFIG_PALETTE
+      if (!mbmi->palette_enabled[0]) {
+        mbmi->mode = read_intra_mode_y(cm, r, size_group_lookup[bsize]);
+      } else {
+        mbmi->mode = DC_PRED;
+        if (!cm->frame_parallel_decoding_mode)
+            ++cm->counts.y_mode[size_group_lookup[bsize]][DC_PRED];
+      }
+#else
       mbmi->mode = read_intra_mode_y(cm, r, size_group_lookup[bsize]);
+#endif  // CONFIG_PALETTE
 #if CONFIG_FILTERINTRA
-      if (is_filter_allowed(mbmi->mode) && is_filter_enabled(mbmi->tx_size)) {
+      if (is_filter_allowed(mbmi->mode) && is_filter_enabled(mbmi->tx_size)
+#if CONFIG_PALETTE
+          && !mbmi->palette_enabled[0]
+#endif  // CONFIG_PALETTE
+      ) {
         mbmi->filterbit = vp9_read(r,
             cm->fc.filterintra_prob[mbmi->tx_size][mbmi->mode]);
         cm->counts.filterintra[mbmi->tx_size][mbmi->mode][mbmi->filterbit]++;
       } else {
         mbmi->filterbit = 0;
+#if CONFIG_PALETTE
+        if (mbmi->palette_enabled[0])
+          cm->counts.filterintra[mbmi->tx_size][mbmi->mode][mbmi->filterbit]++;
+#endif  // CONFIG_PALETTE
       }
-#endif
+#endif  // CONFIG_FILTERINTRA
   }
 
+#if CONFIG_PALETTE
+  if (!mbmi->palette_enabled[1]) {
+    mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
+  } else {
+    mbmi->uv_mode = DC_PRED;
+    if (!cm->frame_parallel_decoding_mode)
+      ++cm->counts.uv_mode[mbmi->mode][DC_PRED];
+  }
+#else
   mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
+#endif  // CONFIG_PALETTE
 #if CONFIG_FILTERINTRA
   if (is_filter_allowed(mbmi->uv_mode) &&
-      is_filter_enabled(get_uv_tx_size(mbmi, &xd->plane[1]))) {
+      is_filter_enabled(get_uv_tx_size(mbmi, &xd->plane[1]))
+#if CONFIG_PALETTE
+      && !mbmi->palette_enabled[1]
+#endif  // CONFIG_PALETTE
+  ) {
     mbmi->uv_filterbit = vp9_read(r,
         cm->fc.filterintra_prob[get_uv_tx_size(mbmi, &xd->plane[1])][mbmi->uv_mode]);
     cm->counts.filterintra[get_uv_tx_size(mbmi, &xd->plane[1])]
                            [mbmi->uv_mode][mbmi->uv_filterbit]++;
   } else {
     mbmi->uv_filterbit = 0;
-  }
-#endif  // CONFIG_FILTERINTRA
-
 #if CONFIG_PALETTE
-  for (i = 0; i < 2; ++i) {
-    mbmi->palette_enabled[i] = 0;
-  }
+    if (mbmi->palette_enabled[1])
+      cm->counts.filterintra[get_uv_tx_size(mbmi, &xd->plane[1])]
+                             [mbmi->uv_mode][mbmi->uv_filterbit]++;
 #endif  // CONFIG_PALETTE
+  }
+#endif  // CONFIG_FILTERINTRA
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
@@ -1254,8 +1247,124 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm,
     if (mbmi->copy_mode == NOREF)
 #endif
       inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
+#if CONFIG_PALETTE
+    mbmi->palette_enabled[0] = 0;
+    mbmi->palette_enabled[1] = 0;
+
+    if (!inter_block && mbmi->sb_type >= BLOCK_8X8 && cm->allow_palette_mode) {
+      mbmi->palette_enabled[0] = vp9_read_bit(r);
+      mbmi->palette_enabled[1] = vp9_read_bit(r);
+    }
+
+    if (mbmi->palette_enabled[0]) {
+      BLOCK_SIZE bsize = mbmi->sb_type;
+      int i, d;
+      int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+      int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+      PALETTE_RUN_LENGTH bits;
+
+      mbmi->mode = DC_PRED;
+      mbmi->palette_size[0] =
+          vp9_read_tree(r, vp9_palette_size_tree,
+                        cm->fc.palette_size_prob[bsize - BLOCK_8X8]);
+      mbmi->palette_size[0] += 2;
+      mbmi->palette_run_length[0] =
+          vp9_read_literal(r, get_bit_depth(palette_max_run(bsize)));
+      mbmi->palette_run_length[0] = (mbmi->palette_run_length[0]) << 1;
+      mbmi->palette_scan_order[0] = vp9_read_literal(r, 2);
+
+      for (i = 0; i < mbmi->palette_size[0]; i++) {
+        mbmi->palette_colors[i] = vp9_read_literal(r, 8);
+      }
+
+      d = get_bit_depth(rows * cols);
+      for (i = 0; i < mbmi->palette_run_length[0]; i += 2) {
+        mbmi->palette_runs[i] =
+            vp9_read_literal(r, get_bit_depth(mbmi->palette_size[0]));
+
+        bits = vp9_read_tree(r, vp9_palette_run_length_tree,
+                             cm->fc.palette_run_length_prob[bsize - BLOCK_8X8]);
+        if (bits == MAX_BITS)
+          mbmi->palette_runs[i + 1] = vp9_read_literal(r, d);
+        else
+          mbmi->palette_runs[i + 1] = vp9_read_literal(r, bits - ONE_BITS + 1);
+        mbmi->palette_runs[i + 1] += 1;
+      }
+
+      run_lengh_decoding(mbmi->palette_runs, mbmi->palette_run_length[0],
+                         xd->palette_map_buffer);
+      palette_iscan(xd->plane[0].color_index_map, xd->palette_map_buffer, rows,
+                    cols, mbmi->palette_scan_order[0], xd->palette_scan_buffer);
+      mbmi->tx_size = MIN(max_txsize_lookup[bsize],
+                          tx_mode_to_biggest_tx_size[cm->tx_mode]);
+      if (!cm->frame_parallel_decoding_mode)
+        ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd),
+                        &cm->counts.tx)[mbmi->tx_size];
+    }
+
+    if (mbmi->palette_enabled[1]) {
+      int i, d;
+      BLOCK_SIZE bsize = mbmi->sb_type;
+      int rows = 4 * num_4x4_blocks_high_lookup[bsize] >>
+          xd->plane[1].subsampling_y;
+      int cols = 4 * num_4x4_blocks_wide_lookup[bsize] >>
+          xd->plane[1].subsampling_y;
+      PALETTE_RUN_LENGTH bits;
+      BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
+
+      mbmi->uv_mode = DC_PRED;
+      if (xd->plane[1].subsampling_x && xd->plane[1].subsampling_y) {
+        mbmi->palette_size[1] =
+            vp9_read_tree(r, vp9_palette_size_tree,
+                          cm->fc.palette_uv_size_prob[uv_bsize - BLOCK_4X4]);
+        mbmi->palette_size[1] += 2;
+        mbmi->palette_run_length[1] =
+            vp9_read_literal(r, get_bit_depth(palette_max_run(uv_bsize)));
+        mbmi->palette_run_length[1] = (mbmi->palette_run_length[1]) << 1;
+        mbmi->palette_scan_order[1] = vp9_read_literal(r, 2);
+      } else {
+        mbmi->palette_size[1] = mbmi->palette_size[0];
+      }
+
+      for (i = 0; i < mbmi->palette_size[1]; i++)
+        mbmi->palette_colors[PALETTE_MAX_SIZE + i] = vp9_read_literal(r, 8);
+      for (i = 0; i < mbmi->palette_size[1]; i++)
+        mbmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = vp9_read_literal(r, 8);
+
+      if (xd->plane[1].subsampling_x && xd->plane[1].subsampling_y) {
+        d = get_bit_depth(rows * cols);
+        for (i = 0; i < mbmi->palette_run_length[1]; i += 2) {
+          mbmi->palette_runs[PALETTE_MAX_RUNS + i] =
+              vp9_read_literal(r, get_bit_depth(mbmi->palette_size[1]));
+          bits = vp9_read_tree(r, vp9_palette_run_length_tree,
+                               cm->fc.palette_uv_run_length_prob[uv_bsize -
+                                                                 BLOCK_4X4]);
+          if (bits == MAX_BITS)
+            mbmi->palette_runs[PALETTE_MAX_RUNS + i + 1] =
+                vp9_read_literal(r, d);
+          else
+            mbmi->palette_runs[PALETTE_MAX_RUNS + i + 1] =
+                vp9_read_literal(r, bits - ONE_BITS + 1);
+          mbmi->palette_runs[ PALETTE_MAX_RUNS + i + 1] += 1;
+        }
+
+        run_lengh_decoding(mbmi->palette_runs + PALETTE_MAX_RUNS,
+                           mbmi->palette_run_length[1], xd->palette_map_buffer);
+        palette_iscan(xd->plane[1].color_index_map, xd->palette_map_buffer,
+                      rows, cols, mbmi->palette_scan_order[1],
+                      xd->palette_scan_buffer);
+      }
+    }
+
+    if (!mbmi->palette_enabled[0]) {
+      mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type,
+                                   !mbmi->skip || !inter_block, r);
+    }
+#else
     mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type,
-                                 !mbmi->skip || !inter_block, r);
+                                     !mbmi->skip || !inter_block, r);
+#endif
+
 #if CONFIG_EXT_TX
     if (inter_block &&
         mbmi->tx_size <= TX_16X16 &&
index 930babbdead4bb9aec16f1709deb2384635beb46..5f4391efb9fde793a4d0b5338f7408b11d73992a 100644 (file)
@@ -266,8 +266,8 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
 #if CONFIG_PALETTE
   if (frame_is_intra_only(cm)) {
     cm->current_palette_size = 0;
-    memset(cm->current_palette_count, 0,
-           PALETTE_BUF_SIZE * sizeof(cm->current_palette_count[0]));
+    vpx_memset(cm->current_palette_count, 0,
+               PALETTE_BUF_SIZE * sizeof(cm->current_palette_count[0]));
   }
 #endif
 
index 4d0152a9153882b07b70d3af89039ef9d042c988..3bd125162aa5bbccae4b857d0ceb9b6561534c84 100644 (file)
@@ -462,10 +462,92 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
   }
 #endif
 
+#if CONFIG_PALETTE
+  if (!is_inter && bsize >= BLOCK_8X8 && cm->allow_palette_mode) {
+    int l, n, i, d, bits;
+
+    vp9_write_bit(w, mbmi->palette_enabled[0]);
+    vp9_write_bit(w, mbmi->palette_enabled[1]);
+
+    if (mbmi->palette_enabled[0]) {
+      int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+      int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+      n = mbmi->palette_size[0];
+      l = mbmi->palette_run_length[0];
+
+      vp9_write_token(w, vp9_palette_size_tree,
+                      cm->fc.palette_size_prob[bsize - BLOCK_8X8],
+                      &palette_size_encodings[n - 2]);
+      vp9_write_literal(w, (l >> 1),
+                        get_bit_depth(palette_max_run(bsize)));
+      vp9_write_literal(w, mbmi->palette_scan_order[0], 2);
+
+      for (i = 0; i < n; i++)
+        vp9_write_literal(w, mbmi->palette_colors[i], 8);
+
+      d = get_bit_depth(rows * cols);
+      for (i = 0; i < l; i += 2) {
+        vp9_write_literal(w, mbmi->palette_runs[i],
+                          get_bit_depth(n));
+        bits = get_bit_depth(mbmi->palette_runs[i + 1]);
+        vp9_write_token(w, vp9_palette_run_length_tree,
+                        cm->fc.palette_run_length_prob[bsize - BLOCK_8X8],
+                        &palette_run_length_encodings[bits > 6 ?
+                            6 : bits - 1]);
+        vp9_write_literal(w, mbmi->palette_runs[i + 1] - 1,
+                          bits > 6 ? d : bits);
+      }
+    }
+
+    if (mbmi->palette_enabled[1]) {
+      const uint16_t *runs = mbmi->palette_runs + PALETTE_MAX_RUNS;
+      int rows = 4 * num_4x4_blocks_high_lookup[bsize] >>
+          xd->plane[1].subsampling_y;
+      int cols = 4 * num_4x4_blocks_wide_lookup[bsize] >>
+          xd->plane[1].subsampling_x;
+      BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
+      n = mbmi->palette_size[1];
+      l = mbmi->palette_run_length[1];
+
+      if (xd->plane[1].subsampling_x && xd->plane[1].subsampling_y) {
+        vp9_write_token(w, vp9_palette_size_tree,
+                        cm->fc.palette_uv_size_prob[uv_bsize - BLOCK_4X4],
+                        &palette_size_encodings[n - 2]);
+        vp9_write_literal(w, (l >> 1),
+                          get_bit_depth(palette_max_run(uv_bsize)));
+        vp9_write_literal(w, mbmi->palette_scan_order[1], 2);
+      }
+
+      for (i = 0; i < n; i++)
+        vp9_write_literal(w, mbmi->palette_colors[PALETTE_MAX_SIZE + i], 8);
+      for (i = 0; i < n; i++)
+        vp9_write_literal(w, mbmi->palette_colors[2 * PALETTE_MAX_SIZE + i], 8);
+
+      if (xd->plane[1].subsampling_x && xd->plane[1].subsampling_y) {
+        d = get_bit_depth(rows * cols);
+        for (i = 0; i < l; i += 2) {
+          vp9_write_literal(w, runs[i],
+                            get_bit_depth(mbmi->palette_size[1]));
+          bits = get_bit_depth(runs[i + 1]);
+          vp9_write_token(w, vp9_palette_run_length_tree,
+                          cm->fc.palette_uv_run_length_prob[uv_bsize -
+                                                            BLOCK_4X4],
+                                      &palette_run_length_encodings[bits > 6 ?
+                                                                6 : bits - 1]);
+          vp9_write_literal(w, runs[i + 1] - 1,
+                            bits > 6 ? d : bits);
+        }
+      }
+    }
+  }
+#endif
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
 #if CONFIG_SUPERTX
       !supertx_enabled &&
-#endif
+#endif  // CONFIG_SUPERTX
+#if CONFIG_PALETTE
+      !mbmi->palette_enabled[0] &&
+#endif  // CONFIG_PALETTE
       !(is_inter &&
         (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
     write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w);
@@ -527,13 +609,22 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
 
   if (!is_inter) {
     if (bsize >= BLOCK_8X8) {
+#if CONFIG_PALETTE
+      if (!mbmi->palette_enabled[0])
+        write_intra_mode(w, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
+#else
       write_intra_mode(w, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
+#endif  // CONFIG_PALETTE
 #if CONFIG_FILTERINTRA
-      if (is_filter_allowed(mode) && is_filter_enabled(mbmi->tx_size)) {
+      if (is_filter_allowed(mode) && is_filter_enabled(mbmi->tx_size)
+#if CONFIG_PALETTE
+          && !mbmi->palette_enabled[0]
+#endif  // CONFIG_PALETTE
+      ) {
         vp9_write(w, mbmi->filterbit,
                   cm->fc.filterintra_prob[mbmi->tx_size][mode]);
       }
-#endif
+#endif  // CONFIG_FILTERINTRA
     } else {
       int idx, idy;
       const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -547,29 +638,38 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
             vp9_write(w, mi->b_filter_info[idy * 2 + idx],
                       cm->fc.filterintra_prob[0][b_mode]);
           }
-#endif
+#endif  // CONFIG_FILTERINTRA
         }
       }
     }
+#if CONFIG_PALETTE
+    if (!mbmi->palette_enabled[1])
+      write_intra_mode(w, mbmi->uv_mode, cm->fc.uv_mode_prob[mode]);
+#else
     write_intra_mode(w, mbmi->uv_mode, cm->fc.uv_mode_prob[mode]);
+#endif  // CONFIG_PALETTE
 #if CONFIG_FILTERINTRA
     if (is_filter_allowed(mbmi->uv_mode) &&
-        is_filter_enabled(get_uv_tx_size(mbmi, &xd->plane[1]))) {
+        is_filter_enabled(get_uv_tx_size(mbmi, &xd->plane[1]))
+#if CONFIG_PALETTE
+        && !mbmi->palette_enabled[1]
+#endif  // CONFIG_PALETTE
+    ) {
       vp9_write(w, mbmi->uv_filterbit,
                 cm->fc.filterintra_prob[get_uv_tx_size(mbmi, &xd->plane[1])][mbmi->uv_mode]);
     }
-#endif
+#endif  // CONFIG_FILTERINTRA
 #if CONFIG_COPY_MODE
   } else if (mbmi->copy_mode == NOREF) {
 #else
   } else {
-#endif
+#endif  // CONFIG_COPY_MODE
     const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
     const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx];
 #if CONFIG_COMPOUND_MODES
     const vp9_prob *const inter_compound_probs =
         cm->fc.inter_compound_mode_probs[mode_ctx];
-#endif
+#endif  // CONFIG_COMPOUND_MODES
     write_ref_frames(cm, xd, w);
 
     // If segment skip is not enabled code the mode.
@@ -602,7 +702,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
         is_inter_mode(mode) &&
 #if CONFIG_SUPERTX
         !supertx_enabled &&
-#endif
+#endif  // CONFIG_SUPERTX
         mbmi->ref_frame[1] <= INTRA_FRAME) {
       vp9_write(w, mbmi->ref_frame[1] == INTRA_FRAME,
                 cm->fc.interintra_prob[bsize]);
@@ -645,7 +745,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
           if (b_mode == NEWMV || b_mode == NEW_NEWMV) {
 #else
           if (b_mode == NEWMV) {
-#endif
+#endif  // CONFIG_COMPOUND_MODES
             for (ref = 0; ref < 1 + is_compound; ++ref) {
               vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
 #if CONFIG_NEWMVREF_SUB8X8
@@ -682,7 +782,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
       if (mode == NEWMV || mode == NEW_NEWMV) {
 #else
       if (mode == NEWMV) {
-#endif
+#endif  // CONFIG_COMPOUND_MODES
         for (ref = 0; ref < 1 + is_compound; ++ref)
           vp9_encode_mv(cpi, w, &mbmi->mv[ref].as_mv,
                         &mbmi->ref_mvs[mbmi->ref_frame[ref]][0].as_mv, nmvc,
@@ -698,7 +798,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
                       &mbmi->ref_mvs[mbmi->ref_frame[0]][0].as_mv, nmvc,
                       allow_hp);
       }
-#endif
+#endif  // CONFIG_COMPOUND_MODES
     }
 #if CONFIG_WEDGE_PARTITION
     if (cm->reference_mode != SINGLE_REFERENCE &&
index f931cab568abe29de6ca3e87b62a8e88063acd14..b7189a210a39c5302e72f18f571efa451cb79678 100644 (file)
@@ -124,7 +124,11 @@ struct macroblock {
 #if CONFIG_VP9_HIGHBITDEPTH
   void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, int bd);
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_PALETTE
+  DECLARE_ALIGNED(16, double, kmeans_data_buffer[MAX_MB_PLANE * 64 * 64]);
+  DECLARE_ALIGNED(16, int, kmeans_indices_buffer[64 * 64]);
+#endif  // CONFIG_PALETTE
 };
 
 #ifdef __cplusplus
index 1eac7f2d9ddc465b89203b13626fc67448d8b51c..61ca13528250734567638a3edbb512a3fafaf8d5 100644 (file)
@@ -369,10 +369,10 @@ static void set_offsets_extend(VP9_COMP *cpi, const TileInfo *const tile,
 #if CONFIG_PALETTE
 void copy_palette_info(PICK_MODE_CONTEXT *c, PICK_MODE_CONTEXT *p) {
   c->palette_buf_size = p->palette_buf_size;
-  memcpy(c->palette_colors_buf, p->palette_colors_buf,
-         c->palette_buf_size * sizeof(c->palette_colors_buf[0]));
-  memcpy(c->palette_count_buf, p->palette_count_buf,
-         c->palette_buf_size * sizeof(c->palette_count_buf[0]));
+  vpx_memcpy(c->palette_colors_buf, p->palette_colors_buf,
+             c->palette_buf_size * sizeof(c->palette_colors_buf[0]));
+  vpx_memcpy(c->palette_count_buf, p->palette_count_buf,
+             c->palette_buf_size * sizeof(c->palette_count_buf[0]));
 }
 #endif
 
@@ -1363,19 +1363,23 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
     uint8_t palette[PALETTE_BUF_SIZE];
     int count[PALETTE_BUF_SIZE];
 
-    memcpy(palette, cpi->common.current_palette_colors, n * sizeof(palette[0]));
-    memcpy(count, cpi->common.current_palette_count, n * sizeof(count[0]));
+    vpx_memcpy(palette, cpi->common.current_palette_colors,
+               n * sizeof(palette[0]));
+    vpx_memcpy(count, cpi->common.current_palette_count,
+               n * sizeof(count[0]));
     cpi->common.current_palette_size = ctx->palette_buf_size;
-    memcpy(cpi->common.current_palette_colors, ctx->palette_colors_buf,
-           ctx->palette_buf_size * sizeof(ctx->palette_colors_buf[0]));
-    memcpy(cpi->common.current_palette_count, ctx->palette_count_buf,
-           ctx->palette_buf_size * sizeof(ctx->palette_count_buf[0]));
+    vpx_memcpy(cpi->common.current_palette_colors, ctx->palette_colors_buf,
+               ctx->palette_buf_size * sizeof(ctx->palette_colors_buf[0]));
+    vpx_memcpy(cpi->common.current_palette_count, ctx->palette_count_buf,
+               ctx->palette_buf_size * sizeof(ctx->palette_count_buf[0]));
 #endif
     vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
 #if CONFIG_PALETTE
     cpi->common.current_palette_size = n;
-    memcpy(cpi->common.current_palette_colors, palette, n * sizeof(palette[0]));
-    memcpy(cpi->common.current_palette_count, count, n * sizeof(count[0]));
+    vpx_memcpy(cpi->common.current_palette_colors,
+               palette, n * sizeof(palette[0]));
+    vpx_memcpy(cpi->common.current_palette_count,
+               count, n * sizeof(count[0]));
 #endif
 #if CONFIG_SUPERTX
     *totalrate_nocoef = 0;
@@ -2831,18 +2835,18 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   if (bsize == BLOCK_64X64) {
     c = &pc_tree->current;
     c->palette_buf_size = cm->current_palette_size;
-    memcpy(c->palette_colors_buf, cm->current_palette_colors,
-           c->palette_buf_size * sizeof(cm->current_palette_colors[0]));
-    memcpy(c->palette_count_buf, cm->current_palette_count,
-           c->palette_buf_size * sizeof(cm->current_palette_count[0]));
+    vpx_memcpy(c->palette_colors_buf, cm->current_palette_colors,
+               c->palette_buf_size * sizeof(cm->current_palette_colors[0]));
+    vpx_memcpy(c->palette_count_buf, cm->current_palette_count,
+               c->palette_buf_size * sizeof(cm->current_palette_count[0]));
   }
 
   c = &pc_tree->current;
   previous_size = c->palette_buf_size;
-  memcpy(previous_colors, c->palette_colors_buf,
-         previous_size * sizeof(previous_colors[0]));
-  memcpy(previous_count, c->palette_count_buf,
-         previous_size * sizeof(previous_count[0]));
+  vpx_memcpy(previous_colors, c->palette_colors_buf,
+             previous_size * sizeof(previous_colors[0]));
+  vpx_memcpy(previous_count, c->palette_count_buf,
+             previous_size * sizeof(previous_count[0]));
 
   c = &pc_tree->none;
   p = &pc_tree->current;
@@ -3141,10 +3145,10 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         c = &pc_tree->split[i]->current;
         if (last < 0) {
           c->palette_buf_size = previous_size;
-          memcpy(c->palette_colors_buf, previous_colors,
-                 previous_size * sizeof(previous_colors[0]));
-          memcpy(c->palette_count_buf, previous_count,
-                 previous_size * sizeof(previous_count[0]));
+          vpx_memcpy(c->palette_colors_buf, previous_colors,
+                     previous_size * sizeof(previous_colors[0]));
+          vpx_memcpy(c->palette_count_buf, previous_count,
+                     previous_size * sizeof(previous_count[0]));
         } else {
           p = &pc_tree->split[last]->current;
           copy_palette_info(c, p);
@@ -3250,10 +3254,10 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         } else {
           c = &pc_tree->current;
           c->palette_buf_size = previous_size;
-          memcpy(c->palette_colors_buf, previous_colors,
-                 previous_size * sizeof(previous_colors[0]));
-          memcpy(c->palette_count_buf, previous_count,
-                 previous_size * sizeof(previous_count[0]));
+          vpx_memcpy(c->palette_colors_buf, previous_colors,
+                     previous_size * sizeof(previous_colors[0]));
+          vpx_memcpy(c->palette_count_buf, previous_count,
+                     previous_size * sizeof(previous_count[0]));
         }
 #endif
       }
@@ -3282,10 +3286,10 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
 #if CONFIG_PALETTE
     c = &pc_tree->horizontal[0];
     c->palette_buf_size = previous_size;
-    memcpy(c->palette_colors_buf, previous_colors,
-           previous_size * sizeof(previous_colors[0]));
-    memcpy(c->palette_count_buf, previous_count,
-           previous_size * sizeof(previous_count[0]));
+    vpx_memcpy(c->palette_colors_buf, previous_colors,
+               previous_size * sizeof(previous_colors[0]));
+    vpx_memcpy(c->palette_count_buf, previous_count,
+               previous_size * sizeof(previous_count[0]));
     last = 0;
 #endif
 
@@ -3410,10 +3414,10 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         } else {
           c = &pc_tree->current;
           c->palette_buf_size = previous_size;
-          memcpy(c->palette_colors_buf, previous_colors,
-                 previous_size * sizeof(previous_colors[0]));
-          memcpy(c->palette_count_buf, previous_count,
-                 previous_size * sizeof(previous_count[0]));
+          vpx_memcpy(c->palette_colors_buf, previous_colors,
+                     previous_size * sizeof(previous_colors[0]));
+          vpx_memcpy(c->palette_count_buf, previous_count,
+                     previous_size * sizeof(previous_count[0]));
         }
 #endif
       }
@@ -3437,10 +3441,10 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
 #if CONFIG_PALETTE
     c = &pc_tree->vertical[0];
     c->palette_buf_size = previous_size;
-    memcpy(c->palette_colors_buf, previous_colors,
-           previous_size * sizeof(previous_colors[0]));
-    memcpy(c->palette_count_buf, previous_count,
-           previous_size * sizeof(previous_count[0]));
+    vpx_memcpy(c->palette_colors_buf, previous_colors,
+               previous_size * sizeof(previous_colors[0]));
+    vpx_memcpy(c->palette_count_buf, previous_count,
+               previous_size * sizeof(previous_count[0]));
     last = 0;
 #endif
 
@@ -3562,10 +3566,10 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         } else {
           c = &pc_tree->current;
           c->palette_buf_size = previous_size;
-          memcpy(c->palette_colors_buf, previous_colors,
-                 previous_size * sizeof(previous_colors[0]));
-          memcpy(c->palette_count_buf, previous_count,
-                 previous_size * sizeof(previous_count[0]));
+          vpx_memcpy(c->palette_colors_buf, previous_colors,
+                     previous_size * sizeof(previous_colors[0]));
+          vpx_memcpy(c->palette_count_buf, previous_count,
+                     previous_size * sizeof(previous_count[0]));
         }
 #endif
       }
@@ -4633,8 +4637,8 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 #if CONFIG_PALETTE
   if (frame_is_intra_only(cm)) {
     cm->current_palette_size = 0;
-    memset(cm->current_palette_count, 0,
-           PALETTE_BUF_SIZE * sizeof(cm->current_palette_count[0]));
+    vpx_memset(cm->current_palette_count, 0,
+               PALETTE_BUF_SIZE * sizeof(cm->current_palette_count[0]));
     cm->palette_counter = 0;
     cm->block_counter = 0;
   }
index f8b2d889c1df46a596963f7f3bffa1ee67eaf594..01e391bbcf4ce8b535e740d7544376c5202b6f4b 100644 (file)
@@ -1262,7 +1262,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
 #if CONFIG_PALETTE
   mic->mbmi.palette_enabled[0] = 0;
-#endif
+#endif  // CONFIG_PALETTE
 
   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
@@ -1362,7 +1362,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int src_stride = x->plane[0].src.stride;
   uint8_t *src = x->plane[0].src.buf;
   uint16_t best_runs[PALETTE_MAX_RUNS];
-  uint8_t best_palette[PALETTE_MAX_SIZE], best_map[4096];
+  uint8_t best_palette[PALETTE_MAX_SIZE];
   uint8_t best_index[PALETTE_MAX_SIZE], best_literal[PALETTE_MAX_SIZE];
   int8_t palette_color_delta[PALETTE_MAX_SIZE];
   PALETTE_SCAN_ORDER best_ps = H_SCAN;
@@ -1500,24 +1500,23 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   colors = count_colors(src, src_stride, rows, cols);
   if (colors > 1 && colors <= 64 && cpi->common.allow_palette_mode) {
     int n, r, c, i, j, temp, max_itr = 200, k;
-    int indices[4096];
     int l, m1, m2, d = get_bit_depth(rows * cols);
     int bits, best_bits = 0, total_bits, best_total_bits;
     int palette_size_cost[PALETTE_SIZES];
     int palette_run_length_cost[PALETTE_RUN_LENGTHS];
-    double data[4096];
     double centroids[PALETTE_MAX_SIZE];
     double lb = src[0], ub = src[0], val;
     int64_t local_tx_cache[TX_MODES];
     PALETTE_SCAN_ORDER ps;
-    uint8_t map[4096];
 #if CONFIG_TX_SKIP
     int this_rate_tokenonly_s, s_s;
     int64_t this_distortion_s;
 #endif  // CONFIG_TX_SKIP
 
-    memset(data, 0, sizeof(data[0] * 4096));
-    memset(indices, 0, sizeof(indices[0] * 4096));
+    vpx_memset(x->kmeans_data_buffer, 0,
+               sizeof(x->kmeans_data_buffer[0] * 4096));
+    vpx_memset(x->kmeans_indices_buffer, 0,
+               sizeof(x->kmeans_indices_buffer[0] * 4096));
     mic->mbmi.palette_enabled[0] = 1;
     vp9_cost_tokens(palette_size_cost,
                     cpi->common.fc.palette_size_prob[bsize - BLOCK_8X8],
@@ -1532,7 +1531,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     for (r = 0; r < rows; r++) {
       for (c = 0; c < cols; c++) {
         val = src[r * src_stride + c];
-        data[r * cols + c] = val;
+        x->kmeans_data_buffer[r * cols + c] = val;
         if (val < lb)
           lb = val;
         else if (val > ub)
@@ -1544,7 +1543,8 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
         n >= 2; n--) {
       for (i = 0; i < n; i++)
         centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
-      r = k_means(data, centroids, indices, rows * cols, n, 1, max_itr);
+      r = k_means(x->kmeans_data_buffer, centroids, x->kmeans_indices_buffer,
+                  rows * cols, n, 1, max_itr);
       insertion_sort(centroids, n);
       i = 1;
       k = n;
@@ -1620,15 +1620,18 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
               [mic->mbmi.palette_indexed_colors[i]];
         }
       if (m2 > 0)
-        memcpy(mic->mbmi.palette_colors + m1, mic->mbmi.palette_literal_colors,
-               m2 * sizeof(mic->mbmi.palette_colors[0]));
+        vpx_memcpy(mic->mbmi.palette_colors + m1,
+                   mic->mbmi.palette_literal_colors,
+                   m2 * sizeof(mic->mbmi.palette_colors[0]));
       for (i = 0; i < k; i++) {
         centroids[i] = (double) mic->mbmi.palette_colors[i];
       }
-      calc_indices(data, centroids, indices, rows * cols, k, 1);
+      calc_indices(x->kmeans_data_buffer, centroids, x->kmeans_indices_buffer,
+                   rows * cols, k, 1);
       for (r = 0; r < rows; r++) {
         for (c = 0; c < cols; c++) {
-          xd->plane[0].color_index_map[r * cols + c] = indices[r * cols + c];
+          xd->plane[0].color_index_map[r * cols + c] =
+              x->kmeans_indices_buffer[r * cols + c];
         }
       }
 
@@ -1669,34 +1672,12 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
       }
 
       for (ps = H_SCAN; ps < PALETTE_SCAN_ORDERS; ps++) {
-        int scan_order[4096];
-        switch (ps) {
-          case H_SCAN:
-            memcpy(map, xd->plane[0].color_index_map, rows * cols);
-            break;
-          case V_SCAN:
-            transpose_block(xd->plane[0].color_index_map,
-                            map, rows, cols);
-            break;
-          case SPIN_SCAN:
-            spin_scan_order(scan_order, rows, cols);
-            for (i = 0; i < rows * cols; i++)
-              map[i] = xd->plane[0].color_index_map[scan_order[i]];
-            break;
-          case ZZ_SCAN:
-            zz_scan_order(scan_order, rows, cols);
-            for (i = 0; i < rows * cols; i++)
-              map[i] = xd->plane[0].color_index_map[scan_order[i]];
-            break;
-          default:
-            break;
-        }
-
-        l = run_lengh_encoding(map, rows * cols, mic->mbmi.palette_runs,
-                               palette_max_run(bsize));
-        if (!l) {
+        palette_scan(xd->plane[0].color_index_map, xd->palette_map_buffer,
+                     rows, cols, ps, xd->palette_scan_buffer);
+        l = run_lengh_encoding(xd->palette_map_buffer, rows * cols,
+                               mic->mbmi.palette_runs, palette_max_run(bsize));
+        if (!l)
           continue;
-        }
 
         this_rate = this_rate_tokenonly +
             (1 + vp9_encode_uniform_cost(MIN(k + 1, 8), m1) + PALETTE_DELTA_BIT
@@ -1730,17 +1711,16 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
           best_m1 = m1;
           best_m2 = m2;
           palette_delta_bitdepth = best_bits;
-          memcpy(best_palette, mic->mbmi.palette_colors,
-                 k * sizeof(best_palette[0]));
-          memcpy(best_runs, mic->mbmi.palette_runs, l * sizeof(best_runs[0]));
-          memcpy(best_map, xd->plane[0].color_index_map,
-                 rows * cols * sizeof(best_map[0]));
-          memcpy(best_index, mic->mbmi.palette_indexed_colors,
-                 best_m1 * sizeof(best_index[0]));
-          memcpy(palette_color_delta, mic->mbmi.palette_color_delta,
-                 best_m1 * sizeof(palette_color_delta[0]));
-          memcpy(best_literal, mic->mbmi.palette_literal_colors,
-                 best_m2 * sizeof(best_literal[0]));
+          vpx_memcpy(best_palette, mic->mbmi.palette_colors,
+                     k * sizeof(best_palette[0]));
+          vpx_memcpy(best_runs, mic->mbmi.palette_runs,
+                     l * sizeof(best_runs[0]));
+          vpx_memcpy(best_index, mic->mbmi.palette_indexed_colors,
+                     best_m1 * sizeof(best_index[0]));
+          vpx_memcpy(palette_color_delta, mic->mbmi.palette_color_delta,
+                     best_m1 * sizeof(palette_color_delta[0]));
+          vpx_memcpy(best_literal, mic->mbmi.palette_literal_colors,
+                     best_m2 * sizeof(best_literal[0]));
 #if CONFIG_TX_SKIP
           tx_skipped = mic->mbmi.tx_skip[0];
 #endif  // CONFIG_TX_SKIP
@@ -1770,18 +1750,21 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     mic->mbmi.palette_indexed_size = best_m1;
     mic->mbmi.palette_literal_size = best_m2;
     mic->mbmi.palette_delta_bitdepth = palette_delta_bitdepth;
-    memcpy(mic->mbmi.palette_colors, best_palette,
-           best_n * sizeof(best_palette[0]));
-    memcpy(mic->mbmi.palette_runs, best_runs, best_l * sizeof(best_runs[0]));
-    memcpy(xd->plane[0].color_index_map, best_map,
-           4 * num_4x4_blocks_high_lookup[bsize] *
-           4 * num_4x4_blocks_wide_lookup[bsize] * sizeof(best_map[0]));
-    memcpy(mic->mbmi.palette_indexed_colors, best_index,
-           best_m1 * sizeof(best_index[0]));
-    memcpy(mic->mbmi.palette_color_delta, palette_color_delta,
-           best_m1 * sizeof(palette_color_delta[0]));
-    memcpy(mic->mbmi.palette_literal_colors, best_literal,
-           best_m2 * sizeof(best_literal[0]));
+    vpx_memcpy(mic->mbmi.palette_colors, best_palette,
+               best_n * sizeof(best_palette[0]));
+    vpx_memcpy(mic->mbmi.palette_runs, best_runs,
+               best_l * sizeof(best_runs[0]));
+    vpx_memcpy(mic->mbmi.palette_indexed_colors, best_index,
+               best_m1 * sizeof(best_index[0]));
+    vpx_memcpy(mic->mbmi.palette_color_delta, palette_color_delta,
+               best_m1 * sizeof(palette_color_delta[0]));
+    vpx_memcpy(mic->mbmi.palette_literal_colors, best_literal,
+               best_m2 * sizeof(best_literal[0]));
+    run_lengh_decoding(mic->mbmi.palette_runs, mic->mbmi.palette_run_length[0],
+                       xd->palette_map_buffer);
+    palette_iscan(xd->plane[0].color_index_map, xd->palette_map_buffer,
+                  rows, cols, mic->mbmi.palette_scan_order[0],
+                  xd->palette_scan_buffer);
 #if CONFIG_FILTERINTRA
     mic->mbmi.filterbit = 0;
 #endif  // CONFIG_FILTERINTRA
@@ -1873,7 +1856,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
       (xd->plane[1].subsampling_y);
   int src_stride = x->plane[1].src.stride;
   uint16_t best_runs[PALETTE_MAX_RUNS];
-  uint8_t best_palette[2 * PALETTE_MAX_SIZE], best_map[4096];
+  uint8_t best_palette[2 * PALETTE_MAX_SIZE];
   uint8_t *src_u = x->plane[1].src.buf;
   uint8_t *src_v = x->plane[2].src.buf;
   PALETTE_SCAN_ORDER best_ps;
@@ -1923,8 +1906,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
                    [get_uv_tx_size(&(x->e_mbd.mi[0].mbmi), &xd->plane[1])][mode], fbit);
 #endif  // CONFIG_FILTERINTRA
 #if CONFIG_PALETTE
-    if (frame_is_intra_only(&cpi->common) &&
-        xd->mi[0].src_mi->mbmi.sb_type >= BLOCK_8X8 &&
+    if (xd->mi[0].src_mi->mbmi.sb_type >= BLOCK_8X8 &&
         xd->plane[1].subsampling_x && xd->plane[1].subsampling_y &&
         cpi->common.allow_palette_mode)
       this_rate += vp9_cost_bit(128, 0);
@@ -2009,8 +1991,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #endif
 
 #if CONFIG_PALETTE
-  if (frame_is_intra_only(&cpi->common) &&
-      xd->mi[0].src_mi->mbmi.sb_type >= BLOCK_8X8 &&
+  if (xd->mi[0].src_mi->mbmi.sb_type >= BLOCK_8X8 &&
       xd->plane[1].subsampling_x && xd->plane[1].subsampling_y &&
       cpi->common.allow_palette_mode) {
     int colors_u = count_colors(src_u, src_stride, rows, cols);
@@ -2019,16 +2000,13 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
     if (colors > 1 && colors <= 64) {
       int n, r, c, i, j, max_itr = 200;
-      int indices[4096];
       int l, d = get_bit_depth(rows * cols);
       int palette_run_length_cost[PALETTE_RUN_LENGTHS];
       int palette_size_cost[PALETTE_SIZES];
-      double data[2048];
       double centroids[2 * PALETTE_MAX_SIZE];
       double lb_u = src_u[0], ub_u = src_u[0];
       double lb_v = src_v[0], ub_v = src_v[0], val;
       PALETTE_SCAN_ORDER ps;
-      uint8_t map[4096];
       BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
 #if CONFIG_TX_SKIP
       int this_rate_tokenonly_s, s_s;
@@ -2045,16 +2023,15 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_FILTERINTRA
       mbmi->uv_filterbit = 0;
 #endif  // CONFIG_FILTERINTRA
-#if CONFIG_TX_SKIP
-      mbmi->tx_skip[1] = 0;
-#endif  // CONFIG_TX_SKIP
       mbmi->palette_enabled[1] = 1;
       mbmi->uv_mode = DC_PRED;
 
       for (r = 0; r < rows; r++) {
         for (c = 0; c < cols; c++) {
-          data[(r * cols + c) * 2 ] = src_u[r * src_stride + c];
-          data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+          x->kmeans_data_buffer[(r * cols + c) * 2 ] =
+              src_u[r * src_stride + c];
+          x->kmeans_data_buffer[(r * cols + c) * 2 + 1] =
+              src_v[r * src_stride + c];
           val = src_u[r * src_stride + c];
           if (val < lb_u)
             lb_u = val;
@@ -2075,7 +2052,8 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
           centroids[i * 2 + 1] =
               lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;;
         }
-        r = k_means(data, centroids, indices, rows * cols, n, 2, max_itr);
+        r = k_means(x->kmeans_data_buffer, centroids, x->kmeans_indices_buffer,
+                    rows * cols, n, 2, max_itr);
 
         mbmi->palette_size[1] = n;
         for (i = 1; i < 3; i++) {
@@ -2086,7 +2064,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
         for (r = 0; r < rows; r++)
           for (c = 0; c < cols; c++) {
             xd->plane[1].color_index_map[r * cols + c] =
-                indices[r * cols + c];
+                x->kmeans_indices_buffer[r * cols + c];
           }
 
 #if CONFIG_TX_SKIP
@@ -2127,36 +2105,13 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
         }
 
         for (ps = H_SCAN; ps < PALETTE_SCAN_ORDERS; ps++) {
-          int scan_order[4096];
-
-          switch (ps) {
-            case H_SCAN:
-              memcpy(map, xd->plane[1].color_index_map, rows * cols);
-              break;
-            case V_SCAN:
-              transpose_block(xd->plane[1].color_index_map,
-                              map, rows, cols);
-              break;
-            case SPIN_SCAN:
-              spin_scan_order(scan_order, rows, cols);
-              for (i = 0; i < rows * cols; i++)
-                map[i] = xd->plane[1].color_index_map[scan_order[i]];
-              break;
-            case ZZ_SCAN:
-              zz_scan_order(scan_order, rows, cols);
-              for (i = 0; i < rows * cols; i++)
-                map[i] = xd->plane[1].color_index_map[scan_order[i]];
-              break;
-            default:
-              break;
-          }
-
-          l = run_lengh_encoding(map, rows * cols,
+          palette_scan(xd->plane[1].color_index_map, xd->palette_map_buffer,
+                       rows, cols, ps, xd->palette_scan_buffer);
+          l = run_lengh_encoding(xd->palette_map_buffer, rows * cols,
                                  mbmi->palette_runs + PALETTE_MAX_RUNS,
                                  palette_max_run(uv_bsize));
-          if (!l) {
+          if (!l)
             continue;
-          }
 
           this_rate = this_rate_tokenonly +
               (1 + get_bit_depth(palette_max_run(uv_bsize)) + 2 + 2 * 8 * n +
@@ -2185,12 +2140,10 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
             best_l = l;
             palette_selected = 1;
             best_ps = ps;
-            memcpy(best_palette, mbmi->palette_colors + PALETTE_MAX_SIZE,
-                   2 * PALETTE_MAX_SIZE * sizeof(best_palette[0]));
-            memcpy(best_runs, mbmi->palette_runs + PALETTE_MAX_RUNS,
-                   l * sizeof(best_runs[0]));
-            memcpy(best_map, xd->plane[1].color_index_map,
-                   rows * cols * sizeof(best_map[0]));
+            vpx_memcpy(best_palette, mbmi->palette_colors + PALETTE_MAX_SIZE,
+                       2 * PALETTE_MAX_SIZE * sizeof(best_palette[0]));
+            vpx_memcpy(best_runs, mbmi->palette_runs + PALETTE_MAX_RUNS,
+                       PALETTE_MAX_RUNS * sizeof(best_runs[0]));
 #if CONFIG_TX_SKIP
             tx_skipped = mbmi->tx_skip[1];
 #endif  // CONFIG_TX_SKIP
@@ -2217,12 +2170,15 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->palette_size[1] = best_n;
     mbmi->palette_run_length[1] = best_l;
     mbmi->palette_scan_order[1] = best_ps;
-    memcpy(mbmi->palette_colors + PALETTE_MAX_SIZE, best_palette,
-           2 * PALETTE_MAX_SIZE * sizeof(best_palette[0]));
-    memcpy(mbmi->palette_runs + PALETTE_MAX_RUNS, best_runs,
-           best_l * sizeof(best_runs[0]));
-    memcpy(xd->plane[1].color_index_map, best_map,
-           rows * cols * sizeof(best_map[0]));
+    vpx_memcpy(mbmi->palette_colors + PALETTE_MAX_SIZE, best_palette,
+               2 * PALETTE_MAX_SIZE * sizeof(best_palette[0]));
+    vpx_memcpy(mbmi->palette_runs + PALETTE_MAX_RUNS, best_runs,
+               best_l * sizeof(best_runs[0]));
+    run_lengh_decoding(mbmi->palette_runs + PALETTE_MAX_RUNS,
+                       mbmi->palette_run_length[1], xd->palette_map_buffer);
+    palette_iscan(xd->plane[1].color_index_map, xd->palette_map_buffer,
+                  rows, cols, mbmi->palette_scan_order[1],
+                  xd->palette_scan_buffer);
   }
 #endif  // CONFIG_PALETTE
   return best_rd;
@@ -4883,9 +4839,8 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     int colors = count_colors(src_y, src_stride_y, rows, cols);
 
     if (colors >= 2 && colors <= 64 && cm->allow_palette_mode) {
-      uint8_t color_index_map_copy[4096], map[4096];
       uint16_t best_runs[PALETTE_MAX_RUNS];
-      uint8_t best_palette[PALETTE_MAX_SIZE * 3], best_map[4096];
+      uint8_t best_palette[PALETTE_MAX_SIZE * 3];
       uint8_t best_index[PALETTE_MAX_SIZE], best_literal[PALETTE_MAX_SIZE];
       int8_t palette_color_delta[PALETTE_MAX_SIZE];
       int64_t local_tx_cache[TX_MODES], sse;
@@ -4893,9 +4848,9 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       int r, c, i, j;
       int d = get_bit_depth(rows * cols), max_itr = 200;
       int palette_run_length_cost[PALETTE_RUN_LENGTHS];
-      int palette_size_cost[PALETTE_SIZES], indices[4096];
+      int palette_size_cost[PALETTE_SIZES];
       int best_m1 = 0, best_m2 = 0, palette_delta_bitdepth = 0;
-      double data[3 * 4096], centroids[3 * PALETTE_MAX_SIZE];
+      double centroids[3 * PALETTE_MAX_SIZE];
       double lb = src_y[0], ub = src_y[0];
       MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
       MB_MODE_INFO mbmi_copy;
@@ -4914,11 +4869,11 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       palette_best_rd.dist = INT64_MAX;
       palette_best_rd.rdcost = INT64_MAX;
       mbmi_copy = *mbmi;
-      memcpy(color_index_map_copy, xd->plane[0].color_index_map,
-             rows * cols * sizeof(color_index_map_copy[0]));
-      memset(data, 0, sizeof(data[0]) * 3 * 4096);
-      memset(map, 0, sizeof(map[0]) * 4096);
-      memset(centroids, 0, sizeof(centroids[0]) * 3 * PALETTE_MAX_SIZE);
+      vpx_memset(x->kmeans_data_buffer, 0,
+                 sizeof(x->kmeans_data_buffer[0]) * 3 * 4096);
+      vpx_memset(xd->palette_map_buffer, 0,
+                 sizeof(xd->palette_map_buffer[0]) * 4096);
+      vpx_memset(centroids, 0, sizeof(centroids[0]) * 3 * PALETTE_MAX_SIZE);
       vp9_cost_tokens(palette_size_cost,
                       cpi->common.fc.palette_size_prob[bsize - BLOCK_8X8],
                                                        vp9_palette_size_tree);
@@ -4933,9 +4888,12 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #endif  // CONFIG_FILTERINTRA
       for (r = 0; r < rows; r++) {
         for (c = 0; c < cols; c++) {
-          data[(r * cols + c) * 3] = src_y[r * src_stride_y + c];
-          data[(r * cols + c) * 3 + 1] = src_u[r * src_stride_uv + c];
-          data[(r * cols + c) * 3 + 2] = src_v[r * src_stride_uv + c];
+          x->kmeans_data_buffer[(r * cols + c) * 3] =
+              src_y[r * src_stride_y + c];
+          x->kmeans_data_buffer[(r * cols + c) * 3 + 1] =
+              src_u[r * src_stride_uv + c];
+          x->kmeans_data_buffer[(r * cols + c) * 3 + 2] =
+              src_v[r * src_stride_uv + c];
         }
       }
 
@@ -4946,7 +4904,8 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           centroids[i * 3 + 1] = 128;
           centroids[i * 3 + 2] = 128;
         }
-        r = k_means(data, centroids, indices, rows * cols, n, 3, max_itr);
+        r = k_means(x->kmeans_data_buffer, centroids, x->kmeans_indices_buffer,
+                    rows * cols, n, 3, max_itr);
         for (i = 0; i < 3; i++) {
           for (j = 0; j < n; j++)
             mbmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
@@ -4955,12 +4914,12 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         for (r = 0; r < rows; r++)
           for (c = 0; c < cols; c++)
             xd->plane[0].color_index_map[r * cols + c] =
-                indices[r * cols + c];
+                x->kmeans_data_buffer[r * cols + c];
         m1 = 0;
         m2 = n;
         best_bits = 0;
-        memcpy(mbmi->palette_literal_colors, mbmi->palette_colors,
-               m2 * sizeof(mbmi->palette_literal_colors[0]));
+        vpx_memcpy(mbmi->palette_literal_colors, mbmi->palette_colors,
+                   m2 * sizeof(mbmi->palette_literal_colors[0]));
 
 #if CONFIG_TX_SKIP
         mbmi->tx_skip[0] = 0;
@@ -5033,33 +4992,12 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           continue;
 
         for (ps = H_SCAN; ps < PALETTE_SCAN_ORDERS; ps++) {
-          int scan_order[4096];
-          switch (ps) {
-            case H_SCAN:
-              memcpy(map, xd->plane[0].color_index_map, rows * cols);
-              break;
-            case V_SCAN:
-              transpose_block(xd->plane[0].color_index_map,
-                              map, rows, cols);
-              break;
-            case SPIN_SCAN:
-              spin_scan_order(scan_order, rows, cols);
-              for (i = 0; i < rows * cols; i++)
-                map[i] = xd->plane[0].color_index_map[scan_order[i]];
-              break;
-            case ZZ_SCAN:
-              zz_scan_order(scan_order, rows, cols);
-              for (i = 0; i < rows * cols; i++)
-                map[i] = xd->plane[0].color_index_map[scan_order[i]];
-              break;
-            default:
-              break;
-          }
-          l = run_lengh_encoding(map, rows * cols, mbmi->palette_runs,
-                                 palette_max_run(bsize));
-          if (!l) {
+          palette_scan(xd->plane[0].color_index_map, xd->palette_map_buffer,
+                       rows, cols, ps, xd->palette_scan_buffer);
+          l = run_lengh_encoding(xd->palette_map_buffer, rows * cols,
+                                 mbmi->palette_runs, palette_max_run(bsize));
+          if (!l)
             continue;
-          }
 
           rate_y = rate_y_tokenonly +
               (1 + PALETTE_DELTA_BIT + get_bit_depth(palette_max_run(bsize)) +
@@ -5103,17 +5041,15 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
             tx_skipped_uv = mbmi->tx_skip[1];
 #endif  // CONFIG_TX_SKIP
 
-            memcpy(best_palette, mbmi->palette_colors,
-                   PALETTE_MAX_SIZE * 3 * sizeof(best_palette[0]));
-            memcpy(best_runs, mbmi->palette_runs, l * sizeof(best_runs[0]));
-            memcpy(best_map, xd->plane[0].color_index_map,
-                   rows * cols * sizeof(best_map[0]));
-            memcpy(best_index, mbmi->palette_indexed_colors,
-                   best_m1 * sizeof(best_index[0]));
-            memcpy(palette_color_delta, mbmi->palette_color_delta,
-                   best_m1 * sizeof(palette_color_delta[0]));
-            memcpy(best_literal, mbmi->palette_literal_colors,
-                   best_m2 * sizeof(best_literal[0]));
+            vpx_memcpy(best_palette, mbmi->palette_colors,
+                       PALETTE_MAX_SIZE * 3 * sizeof(best_palette[0]));
+            vpx_memcpy(best_runs, mbmi->palette_runs, l * sizeof(best_runs[0]));
+            vpx_memcpy(best_index, mbmi->palette_indexed_colors,
+                       best_m1 * sizeof(best_index[0]));
+            vpx_memcpy(palette_color_delta, mbmi->palette_color_delta,
+                       best_m1 * sizeof(palette_color_delta[0]));
+            vpx_memcpy(best_literal, mbmi->palette_literal_colors,
+                       best_m2 * sizeof(best_literal[0]));
           }
         }
       }
@@ -5131,17 +5067,16 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         mbmi->palette_indexed_size = best_m1;
         mbmi->palette_literal_size = best_m2;
         mbmi->palette_delta_bitdepth = palette_delta_bitdepth;
-        memcpy(mbmi->palette_colors, best_palette,
-               PALETTE_MAX_SIZE * 3 * sizeof(best_palette[0]));
-        memcpy(mbmi->palette_runs, best_runs, best_l * sizeof(best_runs[0]));
-        memcpy(xd->plane[0].color_index_map, best_map,
-               rows * cols * sizeof(best_map[0]));
-        memcpy(mbmi->palette_indexed_colors, best_index,
-               best_m1 * sizeof(best_index[0]));
-        memcpy(mbmi->palette_color_delta, palette_color_delta,
-               best_m1 * sizeof(palette_color_delta[0]));
-        memcpy(mbmi->palette_literal_colors, best_literal,
-               best_m2 * sizeof(best_literal[0]));
+        vpx_memcpy(mbmi->palette_colors, best_palette,
+                   PALETTE_MAX_SIZE * 3 * sizeof(best_palette[0]));
+        vpx_memcpy(mbmi->palette_runs, best_runs,
+                   best_l * sizeof(best_runs[0]));
+        vpx_memcpy(mbmi->palette_indexed_colors, best_index,
+                   best_m1 * sizeof(best_index[0]));
+        vpx_memcpy(mbmi->palette_color_delta, palette_color_delta,
+                   best_m1 * sizeof(palette_color_delta[0]));
+        vpx_memcpy(mbmi->palette_literal_colors, best_literal,
+                   best_m2 * sizeof(best_literal[0]));
 #if CONFIG_FILTERINTRA
         mbmi->filterbit = 0;
         mbmi->uv_filterbit = 0;
@@ -5152,8 +5087,13 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #endif  // CONFIG_TX_SKIP
       } else {
         *mbmi = mbmi_copy;
-        memcpy(xd->plane[0].color_index_map, color_index_map_copy,
-               rows * cols * sizeof(color_index_map_copy[0]));
+        if (mbmi->palette_enabled[0]) {
+          run_lengh_decoding(mbmi->palette_runs, mbmi->palette_run_length[0],
+                             xd->palette_map_buffer);
+          palette_iscan(xd->plane[0].color_index_map, xd->palette_map_buffer,
+                        rows, cols, mbmi->palette_scan_order[0],
+                        xd->palette_scan_buffer);
+        }
       }
       ctx->mic = *xd->mi[0].src_mi;
     }
@@ -5238,7 +5178,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   PREDICTION_MODE mode_uv[TX_SIZES];
 #if CONFIG_FILTERINTRA
   int fbit_uv[TX_SIZES];
-#endif
+#endif  // CONFIG_FILTERINTRA
 #if CONFIG_INTERINTRA
   int single_newmv_rate[MAX_REF_FRAMES] = { 0 };
 #endif  // CONFIG_INTERINTRA
@@ -5264,6 +5204,20 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   MB_MODE_INFO *inter_ref_list[18];
   int copy_mode_context = vp9_get_copy_mode_context(xd);
 #endif  // CONFIG_COPY_MODE
+#if CONFIG_PALETTE
+  int best_n = 0, best_l = 0, colors;
+  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+  int src_stride = x->plane[0].src.stride;
+  int palette_enabled_uv[TX_SIZES], palette_run_length_uv[TX_SIZES];
+  int palette_size_uv[TX_SIZES];
+  uint8_t *src = x->plane[0].src.buf;
+  uint8_t best_palette[PALETTE_MAX_SIZE];
+  uint8_t palette_colors_uv[TX_SIZES][2 * PALETTE_MAX_SIZE];
+  uint16_t best_runs[PALETTE_MAX_RUNS];
+  uint16_t palette_runs_uv[TX_SIZES][PALETTE_MAX_RUNS];
+  PALETTE_SCAN_ORDER best_ps = H_SCAN, ps_uv[TX_SIZES];
+#endif  // CONFIG_PALETTE
   vp9_zero(best_mbmode);
 
   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
@@ -5296,7 +5250,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   rd_cost->rate = INT_MAX;
 #if CONFIG_SUPERTX
   *returnrate_nocoef = INT_MAX;
-#endif
+#endif  // CONFIG_SUPERTX
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
@@ -5309,7 +5263,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_COMPOUND_MODES
     frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZERO_ZEROMV][ref_frame].as_int = 0;
-#endif
+#endif  // CONFIG_COMPOUND_MODES
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
@@ -5360,7 +5314,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARMV);
       if (frame_mv[NEAR_NEARESTMV][ALTREF_FRAME].as_int != 0)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARESTMV);
-#endif
+#endif  // CONFIG_COMPOUND_MODES
     }
   }
 
@@ -5409,12 +5363,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     midx = end_pos;
   }
 
-#if CONFIG_PALETTE
-  for (i = 0; i < 2; ++i) {
-    mbmi->palette_enabled[i] = 0;
-  }
-#endif  // CONFIG_PALETTE
-
   for (midx = 0; midx < MAX_MODES; ++midx) {
     int mode_index = mode_map[midx];
     int mode_excluded = 0;
@@ -5423,10 +5371,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     int compmode_cost = 0;
 #if CONFIG_INTERINTRA
     int compmode_interintra_cost = 0;
-#endif
+#endif  // CONFIG_INTERINTRA
 #if CONFIG_WEDGE_PARTITION
     int compmode_wedge_cost = 0;
-#endif
+#endif  // CONFIG_WEDGE_PARTITION
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable = 0;
@@ -5481,7 +5429,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       frame_mv[NEW_NEWMV][second_ref_frame].as_int =
           frame_mv[NEWMV][second_ref_frame].as_int;
     }
-#endif
+#endif  // CONFIG_COMPOUND_MODES
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
     if (midx == mode_skip_start && best_mode_index >= 0) {
@@ -5568,7 +5516,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           this_mode != NEW_NEARMV)
 #else
       if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV)
-#endif
+#endif  // CONFIG_COMPOUND_MODES
         if (rf > INTRA_FRAME)
           if (ref_frame != rf)
             continue;
@@ -5579,7 +5527,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
             this_mode == ZERO_ZEROMV)
 #else
         if (this_mode == NEARMV || this_mode == ZEROMV)
-#endif
+#endif  // CONFIG_COMPOUND_MODES
           continue;
     }
 
@@ -5643,7 +5591,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME &&
         !is_interintra_allowed(bsize))
       continue;
-#endif
+#endif  // CONFIG_INTERINTRA
 
     mbmi->mode = this_mode;
     mbmi->uv_mode = DC_PRED;
@@ -5656,6 +5604,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_COPY_MODE
     mbmi->copy_mode = NOREF;
 #endif  // CONFIG_COPY_MODE
+#if CONFIG_PALETTE
+    mbmi->palette_enabled[0] = 0;
+    mbmi->palette_enabled[1] = 0;
+#endif  // CONFIG_PALETTE
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
@@ -5676,10 +5628,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_INTERINTRA
     mbmi->interintra_mode = (PREDICTION_MODE)(DC_PRED - 1);
     mbmi->interintra_uv_mode = (PREDICTION_MODE)(DC_PRED - 1);
-#endif
+#endif  // CONFIG_INTERINTRA
 #if CONFIG_WEDGE_PARTITION
     mbmi->use_wedge_interinter = 0;
-#endif
+#endif  // CONFIG_WEDGE_PARTITION
 
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
@@ -5688,13 +5640,13 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       int rate_y_s, skippable_s;
       int64_t distortion_y_s;
       int64_t tx_cache_s[TX_MODES];
-#endif
+#endif  // CONFIG_TX_SKIP
 #if CONFIG_FILTERINTRA
       mbmi->filterbit = 0;
-#endif
+#endif  // CONFIG_FILTERINTRA
 #if CONFIG_EXT_TX
       mbmi->ext_txfrm = NORM;
-#endif
+#endif  // CONFIG_EXT_TX
       vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
                       NULL, bsize, tx_cache, best_rd);
@@ -5729,7 +5681,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           vpx_memcpy(tx_cache, tx_cache_tmp, TX_MODES * sizeof(int64_t));
         }
       }
-#endif
+#endif  // CONFIG_FILTERINTRA
 #if CONFIG_TX_SKIP
       if (try_tx_skip) {
         mbmi->tx_skip[0] = 1;
@@ -5755,7 +5707,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           rate_y += vp9_cost_bit(cpi->common.fc.y_tx_skip_prob[0],
                                  mbmi->tx_skip[0]);
       }
-#endif
+#endif  // CONFIG_TX_SKIP
 
       if (rate_y == INT_MAX)
         continue;
@@ -5768,31 +5720,62 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                              &dist_uv[uv_tx], &skip_uv[uv_tx],
 #if CONFIG_FILTERINTRA
                              &fbit_uv[uv_tx],
-#endif
+#endif  // CONFIG_FILTERINTRA
                              &mode_uv[uv_tx]);
 #if CONFIG_TX_SKIP
         tx_skipped_uv[uv_tx] = mbmi->tx_skip[1];
-#endif
-
+#endif  // CONFIG_TX_SKIP
+#if CONFIG_PALETTE
+        palette_enabled_uv[uv_tx] = mbmi->palette_enabled[1];
+        if (palette_enabled_uv[uv_tx]) {
+          palette_size_uv[uv_tx] = mbmi->palette_size[1];
+          palette_run_length_uv[uv_tx] = mbmi->palette_run_length[1];
+          ps_uv[uv_tx] = mbmi->palette_scan_order[1];
+          vpx_memcpy(&palette_colors_uv[uv_tx][0],
+                     mbmi->palette_colors + PALETTE_MAX_SIZE,
+                     2 * PALETTE_MAX_SIZE *
+                     sizeof(palette_colors_uv[uv_tx][0]));
+          vpx_memcpy(&palette_runs_uv[uv_tx][0],
+                     mbmi->palette_runs + PALETTE_MAX_RUNS,
+                     PALETTE_MAX_RUNS * sizeof(palette_runs_uv[uv_tx][0]));
+        }
+#endif  // CONFIG_PALETTE
       }
-
       rate_uv = rate_uv_tokenonly[uv_tx];
       distortion_uv = dist_uv[uv_tx];
       skippable = skippable && skip_uv[uv_tx];
       mbmi->uv_mode = mode_uv[uv_tx];
 #if CONFIG_FILTERINTRA
       mbmi->uv_filterbit = fbit_uv[uv_tx];
-#endif
+#endif  // CONFIG_FILTERINTRA
 #if CONFIG_TX_SKIP
       mbmi->tx_skip[1] = tx_skipped_uv[uv_tx];
-#endif
+#endif  // CONFIG_TX_SKIP
+#if CONFIG_PALETTE
+      mbmi->palette_enabled[1] = palette_enabled_uv[uv_tx];
+      if (mbmi->palette_enabled[1]) {
+        mbmi->palette_size[1] = palette_size_uv[uv_tx];
+        mbmi->palette_run_length[1] = palette_run_length_uv[uv_tx];
+        mbmi->palette_scan_order[1] = ps_uv[uv_tx];
+        vpx_memcpy(mbmi->palette_colors + PALETTE_MAX_SIZE,
+                   &palette_colors_uv[uv_tx][0],
+                   2 * PALETTE_MAX_SIZE * sizeof(palette_colors_uv[uv_tx][0]));
+        vpx_memcpy(mbmi->palette_runs + PALETTE_MAX_RUNS,
+                   &palette_runs_uv[uv_tx][0],
+                   PALETTE_MAX_RUNS * sizeof(palette_runs_uv[uv_tx][0]));
+      }
+#endif  // CONFIG_PALETTE
 
       rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
 #if CONFIG_FILTERINTRA
       if (is_filter_allowed(mbmi->mode) && is_filter_enabled(mbmi->tx_size))
         rate2 += vp9_cost_bit(
            cm->fc.filterintra_prob[mbmi->tx_size][mbmi->mode], mbmi->filterbit);
-#endif
+#endif  // CONFIG_FILTERINTRA
+#if CONFIG_PALETTE
+    if (cpi->common.allow_palette_mode && bsize >= BLOCK_8X8)
+      rate2 += vp9_cost_bit(128, 0);
+#endif  // CONFIG_PALETTE
       if (this_mode != DC_PRED && this_mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
@@ -6202,7 +6185,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_TX_SKIP
     mbmi->tx_skip[0] = 0;
     mbmi->tx_skip[1] = 0;
-#endif
+#endif  // CONFIG_TX_SKIP
     x->skip = 0;
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
     for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -6301,7 +6284,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       rd_cost->rdcost = this_rd;
 #if CONFIG_SUPERTX
       *returnrate_nocoef = rate_copy_mode;
-#endif
+#endif  // CONFIG_SUPERTX
       best_rd = this_rd;
       best_mbmode = *mbmi;
       best_skip2 = this_skip2;
@@ -6354,7 +6337,320 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     vp9_zero(best_filter_diff);
     vp9_zero(best_tx_diff);
   }
-#endif
+#endif  // CONFIG_COPY_MODE
+#if CONFIG_PALETTE
+  if (bsize >= BLOCK_8X8 && cpi->common.allow_palette_mode &&
+      !is_inter_block(mbmi)) {
+    MB_MODE_INFO mbmi_copy = *mbmi;
+    colors = count_colors(src, src_stride, rows, cols);
+    x->skip = 0;
+    if (colors > 1 && colors <= 64) {
+      int n, r, c, i, j, max_itr = 200, k;
+      int l, d = get_bit_depth(rows * cols);
+      int palette_size_cost[PALETTE_SIZES];
+      int palette_run_length_cost[PALETTE_RUN_LENGTHS];
+      double centroids[PALETTE_MAX_SIZE];
+      double lb = src[0], ub = src[0], val;
+      PALETTE_SCAN_ORDER ps;
+
+      int64_t this_rd = INT64_MAX, this_rd_y, best_rd_y;
+      int rate2, rate_y , rate_uv, best_token_rate_y = INT_MAX;
+      int total_rate_y, current_best_total_rate_y, best_total_rate_y = INT_MAX;
+      int64_t distortion2, distortion_y, distortion_uv;
+      int64_t best_distortion_y = INT64_MAX;
+      int skippable, skip_y = 0;
+      int64_t tx_cache[TX_MODES];
+      TX_SIZE uv_tx;
+#if CONFIG_TX_SKIP
+      int rate_y_s, skippable_s;
+      int64_t distortion_y_s;
+      int64_t tx_cache_s[TX_MODES];
+      int tx_skipped_y = 0;
+#endif  // CONFIG_TX_SKIP
+
+      vpx_memset(x->kmeans_data_buffer, 0,
+                 sizeof(x->kmeans_data_buffer[0] * 4096));
+      vpx_memset(x->kmeans_indices_buffer, 0,
+                 sizeof(x->kmeans_indices_buffer[0] * 4096));
+      mbmi->palette_enabled[0] = 1;
+      vp9_cost_tokens(palette_size_cost,
+                      cpi->common.fc.palette_size_prob[bsize - BLOCK_8X8],
+                      vp9_palette_size_tree);
+      vp9_cost_tokens(palette_run_length_cost,
+                      cpi->common.fc.palette_run_length_prob[bsize - BLOCK_8X8],
+                      vp9_palette_run_length_tree);
+      mbmi->ref_frame[0] = INTRA_FRAME;
+      mbmi->mode = DC_PRED;
+      for (r = 0; r < rows; r++) {
+        for (c = 0; c < cols; c++) {
+          val = src[r * src_stride + c];
+          x->kmeans_data_buffer[r * cols + c] = val;
+          if (val < lb)
+            lb = val;
+          else if (val > ub)
+            ub = val;
+        }
+      }
+
+#if CONFIG_FILTERINTRA
+      mbmi->filterbit = 0;
+#endif  // CONFIG_FILTERINTRA
+#if CONFIG_COPY_MODE
+      mbmi->copy_mode = NOREF;
+#endif  // CONFIG_COPY_MODE
+      best_rd_y = INT64_MAX;
+      for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors;
+          n >= 2; n--) {
+        for (i = 0; i < n; i++)
+          centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+        r = k_means(x->kmeans_data_buffer, centroids, x->kmeans_indices_buffer,
+                    rows * cols, n, 1, max_itr);
+        insertion_sort(centroids, n);
+        i = 1;
+        k = n;
+        while (i < k) {
+          if (centroids[i] == centroids[i - 1]) {
+            j = i;
+            while (j < k - 1) {
+              centroids[j] = centroids[j + 1];
+              j++;
+            }
+            k--;
+          } else {
+            i++;
+          }
+        }
+
+        mbmi->palette_size[0] = k;
+        for (i = 0; i < k; i++) {
+          mbmi->palette_colors[i] = clip_pixel(round(centroids[i]));
+          centroids[i] = (double) mbmi->palette_colors[i];
+        }
+        calc_indices(x->kmeans_data_buffer, centroids, x->kmeans_indices_buffer,
+                     rows * cols, k, 1);
+        for (r = 0; r < rows; r++) {
+          for (c = 0; c < cols; c++) {
+            xd->plane[0].color_index_map[r * cols + c] =
+                x->kmeans_indices_buffer[r * cols + c];
+          }
+        }
+        vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+#if CONFIG_TX_SKIP
+        mbmi->tx_skip[0] = 0;
+#endif  // CONFIG_TX_SKIP
+        super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                        NULL, bsize, tx_cache, best_rd);
+#if CONFIG_TX_SKIP
+        if (try_tx_skip) {
+          mbmi->tx_skip[0] = 1;
+          super_block_yrd(cpi, x, &rate_y_s, &distortion_y_s, &skippable_s,
+                          NULL, bsize, tx_cache_s, best_rd);
+
+          if (rate_y != INT_MAX)
+            rate_y += vp9_cost_bit(cpi->common.fc.y_tx_skip_prob[0], 0);
+          if (rate_y_s != INT_MAX)
+            rate_y_s += vp9_cost_bit(cpi->common.fc.y_tx_skip_prob[0], 1);
+
+          if (rate_y_s != INT_MAX &&
+              (rate_y == INT_MAX ||
+                  RDCOST(x->rdmult, x->rddiv, rate_y, distortion_y) >
+          RDCOST(x->rdmult, x->rddiv, rate_y_s, distortion_y_s)))
+            mbmi->tx_skip[0] = 1;
+          else
+            mbmi->tx_skip[0] = 0;
+
+          super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                          NULL, bsize, tx_cache, best_rd);
+        }
+#endif  // CONFIG_TX_SKIP
+        if (rate_y == INT_MAX) {
+          continue;
+        }
+
+        current_best_total_rate_y = INT_MAX;
+        for (ps = H_SCAN; ps < PALETTE_SCAN_ORDERS; ps++) {
+          palette_scan(xd->plane[0].color_index_map, xd->palette_map_buffer,
+                       rows, cols, ps, xd->palette_scan_buffer);
+          l = run_lengh_encoding(xd->palette_map_buffer, rows * cols,
+                                 mbmi->palette_runs, palette_max_run(bsize));
+          if (!l)
+            continue;
+
+          total_rate_y = rate_y +
+              (1 + get_bit_depth(palette_max_run(bsize)) + 2 +
+                  8 * k + get_bit_depth(k) * (l >> 1)) * vp9_cost_bit(128, 0) +
+                  palette_size_cost[k - 2];
+          for (i = 0; i < l; i += 2) {
+            int bits = get_bit_depth(mbmi->palette_runs[i + 1]);
+            total_rate_y += palette_run_length_cost[bits > 6 ? 6 : bits - 1];
+            total_rate_y += (bits > 6 ? d : bits) * vp9_cost_bit(128, 0);
+          }
+#if CONFIG_TX_SKIP
+          total_rate_y += vp9_cost_bit(cpi->common.fc.y_tx_skip_prob[0],
+                                 mbmi->tx_skip[0]);
+#endif  // CONFIG_TX_SKIP
+          if (total_rate_y < current_best_total_rate_y) {
+            mbmi->palette_scan_order[0] = ps;
+            current_best_total_rate_y = total_rate_y;
+          }
+        }
+
+        palette_scan(xd->plane[0].color_index_map, xd->palette_map_buffer,
+                     rows, cols, mbmi->palette_scan_order[0],
+                     xd->palette_scan_buffer);
+        l = run_lengh_encoding(xd->palette_map_buffer, rows * cols,
+                               mbmi->palette_runs, palette_max_run(bsize));
+        if (!l)
+          continue;
+
+        this_rd_y = RDCOST(x->rdmult, x->rddiv,
+                           current_best_total_rate_y, distortion_y);
+        if (this_rd_y < best_rd_y) {
+          best_rd_y = this_rd_y;
+          skip_y = skippable;
+          best_distortion_y = distortion_y;
+          best_total_rate_y = current_best_total_rate_y;
+          best_token_rate_y = rate_y;
+          best_n = k;
+          best_l = l;
+          best_ps = mbmi->palette_scan_order[0];
+          vpx_memcpy(best_palette, mbmi->palette_colors,
+                     k * sizeof(best_palette[0]));
+          vpx_memcpy(best_runs, mbmi->palette_runs, l * sizeof(best_runs[0]));
+#if CONFIG_TX_SKIP
+          tx_skipped_y = mbmi->tx_skip[0];
+#endif  // CONFIG_TX_SKIP
+        }
+      }
+
+      if (best_rd_y < best_rd) {
+        rate_y = best_token_rate_y;
+        uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize,
+                                    xd->plane[1].subsampling_x,
+                                    xd->plane[1].subsampling_y);
+        if (rate_uv_intra[uv_tx] == INT_MAX) {
+          choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
+                               &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
+                               &dist_uv[uv_tx], &skip_uv[uv_tx],
+#if CONFIG_FILTERINTRA
+                               &fbit_uv[uv_tx],
+#endif  // CONFIG_FILTERINTRA
+                               &mode_uv[uv_tx]);
+          palette_enabled_uv[uv_tx] = mbmi->palette_enabled[1];
+          if (palette_enabled_uv[uv_tx]) {
+            palette_size_uv[uv_tx] = mbmi->palette_size[1];
+            palette_run_length_uv[uv_tx] = mbmi->palette_run_length[1];
+            ps_uv[uv_tx] = mbmi->palette_scan_order[1];
+            vpx_memcpy(&palette_colors_uv[uv_tx][0],
+                       mbmi->palette_colors + PALETTE_MAX_SIZE,
+                       2 * PALETTE_MAX_SIZE *
+                       sizeof(palette_colors_uv[uv_tx][0]));
+            vpx_memcpy(&palette_runs_uv[uv_tx][0],
+                       mbmi->palette_runs + PALETTE_MAX_RUNS,
+                       PALETTE_MAX_RUNS * sizeof(palette_runs_uv[uv_tx][0]));
+          }
+#if CONFIG_TX_SKIP
+          tx_skipped_uv[uv_tx] = mbmi->tx_skip[1];
+#endif  // CONFIG_TX_SKIP
+        }
+
+        rate_uv = rate_uv_tokenonly[uv_tx];
+        distortion_uv = dist_uv[uv_tx];
+        skippable = skip_y && skip_uv[uv_tx];
+        mbmi->uv_mode = mode_uv[uv_tx];
+
+        mbmi->palette_enabled[1] = palette_enabled_uv[uv_tx];
+        if (mbmi->palette_enabled[1]) {
+          mbmi->palette_size[1] = palette_size_uv[uv_tx];
+          mbmi->palette_run_length[1] = palette_run_length_uv[uv_tx];
+          mbmi->palette_scan_order[1] = ps_uv[uv_tx];
+          vpx_memcpy(mbmi->palette_colors + PALETTE_MAX_SIZE,
+                     &palette_colors_uv[uv_tx][0],
+                     2 * PALETTE_MAX_SIZE *
+                     sizeof(palette_colors_uv[uv_tx][0]));
+          vpx_memcpy(mbmi->palette_runs + PALETTE_MAX_RUNS,
+                     &palette_runs_uv[uv_tx][0],
+                     PALETTE_MAX_RUNS * sizeof(palette_runs_uv[uv_tx][0]));
+        }
+#if CONFIG_FILTERINTRA
+        mbmi->uv_filterbit = fbit_uv[uv_tx];
+#endif  // CONFIG_FILTERINTRA
+#if CONFIG_TX_SKIP
+        mbmi->tx_skip[1] = tx_skipped_uv[uv_tx];
+#endif  // CONFIG_TX_SKIP
+
+        rate2 = best_total_rate_y + rate_uv_intra[uv_tx];
+        distortion2 = best_distortion_y + distortion_uv;
+        x->skip = skippable;
+        if (skippable) {
+          // Back out the coefficient coding costs
+          rate2 -= (rate_y + rate_uv);
+          rate_y = 0;
+          rate_uv = 0;
+          // Cost the skip mb case
+          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+        } else {
+          // Add in the cost of the no skip flag.
+          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+        }
+
+        // Calculate the final RD estimate for this mode.
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      }
+
+      if (this_rd < best_rd) {
+        ctx->skip = x->skip;
+        ctx->skippable = skippable;
+        ctx->best_mode_index = 3;
+        mbmi->skip = skippable;
+        mbmi->mode = DC_PRED;
+        mbmi->ref_frame[0] = INTRA_FRAME;
+        mbmi->ref_frame[1] = NONE;
+        mbmi->palette_enabled[0] = 1;
+        mbmi->palette_size[0] = best_n;
+        mbmi->palette_run_length[0] = best_l;
+        mbmi->palette_scan_order[0] = best_ps;
+        vpx_memcpy(mbmi->palette_colors, best_palette,
+                   best_n * sizeof(best_palette[0]));
+        vpx_memcpy(mbmi->palette_runs, best_runs,
+                   best_l * sizeof(best_runs[0]));
+#if CONFIG_FILTERINTRA1
+        mbmi->filterbit = 0;
+#endif  // CONFIG_FILTERINTRA
+#if CONFIG_TX_SKIP
+        mbmi->tx_skip[0] = tx_skipped_y;
+#endif  // CONFIG_TX_SKIP
+      } else {
+        *mbmi = mbmi_copy;
+      }
+      ctx->mic = *xd->mi[0].src_mi;
+    }
+  }
+
+  if (mbmi->palette_enabled[0]) {
+    int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+    int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+
+    run_lengh_decoding(mbmi->palette_runs, mbmi->palette_run_length[0],
+                       xd->palette_map_buffer);
+    palette_iscan(xd->plane[0].color_index_map, xd->palette_map_buffer,
+                  rows, cols, mbmi->palette_scan_order[0],
+                  xd->palette_scan_buffer);
+  }
+
+  if (mbmi->palette_enabled[1]) {
+    int rows = 4 * num_4x4_blocks_high_lookup[bsize] >>
+        xd->plane[1].subsampling_y;
+    int cols = 4 * num_4x4_blocks_wide_lookup[bsize] >>
+        xd->plane[1].subsampling_y;
+
+    run_lengh_decoding(mbmi->palette_runs + PALETTE_MAX_RUNS,
+                       mbmi->palette_run_length[1], xd->palette_map_buffer);
+    palette_iscan(xd->plane[1].color_index_map, xd->palette_map_buffer,
+                  rows, cols, mbmi->palette_scan_order[1],
+                  xd->palette_scan_buffer);
+  }
+#endif  // CONFIG_PALETTE
 }
 
 void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,