granicus.if.org Git - libvpx/blob - vpx_dsp/x86/highbd_variance_sse2.c

   1 /*
   2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10 #include "./vpx_config.h"
  11
  12 #include "vpx_ports/mem.h"
  13
  14 typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
  15                                         const uint16_t *ref, int ref_stride,
  16                                         uint32_t *sse, int *sum);
  17
  18 uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
  19                                     const uint16_t *ref, int ref_stride,
  20                                     uint32_t *sse, int *sum);
  21
  22 uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
  23                                       const uint16_t *ref, int ref_stride,
  24                                       uint32_t *sse, int *sum);
  25
  26 static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
  27                                    const uint16_t *ref, int ref_stride,
  28                                    int w, int h, uint32_t *sse, int *sum,
  29                                    high_variance_fn_t var_fn, int block_size) {
  30   int i, j;
  31
  32   *sse = 0;
  33   *sum = 0;
  34
  35   for (i = 0; i < h; i += block_size) {
  36     for (j = 0; j < w; j += block_size) {
  37       unsigned int sse0;
  38       int sum0;
  39       var_fn(src + src_stride * i + j, src_stride,
  40              ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
  41       *sse += sse0;
  42       *sum += sum0;
  43     }
  44   }
  45 }
  46
  47 static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
  48                                     const uint16_t *ref, int ref_stride,
  49                                     int w, int h, uint32_t *sse, int *sum,
  50                                     high_variance_fn_t var_fn, int block_size) {
  51   int i, j;
  52   uint64_t sse_long = 0;
  53   int32_t sum_long = 0;
  54
  55   for (i = 0; i < h; i += block_size) {
  56     for (j = 0; j < w; j += block_size) {
  57       unsigned int sse0;
  58       int sum0;
  59       var_fn(src + src_stride * i + j, src_stride,
  60              ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
  61       sse_long += sse0;
  62       sum_long += sum0;
  63     }
  64   }
  65   *sum = ROUND_POWER_OF_TWO(sum_long, 2);
  66   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
  67 }
  68
  69 static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
  70                                     const uint16_t *ref, int ref_stride,
  71                                     int w, int h, uint32_t *sse, int *sum,
  72                                     high_variance_fn_t var_fn, int block_size) {
  73   int i, j;
  74   uint64_t sse_long = 0;
  75   int32_t sum_long = 0;
  76
  77   for (i = 0; i < h; i += block_size) {
  78     for (j = 0; j < w; j += block_size) {
  79       unsigned int sse0;
  80       int sum0;
  81       var_fn(src + src_stride * i + j, src_stride,
  82              ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
  83       sse_long += sse0;
  84       sum_long += sum0;
  85     }
  86   }
  87   *sum = ROUND_POWER_OF_TWO(sum_long, 4);
  88   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
  89 }
  90
  91
  92 #define HIGH_GET_VAR(S) \
  93 void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
  94                                        const uint8_t *ref8, int ref_stride, \
  95                                        uint32_t *sse, int *sum) { \
  96   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  97   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  98   vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
  99                                      sse, sum); \
 100 } \
 101 \
 102 void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
 103                                           const uint8_t *ref8, int ref_stride, \
 104                                           uint32_t *sse, int *sum) { \
 105   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
 106   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
 107   vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
 108                                      sse, sum); \
 109   *sum = ROUND_POWER_OF_TWO(*sum, 2); \
 110   *sse = ROUND_POWER_OF_TWO(*sse, 4); \
 111 } \
 112 \
 113 void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
 114                                           const uint8_t *ref8, int ref_stride, \
 115                                           uint32_t *sse, int *sum) { \
 116   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
 117   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
 118   vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
 119                                      sse, sum); \
 120   *sum = ROUND_POWER_OF_TWO(*sum, 4); \
 121   *sse = ROUND_POWER_OF_TWO(*sse, 8); \
 122 }
 123
 124 HIGH_GET_VAR(16);
 125 HIGH_GET_VAR(8);
 126
 127 #undef HIGH_GET_VAR
 128
 129 #define VAR_FN(w, h, block_size, shift) \
 130 uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
 131     const uint8_t *src8, int src_stride, \
 132     const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
 133   int sum; \
 134   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
 135   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
 136   highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
 137                          vpx_highbd_calc##block_size##x##block_size##var_sse2, \
 138                          block_size); \
 139   return *sse - (((int64_t)sum * sum) >> shift); \
 140 } \
 141 \
 142 uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
 143     const uint8_t *src8, int src_stride, \
 144     const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
 145   int sum; \
 146   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
 147   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
 148   highbd_10_variance_sse2( \
 149       src, src_stride, ref, ref_stride, w, h, sse, &sum, \
 150       vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
 151   return *sse - (((int64_t)sum * sum) >> shift); \
 152 } \
 153 \
 154 uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
 155     const uint8_t *src8, int src_stride, \
 156     const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
 157   int sum; \
 158   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
 159   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
 160   highbd_12_variance_sse2( \
 161       src, src_stride, ref, ref_stride, w, h, sse, &sum, \
 162       vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
 163   return *sse - (((int64_t)sum * sum) >> shift); \
 164 }
 165
 166 VAR_FN(64, 64, 16, 12);
 167 VAR_FN(64, 32, 16, 11);
 168 VAR_FN(32, 64, 16, 11);
 169 VAR_FN(32, 32, 16, 10);
 170 VAR_FN(32, 16, 16, 9);
 171 VAR_FN(16, 32, 16, 9);
 172 VAR_FN(16, 16, 16, 8);
 173 VAR_FN(16, 8, 8, 7);
 174 VAR_FN(8, 16, 8, 7);
 175 VAR_FN(8, 8, 8, 6);
 176
 177 #undef VAR_FN
 178
 179 unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
 180                                       const uint8_t *ref8, int ref_stride,
 181                                       unsigned int *sse) {
 182   int sum;
 183   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
 184   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 185   highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
 186                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
 187   return *sse;
 188 }
 189
 190 unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
 191                                          const uint8_t *ref8, int ref_stride,
 192                                          unsigned int *sse) {
 193   int sum;
 194   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
 195   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 196   highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
 197                           sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
 198   return *sse;
 199 }
 200
 201 unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
 202                                          const uint8_t *ref8, int ref_stride,
 203                                          unsigned int *sse) {
 204   int sum;
 205   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
 206   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 207   highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
 208                           sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
 209   return *sse;
 210 }
 211
 212 unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
 213                                     const uint8_t *ref8, int ref_stride,
 214                                     unsigned int *sse) {
 215   int sum;
 216   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
 217   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 218   highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
 219                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
 220   return *sse;
 221 }
 222
 223 unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
 224                                        const uint8_t *ref8, int ref_stride,
 225                                        unsigned int *sse) {
 226   int sum;
 227   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
 228   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 229   highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
 230                           sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
 231   return *sse;
 232 }
 233
 234 unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
 235                                        const uint8_t *ref8, int ref_stride,
 236                                        unsigned int *sse) {
 237   int sum;
 238   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
 239   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 240   highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
 241                           sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
 242   return *sse;
 243 }
 244
 245 #if CONFIG_USE_X86INC
 246 #define DECL(w, opt) \
 247   int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
 248                                                  ptrdiff_t src_stride, \
 249                                                  int x_offset, int y_offset, \
 250                                                  const uint16_t *dst, \
 251                                                  ptrdiff_t dst_stride, \
 252                                                  int height, unsigned int *sse);
 253 #define DECLS(opt1, opt2) \
 254   DECL(8, opt1); \
 255   DECL(16, opt1)
 256
 257 DECLS(sse2, sse);
 258 // TODO(johannkoenig): enable the ssse3 or delete
 259 // DECLS(ssse3, ssse3);
 260 #undef DECLS
 261 #undef DECL
 262
 263 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
 264 uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
 265                                                           int src_stride, \
 266                                                           int x_offset, \
 267                                                           int y_offset, \
 268                                                           const uint8_t *dst8, \
 269                                                           int dst_stride, \
 270                                                           uint32_t *sse_ptr) { \
 271   uint32_t sse; \
 272   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
 273   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
 274   int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
 275                                                        x_offset, y_offset, \
 276                                                        dst, dst_stride, h, \
 277                                                        &sse); \
 278   if (w > wf) { \
 279     unsigned int sse2; \
 280     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
 281                                                           src_stride, \
 282                                                           x_offset, y_offset, \
 283                                                           dst + 16, \
 284                                                           dst_stride, \
 285                                                           h, &sse2); \
 286     se += se2; \
 287     sse += sse2; \
 288     if (w > wf * 2) { \
 289       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
 290                                                         x_offset, y_offset, \
 291                                                         dst + 32, dst_stride, \
 292                                                         h, &sse2); \
 293       se += se2; \
 294       sse += sse2; \
 295       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
 296           src + 48, src_stride, x_offset, y_offset, \
 297           dst + 48, dst_stride, h, &sse2); \
 298       se += se2; \
 299       sse += sse2; \
 300     } \
 301   } \
 302   *sse_ptr = sse; \
 303   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 304 } \
 305 \
 306 uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
 307     const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
 308     const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
 309   uint32_t sse; \
 310   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
 311   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
 312   int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
 313                                                        x_offset, y_offset, \
 314                                                        dst, dst_stride, \
 315                                                        h, &sse); \
 316   if (w > wf) { \
 317     uint32_t sse2; \
 318     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
 319                                                           src_stride, \
 320                                                           x_offset, y_offset, \
 321                                                           dst + 16, \
 322                                                           dst_stride, \
 323                                                           h, &sse2); \
 324     se += se2; \
 325     sse += sse2; \
 326     if (w > wf * 2) { \
 327       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
 328                                                         x_offset, y_offset, \
 329                                                         dst + 32, dst_stride, \
 330                                                         h, &sse2); \
 331       se += se2; \
 332       sse += sse2; \
 333       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
 334                                                         x_offset, y_offset, \
 335                                                         dst + 48, dst_stride, \
 336                                                         h, &sse2); \
 337       se += se2; \
 338       sse += sse2; \
 339     } \
 340   } \
 341   se = ROUND_POWER_OF_TWO(se, 2); \
 342   sse = ROUND_POWER_OF_TWO(sse, 4); \
 343   *sse_ptr = sse; \
 344   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 345 } \
 346 \
 347 uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
 348     const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
 349     const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
 350   int start_row; \
 351   uint32_t sse; \
 352   int se = 0; \
 353   uint64_t long_sse = 0; \
 354   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
 355   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
 356   for (start_row = 0; start_row < h; start_row +=16) { \
 357     uint32_t sse2; \
 358     int height = h - start_row < 16 ? h - start_row : 16; \
 359     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
 360         src + (start_row * src_stride), src_stride, \
 361         x_offset, y_offset, dst + (start_row * dst_stride), \
 362         dst_stride, height, &sse2); \
 363     se += se2; \
 364     long_sse += sse2; \
 365     if (w > wf) { \
 366       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
 367           src + 16 + (start_row * src_stride), src_stride, \
 368           x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
 369           dst_stride, height, &sse2); \
 370       se += se2; \
 371       long_sse += sse2; \
 372       if (w > wf * 2) { \
 373         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
 374             src + 32 + (start_row * src_stride), src_stride, \
 375             x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
 376             dst_stride, height, &sse2); \
 377         se += se2; \
 378         long_sse += sse2; \
 379         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
 380             src + 48 + (start_row * src_stride), src_stride, \
 381             x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
 382             dst_stride, height, &sse2); \
 383         se += se2; \
 384         long_sse += sse2; \
 385       }\
 386     } \
 387   } \
 388   se = ROUND_POWER_OF_TWO(se, 4); \
 389   sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
 390   *sse_ptr = sse; \
 391   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 392 }
 393
 394 #define FNS(opt1, opt2) \
 395 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
 396 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
 397 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
 398 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
 399 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
 400 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
 401 FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
 402 FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
 403 FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
 404 FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
 405 FN(8, 4, 8, 3, 2, opt1, (int64_t));
 406
 407
 408 FNS(sse2, sse);
 409
 410 #undef FNS
 411 #undef FN
 412
 413 #define DECL(w, opt) \
 414 int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
 415                                                    ptrdiff_t src_stride, \
 416                                                    int x_offset, int y_offset, \
 417                                                    const uint16_t *dst, \
 418                                                    ptrdiff_t dst_stride, \
 419                                                    const uint16_t *sec, \
 420                                                    ptrdiff_t sec_stride, \
 421                                                    int height, \
 422                                                    unsigned int *sse);
 423 #define DECLS(opt1) \
 424 DECL(16, opt1) \
 425 DECL(8, opt1)
 426
 427 DECLS(sse2);
 428 #undef DECL
 429 #undef DECLS
 430
 431 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
 432 uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
 433     const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
 434     const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
 435     const uint8_t *sec8) { \
 436   uint32_t sse; \
 437   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
 438   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
 439   uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
 440   int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
 441                src, src_stride, x_offset, \
 442                y_offset, dst, dst_stride, sec, w, h, &sse); \
 443   if (w > wf) { \
 444     uint32_t sse2; \
 445     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
 446                   src + 16, src_stride, x_offset, y_offset, \
 447                   dst + 16, dst_stride, sec + 16, w, h, &sse2); \
 448     se += se2; \
 449     sse += sse2; \
 450     if (w > wf * 2) { \
 451       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
 452                 src + 32, src_stride, x_offset, y_offset, \
 453                 dst + 32, dst_stride, sec + 32, w, h, &sse2); \
 454       se += se2; \
 455       sse += sse2; \
 456       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
 457                 src + 48, src_stride, x_offset, y_offset, \
 458                 dst + 48, dst_stride, sec + 48, w, h, &sse2); \
 459       se += se2; \
 460       sse += sse2; \
 461     } \
 462   } \
 463   *sse_ptr = sse; \
 464   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 465 } \
 466 \
 467 uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
 468     const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
 469     const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
 470     const uint8_t *sec8) { \
 471   uint32_t sse; \
 472   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
 473   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
 474   uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
 475   int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
 476                                             src, src_stride, x_offset, \
 477                                             y_offset, dst, dst_stride, \
 478                                             sec, w, h, &sse); \
 479   if (w > wf) { \
 480     uint32_t sse2; \
 481     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
 482                                             src + 16, src_stride, \
 483                                             x_offset, y_offset, \
 484                                             dst + 16, dst_stride, \
 485                                             sec + 16, w, h, &sse2); \
 486     se += se2; \
 487     sse += sse2; \
 488     if (w > wf * 2) { \
 489       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
 490                                             src + 32, src_stride, \
 491                                             x_offset, y_offset, \
 492                                             dst + 32, dst_stride, \
 493                                             sec + 32, w, h, &sse2); \
 494       se += se2; \
 495       sse += sse2; \
 496       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
 497                                             src + 48, src_stride, \
 498                                             x_offset, y_offset, \
 499                                             dst + 48, dst_stride, \
 500                                             sec + 48, w, h, &sse2); \
 501       se += se2; \
 502       sse += sse2; \
 503     } \
 504   } \
 505   se = ROUND_POWER_OF_TWO(se, 2); \
 506   sse = ROUND_POWER_OF_TWO(sse, 4); \
 507   *sse_ptr = sse; \
 508   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 509 } \
 510 \
 511 uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
 512     const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
 513     const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
 514     const uint8_t *sec8) { \
 515   int start_row; \
 516   uint32_t sse; \
 517   int se = 0; \
 518   uint64_t long_sse = 0; \
 519   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
 520   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
 521   uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
 522   for (start_row = 0; start_row < h; start_row +=16) { \
 523     uint32_t sse2; \
 524     int height = h - start_row < 16 ? h - start_row : 16; \
 525     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
 526                 src + (start_row * src_stride), src_stride, x_offset, \
 527                 y_offset, dst + (start_row * dst_stride), dst_stride, \
 528                 sec + (start_row * w), w, height, &sse2); \
 529     se += se2; \
 530     long_sse += sse2; \
 531     if (w > wf) { \
 532       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
 533                 src + 16 + (start_row * src_stride), src_stride, \
 534                 x_offset, y_offset, \
 535                 dst + 16 + (start_row * dst_stride), dst_stride, \
 536                 sec + 16 + (start_row * w), w, height, &sse2); \
 537       se += se2; \
 538       long_sse += sse2; \
 539       if (w > wf * 2) { \
 540         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
 541                 src + 32 + (start_row * src_stride), src_stride, \
 542                 x_offset, y_offset, \
 543                 dst + 32 + (start_row * dst_stride), dst_stride, \
 544                 sec + 32 + (start_row * w), w, height, &sse2); \
 545         se += se2; \
 546         long_sse += sse2; \
 547         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
 548                 src + 48 + (start_row * src_stride), src_stride, \
 549                 x_offset, y_offset, \
 550                 dst + 48 + (start_row * dst_stride), dst_stride, \
 551                 sec + 48 + (start_row * w), w, height, &sse2); \
 552         se += se2; \
 553         long_sse += sse2; \
 554       } \
 555     } \
 556   } \
 557   se = ROUND_POWER_OF_TWO(se, 4); \
 558   sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
 559   *sse_ptr = sse; \
 560   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 561 }
 562
 563
 564 #define FNS(opt1) \
 565 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
 566 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
 567 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
 568 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
 569 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
 570 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
 571 FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
 572 FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
 573 FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
 574 FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
 575 FN(8, 4, 8, 3, 2, opt1, (int64_t));
 576
 577 FNS(sse2);
 578
 579 #undef FNS
 580 #undef FN
 581 #endif  // CONFIG_USE_X86INC