]> granicus.if.org Git - libvpx/blob - vpx_dsp/x86/highbd_variance_sse2.c
Clean out more MSVC warnings
[libvpx] / vpx_dsp / x86 / highbd_variance_sse2.c
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 #include "./vpx_config.h"
11
12 #include "vpx_ports/mem.h"
13
14 typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
15                                         const uint16_t *ref, int ref_stride,
16                                         uint32_t *sse, int *sum);
17
18 uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
19                                     const uint16_t *ref, int ref_stride,
20                                     uint32_t *sse, int *sum);
21
22 uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
23                                       const uint16_t *ref, int ref_stride,
24                                       uint32_t *sse, int *sum);
25
26 static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
27                                    const uint16_t *ref, int ref_stride,
28                                    int w, int h, uint32_t *sse, int *sum,
29                                    high_variance_fn_t var_fn, int block_size) {
30   int i, j;
31
32   *sse = 0;
33   *sum = 0;
34
35   for (i = 0; i < h; i += block_size) {
36     for (j = 0; j < w; j += block_size) {
37       unsigned int sse0;
38       int sum0;
39       var_fn(src + src_stride * i + j, src_stride,
40              ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
41       *sse += sse0;
42       *sum += sum0;
43     }
44   }
45 }
46
47 static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
48                                     const uint16_t *ref, int ref_stride,
49                                     int w, int h, uint32_t *sse, int *sum,
50                                     high_variance_fn_t var_fn, int block_size) {
51   int i, j;
52   uint64_t sse_long = 0;
53   int32_t sum_long = 0;
54
55   for (i = 0; i < h; i += block_size) {
56     for (j = 0; j < w; j += block_size) {
57       unsigned int sse0;
58       int sum0;
59       var_fn(src + src_stride * i + j, src_stride,
60              ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
61       sse_long += sse0;
62       sum_long += sum0;
63     }
64   }
65   *sum = ROUND_POWER_OF_TWO(sum_long, 2);
66   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
67 }
68
69 static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
70                                     const uint16_t *ref, int ref_stride,
71                                     int w, int h, uint32_t *sse, int *sum,
72                                     high_variance_fn_t var_fn, int block_size) {
73   int i, j;
74   uint64_t sse_long = 0;
75   int32_t sum_long = 0;
76
77   for (i = 0; i < h; i += block_size) {
78     for (j = 0; j < w; j += block_size) {
79       unsigned int sse0;
80       int sum0;
81       var_fn(src + src_stride * i + j, src_stride,
82              ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
83       sse_long += sse0;
84       sum_long += sum0;
85     }
86   }
87   *sum = ROUND_POWER_OF_TWO(sum_long, 4);
88   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
89 }
90
91
92 #define HIGH_GET_VAR(S) \
93 void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
94                                        const uint8_t *ref8, int ref_stride, \
95                                        uint32_t *sse, int *sum) { \
96   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
97   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
98   vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
99                                      sse, sum); \
100 } \
101 \
102 void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
103                                           const uint8_t *ref8, int ref_stride, \
104                                           uint32_t *sse, int *sum) { \
105   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
106   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
107   vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
108                                      sse, sum); \
109   *sum = ROUND_POWER_OF_TWO(*sum, 2); \
110   *sse = ROUND_POWER_OF_TWO(*sse, 4); \
111 } \
112 \
113 void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
114                                           const uint8_t *ref8, int ref_stride, \
115                                           uint32_t *sse, int *sum) { \
116   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
117   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
118   vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
119                                      sse, sum); \
120   *sum = ROUND_POWER_OF_TWO(*sum, 4); \
121   *sse = ROUND_POWER_OF_TWO(*sse, 8); \
122 }
123
124 HIGH_GET_VAR(16);
125 HIGH_GET_VAR(8);
126
127 #undef HIGH_GET_VAR
128
129 #define VAR_FN(w, h, block_size, shift) \
130 uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
131     const uint8_t *src8, int src_stride, \
132     const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
133   int sum; \
134   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
135   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
136   highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
137                          vpx_highbd_calc##block_size##x##block_size##var_sse2, \
138                          block_size); \
139   return *sse - (((int64_t)sum * sum) >> shift); \
140 } \
141 \
142 uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
143     const uint8_t *src8, int src_stride, \
144     const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
145   int sum; \
146   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
147   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
148   highbd_10_variance_sse2( \
149       src, src_stride, ref, ref_stride, w, h, sse, &sum, \
150       vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
151   return *sse - (((int64_t)sum * sum) >> shift); \
152 } \
153 \
154 uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
155     const uint8_t *src8, int src_stride, \
156     const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
157   int sum; \
158   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
159   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
160   highbd_12_variance_sse2( \
161       src, src_stride, ref, ref_stride, w, h, sse, &sum, \
162       vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
163   return *sse - (((int64_t)sum * sum) >> shift); \
164 }
165
166 VAR_FN(64, 64, 16, 12);
167 VAR_FN(64, 32, 16, 11);
168 VAR_FN(32, 64, 16, 11);
169 VAR_FN(32, 32, 16, 10);
170 VAR_FN(32, 16, 16, 9);
171 VAR_FN(16, 32, 16, 9);
172 VAR_FN(16, 16, 16, 8);
173 VAR_FN(16, 8, 8, 7);
174 VAR_FN(8, 16, 8, 7);
175 VAR_FN(8, 8, 8, 6);
176
177 #undef VAR_FN
178
179 unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
180                                       const uint8_t *ref8, int ref_stride,
181                                       unsigned int *sse) {
182   int sum;
183   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
184   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
185   highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
186                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
187   return *sse;
188 }
189
190 unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
191                                          const uint8_t *ref8, int ref_stride,
192                                          unsigned int *sse) {
193   int sum;
194   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
195   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
196   highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
197                           sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
198   return *sse;
199 }
200
201 unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
202                                          const uint8_t *ref8, int ref_stride,
203                                          unsigned int *sse) {
204   int sum;
205   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
206   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
207   highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
208                           sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
209   return *sse;
210 }
211
212 unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
213                                     const uint8_t *ref8, int ref_stride,
214                                     unsigned int *sse) {
215   int sum;
216   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
217   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
218   highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
219                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
220   return *sse;
221 }
222
223 unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
224                                        const uint8_t *ref8, int ref_stride,
225                                        unsigned int *sse) {
226   int sum;
227   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
228   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
229   highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
230                           sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
231   return *sse;
232 }
233
234 unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
235                                        const uint8_t *ref8, int ref_stride,
236                                        unsigned int *sse) {
237   int sum;
238   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
239   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
240   highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
241                           sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
242   return *sse;
243 }
244
245 #if CONFIG_USE_X86INC
246 #define DECL(w, opt) \
247   int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
248                                                  ptrdiff_t src_stride, \
249                                                  int x_offset, int y_offset, \
250                                                  const uint16_t *dst, \
251                                                  ptrdiff_t dst_stride, \
252                                                  int height, unsigned int *sse);
253 #define DECLS(opt1, opt2) \
254   DECL(8, opt1); \
255   DECL(16, opt1)
256
257 DECLS(sse2, sse);
258 // TODO(johannkoenig): enable the ssse3 or delete
259 // DECLS(ssse3, ssse3);
260 #undef DECLS
261 #undef DECL
262
263 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
264 uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
265                                                           int src_stride, \
266                                                           int x_offset, \
267                                                           int y_offset, \
268                                                           const uint8_t *dst8, \
269                                                           int dst_stride, \
270                                                           uint32_t *sse_ptr) { \
271   uint32_t sse; \
272   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
273   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
274   int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
275                                                        x_offset, y_offset, \
276                                                        dst, dst_stride, h, \
277                                                        &sse); \
278   if (w > wf) { \
279     unsigned int sse2; \
280     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
281                                                           src_stride, \
282                                                           x_offset, y_offset, \
283                                                           dst + 16, \
284                                                           dst_stride, \
285                                                           h, &sse2); \
286     se += se2; \
287     sse += sse2; \
288     if (w > wf * 2) { \
289       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
290                                                         x_offset, y_offset, \
291                                                         dst + 32, dst_stride, \
292                                                         h, &sse2); \
293       se += se2; \
294       sse += sse2; \
295       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
296           src + 48, src_stride, x_offset, y_offset, \
297           dst + 48, dst_stride, h, &sse2); \
298       se += se2; \
299       sse += sse2; \
300     } \
301   } \
302   *sse_ptr = sse; \
303   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
304 } \
305 \
306 uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
307     const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
308     const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
309   uint32_t sse; \
310   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
311   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
312   int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
313                                                        x_offset, y_offset, \
314                                                        dst, dst_stride, \
315                                                        h, &sse); \
316   if (w > wf) { \
317     uint32_t sse2; \
318     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
319                                                           src_stride, \
320                                                           x_offset, y_offset, \
321                                                           dst + 16, \
322                                                           dst_stride, \
323                                                           h, &sse2); \
324     se += se2; \
325     sse += sse2; \
326     if (w > wf * 2) { \
327       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
328                                                         x_offset, y_offset, \
329                                                         dst + 32, dst_stride, \
330                                                         h, &sse2); \
331       se += se2; \
332       sse += sse2; \
333       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
334                                                         x_offset, y_offset, \
335                                                         dst + 48, dst_stride, \
336                                                         h, &sse2); \
337       se += se2; \
338       sse += sse2; \
339     } \
340   } \
341   se = ROUND_POWER_OF_TWO(se, 2); \
342   sse = ROUND_POWER_OF_TWO(sse, 4); \
343   *sse_ptr = sse; \
344   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
345 } \
346 \
347 uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
348     const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
349     const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
350   int start_row; \
351   uint32_t sse; \
352   int se = 0; \
353   uint64_t long_sse = 0; \
354   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
355   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
356   for (start_row = 0; start_row < h; start_row +=16) { \
357     uint32_t sse2; \
358     int height = h - start_row < 16 ? h - start_row : 16; \
359     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
360         src + (start_row * src_stride), src_stride, \
361         x_offset, y_offset, dst + (start_row * dst_stride), \
362         dst_stride, height, &sse2); \
363     se += se2; \
364     long_sse += sse2; \
365     if (w > wf) { \
366       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
367           src + 16 + (start_row * src_stride), src_stride, \
368           x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
369           dst_stride, height, &sse2); \
370       se += se2; \
371       long_sse += sse2; \
372       if (w > wf * 2) { \
373         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
374             src + 32 + (start_row * src_stride), src_stride, \
375             x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
376             dst_stride, height, &sse2); \
377         se += se2; \
378         long_sse += sse2; \
379         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
380             src + 48 + (start_row * src_stride), src_stride, \
381             x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
382             dst_stride, height, &sse2); \
383         se += se2; \
384         long_sse += sse2; \
385       }\
386     } \
387   } \
388   se = ROUND_POWER_OF_TWO(se, 4); \
389   sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
390   *sse_ptr = sse; \
391   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
392 }
393
394 #define FNS(opt1, opt2) \
395 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
396 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
397 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
398 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
399 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
400 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
401 FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
402 FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
403 FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
404 FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
405 FN(8, 4, 8, 3, 2, opt1, (int64_t));
406
407
408 FNS(sse2, sse);
409
410 #undef FNS
411 #undef FN
412
413 #define DECL(w, opt) \
414 int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
415                                                    ptrdiff_t src_stride, \
416                                                    int x_offset, int y_offset, \
417                                                    const uint16_t *dst, \
418                                                    ptrdiff_t dst_stride, \
419                                                    const uint16_t *sec, \
420                                                    ptrdiff_t sec_stride, \
421                                                    int height, \
422                                                    unsigned int *sse);
423 #define DECLS(opt1) \
424 DECL(16, opt1) \
425 DECL(8, opt1)
426
427 DECLS(sse2);
428 #undef DECL
429 #undef DECLS
430
431 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
432 uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
433     const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
434     const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
435     const uint8_t *sec8) { \
436   uint32_t sse; \
437   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
438   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
439   uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
440   int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
441                src, src_stride, x_offset, \
442                y_offset, dst, dst_stride, sec, w, h, &sse); \
443   if (w > wf) { \
444     uint32_t sse2; \
445     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
446                   src + 16, src_stride, x_offset, y_offset, \
447                   dst + 16, dst_stride, sec + 16, w, h, &sse2); \
448     se += se2; \
449     sse += sse2; \
450     if (w > wf * 2) { \
451       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
452                 src + 32, src_stride, x_offset, y_offset, \
453                 dst + 32, dst_stride, sec + 32, w, h, &sse2); \
454       se += se2; \
455       sse += sse2; \
456       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
457                 src + 48, src_stride, x_offset, y_offset, \
458                 dst + 48, dst_stride, sec + 48, w, h, &sse2); \
459       se += se2; \
460       sse += sse2; \
461     } \
462   } \
463   *sse_ptr = sse; \
464   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
465 } \
466 \
467 uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
468     const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
469     const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
470     const uint8_t *sec8) { \
471   uint32_t sse; \
472   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
473   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
474   uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
475   int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
476                                             src, src_stride, x_offset, \
477                                             y_offset, dst, dst_stride, \
478                                             sec, w, h, &sse); \
479   if (w > wf) { \
480     uint32_t sse2; \
481     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
482                                             src + 16, src_stride, \
483                                             x_offset, y_offset, \
484                                             dst + 16, dst_stride, \
485                                             sec + 16, w, h, &sse2); \
486     se += se2; \
487     sse += sse2; \
488     if (w > wf * 2) { \
489       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
490                                             src + 32, src_stride, \
491                                             x_offset, y_offset, \
492                                             dst + 32, dst_stride, \
493                                             sec + 32, w, h, &sse2); \
494       se += se2; \
495       sse += sse2; \
496       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
497                                             src + 48, src_stride, \
498                                             x_offset, y_offset, \
499                                             dst + 48, dst_stride, \
500                                             sec + 48, w, h, &sse2); \
501       se += se2; \
502       sse += sse2; \
503     } \
504   } \
505   se = ROUND_POWER_OF_TWO(se, 2); \
506   sse = ROUND_POWER_OF_TWO(sse, 4); \
507   *sse_ptr = sse; \
508   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
509 } \
510 \
511 uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
512     const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
513     const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
514     const uint8_t *sec8) { \
515   int start_row; \
516   uint32_t sse; \
517   int se = 0; \
518   uint64_t long_sse = 0; \
519   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
520   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
521   uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
522   for (start_row = 0; start_row < h; start_row +=16) { \
523     uint32_t sse2; \
524     int height = h - start_row < 16 ? h - start_row : 16; \
525     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
526                 src + (start_row * src_stride), src_stride, x_offset, \
527                 y_offset, dst + (start_row * dst_stride), dst_stride, \
528                 sec + (start_row * w), w, height, &sse2); \
529     se += se2; \
530     long_sse += sse2; \
531     if (w > wf) { \
532       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
533                 src + 16 + (start_row * src_stride), src_stride, \
534                 x_offset, y_offset, \
535                 dst + 16 + (start_row * dst_stride), dst_stride, \
536                 sec + 16 + (start_row * w), w, height, &sse2); \
537       se += se2; \
538       long_sse += sse2; \
539       if (w > wf * 2) { \
540         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
541                 src + 32 + (start_row * src_stride), src_stride, \
542                 x_offset, y_offset, \
543                 dst + 32 + (start_row * dst_stride), dst_stride, \
544                 sec + 32 + (start_row * w), w, height, &sse2); \
545         se += se2; \
546         long_sse += sse2; \
547         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
548                 src + 48 + (start_row * src_stride), src_stride, \
549                 x_offset, y_offset, \
550                 dst + 48 + (start_row * dst_stride), dst_stride, \
551                 sec + 48 + (start_row * w), w, height, &sse2); \
552         se += se2; \
553         long_sse += sse2; \
554       } \
555     } \
556   } \
557   se = ROUND_POWER_OF_TWO(se, 4); \
558   sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
559   *sse_ptr = sse; \
560   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
561 }
562
563
564 #define FNS(opt1) \
565 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
566 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
567 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
568 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
569 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
570 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
571 FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
572 FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
573 FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
574 FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
575 FN(8, 4, 8, 3, 2, opt1, (int64_t));
576
577 FNS(sse2);
578
579 #undef FNS
580 #undef FN
581 #endif  // CONFIG_USE_X86INC