2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "./vpx_dsp_rtcd.h"
12 #include "./macros_msa.h"
14 uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int src_stride,
22 uint64_t src0, src1, src2, src3;
26 LD4(src, src_stride, src0, src1, src2, src3);
27 INSERT_D2_SH(src0, src1, diff0);
28 INSERT_D2_SH(src2, src3, diff1);
29 DOTP_SH2_SW(diff0, diff1, diff0, diff1, mul0, mul1);
31 res0 = __msa_hadd_s_d(mul0, mul0);
32 res0 += __msa_splati_d(res0, 1);
33 ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
34 } else if (8 == size) {
35 v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
37 LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
38 DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
39 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
40 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
41 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
43 res0 = __msa_hadd_s_d(mul0, mul0);
44 res0 += __msa_splati_d(res0, 1);
45 ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
46 } else if (16 == size) {
47 v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
49 LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
50 DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
51 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
52 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
53 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
54 LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
55 src += 8 * src_stride;
56 DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
57 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
58 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
59 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
60 LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
61 DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
62 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
63 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
64 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
65 LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
66 DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
67 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
68 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
69 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
71 res0 += __msa_hadd_s_d(mul0, mul0);
73 res0 += __msa_splati_d(res0, 1);
74 ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
75 } else if (0 == (size % 16)) {
76 v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
78 for (row = 0; row < (size >> 4); row++) {
79 for (col = 0; col < size; col += 16) {
80 const int16_t *src_ptr = src + col;
81 LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
83 DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
84 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
85 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
86 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
87 LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
89 src_ptr += 8 * src_stride;
90 DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
91 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
92 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
93 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
94 LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
96 DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
97 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
98 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
99 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
100 LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
102 DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
103 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
104 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
105 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
107 res0 += __msa_hadd_s_d(mul0, mul0);
110 src += 16 * src_stride;
113 res0 += __msa_splati_d(res0, 1);
114 ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
118 for (row = 0; row < size; row++) {
119 for (col = 0; col < size; col++) {