]> granicus.if.org Git - libvpx/blob - vp8/encoder/x86/sad_ssse3.asm
safety check to avoid divide by 0s
[libvpx] / vp8 / encoder / x86 / sad_ssse3.asm
1 ;
2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ;  Use of this source code is governed by a BSD-style license
5 ;  that can be found in the LICENSE file in the root of the source
6 ;  tree. An additional intellectual property rights grant can be found
7 ;  in the file PATENTS.  All contributing project authors may
8 ;  be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 %idefine QWORD
15
16 %macro PROCESS_16X2X3 1
17 %if %1
18         movdqa          xmm0,       [rsi]
19         lddqu           xmm5,       [rdi]
20         lddqu           xmm6,       [rdi+1]
21         lddqu           xmm7,       [rdi+2]
22
23         psadbw          xmm5,       xmm0
24         psadbw          xmm6,       xmm0
25         psadbw          xmm7,       xmm0
26 %else
27         movdqa          xmm0,       [rsi]
28         lddqu           xmm1,       [rdi]
29         lddqu           xmm2,       [rdi+1]
30         lddqu           xmm3,       [rdi+2]
31
32         psadbw          xmm1,       xmm0
33         psadbw          xmm2,       xmm0
34         psadbw          xmm3,       xmm0
35
36         paddw           xmm5,       xmm1
37         paddw           xmm6,       xmm2
38         paddw           xmm7,       xmm3
39 %endif
40         movdqa          xmm0,       QWORD PTR [rsi+rax]
41         lddqu           xmm1,       QWORD PTR [rdi+rdx]
42         lddqu           xmm2,       QWORD PTR [rdi+rdx+1]
43         lddqu           xmm3,       QWORD PTR [rdi+rdx+2]
44
45         lea             rsi,        [rsi+rax*2]
46         lea             rdi,        [rdi+rdx*2]
47
48         psadbw          xmm1,       xmm0
49         psadbw          xmm2,       xmm0
50         psadbw          xmm3,       xmm0
51
52         paddw           xmm5,       xmm1
53         paddw           xmm6,       xmm2
54         paddw           xmm7,       xmm3
55 %endmacro
56
57 %macro PROCESS_16X2X3_OFFSET 2
58 %if %1
59         movdqa          xmm0,       [rsi]
60         movdqa          xmm4,       [rdi]
61         movdqa          xmm7,       [rdi+16]
62
63         movdqa          xmm5,       xmm7
64         palignr         xmm5,       xmm4,       %2
65
66         movdqa          xmm6,       xmm7
67         palignr         xmm6,       xmm4,       (%2+1)
68
69         palignr         xmm7,       xmm4,       (%2+2)
70
71         psadbw          xmm5,       xmm0
72         psadbw          xmm6,       xmm0
73         psadbw          xmm7,       xmm0
74 %else
75         movdqa          xmm0,       [rsi]
76         movdqa          xmm4,       [rdi]
77         movdqa          xmm3,       [rdi+16]
78
79         movdqa          xmm1,       xmm3
80         palignr         xmm1,       xmm4,       %2
81
82         movdqa          xmm2,       xmm3
83         palignr         xmm2,       xmm4,       (%2+1)
84
85         palignr         xmm3,       xmm4,       (%2+2)
86
87         psadbw          xmm1,       xmm0
88         psadbw          xmm2,       xmm0
89         psadbw          xmm3,       xmm0
90
91         paddw           xmm5,       xmm1
92         paddw           xmm6,       xmm2
93         paddw           xmm7,       xmm3
94 %endif
95         movdqa          xmm0,       QWORD PTR [rsi+rax]
96         movdqa          xmm4,       QWORD PTR [rdi+rdx]
97         movdqa          xmm3,       QWORD PTR [rdi+rdx+16]
98
99         movdqa          xmm1,       xmm3
100         palignr         xmm1,       xmm4,       %2
101
102         movdqa          xmm2,       xmm3
103         palignr         xmm2,       xmm4,       (%2+1)
104
105         palignr         xmm3,       xmm4,       (%2+2)
106
107         lea             rsi,        [rsi+rax*2]
108         lea             rdi,        [rdi+rdx*2]
109
110         psadbw          xmm1,       xmm0
111         psadbw          xmm2,       xmm0
112         psadbw          xmm3,       xmm0
113
114         paddw           xmm5,       xmm1
115         paddw           xmm6,       xmm2
116         paddw           xmm7,       xmm3
117 %endmacro
118
119 %macro PROCESS_16X16X3_OFFSET 2
120 %2_aligned_by_%1:
121
122         sub             rdi,        %1
123
124         PROCESS_16X2X3_OFFSET 1, %1
125         PROCESS_16X2X3_OFFSET 0, %1
126         PROCESS_16X2X3_OFFSET 0, %1
127         PROCESS_16X2X3_OFFSET 0, %1
128         PROCESS_16X2X3_OFFSET 0, %1
129         PROCESS_16X2X3_OFFSET 0, %1
130         PROCESS_16X2X3_OFFSET 0, %1
131         PROCESS_16X2X3_OFFSET 0, %1
132
133         jmp             %2_store_off
134
135 %endmacro
136
137 %macro PROCESS_16X8X3_OFFSET 2
138 %2_aligned_by_%1:
139
140         sub             rdi,        %1
141
142         PROCESS_16X2X3_OFFSET 1, %1
143         PROCESS_16X2X3_OFFSET 0, %1
144         PROCESS_16X2X3_OFFSET 0, %1
145         PROCESS_16X2X3_OFFSET 0, %1
146
147         jmp             %2_store_off
148
149 %endmacro
150
151 ;void int vp8_sad16x16x3_ssse3(
152 ;    unsigned char *src_ptr,
153 ;    int  src_stride,
154 ;    unsigned char *ref_ptr,
155 ;    int  ref_stride,
156 ;    int  *results)
157 global sym(vp8_sad16x16x3_ssse3)
158 sym(vp8_sad16x16x3_ssse3):
159     push        rbp
160     mov         rbp, rsp
161     SHADOW_ARGS_TO_STACK 5
162     push        rsi
163     push        rdi
164     push        rcx
165     ; end prolog
166
167         mov             rsi,        arg(0) ;src_ptr
168         mov             rdi,        arg(2) ;ref_ptr
169
170         mov             rdx,        0xf
171         and             rdx,        rdi
172
173         jmp vp8_sad16x16x3_ssse3_skiptable
174 vp8_sad16x16x3_ssse3_jumptable:
175         dd vp8_sad16x16x3_ssse3_aligned_by_0  - vp8_sad16x16x3_ssse3_do_jump
176         dd vp8_sad16x16x3_ssse3_aligned_by_1  - vp8_sad16x16x3_ssse3_do_jump
177         dd vp8_sad16x16x3_ssse3_aligned_by_2  - vp8_sad16x16x3_ssse3_do_jump
178         dd vp8_sad16x16x3_ssse3_aligned_by_3  - vp8_sad16x16x3_ssse3_do_jump
179         dd vp8_sad16x16x3_ssse3_aligned_by_4  - vp8_sad16x16x3_ssse3_do_jump
180         dd vp8_sad16x16x3_ssse3_aligned_by_5  - vp8_sad16x16x3_ssse3_do_jump
181         dd vp8_sad16x16x3_ssse3_aligned_by_6  - vp8_sad16x16x3_ssse3_do_jump
182         dd vp8_sad16x16x3_ssse3_aligned_by_7  - vp8_sad16x16x3_ssse3_do_jump
183         dd vp8_sad16x16x3_ssse3_aligned_by_8  - vp8_sad16x16x3_ssse3_do_jump
184         dd vp8_sad16x16x3_ssse3_aligned_by_9  - vp8_sad16x16x3_ssse3_do_jump
185         dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
186         dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
187         dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
188         dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
189         dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
190         dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
191 vp8_sad16x16x3_ssse3_skiptable:
192
193         call vp8_sad16x16x3_ssse3_do_jump
194 vp8_sad16x16x3_ssse3_do_jump:
195         pop             rcx                         ; get the address of do_jump
196         mov             rax,  vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
197         add             rax,  rcx  ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
198
199         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
200         add             rcx,        rax
201
202         movsxd          rax,        dword ptr arg(1) ;src_stride
203         movsxd          rdx,        dword ptr arg(3) ;ref_stride
204
205         jmp             rcx
206
207         PROCESS_16X16X3_OFFSET 0,  vp8_sad16x16x3_ssse3
208         PROCESS_16X16X3_OFFSET 1,  vp8_sad16x16x3_ssse3
209         PROCESS_16X16X3_OFFSET 2,  vp8_sad16x16x3_ssse3
210         PROCESS_16X16X3_OFFSET 3,  vp8_sad16x16x3_ssse3
211         PROCESS_16X16X3_OFFSET 4,  vp8_sad16x16x3_ssse3
212         PROCESS_16X16X3_OFFSET 5,  vp8_sad16x16x3_ssse3
213         PROCESS_16X16X3_OFFSET 6,  vp8_sad16x16x3_ssse3
214         PROCESS_16X16X3_OFFSET 7,  vp8_sad16x16x3_ssse3
215         PROCESS_16X16X3_OFFSET 8,  vp8_sad16x16x3_ssse3
216         PROCESS_16X16X3_OFFSET 9,  vp8_sad16x16x3_ssse3
217         PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
218         PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
219         PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
220         PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
221         PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
222
223 vp8_sad16x16x3_ssse3_aligned_by_15:
224         PROCESS_16X2X3 1
225         PROCESS_16X2X3 0
226         PROCESS_16X2X3 0
227         PROCESS_16X2X3 0
228         PROCESS_16X2X3 0
229         PROCESS_16X2X3 0
230         PROCESS_16X2X3 0
231         PROCESS_16X2X3 0
232
233 vp8_sad16x16x3_ssse3_store_off:
234         mov             rdi,        arg(4) ;Results
235
236         movq            xmm0,       xmm5
237         psrldq          xmm5,       8
238
239         paddw           xmm0,       xmm5
240         movd            [rdi],      xmm0
241 ;-
242         movq            xmm0,       xmm6
243         psrldq          xmm6,       8
244
245         paddw           xmm0,       xmm6
246         movd            [rdi+4],    xmm0
247 ;-
248         movq            xmm0,       xmm7
249         psrldq          xmm7,       8
250
251         paddw           xmm0,       xmm7
252         movd            [rdi+8],    xmm0
253
254     ; begin epilog
255     pop         rcx
256     pop         rdi
257     pop         rsi
258     UNSHADOW_ARGS
259     pop         rbp
260     ret
261
262 ;void int vp8_sad16x8x3_ssse3(
263 ;    unsigned char *src_ptr,
264 ;    int  src_stride,
265 ;    unsigned char *ref_ptr,
266 ;    int  ref_stride,
267 ;    int  *results)
268 global sym(vp8_sad16x8x3_ssse3)
269 sym(vp8_sad16x8x3_ssse3):
270     push        rbp
271     mov         rbp, rsp
272     SHADOW_ARGS_TO_STACK 5
273     push        rsi
274     push        rdi
275     push        rcx
276     ; end prolog
277
278         mov             rsi,        arg(0) ;src_ptr
279         mov             rdi,        arg(2) ;ref_ptr
280
281         mov             rdx,        0xf
282         and             rdx,        rdi
283
284         jmp vp8_sad16x8x3_ssse3_skiptable
285 vp8_sad16x8x3_ssse3_jumptable:
286         dd vp8_sad16x8x3_ssse3_aligned_by_0  - vp8_sad16x8x3_ssse3_do_jump
287         dd vp8_sad16x8x3_ssse3_aligned_by_1  - vp8_sad16x8x3_ssse3_do_jump
288         dd vp8_sad16x8x3_ssse3_aligned_by_2  - vp8_sad16x8x3_ssse3_do_jump
289         dd vp8_sad16x8x3_ssse3_aligned_by_3  - vp8_sad16x8x3_ssse3_do_jump
290         dd vp8_sad16x8x3_ssse3_aligned_by_4  - vp8_sad16x8x3_ssse3_do_jump
291         dd vp8_sad16x8x3_ssse3_aligned_by_5  - vp8_sad16x8x3_ssse3_do_jump
292         dd vp8_sad16x8x3_ssse3_aligned_by_6  - vp8_sad16x8x3_ssse3_do_jump
293         dd vp8_sad16x8x3_ssse3_aligned_by_7  - vp8_sad16x8x3_ssse3_do_jump
294         dd vp8_sad16x8x3_ssse3_aligned_by_8  - vp8_sad16x8x3_ssse3_do_jump
295         dd vp8_sad16x8x3_ssse3_aligned_by_9  - vp8_sad16x8x3_ssse3_do_jump
296         dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
297         dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
298         dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
299         dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
300         dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
301         dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
302 vp8_sad16x8x3_ssse3_skiptable:
303
304         call vp8_sad16x8x3_ssse3_do_jump
305 vp8_sad16x8x3_ssse3_do_jump:
306         pop             rcx                         ; get the address of do_jump
307         mov             rax,  vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
308         add             rax,  rcx  ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
309
310         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
311         add             rcx,        rax
312
313         movsxd          rax,        dword ptr arg(1) ;src_stride
314         movsxd          rdx,        dword ptr arg(3) ;ref_stride
315
316         jmp             rcx
317
318         PROCESS_16X8X3_OFFSET 0,  vp8_sad16x8x3_ssse3
319         PROCESS_16X8X3_OFFSET 1,  vp8_sad16x8x3_ssse3
320         PROCESS_16X8X3_OFFSET 2,  vp8_sad16x8x3_ssse3
321         PROCESS_16X8X3_OFFSET 3,  vp8_sad16x8x3_ssse3
322         PROCESS_16X8X3_OFFSET 4,  vp8_sad16x8x3_ssse3
323         PROCESS_16X8X3_OFFSET 5,  vp8_sad16x8x3_ssse3
324         PROCESS_16X8X3_OFFSET 6,  vp8_sad16x8x3_ssse3
325         PROCESS_16X8X3_OFFSET 7,  vp8_sad16x8x3_ssse3
326         PROCESS_16X8X3_OFFSET 8,  vp8_sad16x8x3_ssse3
327         PROCESS_16X8X3_OFFSET 9,  vp8_sad16x8x3_ssse3
328         PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
329         PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
330         PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
331         PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
332         PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
333
334 vp8_sad16x8x3_ssse3_aligned_by_15:
335
336         PROCESS_16X2X3 1
337         PROCESS_16X2X3 0
338         PROCESS_16X2X3 0
339         PROCESS_16X2X3 0
340
341 vp8_sad16x8x3_ssse3_store_off:
342         mov             rdi,        arg(4) ;Results
343
344         movq            xmm0,       xmm5
345         psrldq          xmm5,       8
346
347         paddw           xmm0,       xmm5
348         movd            [rdi],      xmm0
349 ;-
350         movq            xmm0,       xmm6
351         psrldq          xmm6,       8
352
353         paddw           xmm0,       xmm6
354         movd            [rdi+4],    xmm0
355 ;-
356         movq            xmm0,       xmm7
357         psrldq          xmm7,       8
358
359         paddw           xmm0,       xmm7
360         movd            [rdi+8],    xmm0
361
362     ; begin epilog
363     pop         rcx
364     pop         rdi
365     pop         rsi
366     UNSHADOW_ARGS
367     pop         rbp
368     ret