]> granicus.if.org Git - libvpx/blob - vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
Add AVX vectorized vp9_diamond_search_sad
[libvpx] / vp9 / encoder / x86 / vp9_temporal_filter_apply_sse2.asm
1 ;
2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ;  Use of this source code is governed by a BSD-style license
5 ;  that can be found in the LICENSE file in the root of the source
6 ;  tree. An additional intellectual property rights grant can be found
7 ;  in the file PATENTS.  All contributing project authors may
8 ;  be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 ; void vp9_temporal_filter_apply_sse2 | arg
15 ;  (unsigned char  *frame1,           |  0
16 ;   unsigned int    stride,           |  1
17 ;   unsigned char  *frame2,           |  2
18 ;   unsigned int    block_width,      |  3
19 ;   unsigned int    block_height,     |  4
20 ;   int             strength,         |  5
21 ;   int             filter_weight,    |  6
22 ;   unsigned int   *accumulator,      |  7
23 ;   unsigned short *count)            |  8
24 global sym(vp9_temporal_filter_apply_sse2) PRIVATE
25 sym(vp9_temporal_filter_apply_sse2):
26
27     push        rbp
28     mov         rbp, rsp
29     SHADOW_ARGS_TO_STACK 9
30     SAVE_XMM 7
31     GET_GOT     rbx
32     push        rsi
33     push        rdi
34     ALIGN_STACK 16, rax
35     %define block_width    0
36     %define block_height  16
37     %define strength      32
38     %define filter_weight 48
39     %define rounding_bit  64
40     %define rbp_backup    80
41     %define stack_size    96
42     sub         rsp,           stack_size
43     mov         [rsp + rbp_backup], rbp
44     ; end prolog
45
46         mov         edx,            arg(3)
47         mov         [rsp + block_width], rdx
48         mov         edx,            arg(4)
49         mov         [rsp + block_height], rdx
50         movd        xmm6,           arg(5)
51         movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
52
53         ; calculate the rounding bit outside the loop
54         ; 0x8000 >> (16 - strength)
55         mov         rdx,            16
56         sub         rdx,            arg(5) ; 16 - strength
57         movq        xmm4,           rdx    ; can't use rdx w/ shift
58         movdqa      xmm5,           [GLOBAL(_const_top_bit)]
59         psrlw       xmm5,           xmm4
60         movdqa      [rsp + rounding_bit], xmm5
61
62         mov         rsi,            arg(0) ; src/frame1
63         mov         rdx,            arg(2) ; predictor frame
64         mov         rdi,            arg(7) ; accumulator
65         mov         rax,            arg(8) ; count
66
67         ; dup the filter weight and store for later
68         movd        xmm0,           arg(6) ; filter_weight
69         pshuflw     xmm0,           xmm0, 0
70         punpcklwd   xmm0,           xmm0
71         movdqa      [rsp + filter_weight], xmm0
72
73         mov         rbp,            arg(1) ; stride
74         pxor        xmm7,           xmm7   ; zero for extraction
75
76         mov         rcx,            [rsp + block_width]
77         imul        rcx,            [rsp + block_height]
78         add         rcx,            rdx
79         cmp         dword ptr [rsp + block_width], 8
80         jne         .temporal_filter_apply_load_16
81
82 .temporal_filter_apply_load_8:
83         movq        xmm0,           [rsi]  ; first row
84         lea         rsi,            [rsi + rbp] ; += stride
85         punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
86         movq        xmm1,           [rsi]  ; second row
87         lea         rsi,            [rsi + rbp] ; += stride
88         punpcklbw   xmm1,           xmm7   ; src[ 8-15]
89         jmp         .temporal_filter_apply_load_finished
90
91 .temporal_filter_apply_load_16:
92         movdqa      xmm0,           [rsi]  ; src (frame1)
93         lea         rsi,            [rsi + rbp] ; += stride
94         movdqa      xmm1,           xmm0
95         punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
96         punpckhbw   xmm1,           xmm7   ; src[ 8-15]
97
98 .temporal_filter_apply_load_finished:
99         movdqa      xmm2,           [rdx]  ; predictor (frame2)
100         movdqa      xmm3,           xmm2
101         punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
102         punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
103
104         ; modifier = src_byte - pixel_value
105         psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
106         psubw       xmm1,           xmm3   ; src - pred[ 8-15]
107
108         ; modifier *= modifier
109         pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
110         pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
111
112         ; modifier *= 3
113         pmullw      xmm0,           [GLOBAL(_const_3w)]
114         pmullw      xmm1,           [GLOBAL(_const_3w)]
115
116         ; modifer += 0x8000 >> (16 - strength)
117         paddw       xmm0,           [rsp + rounding_bit]
118         paddw       xmm1,           [rsp + rounding_bit]
119
120         ; modifier >>= strength
121         psrlw       xmm0,           [rsp + strength]
122         psrlw       xmm1,           [rsp + strength]
123
124         ; modifier = 16 - modifier
125         ; saturation takes care of modifier > 16
126         movdqa      xmm3,           [GLOBAL(_const_16w)]
127         movdqa      xmm2,           [GLOBAL(_const_16w)]
128         psubusw     xmm3,           xmm1
129         psubusw     xmm2,           xmm0
130
131         ; modifier *= filter_weight
132         pmullw      xmm2,           [rsp + filter_weight]
133         pmullw      xmm3,           [rsp + filter_weight]
134
135         ; count
136         movdqa      xmm4,           [rax]
137         movdqa      xmm5,           [rax+16]
138         ; += modifier
139         paddw       xmm4,           xmm2
140         paddw       xmm5,           xmm3
141         ; write back
142         movdqa      [rax],          xmm4
143         movdqa      [rax+16],       xmm5
144         lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
145
146         ; load and extract the predictor up to shorts
147         pxor        xmm7,           xmm7
148         movdqa      xmm0,           [rdx]
149         lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
150         movdqa      xmm1,           xmm0
151         punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
152         punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
153
154         ; modifier *= pixel_value
155         pmullw      xmm0,           xmm2
156         pmullw      xmm1,           xmm3
157
158         ; expand to double words
159         movdqa      xmm2,           xmm0
160         punpcklwd   xmm0,           xmm7   ; [ 0- 3]
161         punpckhwd   xmm2,           xmm7   ; [ 4- 7]
162         movdqa      xmm3,           xmm1
163         punpcklwd   xmm1,           xmm7   ; [ 8-11]
164         punpckhwd   xmm3,           xmm7   ; [12-15]
165
166         ; accumulator
167         movdqa      xmm4,           [rdi]
168         movdqa      xmm5,           [rdi+16]
169         movdqa      xmm6,           [rdi+32]
170         movdqa      xmm7,           [rdi+48]
171         ; += modifier
172         paddd       xmm4,           xmm0
173         paddd       xmm5,           xmm2
174         paddd       xmm6,           xmm1
175         paddd       xmm7,           xmm3
176         ; write back
177         movdqa      [rdi],          xmm4
178         movdqa      [rdi+16],       xmm5
179         movdqa      [rdi+32],       xmm6
180         movdqa      [rdi+48],       xmm7
181         lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
182
183         cmp         rdx,            rcx
184         je          .temporal_filter_apply_epilog
185         pxor        xmm7,           xmm7   ; zero for extraction
186         cmp         dword ptr [rsp + block_width], 16
187         je          .temporal_filter_apply_load_16
188         jmp         .temporal_filter_apply_load_8
189
190 .temporal_filter_apply_epilog:
191     ; begin epilog
192     mov         rbp,            [rsp + rbp_backup]
193     add         rsp,            stack_size
194     pop         rsp
195     pop         rdi
196     pop         rsi
197     RESTORE_GOT
198     RESTORE_XMM
199     UNSHADOW_ARGS
200     pop         rbp
201     ret
202
203 SECTION_RODATA
204 align 16
205 _const_3w:
206     times 8 dw 3
207 align 16
208 _const_top_bit:
209     times 8 dw 1<<15
210 align 16
211 _const_16w
212     times 8 dw 16