2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ; void vp9_temporal_filter_apply_sse2 | arg
15 ; (unsigned char *frame1, | 0
16 ; unsigned int stride, | 1
17 ; unsigned char *frame2, | 2
18 ; unsigned int block_width, | 3
19 ; unsigned int block_height, | 4
21 ; int filter_weight, | 6
22 ; unsigned int *accumulator, | 7
23 ; unsigned short *count) | 8
24 global sym(vp9_temporal_filter_apply_sse2) PRIVATE
25 sym(vp9_temporal_filter_apply_sse2):
29 SHADOW_ARGS_TO_STACK 9
36 %define block_height 16
38 %define filter_weight 48
39 %define rounding_bit 64
43 mov [rsp + rbp_backup], rbp
47 mov [rsp + block_width], rdx
49 mov [rsp + block_height], rdx
51 movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
53 ; calculate the rounding bit outside the loop
54 ; 0x8000 >> (16 - strength)
56 sub rdx, arg(5) ; 16 - strength
57 movq xmm4, rdx ; can't use rdx w/ shift
58 movdqa xmm5, [GLOBAL(_const_top_bit)]
60 movdqa [rsp + rounding_bit], xmm5
62 mov rsi, arg(0) ; src/frame1
63 mov rdx, arg(2) ; predictor frame
64 mov rdi, arg(7) ; accumulator
65 mov rax, arg(8) ; count
67 ; dup the filter weight and store for later
68 movd xmm0, arg(6) ; filter_weight
71 movdqa [rsp + filter_weight], xmm0
73 mov rbp, arg(1) ; stride
74 pxor xmm7, xmm7 ; zero for extraction
76 mov rcx, [rsp + block_width]
77 imul rcx, [rsp + block_height]
79 cmp dword ptr [rsp + block_width], 8
80 jne .temporal_filter_apply_load_16
82 .temporal_filter_apply_load_8:
83 movq xmm0, [rsi] ; first row
84 lea rsi, [rsi + rbp] ; += stride
85 punpcklbw xmm0, xmm7 ; src[ 0- 7]
86 movq xmm1, [rsi] ; second row
87 lea rsi, [rsi + rbp] ; += stride
88 punpcklbw xmm1, xmm7 ; src[ 8-15]
89 jmp .temporal_filter_apply_load_finished
91 .temporal_filter_apply_load_16:
92 movdqa xmm0, [rsi] ; src (frame1)
93 lea rsi, [rsi + rbp] ; += stride
95 punpcklbw xmm0, xmm7 ; src[ 0- 7]
96 punpckhbw xmm1, xmm7 ; src[ 8-15]
98 .temporal_filter_apply_load_finished:
99 movdqa xmm2, [rdx] ; predictor (frame2)
101 punpcklbw xmm2, xmm7 ; pred[ 0- 7]
102 punpckhbw xmm3, xmm7 ; pred[ 8-15]
104 ; modifier = src_byte - pixel_value
105 psubw xmm0, xmm2 ; src - pred[ 0- 7]
106 psubw xmm1, xmm3 ; src - pred[ 8-15]
108 ; modifier *= modifier
109 pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
110 pmullw xmm1, xmm1 ; modifer[ 8-15]^2
113 pmullw xmm0, [GLOBAL(_const_3w)]
114 pmullw xmm1, [GLOBAL(_const_3w)]
116 ; modifer += 0x8000 >> (16 - strength)
117 paddw xmm0, [rsp + rounding_bit]
118 paddw xmm1, [rsp + rounding_bit]
120 ; modifier >>= strength
121 psrlw xmm0, [rsp + strength]
122 psrlw xmm1, [rsp + strength]
124 ; modifier = 16 - modifier
125 ; saturation takes care of modifier > 16
126 movdqa xmm3, [GLOBAL(_const_16w)]
127 movdqa xmm2, [GLOBAL(_const_16w)]
131 ; modifier *= filter_weight
132 pmullw xmm2, [rsp + filter_weight]
133 pmullw xmm3, [rsp + filter_weight]
137 movdqa xmm5, [rax+16]
143 movdqa [rax+16], xmm5
144 lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
146 ; load and extract the predictor up to shorts
149 lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
151 punpcklbw xmm0, xmm7 ; pred[ 0- 7]
152 punpckhbw xmm1, xmm7 ; pred[ 8-15]
154 ; modifier *= pixel_value
158 ; expand to double words
160 punpcklwd xmm0, xmm7 ; [ 0- 3]
161 punpckhwd xmm2, xmm7 ; [ 4- 7]
163 punpcklwd xmm1, xmm7 ; [ 8-11]
164 punpckhwd xmm3, xmm7 ; [12-15]
168 movdqa xmm5, [rdi+16]
169 movdqa xmm6, [rdi+32]
170 movdqa xmm7, [rdi+48]
178 movdqa [rdi+16], xmm5
179 movdqa [rdi+32], xmm6
180 movdqa [rdi+48], xmm7
181 lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
184 je .temporal_filter_apply_epilog
185 pxor xmm7, xmm7 ; zero for extraction
186 cmp dword ptr [rsp + block_width], 16
187 je .temporal_filter_apply_load_16
188 jmp .temporal_filter_apply_load_8
190 .temporal_filter_apply_epilog:
192 mov rbp, [rsp + rbp_backup]