lea r11, [r10+r10*2]
lea rax, [rdi-4]
lea r9, [rdi-4+r11]
- %define pix_tmp rsp-104 ; 16x6 for the buffer + 8 for x264_deblock_v_luma_sse2's return address
+ sub rsp, 0x68
+ %define pix_tmp rsp
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
movq mm3, [pix_tmp+0x40]
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
+ add rsp, 0x68
ret
; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal x264_intra_satd_x3_16x16_mmxext
-%define sums rsp-32 ; +24
-%define top_1d rsp-64 ; +32
-%define left_1d rsp-96 ; +32
+ sub rsp, 96
+%define sums rsp+64 ; size 24
+%define top_1d rsp+32 ; size 32
+%define left_1d rsp ; size 32
mov qword [sums+0], 0
mov qword [sums+8], 0
movd [parm3q+8], mm2 ; i16x16_dc satd
movd [parm3q+4], mm1 ; i16x16_h satd
movd [parm3q+0], mm0 ; i16x16_v satd
+ add rsp, 96
ret
;-----------------------------------------------------------------------------
; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal x264_intra_satd_x3_8x8c_mmxext
-%define sums rsp-32 ; +24
-%define top_1d rsp-48 ; +16
-%define left_1d rsp-64 ; +16
+ sub rsp, 64
+%define sums rsp+32 ; size 24
+%define top_1d rsp+16 ; size 16
+%define left_1d rsp ; size 16
mov qword [sums+0], 0
mov qword [sums+8], 0
movd [parm3q+0], mm0 ; i8x8c_dc satd
movd [parm3q+4], mm1 ; i8x8c_h satd
movd [parm3q+8], mm2 ; i8x8c_v satd
+ add rsp, 64
ret
uint8_t *ref = frame->plane[0] + y * stride - PADH;
uint16_t *line = frame->integral + (y+1) * stride - PADH + 1;
uint16_t v = line[0] = 0;
- for( x = 0; x < stride-1; x++ )
+ for( x = 1; x < stride-1; x++ )
line[x] = v += ref[x] + line[x-stride] - line[x-stride-1];
line -= 8*stride;
if( y >= 9-PADV )