From: Scott LaVarnway Date: Tue, 15 Nov 2011 17:53:01 +0000 (-0500) Subject: Added predictor stride argument(s) to subtract functions X-Git-Tag: v1.0.0~75^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=edd98b7310e0338708abfbd0826688222e1e6f57;p=libvpx Added predictor stride argument(s) to subtract functions Patch set 2: 64 bit build fix Patch set 3: 64 bit crash fix [Tero] Patch set 4: Updated ARMv6 and NEON assembly. Added also minor NEON optimizations to subtract functions. Patch set 5: x86 stride bug fix Change-Id: I1fcca93e90c89b89ddc204e1c18f208682675c15 --- diff --git a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm index 0ca74387b..f329f8f73 100644 --- a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm @@ -72,22 +72,23 @@ loop_block ; r0 short *diff ; r1 unsigned char *usrc ; r2 unsigned char *vsrc -; r3 unsigned char *pred -; stack int stride +; r3 int src_stride +; sp unsigned char *upred +; sp unsigned char *vpred +; sp int pred_stride |vp8_subtract_mbuv_armv6| PROC - stmfd sp!, {r4-r12, lr} + stmfd sp!, {r4-r11} add r0, r0, #512 ; set *diff point to Cb - add r3, r3, #256 ; set *pred point to Cb - mov r4, #8 ; loop count - ldr r5, [sp, #40] ; stride + ldr r5, [sp, #32] ; upred + ldr r12, [sp, #40] ; pred_stride ; Subtract U block loop_u - ldr r6, [r1] ; src (A) - ldr r7, [r3], #4 ; pred (A) + ldr r6, [r1] ; usrc (A) + ldr r7, [r5] ; upred (A) uxtb16 r8, r6 ; [s2 | s0] (A) uxtb16 r9, r7 ; [p2 | p0] (A) @@ -97,8 +98,8 @@ loop_u usub16 r6, r8, r9 ; [d2 | d0] (A) usub16 r7, r10, r11 ; [d3 | d1] (A) - ldr r10, [r1, #4] ; src (B) - ldr r11, [r3], #4 ; pred (B) + ldr r10, [r1, #4] ; usrc (B) + ldr r11, [r5, #4] ; upred (B) pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) @@ -114,7 +115,8 @@ loop_u usub16 r6, r8, r9 ; [d2 | d0] (B) usub16 r7, r10, r11 ; [d3 | d1] (B) - add r1, r1, r5 ; update usrc pointer + add r1, r1, r3 ; update usrc pointer + add r5, r5, r12 ; update upred pointer pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) @@ -125,12 +127,13 @@ loop_u bne loop_u + ldr r5, [sp, #36] ; vpred mov r4, #8 ; loop count ; Subtract V block loop_v - ldr r6, [r2] ; src (A) - ldr r7, [r3], #4 ; pred (A) + ldr r6, [r2] ; vsrc (A) + ldr r7, [r5] ; vpred (A) uxtb16 r8, r6 ; [s2 | s0] (A) uxtb16 r9, r7 ; [p2 | p0] (A) @@ -140,8 +143,8 @@ loop_v usub16 r6, r8, r9 ; [d2 | d0] (A) usub16 r7, r10, r11 ; [d3 | d1] (A) - ldr r10, [r2, #4] ; src (B) - ldr r11, [r3], #4 ; pred (B) + ldr r10, [r2, #4] ; vsrc (B) + ldr r11, [r5, #4] ; vpred (B) pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) @@ -157,7 +160,8 @@ loop_v usub16 r6, r8, r9 ; [d2 | d0] (B) usub16 r7, r10, r11 ; [d3 | d1] (B) - add r2, r2, r5 ; update vsrc pointer + add r2, r2, r3 ; update vsrc pointer + add r5, r5, r12 ; update vpred pointer pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) @@ -168,23 +172,25 @@ loop_v bne loop_v - ldmfd sp!, {r4-r12, pc} + ldmfd sp!, {r4-r11} + bx lr ENDP ; r0 short *diff ; r1 unsigned char *src -; r2 unsigned char *pred -; r3 int stride +; r2 int src_stride +; r3 unsigned char *pred +; sp int pred_stride |vp8_subtract_mby_armv6| PROC stmfd sp!, {r4-r11} - + ldr r12, [sp, #32] ; pred_stride mov r4, #16 loop ldr r6, [r1] ; src (A) - ldr r7, [r2], #4 ; pred (A) + ldr r7, [r3] ; pred (A) uxtb16 r8, r6 ; [s2 | s0] (A) uxtb16 r9, r7 ; [p2 | p0] (A) @@ -195,7 +201,7 @@ loop usub16 r7, r10, r11 ; [d3 | d1] (A) ldr r10, [r1, #4] ; src (B) - ldr r11, [r2], #4 ; pred (B) + ldr r11, [r3, #4] ; pred (B) pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) @@ -212,7 +218,7 @@ loop usub16 r7, r10, r11 ; [d3 | d1] (B) ldr r10, [r1, #8] ; src (C) - ldr r11, [r2], #4 ; pred (C) + ldr r11, [r3, #8] ; pred (C) pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) @@ -229,10 +235,10 @@ loop usub16 r7, r10, r11 ; [d3 | d1] (C) ldr r10, [r1, #12] ; src (D) - ldr r11, [r2], #4 ; pred (D) + ldr r11, [r3, #12] ; pred (D) - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C) + pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C) + pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C) str r8, [r0], #4 ; diff (C) uxtb16 r8, r10 ; [s2 | s0] (D) @@ -245,7 +251,8 @@ loop usub16 r6, r8, r9 ; [d2 | d0] (D) usub16 r7, r10, r11 ; [d3 | d1] (D) - add r1, r1, r3 ; update src pointer + add r1, r1, r2 ; update src pointer + add r3, r3, r12 ; update pred pointer pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D) @@ -257,7 +264,7 @@ loop bne loop ldmfd sp!, {r4-r11} - mov pc, lr + bx lr ENDP diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm index 68c295062..91a328c29 100644 --- a/vp8/encoder/arm/neon/subtract_neon.asm +++ b/vp8/encoder/arm/neon/subtract_neon.asm @@ -61,19 +61,24 @@ ;========================================== -;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride) +;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride +; unsigned char *pred, int pred_stride) |vp8_subtract_mby_neon| PROC + push {r4-r7} mov r12, #4 + ldr r4, [sp, #16] ; pred_stride + mov r6, #32 ; "diff" stride x2 + add r5, r0, #16 ; second diff pointer subtract_mby_loop - vld1.8 {q0}, [r1], r3 ;load src - vld1.8 {q1}, [r2]! ;load pred - vld1.8 {q2}, [r1], r3 - vld1.8 {q3}, [r2]! - vld1.8 {q4}, [r1], r3 - vld1.8 {q5}, [r2]! - vld1.8 {q6}, [r1], r3 - vld1.8 {q7}, [r2]! + vld1.8 {q0}, [r1], r2 ;load src + vld1.8 {q1}, [r3], r4 ;load pred + vld1.8 {q2}, [r1], r2 + vld1.8 {q3}, [r3], r4 + vld1.8 {q4}, [r1], r2 + vld1.8 {q5}, [r3], r4 + vld1.8 {q6}, [r1], r2 + vld1.8 {q7}, [r3], r4 vsubl.u8 q8, d0, d2 vsubl.u8 q9, d1, d3 @@ -84,46 +89,53 @@ subtract_mby_loop vsubl.u8 q14, d12, d14 vsubl.u8 q15, d13, d15 - vst1.16 {q8}, [r0]! ;store diff - vst1.16 {q9}, [r0]! - vst1.16 {q10}, [r0]! - vst1.16 {q11}, [r0]! - vst1.16 {q12}, [r0]! - vst1.16 {q13}, [r0]! - vst1.16 {q14}, [r0]! - vst1.16 {q15}, [r0]! + vst1.16 {q8}, [r0], r6 ;store diff + vst1.16 {q9}, [r5], r6 + vst1.16 {q10}, [r0], r6 + vst1.16 {q11}, [r5], r6 + vst1.16 {q12}, [r0], r6 + vst1.16 {q13}, [r5], r6 + vst1.16 {q14}, [r0], r6 + vst1.16 {q15}, [r5], r6 subs r12, r12, #1 bne subtract_mby_loop + pop {r4-r7} bx lr ENDP ;================================= -;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, +; int src_stride, unsigned char *upred, +; unsigned char *vpred, int pred_stride) + |vp8_subtract_mbuv_neon| PROC - ldr r12, [sp] + push {r4-r7} + ldr r4, [sp, #16] ; upred + ldr r5, [sp, #20] ; vpred + ldr r6, [sp, #24] ; pred_stride + add r0, r0, #512 ; short *udiff = diff + 256; + mov r12, #32 ; "diff" stride x2 + add r7, r0, #16 ; second diff pointer ;u - add r0, r0, #512 ; short *udiff = diff + 256; - add r3, r3, #256 ; unsigned char *upred = pred + 256; - - vld1.8 {d0}, [r1], r12 ;load src - vld1.8 {d1}, [r3]! ;load pred - vld1.8 {d2}, [r1], r12 - vld1.8 {d3}, [r3]! - vld1.8 {d4}, [r1], r12 - vld1.8 {d5}, [r3]! - vld1.8 {d6}, [r1], r12 - vld1.8 {d7}, [r3]! - vld1.8 {d8}, [r1], r12 - vld1.8 {d9}, [r3]! - vld1.8 {d10}, [r1], r12 - vld1.8 {d11}, [r3]! - vld1.8 {d12}, [r1], r12 - vld1.8 {d13}, [r3]! - vld1.8 {d14}, [r1], r12 - vld1.8 {d15}, [r3]! + vld1.8 {d0}, [r1], r3 ;load usrc + vld1.8 {d1}, [r4], r6 ;load upred + vld1.8 {d2}, [r1], r3 + vld1.8 {d3}, [r4], r6 + vld1.8 {d4}, [r1], r3 + vld1.8 {d5}, [r4], r6 + vld1.8 {d6}, [r1], r3 + vld1.8 {d7}, [r4], r6 + vld1.8 {d8}, [r1], r3 + vld1.8 {d9}, [r4], r6 + vld1.8 {d10}, [r1], r3 + vld1.8 {d11}, [r4], r6 + vld1.8 {d12}, [r1], r3 + vld1.8 {d13}, [r4], r6 + vld1.8 {d14}, [r1], r3 + vld1.8 {d15}, [r4], r6 vsubl.u8 q8, d0, d1 vsubl.u8 q9, d2, d3 @@ -134,32 +146,32 @@ subtract_mby_loop vsubl.u8 q14, d12, d13 vsubl.u8 q15, d14, d15 - vst1.16 {q8}, [r0]! ;store diff - vst1.16 {q9}, [r0]! - vst1.16 {q10}, [r0]! - vst1.16 {q11}, [r0]! - vst1.16 {q12}, [r0]! - vst1.16 {q13}, [r0]! - vst1.16 {q14}, [r0]! - vst1.16 {q15}, [r0]! + vst1.16 {q8}, [r0], r12 ;store diff + vst1.16 {q9}, [r7], r12 + vst1.16 {q10}, [r0], r12 + vst1.16 {q11}, [r7], r12 + vst1.16 {q12}, [r0], r12 + vst1.16 {q13}, [r7], r12 + vst1.16 {q14}, [r0], r12 + vst1.16 {q15}, [r7], r12 ;v - vld1.8 {d0}, [r2], r12 ;load src - vld1.8 {d1}, [r3]! ;load pred - vld1.8 {d2}, [r2], r12 - vld1.8 {d3}, [r3]! - vld1.8 {d4}, [r2], r12 - vld1.8 {d5}, [r3]! - vld1.8 {d6}, [r2], r12 - vld1.8 {d7}, [r3]! - vld1.8 {d8}, [r2], r12 - vld1.8 {d9}, [r3]! - vld1.8 {d10}, [r2], r12 - vld1.8 {d11}, [r3]! - vld1.8 {d12}, [r2], r12 - vld1.8 {d13}, [r3]! - vld1.8 {d14}, [r2], r12 - vld1.8 {d15}, [r3]! + vld1.8 {d0}, [r2], r3 ;load vsrc + vld1.8 {d1}, [r5], r6 ;load vpred + vld1.8 {d2}, [r2], r3 + vld1.8 {d3}, [r5], r6 + vld1.8 {d4}, [r2], r3 + vld1.8 {d5}, [r5], r6 + vld1.8 {d6}, [r2], r3 + vld1.8 {d7}, [r5], r6 + vld1.8 {d8}, [r2], r3 + vld1.8 {d9}, [r5], r6 + vld1.8 {d10}, [r2], r3 + vld1.8 {d11}, [r5], r6 + vld1.8 {d12}, [r2], r3 + vld1.8 {d13}, [r5], r6 + vld1.8 {d14}, [r2], r3 + vld1.8 {d15}, [r5], r6 vsubl.u8 q8, d0, d1 vsubl.u8 q9, d2, d3 @@ -170,16 +182,18 @@ subtract_mby_loop vsubl.u8 q14, d12, d13 vsubl.u8 q15, d14, d15 - vst1.16 {q8}, [r0]! ;store diff - vst1.16 {q9}, [r0]! - vst1.16 {q10}, [r0]! - vst1.16 {q11}, [r0]! - vst1.16 {q12}, [r0]! - vst1.16 {q13}, [r0]! - vst1.16 {q14}, [r0]! - vst1.16 {q15}, [r0]! + vst1.16 {q8}, [r0], r12 ;store diff + vst1.16 {q9}, [r7], r12 + vst1.16 {q10}, [r0], r12 + vst1.16 {q11}, [r7], r12 + vst1.16 {q12}, [r0], r12 + vst1.16 {q13}, [r7], r12 + vst1.16 {q14}, [r0], r12 + vst1.16 {q15}, [r7], r12 + pop {r4-r7} bx lr + ENDP END diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index d89d74e5e..a3b800a2d 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -100,7 +100,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd); ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), - x->e_mbd.predictor, b->src_stride); + b->src_stride, x->e_mbd.predictor, 16); vp8_transform_intra_mby(x); @@ -115,7 +115,9 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd); - ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); + ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, + x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256], + &x->e_mbd.predictor[320], 8); vp8_transform_mbuv(x); diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c index 80c32df1b..e9042e151 100644 --- a/vp8/encoder/encodemb.c +++ b/vp8/encoder/encodemb.c @@ -48,12 +48,12 @@ void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) } } -void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, + int src_stride, unsigned char *upred, + unsigned char *vpred, int pred_stride) { short *udiff = diff + 256; short *vdiff = diff + 320; - unsigned char *upred = pred + 256; - unsigned char *vpred = pred + 320; int r, c; @@ -65,8 +65,8 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, } udiff += 8; - upred += 8; - usrc += stride; + upred += pred_stride; + usrc += src_stride; } for (r = 0; r < 8; r++) @@ -77,12 +77,13 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, } vdiff += 8; - vpred += 8; - vsrc += stride; + vpred += pred_stride; + vsrc += src_stride; } } -void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride) +void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, + unsigned char *pred, int pred_stride) { int r, c; @@ -94,8 +95,8 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, in } diff += 16; - pred += 16; - src += stride; + pred += pred_stride; + src += src_stride; } } @@ -103,8 +104,11 @@ static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { BLOCK *b = &x->block[0]; - ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride); - ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); + ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), + b->src_stride, x->e_mbd.predictor, 16); + ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, + x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256], + &x->e_mbd.predictor[320], 8); } static void build_dcblock(MACROBLOCK *x) @@ -641,7 +645,8 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) vp8_build_inter16x16_predictors_mby(&x->e_mbd); - ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride); + ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), + b->src_stride, x->e_mbd.predictor, 16); transform_mby(x); diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h index 8fa457aa8..597a57b5d 100644 --- a/vp8/encoder/encodemb.h +++ b/vp8/encoder/encodemb.h @@ -28,11 +28,13 @@ void (sym)(BLOCK *be,BLOCKD *bd, int pitch) #define prototype_submby(sym) \ - void (sym)(short *diff, unsigned char *src, unsigned char *pred, int stride) + void (sym)(short *diff, unsigned char *src, int src_stride, \ + unsigned char *pred, int pred_stride) #define prototype_submbuv(sym) \ void (sym)(short *diff, unsigned char *usrc, unsigned char *vsrc,\ - unsigned char *pred, int stride) + int src_stride, unsigned char *upred, unsigned char *vpred,\ + int pred_stride) #if ARCH_X86 || ARCH_X86_64 #include "x86/encodemb_x86.h" diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index e8abf848c..cb16e34df 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -552,7 +552,7 @@ static void macro_block_yrd( MACROBLOCK *mb, int d; ENCODEMB_INVOKE(rtcd, submby)( mb->src_diff, *(mb->block[0].base_src), - mb->e_mbd.predictor, mb->block[0].src_stride ); + mb->block[0].src_stride, mb->e_mbd.predictor, 16); // Fdct and building the 2nd order block for (beptr = mb->block; beptr < mb->block + 16; beptr += 2) @@ -800,7 +800,8 @@ static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, { vp8_build_inter16x16_predictors_mbuv(&x->e_mbd); ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff, - x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); + x->src.u_buffer, x->src.v_buffer, x->src.uv_stride, + &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8); vp8_transform_mbuv(x); vp8_quantize_mbuv(x); @@ -816,7 +817,8 @@ static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, { vp8_build_inter4x4_predictors_mbuv(&x->e_mbd); ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff, - x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); + x->src.u_buffer, x->src.v_buffer, x->src.uv_stride, + &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8); vp8_transform_mbuv(x); vp8_quantize_mbuv(x); @@ -845,8 +847,8 @@ static void rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int RECON_INVOKE(&cpi->rtcd.common->recon, build_intra_predictors_mbuv) (&x->e_mbd); ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff, - x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, - x->src.uv_stride); + x->src.u_buffer, x->src.v_buffer, x->src.uv_stride, + &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8); vp8_transform_mbuv(x); vp8_quantize_mbuv(x); diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm index 4ce16ce90..75e8aa3c2 100644 --- a/vp8/encoder/x86/subtract_mmx.asm +++ b/vp8/encoder/x86/subtract_mmx.asm @@ -73,74 +73,71 @@ sym(vp8_subtract_b_mmx_impl): pop rbp ret -;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride) +;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, +;unsigned char *pred, int pred_stride) global sym(vp8_subtract_mby_mmx) sym(vp8_subtract_mby_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi ; end prolog + mov rdi, arg(0) ;diff + mov rsi, arg(1) ;src + movsxd rdx, dword ptr arg(2);src_stride + mov rax, arg(3) ;pred + push rbx + movsxd rbx, dword ptr arg(4);pred_stride - mov rsi, arg(1) ;src - mov rdi, arg(0) ;diff - - mov rax, arg(2) ;pred - movsxd rdx, dword ptr arg(3) ;stride + pxor mm0, mm0 + mov rcx, 16 - mov rcx, 16 - pxor mm0, mm0 .submby_loop: + movq mm1, [rsi] + movq mm3, [rax] - movq mm1, [rsi] - movq mm3, [rax] - - movq mm2, mm1 - movq mm4, mm3 - - punpcklbw mm1, mm0 - punpcklbw mm3, mm0 - - punpckhbw mm2, mm0 - punpckhbw mm4, mm0 - - psubw mm1, mm3 - psubw mm2, mm4 + movq mm2, mm1 + movq mm4, mm3 - movq [rdi], mm1 - movq [rdi+8], mm2 + punpcklbw mm1, mm0 + punpcklbw mm3, mm0 + punpckhbw mm2, mm0 + punpckhbw mm4, mm0 - movq mm1, [rsi+8] - movq mm3, [rax+8] + psubw mm1, mm3 + psubw mm2, mm4 - movq mm2, mm1 - movq mm4, mm3 + movq [rdi], mm1 + movq [rdi+8], mm2 - punpcklbw mm1, mm0 - punpcklbw mm3, mm0 + movq mm1, [rsi+8] + movq mm3, [rax+8] - punpckhbw mm2, mm0 - punpckhbw mm4, mm0 + movq mm2, mm1 + movq mm4, mm3 - psubw mm1, mm3 - psubw mm2, mm4 + punpcklbw mm1, mm0 + punpcklbw mm3, mm0 - movq [rdi+16], mm1 - movq [rdi+24], mm2 + punpckhbw mm2, mm0 + punpckhbw mm4, mm0 + psubw mm1, mm3 + psubw mm2, mm4 - add rdi, 32 - add rax, 16 - - lea rsi, [rsi+rdx] - - sub rcx, 1 - jnz .submby_loop + movq [rdi+16], mm1 + movq [rdi+24], mm2 + add rdi, 32 + lea rax, [rax+rbx] + lea rsi, [rsi+rdx] + dec rcx + jnz .submby_loop + pop rbx pop rdi pop rsi ; begin epilog @@ -149,281 +146,75 @@ sym(vp8_subtract_mby_mmx): ret -;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, +; int src_stride, unsigned char *upred, +; unsigned char *vpred, int pred_stride) + global sym(vp8_subtract_mbuv_mmx) sym(vp8_subtract_mbuv_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 + SHADOW_ARGS_TO_STACK 7 push rsi push rdi ; end prolog - ;short *udiff = diff + 256; - ;short *vdiff = diff + 320; - ;unsigned char *upred = pred + 256; - ;unsigned char *vpred = pred + 320; - - ;unsigned char *z = usrc; - ;unsigned short *diff = udiff; - ;unsigned char *Predictor= upred; - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(1) ;z = usrc - add rdi, 256*2 ;diff = diff + 256 (shorts) - add rax, 256 ;Predictor = pred + 256 - movsxd rdx, dword ptr arg(4) ;stride; - pxor mm7, mm7 - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - ;unsigned char *z = vsrc; - ;unsigned short *diff = vdiff; - ;unsigned char *Predictor= vpred; - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(2) ;z = usrc - add rdi, 320*2 ;diff = diff + 320 (shorts) - add rax, 320 ;Predictor = pred + 320 - movsxd rdx, dword ptr arg(4) ;stride; - pxor mm7, mm7 - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - + mov rdi, arg(0) ;diff + mov rsi, arg(1) ;usrc + movsxd rdx, dword ptr arg(3);src_stride; + mov rax, arg(4) ;upred + add rdi, 256*2 ;diff = diff + 256 (shorts) + mov rcx, 8 + push rbx + movsxd rbx, dword ptr arg(6);pred_stride + + pxor mm7, mm7 + +.submbu_loop: + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + add rdi, 16 + add rsi, rdx + add rax, rbx + + dec rcx + jnz .submbu_loop + + mov rsi, arg(2) ;vsrc + mov rax, arg(5) ;vpred + mov rcx, 8 + +.submbv_loop: + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + add rdi, 16 + add rsi, rdx + add rax, rbx + + dec rcx + jnz .submbv_loop + + pop rbx ; begin epilog pop rdi pop rsi diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm index 3bd1ff678..008e9c7d1 100644 --- a/vp8/encoder/x86/subtract_sse2.asm +++ b/vp8/encoder/x86/subtract_sse2.asm @@ -71,277 +71,166 @@ sym(vp8_subtract_b_sse2_impl): ret -;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride) +;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, +;unsigned char *pred, int pred_stride) global sym(vp8_subtract_mby_sse2) sym(vp8_subtract_mby_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - SAVE_XMM 7 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx push rsi push rdi ; end prolog - mov rsi, arg(1) ;src - mov rdi, arg(0) ;diff - - mov rax, arg(2) ;pred - movsxd rdx, dword ptr arg(3) ;stride - - mov rcx, 8 ; do two lines at one time + mov rdi, arg(0) ;diff + mov rsi, arg(1) ;src + movsxd rdx, dword ptr arg(2);src_stride + mov rax, arg(3) ;pred + movdqa xmm4, [GLOBAL(t80)] + push rbx + mov rcx, 8 ; do two lines at one time + movsxd rbx, dword ptr arg(4);pred_stride .submby_loop: - movdqa xmm0, XMMWORD PTR [rsi] ; src - movdqa xmm1, XMMWORD PTR [rax] ; pred + movdqa xmm0, [rsi] ; src + movdqa xmm1, [rax] ; pred - movdqa xmm2, xmm0 - psubb xmm0, xmm1 + movdqa xmm2, xmm0 + psubb xmm0, xmm1 - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information + pxor xmm1, xmm4 ;convert to signed values + pxor xmm2, xmm4 + pcmpgtb xmm1, xmm2 ; obtain sign information - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction + movdqa xmm2, xmm0 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm1 ; put sign back to subtraction - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 + movdqa xmm3, [rsi + rdx] + movdqa xmm5, [rax + rbx] - movdqa xmm4, XMMWORD PTR [rsi + rdx] - movdqa xmm5, XMMWORD PTR [rax + 16] + lea rsi, [rsi+rdx*2] + lea rax, [rax+rbx*2] - movdqa xmm6, xmm4 - psubb xmm4, xmm5 + movdqa [rdi], xmm0 + movdqa [rdi +16], xmm2 - pxor xmm5, [GLOBAL(t80)] ;convert to signed values - pxor xmm6, [GLOBAL(t80)] - pcmpgtb xmm5, xmm6 ; obtain sign information + movdqa xmm1, xmm3 + psubb xmm3, xmm5 - movdqa xmm6, xmm4 - movdqa xmm7, xmm5 - punpcklbw xmm4, xmm5 ; put sign back to subtraction - punpckhbw xmm6, xmm7 ; put sign back to subtraction + pxor xmm5, xmm4 ;convert to signed values + pxor xmm1, xmm4 + pcmpgtb xmm5, xmm1 ; obtain sign information - movdqa XMMWORD PTR [rdi +32], xmm4 - movdqa XMMWORD PTR [rdi +48], xmm6 + movdqa xmm1, xmm3 + punpcklbw xmm3, xmm5 ; put sign back to subtraction + punpckhbw xmm1, xmm5 ; put sign back to subtraction - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] + movdqa [rdi +32], xmm3 + movdqa [rdi +48], xmm1 - sub rcx, 1 - jnz .submby_loop + add rdi, 64 + dec rcx + jnz .submby_loop + pop rbx pop rdi pop rsi ; begin epilog RESTORE_GOT - RESTORE_XMM UNSHADOW_ARGS pop rbp ret - -;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, +; int src_stride, unsigned char *upred, +; unsigned char *vpred, int pred_stride) global sym(vp8_subtract_mbuv_sse2) sym(vp8_subtract_mbuv_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 + SHADOW_ARGS_TO_STACK 7 GET_GOT rbx push rsi push rdi ; end prolog - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(1) ;z = usrc - add rdi, 256*2 ;diff = diff + 256 (shorts) - add rax, 256 ;Predictor = pred + 256 - movsxd rdx, dword ptr arg(4) ;stride; - lea rcx, [rdx + rdx*2] - - ;u - ;line 0 1 - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - ;line 2 3 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+16] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 32], xmm0 - movdqa XMMWORD PTR [rdi + 48], xmm2 - - ;line 4 5 - lea rsi, [rsi + rdx*4] - - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax + 32] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 64], xmm0 - movdqa XMMWORD PTR [rdi + 80], xmm2 - - ;line 6 7 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 96], xmm0 - movdqa XMMWORD PTR [rdi + 112], xmm2 - - ;v - mov rsi, arg(2) ;z = vsrc - add rdi, 64*2 ;diff = diff + 320 (shorts) - add rax, 64 ;Predictor = pred + 320 - - ;line 0 1 - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - ;line 2 3 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+16] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 32], xmm0 - movdqa XMMWORD PTR [rdi + 48], xmm2 - - ;line 4 5 - lea rsi, [rsi + rdx*4] - - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax + 32] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 64], xmm0 - movdqa XMMWORD PTR [rdi + 80], xmm2 - - ;line 6 7 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 96], xmm0 - movdqa XMMWORD PTR [rdi + 112], xmm2 - + movdqa xmm4, [GLOBAL(t80)] + mov rdi, arg(0) ;diff + mov rsi, arg(1) ;usrc + movsxd rdx, dword ptr arg(3);src_stride; + mov rax, arg(4) ;upred + add rdi, 256*2 ;diff = diff + 256 (shorts) + mov rcx, 4 + push rbx + movsxd rbx, dword ptr arg(6);pred_stride + + ;u +.submbu_loop: + movq xmm0, [rsi] ; src + movq xmm2, [rsi+rdx] ; src -- next line + movq xmm1, [rax] ; pred + movq xmm3, [rax+rbx] ; pred -- next line + lea rsi, [rsi + rdx*2] + lea rax, [rax + rbx*2] + + punpcklqdq xmm0, xmm2 + punpcklqdq xmm1, xmm3 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, xmm4 ;convert to signed values + pxor xmm2, xmm4 + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi], xmm0 ; store difference + movdqa [rdi +16], xmm2 ; store difference + add rdi, 32 + sub rcx, 1 + jnz .submbu_loop + + mov rsi, arg(2) ;vsrc + mov rax, arg(5) ;vpred + mov rcx, 4 + + ;v +.submbv_loop: + movq xmm0, [rsi] ; src + movq xmm2, [rsi+rdx] ; src -- next line + movq xmm1, [rax] ; pred + movq xmm3, [rax+rbx] ; pred -- next line + lea rsi, [rsi + rdx*2] + lea rax, [rax + rbx*2] + + punpcklqdq xmm0, xmm2 + punpcklqdq xmm1, xmm3 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, xmm4 ;convert to signed values + pxor xmm2, xmm4 + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi], xmm0 ; store difference + movdqa [rdi +16], xmm2 ; store difference + add rdi, 32 + sub rcx, 1 + jnz .submbv_loop + + pop rbx ; begin epilog pop rdi pop rsi