From: Attila Nagy Date: Wed, 6 Jul 2011 10:35:33 +0000 (+0300) Subject: Update armv7 loopfilter to new interface X-Git-Tag: v0.9.7~46^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=283b0e25ac6e0a2bd3d5f0b8cd1d0a50bbda7318;p=libvpx Update armv7 loopfilter to new interface Change-Id: I65105a9c63832669237e6a6a7fcb4ea3ea683346 --- diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c index 6d1caa485..1ec2b7484 100644 --- a/vp8/common/arm/loopfilter_arm.c +++ b/vp8/common/arm/loopfilter_arm.c @@ -9,30 +9,36 @@ */ -#include "vpx_ports/config.h" -#include +#include "vpx_config.h" #include "vp8/common/loopfilter.h" #include "vp8/common/onyxc_int.h" +#if HAVE_ARMV6 extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6); extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6); +#endif -extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon); -extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon); -extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon); -extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon); -extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon); -extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon); - -extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon; -extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon; -extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon; -extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon; - +#if HAVE_ARMV7 +typedef void loopfilter_y_neon(unsigned char *src, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh); +typedef void loopfilter_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh, + unsigned char *v); + +extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; +extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; + +extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; +#endif #if HAVE_ARMV6 /*ARMV6 loopfilter functions*/ @@ -40,13 +46,13 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon; void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, @@ -55,20 +61,20 @@ void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsi (void) u_ptr; (void) v_ptr; (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, @@ -77,22 +83,22 @@ void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsi (void) u_ptr; (void) v_ptr; (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, @@ -101,24 +107,24 @@ void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig (void) u_ptr; (void) v_ptr; (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); } /* Vertical B Filtering */ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, @@ -127,9 +133,9 @@ void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig (void) u_ptr; (void) v_ptr; (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); } #endif @@ -139,83 +145,58 @@ void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} - -void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; - if (u_ptr) - vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} + vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); -void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + if (u_ptr) + vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; - if (u_ptr) - vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride); -} + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr); -void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + if (u_ptr) + vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride); } /* Vertical B Filtering */ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; - if (u_ptr) - vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4); -} + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr); -void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + if (u_ptr) + vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4); } #endif diff --git a/vp8/common/arm/loopfilter_arm.h b/vp8/common/arm/loopfilter_arm.h index cd62207d7..27159b59f 100644 --- a/vp8/common/arm/loopfilter_arm.h +++ b/vp8/common/arm/loopfilter_arm.h @@ -12,6 +12,8 @@ #ifndef LOOPFILTER_ARM_H #define LOOPFILTER_ARM_H +#include "vpx_config.h" + #if HAVE_ARMV6 extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6); extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6); @@ -46,18 +48,19 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6); #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6 -#endif -#endif +#endif /* !CONFIG_RUNTIME_CPU_DETECT */ + +#endif /* HAVE_ARMV6 */ #if HAVE_ARMV7 extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon); extern prototype_loopfilter_block(vp8_loop_filter_bv_neon); extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon); extern prototype_loopfilter_block(vp8_loop_filter_bh_neon); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_mbvs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_mbhs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_neon); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_lf_normal_mb_v @@ -83,7 +86,8 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon); #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon -#endif -#endif +#endif /* !CONFIG_RUNTIME_CPU_DETECT */ + +#endif /* HAVE_ARMV7 */ -#endif +#endif /* LOOPFILTER_ARM_H */ diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm index e73dd6401..e44be0a1e 100644 --- a/vp8/common/arm/neon/loopfilter_neon.asm +++ b/vp8/common/arm/neon/loopfilter_neon.asm @@ -14,109 +14,97 @@ EXPORT |vp8_loop_filter_vertical_edge_y_neon| EXPORT |vp8_loop_filter_vertical_edge_uv_neon| ARM - REQUIRE8 - PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -; flimit, limit, and thresh should be positive numbers. -; All 16 elements in these variables are equal. - -; void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) ; r0 unsigned char *src ; r1 int pitch -; r2 const signed char *flimit -; r3 const signed char *limit -; sp const signed char *thresh, -; sp+4 int count (unused) +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, |vp8_loop_filter_horizontal_edge_y_neon| PROC - stmdb sp!, {lr} - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit - sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r12, [sp, #4] ; load thresh pointer - - vld1.u8 {q3}, [r2], r1 ; p3 - vld1.u8 {q4}, [r2], r1 ; p2 - vld1.u8 {q5}, [r2], r1 ; p1 - vld1.u8 {q6}, [r2], r1 ; p0 - vld1.u8 {q7}, [r2], r1 ; q0 - vld1.u8 {q8}, [r2], r1 ; q1 - vld1.u8 {q9}, [r2], r1 ; q2 - vld1.u8 {q10}, [r2] ; q3 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - sub r0, r0, r1, lsl #1 + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines + ldr r3, [sp, #4] ; load thresh + add r12, r2, r1 + add r1, r1, r1 + + vdup.u8 q2, r3 ; duplicate thresh + + vld1.u8 {q3}, [r2@128], r1 ; p3 + vld1.u8 {q4}, [r12@128], r1 ; p2 + vld1.u8 {q5}, [r2@128], r1 ; p1 + vld1.u8 {q6}, [r12@128], r1 ; p0 + vld1.u8 {q7}, [r2@128], r1 ; q0 + vld1.u8 {q8}, [r12@128], r1 ; q1 + vld1.u8 {q9}, [r2@128] ; q2 + vld1.u8 {q10}, [r12@128] ; q3 + + sub r2, r2, r1, lsl #1 + sub r12, r12, r1, lsl #1 bl vp8_loop_filter_neon - vst1.u8 {q5}, [r0], r1 ; store op1 - vst1.u8 {q6}, [r0], r1 ; store op0 - vst1.u8 {q7}, [r0], r1 ; store oq0 - vst1.u8 {q8}, [r0], r1 ; store oq1 + vst1.u8 {q5}, [r2@128], r1 ; store op1 + vst1.u8 {q6}, [r12@128], r1 ; store op0 + vst1.u8 {q7}, [r2@128], r1 ; store oq0 + vst1.u8 {q8}, [r12@128], r1 ; store oq1 - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| -; void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; unsigned char *v) + ; r0 unsigned char *u, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, ; sp+4 unsigned char *v |vp8_loop_filter_horizontal_edge_uv_neon| PROC - stmdb sp!, {lr} - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + ldr r12, [sp, #4] ; load thresh ldr r2, [sp, #8] ; load v ptr + vdup.u8 q2, r12 ; duplicate thresh sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines - vld1.u8 {d6}, [r3], r1 ; p3 - vld1.u8 {d8}, [r3], r1 ; p2 - vld1.u8 {d10}, [r3], r1 ; p1 - vld1.u8 {d12}, [r3], r1 ; p0 - vld1.u8 {d14}, [r3], r1 ; q0 - vld1.u8 {d16}, [r3], r1 ; q1 - vld1.u8 {d18}, [r3], r1 ; q2 - vld1.u8 {d20}, [r3] ; q3 - - ldr r3, [sp, #4] ; load thresh pointer - sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines - vld1.u8 {d7}, [r12], r1 ; p3 - vld1.u8 {d9}, [r12], r1 ; p2 - vld1.u8 {d11}, [r12], r1 ; p1 - vld1.u8 {d13}, [r12], r1 ; p0 - vld1.u8 {d15}, [r12], r1 ; q0 - vld1.u8 {d17}, [r12], r1 ; q1 - vld1.u8 {d19}, [r12], r1 ; q2 - vld1.u8 {d21}, [r12] ; q3 - vld1.s8 {d4[], d5[]}, [r3] ; thresh + vld1.u8 {d6}, [r3@64], r1 ; p3 + vld1.u8 {d7}, [r12@64], r1 ; p3 + vld1.u8 {d8}, [r3@64], r1 ; p2 + vld1.u8 {d9}, [r12@64], r1 ; p2 + vld1.u8 {d10}, [r3@64], r1 ; p1 + vld1.u8 {d11}, [r12@64], r1 ; p1 + vld1.u8 {d12}, [r3@64], r1 ; p0 + vld1.u8 {d13}, [r12@64], r1 ; p0 + vld1.u8 {d14}, [r3@64], r1 ; q0 + vld1.u8 {d15}, [r12@64], r1 ; q0 + vld1.u8 {d16}, [r3@64], r1 ; q1 + vld1.u8 {d17}, [r12@64], r1 ; q1 + vld1.u8 {d18}, [r3@64], r1 ; q2 + vld1.u8 {d19}, [r12@64], r1 ; q2 + vld1.u8 {d20}, [r3@64] ; q3 + vld1.u8 {d21}, [r12@64] ; q3 bl vp8_loop_filter_neon sub r0, r0, r1, lsl #1 sub r2, r2, r1, lsl #1 - vst1.u8 {d10}, [r0], r1 ; store u op1 - vst1.u8 {d11}, [r2], r1 ; store v op1 - vst1.u8 {d12}, [r0], r1 ; store u op0 - vst1.u8 {d13}, [r2], r1 ; store v op0 - vst1.u8 {d14}, [r0], r1 ; store u oq0 - vst1.u8 {d15}, [r2], r1 ; store v oq0 - vst1.u8 {d16}, [r0] ; store u oq1 - vst1.u8 {d17}, [r2] ; store v oq1 + vst1.u8 {d10}, [r0@64], r1 ; store u op1 + vst1.u8 {d11}, [r2@64], r1 ; store v op1 + vst1.u8 {d12}, [r0@64], r1 ; store u op0 + vst1.u8 {d13}, [r2@64], r1 ; store v op0 + vst1.u8 {d14}, [r0@64], r1 ; store u oq0 + vst1.u8 {d15}, [r2@64], r1 ; store v oq0 + vst1.u8 {d16}, [r0@64] ; store u oq1 + vst1.u8 {d17}, [r2@64] ; store v oq1 - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon| ; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, @@ -124,39 +112,38 @@ ; const signed char *limit, ; const signed char *thresh, ; int count) -; r0 unsigned char *src, -; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 int count (unused) +; r0 unsigned char *src +; r1 int pitch +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, + |vp8_loop_filter_vertical_edge_y_neon| PROC - stmdb sp!, {lr} - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit - sub r2, r0, #4 ; src ptr down by 4 columns - sub r0, r0, #2 ; dst ptr - ldr r12, [sp, #4] ; load thresh pointer - - vld1.u8 {d6}, [r2], r1 ; load first 8-line src data - vld1.u8 {d8}, [r2], r1 + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + sub r2, r0, #4 ; src ptr down by 4 columns + add r1, r1, r1 + ldr r3, [sp, #4] ; load thresh + add r12, r2, r1, asr #1 + + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d8}, [r12], r1 vld1.u8 {d10}, [r2], r1 - vld1.u8 {d12}, [r2], r1 + vld1.u8 {d12}, [r12], r1 vld1.u8 {d14}, [r2], r1 - vld1.u8 {d16}, [r2], r1 + vld1.u8 {d16}, [r12], r1 vld1.u8 {d18}, [r2], r1 - vld1.u8 {d20}, [r2], r1 - - vld1.s8 {d4[], d5[]}, [r12] ; thresh + vld1.u8 {d20}, [r12], r1 vld1.u8 {d7}, [r2], r1 ; load second 8-line src data - vld1.u8 {d9}, [r2], r1 + vld1.u8 {d9}, [r12], r1 vld1.u8 {d11}, [r2], r1 - vld1.u8 {d13}, [r2], r1 + vld1.u8 {d13}, [r12], r1 vld1.u8 {d15}, [r2], r1 - vld1.u8 {d17}, [r2], r1 - vld1.u8 {d19}, [r2], r1 - vld1.u8 {d21}, [r2] + vld1.u8 {d17}, [r12], r1 + vld1.u8 {d19}, [r2] + vld1.u8 {d21}, [r12] ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -164,6 +151,8 @@ vtrn.32 q5, q9 vtrn.32 q6, q10 + vdup.u8 q2, r3 ; duplicate thresh + vtrn.16 q3, q5 vtrn.16 q4, q6 vtrn.16 q7, q9 @@ -178,28 +167,34 @@ vswp d12, d11 vswp d16, d13 + + sub r0, r0, #2 ; dst ptr + vswp d14, d12 vswp d16, d15 + add r12, r0, r1, asr #1 + ;store op1, op0, oq0, oq1 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1 + vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r0], r1 + vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1 vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r0], r1 + vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1 vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r0], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0], r1 - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r0] + vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1 + vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] + vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12] - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_vertical_edge_y_neon| ; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch @@ -209,38 +204,36 @@ ; unsigned char *v) ; r0 unsigned char *u, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, ; sp+4 unsigned char *v |vp8_loop_filter_vertical_edge_uv_neon| PROC - stmdb sp!, {lr} - sub r12, r0, #4 ; move u pointer down by 4 columns - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit - + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + sub r12, r0, #4 ; move u pointer down by 4 columns ldr r2, [sp, #8] ; load v ptr - - vld1.u8 {d6}, [r12], r1 ;load u data - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d10}, [r12], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d14}, [r12], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d18}, [r12], r1 - vld1.u8 {d20}, [r12] - + vdup.u8 q1, r3 ; duplicate limit sub r3, r2, #4 ; move v pointer down by 4 columns + + vld1.u8 {d6}, [r12], r1 ;load u data vld1.u8 {d7}, [r3], r1 ;load v data + vld1.u8 {d8}, [r12], r1 vld1.u8 {d9}, [r3], r1 + vld1.u8 {d10}, [r12], r1 vld1.u8 {d11}, [r3], r1 + vld1.u8 {d12}, [r12], r1 vld1.u8 {d13}, [r3], r1 + vld1.u8 {d14}, [r12], r1 vld1.u8 {d15}, [r3], r1 + vld1.u8 {d16}, [r12], r1 vld1.u8 {d17}, [r3], r1 + vld1.u8 {d18}, [r12], r1 vld1.u8 {d19}, [r3], r1 + vld1.u8 {d20}, [r12] vld1.u8 {d21}, [r3] - ldr r12, [sp, #4] ; load thresh pointer + ldr r12, [sp, #4] ; load thresh ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -248,6 +241,8 @@ vtrn.32 q5, q9 vtrn.32 q6, q10 + vdup.u8 q2, r12 ; duplicate thresh + vtrn.16 q3, q5 vtrn.16 q4, q6 vtrn.16 q7, q9 @@ -258,18 +253,16 @@ vtrn.8 q7, q8 vtrn.8 q9, q10 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - bl vp8_loop_filter_neon - sub r0, r0, #2 - sub r2, r2, #2 - vswp d12, d11 vswp d16, d13 vswp d14, d12 vswp d16, d15 + sub r0, r0, #2 + sub r2, r2, #2 + ;store op1, op0, oq0, oq1 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1 @@ -288,7 +281,7 @@ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0] vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_vertical_edge_uv_neon| ; void vp8_loop_filter_neon(); @@ -316,42 +309,44 @@ vabd.u8 q14, q8, q7 ; abs(q1 - q0) vabd.u8 q3, q9, q8 ; abs(q2 - q1) vabd.u8 q4, q10, q9 ; abs(q3 - q2) - vabd.u8 q9, q6, q7 ; abs(p0 - q0) vmax.u8 q11, q11, q12 vmax.u8 q12, q13, q14 vmax.u8 q3, q3, q4 vmax.u8 q15, q11, q12 + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + ; vp8_hevmask vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 vmax.u8 q15, q15, q3 - vadd.u8 q0, q0, q0 ; flimit * 2 - vadd.u8 q0, q0, q1 ; flimit * 2 + limit - vcge.u8 q15, q1, q15 + vmov.u8 q10, #0x80 ; 0x80 vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 - vshr.u8 q2, q2, #1 ; a = a / 2 - vqadd.u8 q9, q9, q2 ; a = b + a - vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 - vmov.u8 q0, #0x80 ; 0x80 + vcge.u8 q15, q1, q15 ; vp8_filter() function ; convert to signed - veor q7, q7, q0 ; qs0 - veor q6, q6, q0 ; ps0 - veor q5, q5, q0 ; ps1 - veor q8, q8, q0 ; qs1 + veor q7, q7, q10 ; qs0 + vshr.u8 q2, q2, #1 ; a = a / 2 + veor q6, q6, q10 ; ps0 + + veor q5, q5, q10 ; ps1 + vqadd.u8 q9, q9, q2 ; a = b + a + + veor q8, q8, q10 ; qs1 vmov.u8 q10, #3 ; #3 vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) vsubl.s8 q11, d15, d13 + vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 + vmovl.u8 q4, d20 vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) @@ -378,19 +373,20 @@ vshr.s8 q2, q2, #3 ; Filter2 >>= 3 vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2) vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1) ; outer tap adjustments: ++vp8_filter >> 1 vrshr.s8 q1, q1, #1 vbic q1, q1, q14 ; vp8_filter &= ~hev - + vmov.u8 q0, #0x80 ; 0x80 vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter) vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter) - veor q5, q13, q0 ; *op1 = u^0x80 veor q6, q11, q0 ; *op0 = u^0x80 veor q7, q10, q0 ; *oq0 = u^0x80 + veor q5, q13, q0 ; *op1 = u^0x80 veor q8, q12, q0 ; *oq1 = u^0x80 bx lr diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm index 7c5ea3644..adf848b9c 100644 --- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm @@ -9,99 +9,109 @@ ; - EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| + ;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| + EXPORT |vp8_loop_filter_bhs_neon| + EXPORT |vp8_loop_filter_mbhs_neon| ARM - REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit -;are equal. So, in the code, only one load is needed -;for flimit. Same way applies to limit and thresh. -; r0 unsigned char *s, -; r1 int p, //pitch -; r2 const signed char *flimit, -; r3 const signed char *limit, -; stack(r4) const signed char *thresh (unused) -; //stack(r5) int count --unused + +; r0 unsigned char *s, PRESERVE +; r1 int p, PRESERVE +; q1 limit, PRESERVE |vp8_loop_filter_simple_horizontal_edge_neon| PROC - sub r0, r0, r1, lsl #1 ; move src pointer down by 2 lines - vld1.u8 {q5}, [r0], r1 ; p1 - vld1.s8 {d2[], d3[]}, [r2] ; flimit - vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 - vld1.u8 {q6}, [r0], r1 ; p0 - vmov.u8 q0, #0x80 ; 0x80 - vld1.u8 {q7}, [r0], r1 ; q0 - vmov.u8 q10, #0x03 ; 0x03 - vld1.u8 {q8}, [r0] ; q1 + sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines + + vld1.u8 {q7}, [r0@128], r1 ; q0 + vld1.u8 {q5}, [r3@128], r1 ; p0 + vld1.u8 {q8}, [r0@128] ; q1 + vld1.u8 {q6}, [r3@128] ; p1 - ;vp8_filter_mask() function vabd.u8 q15, q6, q7 ; abs(p0 - q0) vabd.u8 q14, q5, q8 ; abs(p1 - q1) + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vmov.u8 q0, #0x80 ; 0x80 + vmov.s16 q13, #3 vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 - ;vp8_filter() function veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value - vadd.u8 q1, q1, q1 ; flimit * 2 - vadd.u8 q1, q1, q13 ; flimit * 2 + limit - vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1 -;;;;;;;;;; - ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) vsubl.s8 q3, d15, d13 vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1) - ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) - vadd.s16 q11, q2, q2 ; 3 * ( qs0 - ps0) - vadd.s16 q12, q3, q3 + vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0) + vmul.s16 q3, q3, q13 + vmov.u8 q10, #0x03 ; 0x03 vmov.u8 q9, #0x04 ; 0x04 - vadd.s16 q2, q2, q11 - vadd.s16 q3, q3, q12 - vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0) vaddw.s8 q3, q3, d9 - ;vqadd.s8 q4, q4, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) vqmovn.s16 d9, q3 -;;;;;;;;;;;;; - vand q4, q4, q15 ; vp8_filter &= mask + vand q14, q4, q15 ; vp8_filter &= mask - vqadd.s8 q2, q4, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) - vqadd.s8 q4, q4, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q4, q4, #3 ; Filter1 >>= 3 + vshr.s8 q4, q3, #3 ; Filter1 >>= 3 - sub r0, r0, r1, lsl #1 + sub r0, r0, r1 ;calculate output vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1) - add r3, r0, r1 - veor q6, q11, q0 ; *op0 = u^0x80 veor q7, q10, q0 ; *oq0 = u^0x80 - vst1.u8 {q6}, [r0] ; store op0 - vst1.u8 {q7}, [r3] ; store oq0 + vst1.u8 {q6}, [r3@128] ; store op0 + vst1.u8 {q7}, [r0@128] ; store oq0 bx lr ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon| -;----------------- +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_bhs_neon| PROC + push {r4, lr} + ldrb r3, [r2] ; load blim from mem + vdup.s8 q1, r3 ; duplicate blim + + add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride + bl vp8_loop_filter_simple_horizontal_edge_neon + ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1 + add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride + bl vp8_loop_filter_simple_horizontal_edge_neon + add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride + pop {r4, lr} + b vp8_loop_filter_simple_horizontal_edge_neon + ENDP ;|vp8_loop_filter_bhs_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_mbhs_neon| PROC + ldrb r3, [r2] ; load blim from mem + vdup.s8 q1, r3 ; duplicate mblim + b vp8_loop_filter_simple_horizontal_edge_neon + ENDP ;|vp8_loop_filter_bhs_neon| END diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm index a7f7b690e..e690df2f7 100644 --- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm @@ -9,59 +9,54 @@ ; - EXPORT |vp8_loop_filter_simple_vertical_edge_neon| + ;EXPORT |vp8_loop_filter_simple_vertical_edge_neon| + EXPORT |vp8_loop_filter_bvs_neon| + EXPORT |vp8_loop_filter_mbvs_neon| ARM - REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit -;are equal. So, in the code, only one load is needed -;for flimit. Same way applies to limit and thresh. -; r0 unsigned char *s, -; r1 int p, //pitch -; r2 const signed char *flimit, -; r3 const signed char *limit, -; stack(r4) const signed char *thresh (unused) -; //stack(r5) int count --unused + +; r0 unsigned char *s, PRESERVE +; r1 int p, PRESERVE +; q1 limit, PRESERVE |vp8_loop_filter_simple_vertical_edge_neon| PROC sub r0, r0, #2 ; move src pointer down by 2 columns - - vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r1 - vld1.s8 {d2[], d3[]}, [r2] ; flimit - vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 - vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r0], r1 - vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r1 - vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r0], r1 - vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r1 - vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r0], r1 - vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r1 - vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r0], r1 - - vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vmov.u8 q0, #0x80 ; 0x80 - vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 - vmov.u8 q11, #0x03 ; 0x03 - vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vmov.u8 q12, #0x04 ; 0x04 - vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 - vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 - vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + add r12, r1, r1 + add r3, r0, r1 + + vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12 + vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12 + vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12 + vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12 + vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12 + vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12 + vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12 + vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12 + + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12 + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12 + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12 + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12 + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12 + vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3] vswp d7, d10 vswp d12, d9 - ;vswp q4, q5 ; p1:q3, p0:q5, q0:q4, q1:q6 ;vp8_filter_mask() function ;vp8_hevmask() function sub r0, r0, r1, lsl #4 vabd.u8 q15, q5, q4 ; abs(p0 - q0) vabd.u8 q14, q3, q6 ; abs(p1 - q1) + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vmov.u8 q0, #0x80 ; 0x80 + vmov.s16 q11, #3 vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value @@ -69,80 +64,91 @@ veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value - vadd.u8 q1, q1, q1 ; flimit * 2 - vadd.u8 q1, q1, q13 ; flimit * 2 + limit vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 - ;vp8_filter() function -;;;;;;;;;; - ;vqsub.s8 q2, q5, q4 ; ( qs0 - ps0) vsubl.s8 q2, d8, d10 ; ( qs0 - ps0) vsubl.s8 q13, d9, d11 - vqsub.s8 q1, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0) + vmul.s16 q13, q13, q11 - ;vmul.i8 q2, q2, q11 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) - vadd.s16 q14, q13, q13 - vadd.s16 q2, q2, q10 - vadd.s16 q13, q13, q14 + vmov.u8 q11, #0x03 ; 0x03 + vmov.u8 q12, #0x04 ; 0x04 - ;vqadd.s8 q1, q1, q2 - vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) - vaddw.s8 q13, q13, d3 + vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d29 - vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d3, q13 + vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d29, q13 add r0, r0, #1 - add r2, r0, r1 -;;;;;;;;;;; + add r3, r0, r1 - vand q1, q1, q15 ; vp8_filter &= mask + vand q14, q14, q15 ; vp8_filter &= mask - vqadd.s8 q2, q1, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) - vqadd.s8 q1, q1, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + vshr.s8 q14, q3, #3 ; Filter1 >>= 3 ;calculate output - vqsub.s8 q10, q4, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1) - veor q7, q10, q0 ; *oq0 = u^0x80 veor q6, q11, q0 ; *op0 = u^0x80 - - add r3, r2, r1 + veor q7, q10, q0 ; *oq0 = u^0x80 + add r12, r1, r1 vswp d13, d14 - add r12, r3, r1 ;store op1, op0, oq0, oq1 - vst2.8 {d12[0], d13[0]}, [r0] - vst2.8 {d12[1], d13[1]}, [r2] - vst2.8 {d12[2], d13[2]}, [r3] - vst2.8 {d12[3], d13[3]}, [r12], r1 - add r0, r12, r1 - vst2.8 {d12[4], d13[4]}, [r12] - vst2.8 {d12[5], d13[5]}, [r0], r1 - add r2, r0, r1 - vst2.8 {d12[6], d13[6]}, [r0] - vst2.8 {d12[7], d13[7]}, [r2], r1 - add r3, r2, r1 - vst2.8 {d14[0], d15[0]}, [r2] - vst2.8 {d14[1], d15[1]}, [r3], r1 - add r12, r3, r1 - vst2.8 {d14[2], d15[2]}, [r3] - vst2.8 {d14[3], d15[3]}, [r12], r1 - add r0, r12, r1 - vst2.8 {d14[4], d15[4]}, [r12] - vst2.8 {d14[5], d15[5]}, [r0], r1 - add r2, r0, r1 - vst2.8 {d14[6], d15[6]}, [r0] - vst2.8 {d14[7], d15[7]}, [r2] + vst2.8 {d12[0], d13[0]}, [r0], r12 + vst2.8 {d12[1], d13[1]}, [r3], r12 + vst2.8 {d12[2], d13[2]}, [r0], r12 + vst2.8 {d12[3], d13[3]}, [r3], r12 + vst2.8 {d12[4], d13[4]}, [r0], r12 + vst2.8 {d12[5], d13[5]}, [r3], r12 + vst2.8 {d12[6], d13[6]}, [r0], r12 + vst2.8 {d12[7], d13[7]}, [r3], r12 + vst2.8 {d14[0], d15[0]}, [r0], r12 + vst2.8 {d14[1], d15[1]}, [r3], r12 + vst2.8 {d14[2], d15[2]}, [r0], r12 + vst2.8 {d14[3], d15[3]}, [r3], r12 + vst2.8 {d14[4], d15[4]}, [r0], r12 + vst2.8 {d14[5], d15[5]}, [r3], r12 + vst2.8 {d14[6], d15[6]}, [r0], r12 + vst2.8 {d14[7], d15[7]}, [r3] bx lr ENDP ; |vp8_loop_filter_simple_vertical_edge_neon| -;----------------- - +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_bvs_neon| PROC + push {r4, lr} + ldrb r3, [r2] ; load blim from mem + mov r4, r0 + add r0, r0, #4 + vdup.s8 q1, r3 ; duplicate blim + bl vp8_loop_filter_simple_vertical_edge_neon + ; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1 + add r0, r4, #8 + bl vp8_loop_filter_simple_vertical_edge_neon + add r0, r4, #12 + pop {r4, lr} + b vp8_loop_filter_simple_vertical_edge_neon + ENDP ;|vp8_loop_filter_bvs_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_mbvs_neon| PROC + ldrb r3, [r2] ; load mblim from mem + vdup.s8 q1, r3 ; duplicate mblim + b vp8_loop_filter_simple_vertical_edge_neon + ENDP ;|vp8_loop_filter_bvs_neon| END diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm index 72f0f9271..f41c156df 100644 --- a/vp8/common/arm/neon/mbloopfilter_neon.asm +++ b/vp8/common/arm/neon/mbloopfilter_neon.asm @@ -14,155 +14,143 @@ EXPORT |vp8_mbloop_filter_vertical_edge_y_neon| EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon| ARM - REQUIRE8 - PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -; flimit, limit, and thresh should be positive numbers. -; All 16 elements in these variables are equal. - ; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh) ; r0 unsigned char *src, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 int count (unused) +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, |vp8_mbloop_filter_horizontal_edge_y_neon| PROC - stmdb sp!, {lr} - sub r0, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r12, [sp, #4] ; load thresh pointer - - vld1.u8 {q3}, [r0], r1 ; p3 - vld1.s8 {d2[], d3[]}, [r3] ; limit - vld1.u8 {q4}, [r0], r1 ; p2 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - vld1.u8 {q5}, [r0], r1 ; p1 - vld1.u8 {q6}, [r0], r1 ; p0 - vld1.u8 {q7}, [r0], r1 ; q0 - vld1.u8 {q8}, [r0], r1 ; q1 - vld1.u8 {q9}, [r0], r1 ; q2 - vld1.u8 {q10}, [r0], r1 ; q3 + push {lr} + add r1, r1, r1 ; double stride + ldr r12, [sp, #4] ; load thresh + sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines + vdup.u8 q2, r12 ; thresh + add r12, r0, r1, lsr #1 ; move src pointer up by 1 line + + vld1.u8 {q3}, [r0@128], r1 ; p3 + vld1.u8 {q4}, [r12@128], r1 ; p2 + vld1.u8 {q5}, [r0@128], r1 ; p1 + vld1.u8 {q6}, [r12@128], r1 ; p0 + vld1.u8 {q7}, [r0@128], r1 ; q0 + vld1.u8 {q8}, [r12@128], r1 ; q1 + vld1.u8 {q9}, [r0@128], r1 ; q2 + vld1.u8 {q10}, [r12@128], r1 ; q3 bl vp8_mbloop_filter_neon - sub r0, r0, r1, lsl #3 - add r0, r0, r1 - add r2, r0, r1 - add r3, r2, r1 - - vst1.u8 {q4}, [r0] ; store op2 - vst1.u8 {q5}, [r2] ; store op1 - vst1.u8 {q6}, [r3], r1 ; store op0 - add r12, r3, r1 - vst1.u8 {q7}, [r3] ; store oq0 - vst1.u8 {q8}, [r12], r1 ; store oq1 - vst1.u8 {q9}, [r12] ; store oq2 - - ldmia sp!, {pc} + sub r12, r12, r1, lsl #2 + add r0, r12, r1, lsr #1 + + vst1.u8 {q4}, [r12@128],r1 ; store op2 + vst1.u8 {q5}, [r0@128],r1 ; store op1 + vst1.u8 {q6}, [r12@128], r1 ; store op0 + vst1.u8 {q7}, [r0@128],r1 ; store oq0 + vst1.u8 {q8}, [r12@128] ; store oq1 + vst1.u8 {q9}, [r0@128] ; store oq2 + + pop {pc} ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon| ; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh, ; unsigned char *v) ; r0 unsigned char *u, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, ; sp+4 unsigned char *v + |vp8_mbloop_filter_horizontal_edge_uv_neon| PROC - stmdb sp!, {lr} - sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines - vld1.s8 {d2[], d3[]}, [r3] ; limit - ldr r3, [sp, #8] ; load v ptr - ldr r12, [sp, #4] ; load thresh pointer - sub r3, r3, r1, lsl #2 ; move v pointer down by 4 lines - - vld1.u8 {d6}, [r0], r1 ; p3 - vld1.u8 {d7}, [r3], r1 ; p3 - vld1.u8 {d8}, [r0], r1 ; p2 - vld1.u8 {d9}, [r3], r1 ; p2 - vld1.u8 {d10}, [r0], r1 ; p1 - vld1.u8 {d11}, [r3], r1 ; p1 - vld1.u8 {d12}, [r0], r1 ; p0 - vld1.u8 {d13}, [r3], r1 ; p0 - vld1.u8 {d14}, [r0], r1 ; q0 - vld1.u8 {d15}, [r3], r1 ; q0 - vld1.u8 {d16}, [r0], r1 ; q1 - vld1.u8 {d17}, [r3], r1 ; q1 - vld1.u8 {d18}, [r0], r1 ; q2 - vld1.u8 {d19}, [r3], r1 ; q2 - vld1.u8 {d20}, [r0], r1 ; q3 - vld1.u8 {d21}, [r3], r1 ; q3 - - vld1.s8 {d4[], d5[]}, [r12] ; thresh + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines + vdup.u8 q2, r12 ; thresh + ldr r12, [sp, #8] ; load v ptr + sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines + + vld1.u8 {d6}, [r0@64], r1 ; p3 + vld1.u8 {d7}, [r12@64], r1 ; p3 + vld1.u8 {d8}, [r0@64], r1 ; p2 + vld1.u8 {d9}, [r12@64], r1 ; p2 + vld1.u8 {d10}, [r0@64], r1 ; p1 + vld1.u8 {d11}, [r12@64], r1 ; p1 + vld1.u8 {d12}, [r0@64], r1 ; p0 + vld1.u8 {d13}, [r12@64], r1 ; p0 + vld1.u8 {d14}, [r0@64], r1 ; q0 + vld1.u8 {d15}, [r12@64], r1 ; q0 + vld1.u8 {d16}, [r0@64], r1 ; q1 + vld1.u8 {d17}, [r12@64], r1 ; q1 + vld1.u8 {d18}, [r0@64], r1 ; q2 + vld1.u8 {d19}, [r12@64], r1 ; q2 + vld1.u8 {d20}, [r0@64], r1 ; q3 + vld1.u8 {d21}, [r12@64], r1 ; q3 bl vp8_mbloop_filter_neon sub r0, r0, r1, lsl #3 - sub r3, r3, r1, lsl #3 + sub r12, r12, r1, lsl #3 add r0, r0, r1 - add r3, r3, r1 - - vst1.u8 {d8}, [r0], r1 ; store u op2 - vst1.u8 {d9}, [r3], r1 ; store v op2 - vst1.u8 {d10}, [r0], r1 ; store u op1 - vst1.u8 {d11}, [r3], r1 ; store v op1 - vst1.u8 {d12}, [r0], r1 ; store u op0 - vst1.u8 {d13}, [r3], r1 ; store v op0 - vst1.u8 {d14}, [r0], r1 ; store u oq0 - vst1.u8 {d15}, [r3], r1 ; store v oq0 - vst1.u8 {d16}, [r0], r1 ; store u oq1 - vst1.u8 {d17}, [r3], r1 ; store v oq1 - vst1.u8 {d18}, [r0], r1 ; store u oq2 - vst1.u8 {d19}, [r3], r1 ; store v oq2 - - ldmia sp!, {pc} + add r12, r12, r1 + + vst1.u8 {d8}, [r0@64], r1 ; store u op2 + vst1.u8 {d9}, [r12@64], r1 ; store v op2 + vst1.u8 {d10}, [r0@64], r1 ; store u op1 + vst1.u8 {d11}, [r12@64], r1 ; store v op1 + vst1.u8 {d12}, [r0@64], r1 ; store u op0 + vst1.u8 {d13}, [r12@64], r1 ; store v op0 + vst1.u8 {d14}, [r0@64], r1 ; store u oq0 + vst1.u8 {d15}, [r12@64], r1 ; store v oq0 + vst1.u8 {d16}, [r0@64], r1 ; store u oq1 + vst1.u8 {d17}, [r12@64], r1 ; store v oq1 + vst1.u8 {d18}, [r0@64], r1 ; store u oq2 + vst1.u8 {d19}, [r12@64], r1 ; store v oq2 + + pop {pc} ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon| ; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh) ; r0 unsigned char *src, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 int count (unused) +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, |vp8_mbloop_filter_vertical_edge_y_neon| PROC - stmdb sp!, {lr} + push {lr} + ldr r12, [sp, #4] ; load thresh sub r0, r0, #4 ; move src pointer down by 4 columns + vdup.s8 q2, r12 ; thresh + add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines vld1.u8 {d6}, [r0], r1 ; load first 8-line src data - ldr r12, [sp, #4] ; load thresh pointer + vld1.u8 {d7}, [r12], r1 ; load second 8-line src data vld1.u8 {d8}, [r0], r1 - sub sp, sp, #32 + vld1.u8 {d9}, [r12], r1 vld1.u8 {d10}, [r0], r1 + vld1.u8 {d11}, [r12], r1 vld1.u8 {d12}, [r0], r1 + vld1.u8 {d13}, [r12], r1 vld1.u8 {d14}, [r0], r1 + vld1.u8 {d15}, [r12], r1 vld1.u8 {d16}, [r0], r1 + vld1.u8 {d17}, [r12], r1 vld1.u8 {d18}, [r0], r1 + vld1.u8 {d19}, [r12], r1 vld1.u8 {d20}, [r0], r1 - - vld1.u8 {d7}, [r0], r1 ; load second 8-line src data - vld1.u8 {d9}, [r0], r1 - vld1.u8 {d11}, [r0], r1 - vld1.u8 {d13}, [r0], r1 - vld1.u8 {d15}, [r0], r1 - vld1.u8 {d17}, [r0], r1 - vld1.u8 {d19}, [r0], r1 - vld1.u8 {d21}, [r0], r1 + vld1.u8 {d21}, [r12], r1 ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -180,29 +168,17 @@ vtrn.8 q7, q8 vtrn.8 q9, q10 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - vld1.s8 {d2[], d3[]}, [r3] ; limit - mov r12, sp - vst1.u8 {q3}, [r12]! - vst1.u8 {q10}, [r12]! + sub r0, r0, r1, lsl #3 bl vp8_mbloop_filter_neon - sub r0, r0, r1, lsl #4 - - add r2, r0, r1 - - add r3, r2, r1 - - vld1.u8 {q3}, [sp]! - vld1.u8 {q10}, [sp]! + sub r12, r12, r1, lsl #3 ;transpose to 16x8 matrix vtrn.32 q3, q7 vtrn.32 q4, q8 vtrn.32 q5, q9 vtrn.32 q6, q10 - add r12, r3, r1 vtrn.16 q3, q5 vtrn.16 q4, q6 @@ -215,36 +191,30 @@ vtrn.8 q9, q10 ;store op2, op1, op0, oq0, oq1, oq2 - vst1.8 {d6}, [r0] - vst1.8 {d8}, [r2] - vst1.8 {d10}, [r3] - vst1.8 {d12}, [r12], r1 - add r0, r12, r1 - vst1.8 {d14}, [r12] - vst1.8 {d16}, [r0], r1 - add r2, r0, r1 - vst1.8 {d18}, [r0] - vst1.8 {d20}, [r2], r1 - add r3, r2, r1 - vst1.8 {d7}, [r2] - vst1.8 {d9}, [r3], r1 - add r12, r3, r1 - vst1.8 {d11}, [r3] + vst1.8 {d6}, [r0], r1 + vst1.8 {d7}, [r12], r1 + vst1.8 {d8}, [r0], r1 + vst1.8 {d9}, [r12], r1 + vst1.8 {d10}, [r0], r1 + vst1.8 {d11}, [r12], r1 + vst1.8 {d12}, [r0], r1 vst1.8 {d13}, [r12], r1 - add r0, r12, r1 - vst1.8 {d15}, [r12] - vst1.8 {d17}, [r0], r1 - add r2, r0, r1 - vst1.8 {d19}, [r0] - vst1.8 {d21}, [r2] - - ldmia sp!, {pc} + vst1.8 {d14}, [r0], r1 + vst1.8 {d15}, [r12], r1 + vst1.8 {d16}, [r0], r1 + vst1.8 {d17}, [r12], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d19}, [r12], r1 + vst1.8 {d20}, [r0] + vst1.8 {d21}, [r12] + + pop {pc} ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| ; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh, ; unsigned char *v) ; r0 unsigned char *u, ; r1 int pitch, @@ -253,30 +223,29 @@ ; sp const signed char *thresh, ; sp+4 unsigned char *v |vp8_mbloop_filter_vertical_edge_uv_neon| PROC - stmdb sp!, {lr} - sub r0, r0, #4 ; move src pointer down by 4 columns - vld1.s8 {d2[], d3[]}, [r3] ; limit - ldr r3, [sp, #8] ; load v ptr - ldr r12, [sp, #4] ; load thresh pointer - - sub r3, r3, #4 ; move v pointer down by 4 columns + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, #4 ; move u pointer down by 4 columns + vdup.u8 q2, r12 ; thresh + ldr r12, [sp, #8] ; load v ptr + sub r12, r12, #4 ; move v pointer down by 4 columns vld1.u8 {d6}, [r0], r1 ;load u data - vld1.u8 {d7}, [r3], r1 ;load v data + vld1.u8 {d7}, [r12], r1 ;load v data vld1.u8 {d8}, [r0], r1 - vld1.u8 {d9}, [r3], r1 + vld1.u8 {d9}, [r12], r1 vld1.u8 {d10}, [r0], r1 - vld1.u8 {d11}, [r3], r1 + vld1.u8 {d11}, [r12], r1 vld1.u8 {d12}, [r0], r1 - vld1.u8 {d13}, [r3], r1 + vld1.u8 {d13}, [r12], r1 vld1.u8 {d14}, [r0], r1 - vld1.u8 {d15}, [r3], r1 + vld1.u8 {d15}, [r12], r1 vld1.u8 {d16}, [r0], r1 - vld1.u8 {d17}, [r3], r1 + vld1.u8 {d17}, [r12], r1 vld1.u8 {d18}, [r0], r1 - vld1.u8 {d19}, [r3], r1 + vld1.u8 {d19}, [r12], r1 vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r3], r1 + vld1.u8 {d21}, [r12], r1 ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -294,19 +263,11 @@ vtrn.8 q7, q8 vtrn.8 q9, q10 - sub sp, sp, #32 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - mov r12, sp - vst1.u8 {q3}, [r12]! - vst1.u8 {q10}, [r12]! + sub r0, r0, r1, lsl #3 bl vp8_mbloop_filter_neon - sub r0, r0, r1, lsl #3 - sub r3, r3, r1, lsl #3 - - vld1.u8 {q3}, [sp]! - vld1.u8 {q10}, [sp]! + sub r12, r12, r1, lsl #3 ;transpose to 16x8 matrix vtrn.32 q3, q7 @@ -326,23 +287,23 @@ ;store op2, op1, op0, oq0, oq1, oq2 vst1.8 {d6}, [r0], r1 - vst1.8 {d7}, [r3], r1 + vst1.8 {d7}, [r12], r1 vst1.8 {d8}, [r0], r1 - vst1.8 {d9}, [r3], r1 + vst1.8 {d9}, [r12], r1 vst1.8 {d10}, [r0], r1 - vst1.8 {d11}, [r3], r1 + vst1.8 {d11}, [r12], r1 vst1.8 {d12}, [r0], r1 - vst1.8 {d13}, [r3], r1 + vst1.8 {d13}, [r12], r1 vst1.8 {d14}, [r0], r1 - vst1.8 {d15}, [r3], r1 + vst1.8 {d15}, [r12], r1 vst1.8 {d16}, [r0], r1 - vst1.8 {d17}, [r3], r1 + vst1.8 {d17}, [r12], r1 vst1.8 {d18}, [r0], r1 - vst1.8 {d19}, [r3], r1 - vst1.8 {d20}, [r0], r1 - vst1.8 {d21}, [r3], r1 + vst1.8 {d19}, [r12], r1 + vst1.8 {d20}, [r0] + vst1.8 {d21}, [r12] - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon| ; void vp8_mbloop_filter_neon() @@ -350,26 +311,19 @@ ; functions do the necessary load, transpose (if necessary), preserve (if ; necessary) and store. -; TODO: -; The vertical filter writes p3/q3 back out because two 4 element writes are -; much simpler than ordering and writing two 3 element sets (or three 2 elements -; sets, or whichever other combinations are possible). -; If we can preserve q3 and q10, the vertical filter will be able to avoid -; storing those values on the stack and reading them back after the filter. - ; r0,r1 PRESERVE -; r2 flimit -; r3 PRESERVE -; q1 limit +; r2 mblimit +; r3 limit + ; q2 thresh -; q3 p3 +; q3 p3 PRESERVE ; q4 p2 ; q5 p1 ; q6 p0 ; q7 q0 ; q8 q1 ; q9 q2 -; q10 q3 +; q10 q3 PRESERVE |vp8_mbloop_filter_neon| PROC @@ -378,12 +332,12 @@ vabd.u8 q12, q4, q5 ; abs(p2 - p1) vabd.u8 q13, q5, q6 ; abs(p1 - p0) vabd.u8 q14, q8, q7 ; abs(q1 - q0) - vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q1, q9, q8 ; abs(q2 - q1) vabd.u8 q0, q10, q9 ; abs(q3 - q2) vmax.u8 q11, q11, q12 vmax.u8 q12, q13, q14 - vmax.u8 q3, q3, q0 + vmax.u8 q1, q1, q0 vmax.u8 q15, q11, q12 vabd.u8 q12, q6, q7 ; abs(p0 - q0) @@ -391,44 +345,46 @@ ; vp8_hevmask vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1 vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1 - vmax.u8 q15, q15, q3 + vmax.u8 q15, q15, q1 - vld1.s8 {d4[], d5[]}, [r2] ; flimit + vdup.u8 q1, r3 ; limit + vdup.u8 q2, r2 ; mblimit vmov.u8 q0, #0x80 ; 0x80 - vadd.u8 q2, q2, q2 ; flimit * 2 - vadd.u8 q2, q2, q1 ; flimit * 2 + limit vcge.u8 q15, q1, q15 vabd.u8 q1, q5, q8 ; a = abs(p1 - q1) vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2 - vshr.u8 q1, q1, #1 ; a = a / 2 - vqadd.u8 q12, q12, q1 ; a = b + a - vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 + vmov.u16 q11, #3 ; #3 ; vp8_filter ; convert to signed veor q7, q7, q0 ; qs0 + vshr.u8 q1, q1, #1 ; a = a / 2 veor q6, q6, q0 ; ps0 veor q5, q5, q0 ; ps1 + + vqadd.u8 q12, q12, q1 ; a = b + a + veor q8, q8, q0 ; qs1 veor q4, q4, q0 ; ps2 veor q9, q9, q0 ; qs2 vorr q14, q13, q14 ; vp8_hevmask + vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 + vsubl.s8 q2, d14, d12 ; qs0 - ps0 vsubl.s8 q13, d15, d13 vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) - vadd.s16 q10, q2, q2 ; 3 * (qs0 - ps0) - vadd.s16 q11, q13, q13 + vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0) + vand q15, q15, q12 ; vp8_filter_mask - vadd.s16 q2, q2, q10 - vadd.s16 q13, q13, q11 + vmul.i16 q13, q13, q11 vmov.u8 q12, #3 ; #3 @@ -447,23 +403,19 @@ vand q13, q1, q14 ; Filter2 &= hev - vmov.u8 d7, #9 ; #9 - vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4) vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3) - vmov.u8 d6, #18 ; #18 + vmov q0, q15 vshr.s8 q2, q2, #3 ; Filter1 >>= 3 vshr.s8 q13, q13, #3 ; Filter2 >>= 3 - vmov q10, q15 + vmov q11, q15 vmov q12, q15 vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1) - vmov.u8 d5, #27 ; #27 - vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2) vbic q1, q1, q14 ; vp8_filter &= ~hev @@ -471,35 +423,43 @@ ; roughly 1/7th difference across boundary ; roughly 2/7th difference across boundary ; roughly 3/7th difference across boundary - vmov q11, q15 + + vmov.u8 d5, #9 ; #9 + vmov.u8 d4, #18 ; #18 + vmov q13, q15 vmov q14, q15 - vmlal.s8 q10, d2, d7 ; Filter2 * 9 - vmlal.s8 q11, d3, d7 - vmlal.s8 q12, d2, d6 ; Filter2 * 18 - vmlal.s8 q13, d3, d6 - vmlal.s8 q14, d2, d5 ; Filter2 * 27 + vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9 + vmlal.s8 q11, d3, d5 + vmov.u8 d5, #27 ; #27 + vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18 + vmlal.s8 q13, d3, d4 + vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27 vmlal.s8 q15, d3, d5 - vqshrn.s16 d20, q10, #7 ; u = clamp((63 + Filter2 * 9)>>7) - vqshrn.s16 d21, q11, #7 + + vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d1, q11, #7 vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7) vqshrn.s16 d25, q13, #7 vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7) vqshrn.s16 d29, q15, #7 - vqsub.s8 q11, q9, q10 ; s = clamp(qs2 - u) - vqadd.s8 q10, q4, q10 ; s = clamp(ps2 + u) + vmov.u8 q1, #0x80 ; 0x80 + + vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u) + vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u) vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u) vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u) vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u) vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u) - veor q9, q11, q0 ; *oq2 = s^0x80 - veor q4, q10, q0 ; *op2 = s^0x80 - veor q8, q13, q0 ; *oq1 = s^0x80 - veor q5, q12, q0 ; *op2 = s^0x80 - veor q7, q15, q0 ; *oq0 = s^0x80 - veor q6, q14, q0 ; *op0 = s^0x80 + + veor q9, q11, q1 ; *oq2 = s^0x80 + veor q4, q0, q1 ; *op2 = s^0x80 + veor q8, q13, q1 ; *oq1 = s^0x80 + veor q5, q12, q1 ; *op2 = s^0x80 + veor q7, q15, q1 ; *oq0 = s^0x80 + veor q6, q14, q1 ; *op0 = s^0x80 bx lr ENDP ; |vp8_mbloop_filter_neon|