rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_armv6;
rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_armv6;
- rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;
+ rtcd->loopfilter.simple_mb_v =
+ vp8_loop_filter_simple_vertical_edge_armv6;
rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_armv6;
- rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;
+ rtcd->loopfilter.simple_mb_h =
+ vp8_loop_filter_simple_horizontal_edge_armv6;
rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_armv6;
rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6;
;r0 unsigned char *src_ptr,
;r1 int src_pixel_step,
-;r2 const char *flimit,
+;r2 const char *blimit,
;r3 const char *limit,
;stack const char *thresh,
;stack int count
-;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_horizontal_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
sub sp, sp, #16 ; create temp buffer
ldr r9, [src], pstep ; p3
- ldr r4, [r2], #4 ; flimit
+ ldrb r4, [r2] ; blimit
ldr r10, [src], pstep ; p2
- ldr r2, [r3], #4 ; limit
+ ldrb r2, [r3] ; limit
ldr r11, [src], pstep ; p1
- uadd8 r4, r4, r4 ; flimit * 2
- ldr r3, [r6], #4 ; thresh
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r6] ; thresh
+ orr r2, r2, r2, lsl #8
mov count, count, lsl #1 ; 4-in-parallel
- uadd8 r4, r4, r2 ; flimit * 2 + limit
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
|Hnext8|
; vp8_filter_mask() function
sub sp, sp, #16 ; create temp buffer
ldr r9, [src], pstep ; p3
- ldr r4, [r2], #4 ; flimit
+ ldrb r4, [r2] ; blimit
ldr r10, [src], pstep ; p2
- ldr r2, [r3], #4 ; limit
+ ldrb r2, [r3] ; limit
ldr r11, [src], pstep ; p1
- uadd8 r4, r4, r4 ; flimit * 2
- ldr r3, [r6], #4 ; thresh
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r6] ; thresh
+ orr r2, r2, r2, lsl #8
mov count, count, lsl #1 ; 4-in-parallel
- uadd8 r4, r4, r2 ; flimit * 2 + limit
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
|MBHnext8|
sub sp, sp, #16 ; create temp buffer
ldr r6, [src], pstep ; load source data
- ldr r4, [r2], #4 ; flimit
+ ldrb r4, [r2] ; blimit
ldr r7, [src], pstep
- ldr r2, [r3], #4 ; limit
+ ldrb r2, [r3] ; limit
ldr r8, [src], pstep
- uadd8 r4, r4, r4 ; flimit * 2
- ldr r3, [r12], #4 ; thresh
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r12] ; thresh
+ orr r2, r2, r2, lsl #8
ldr lr, [src], pstep
mov count, count, lsl #1 ; 4-in-parallel
- uadd8 r4, r4, r2 ; flimit * 2 + limit
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
|Vnext8|
sub sp, sp, #16 ; create temp buffer
ldr r6, [src], pstep ; load source data
- ldr r4, [r2], #4 ; flimit
+ ldrb r4, [r2] ; blimit
pld [src, #23]
ldr r7, [src], pstep
- ldr r2, [r3], #4 ; limit
+ ldrb r2, [r3] ; limit
pld [src, #23]
ldr r8, [src], pstep
- uadd8 r4, r4, r4 ; flimit * 2
- ldr r3, [r12], #4 ; thresh
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r12] ; thresh
+ orr r2, r2, r2, lsl #8
pld [src, #23]
ldr lr, [src], pstep
mov count, count, lsl #1 ; 4-in-parallel
- uadd8 r4, r4, r2 ; flimit * 2 + limit
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
|MBVnext8|
; vp8_filter_mask() function
str lr, [sp, #8]
ldr lr, [src], pstep
+
TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
ldr lr, [sp, #8] ; load back (f)limit accumulator
beq mbvskip_filter ; skip filtering
+
;vp8_hevmask() function
;calculate high edge variance
smlabb r8, r6, lr, r7
smlatb r6, r6, lr, r7
smlabb r9, r10, lr, r7
+
smlatb r10, r10, lr, r7
ssat r8, #8, r8, asr #7
ssat r6, #8, r6, asr #7
MEND
+
src RN r0
pstep RN r1
;r0 unsigned char *src_ptr,
;r1 int src_pixel_step,
-;r2 const char *flimit,
-;r3 const char *limit,
-;stack const char *thresh,
-;stack int count
-
-; All 16 elements in flimit are equal. So, in the code, only one load is needed
-; for flimit. Same applies to limit. thresh is not used in simple looopfilter
+;r2 const char *blimit
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
- ldr r12, [r3] ; limit
+ ldrb r12, [r2] ; blimit
ldr r3, [src, -pstep, lsl #1] ; p1
ldr r4, [src, -pstep] ; p0
ldr r5, [src] ; q0
ldr r6, [src, pstep] ; q1
- ldr r7, [r2] ; flimit
+ orr r12, r12, r12, lsl #8 ; blimit
ldr r2, c0x80808080
- ldr r9, [sp, #40] ; count for 8-in-parallel
- uadd8 r7, r7, r7 ; flimit * 2
- mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time
- uadd8 r12, r7, r12 ; flimit * 2 + limit
+ orr r12, r12, r12, lsl #16 ; blimit
+ mov r9, #4 ; double the count. we're doing 4 at a time
mov lr, #0 ; need 0 in a couple places
|simple_hnext8|
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
- ldr r12, [r2] ; r12: flimit
+ ldrb r12, [r2] ; r12: blimit
ldr r2, c0x80808080
- ldr r7, [r3] ; limit
+ orr r12, r12, r12, lsl #8
; load soure data to r7, r8, r9, r10
ldrh r3, [src, #-2]
pld [src, #23] ; preload for next block
ldrh r4, [src], pstep
- uadd8 r12, r12, r12 ; flimit * 2
+ orr r12, r12, r12, lsl #16
ldrh r5, [src, #-2]
pld [src, #23]
ldrh r6, [src], pstep
- uadd8 r12, r12, r7 ; flimit * 2 + limit
pkhbt r7, r3, r4, lsl #16
ldrh r3, [src, #-2]
pld [src, #23]
ldrh r4, [src], pstep
- ldr r11, [sp, #40] ; count (r11) for 8-in-parallel
pkhbt r8, r5, r6, lsl #16
ldrh r5, [src, #-2]
pld [src, #23]
ldrh r6, [src], pstep
- mov r11, r11, lsl #1 ; 4-in-parallel
+ mov r11, #4 ; double the count. we're doing 4 at a time
|simple_vnext8|
; vp8_simple_filter_mask() function
extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
-extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
-extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
#endif
#if HAVE_ARMV7
vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
-void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void) u_ptr;
- (void) v_ptr;
- (void) uv_stride;
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-}
-
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
-void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void) u_ptr;
- (void) v_ptr;
- (void) uv_stride;
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-}
-
/* Horizontal B Filtering */
void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
-void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit)
{
- (void) u_ptr;
- (void) v_ptr;
- (void) uv_stride;
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
}
/* Vertical B Filtering */
vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
-void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit)
{
- (void) u_ptr;
- (void) v_ptr;
- (void) uv_stride;
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
}
#endif
extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_lf_normal_mb_v
#define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6
#undef vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6
+#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6
#undef vp8_lf_simple_b_v
#define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6
#undef vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6
+#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6
#undef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6