INSTANTIATE_TEST_CASE_P(
MMX, Loop8Test6Param,
::testing::Values(
- make_tuple(&vpx_lpf_horizontal_4_mmx, &vpx_lpf_horizontal_4_c, 8, 1),
+ make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_mmx>,
+ &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
make_tuple(&wrapper_nc<vpx_lpf_vertical_4_mmx>,
&wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1)));
#endif // HAVE_MMX
&wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
make_tuple(&wrapper_nc<vpx_lpf_vertical_8_neon>,
&wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
- make_tuple(&vpx_lpf_horizontal_4_neon,
- &vpx_lpf_horizontal_4_c, 8, 1),
+ make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_neon>,
+ &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
make_tuple(&wrapper_nc<vpx_lpf_vertical_4_neon>,
&wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1)));
INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
DSPR2, Loop8Test6Param,
::testing::Values(
- make_tuple(&vpx_lpf_horizontal_4_dspr2, &vpx_lpf_horizontal_4_c, 8, 1),
+ make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_dspr2>,
+ &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_dspr2>,
&wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
make_tuple(&vpx_lpf_horizontal_16_dspr2,
INSTANTIATE_TEST_CASE_P(
MSA, Loop8Test6Param,
::testing::Values(
- make_tuple(&vpx_lpf_horizontal_4_msa, &vpx_lpf_horizontal_4_c, 8, 1),
+ make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_msa>,
+ &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_msa>,
&wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
} else {
if (mask_4x4_int & 1)
vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ lfi->hev_thr);
else if (mask_4x4_int & 2)
vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
- lfin->lim, lfin->hev_thr, 1);
+ lfin->lim, lfin->hev_thr);
}
count = 2;
} else {
if (mask_4x4_int & 1)
vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ lfi->hev_thr);
}
} else if (mask_4x4 & 1) {
if ((mask_4x4 & 3) == 3) {
} else {
if (mask_4x4_int & 1)
vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ lfi->hev_thr);
else if (mask_4x4_int & 2)
vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
- lfin->lim, lfin->hev_thr, 1);
+ lfin->lim, lfin->hev_thr);
}
count = 2;
} else {
- vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+ vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
if (mask_4x4_int & 1)
vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ lfi->hev_thr);
}
} else if (mask_4x4_int & 1) {
vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ lfi->hev_thr);
}
}
s += 8 * count;
} else {
if (mask_4x4_int & 1)
vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ lfi->hev_thr);
else if (mask_4x4_int & 2)
vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
- lfin->lim, lfin->hev_thr, 1);
+ lfin->lim, lfin->hev_thr);
}
count = 2;
} else {
if (mask_4x4_int & 1)
vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ lfi->hev_thr);
}
} else if (mask_4x4 & 1) {
if ((mask_4x4 & 3) == 3) {
} else {
if (mask_4x4_int & 1)
vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ lfi->hev_thr);
else if (mask_4x4_int & 2)
vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
- lfin->lim, lfin->hev_thr, 1);
+ lfin->lim, lfin->hev_thr);
}
count = 2;
} else {
- vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+ vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
if (mask_4x4_int & 1)
vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ lfi->hev_thr);
}
} else if (mask_4x4_int & 1) {
vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ lfi->hev_thr);
}
}
s += 8 * count;
; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
;
; void vpx_lpf_horizontal_4_neon(uint8_t *s,
; int p /* pitch */,
; const uint8_t *blimit,
; const uint8_t *limit,
-; const uint8_t *thresh,
-; int count)
+; const uint8_t *thresh)
;
; r0 uint8_t *s,
; r1 int p, /* pitch */
; r2 const uint8_t *blimit,
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
-; sp+4 int count
|vpx_lpf_horizontal_4_neon| PROC
push {lr}
vld1.8 {d0[]}, [r2] ; duplicate *blimit
- ldr r12, [sp, #8] ; load count
ldr r2, [sp, #4] ; load thresh
add r1, r1, r1 ; double pitch
- cmp r12, #0
- beq end_vpx_lf_h_edge
-
vld1.8 {d1[]}, [r3] ; duplicate *limit
vld1.8 {d2[]}, [r2] ; duplicate *thresh
-count_lf_h_loop
sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
add r3, r2, r1, lsr #1 ; set to 3 lines down
vst1.u8 {d6}, [r2@64], r1 ; store oq0
vst1.u8 {d7}, [r3@64], r1 ; store oq1
- add r0, r0, #8
- subs r12, r12, #1
- bne count_lf_h_loop
-
-end_vpx_lf_h_edge
pop {pc}
ENDP ; |vpx_lpf_horizontal_4_neon|
int pitch,
const uint8_t *blimit,
const uint8_t *limit,
- const uint8_t *thresh,
- int count) {
+ const uint8_t *thresh) {
int i;
uint8_t *s, *psrc;
uint8x8_t dblimit, dlimit, dthresh;
uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
- if (count == 0) // end_vpx_lf_h_edge
- return;
-
dblimit = vld1_u8(blimit);
dlimit = vld1_u8(limit);
dthresh = vld1_u8(thresh);
psrc = src - (pitch << 2);
- for (i = 0; i < count; i++) {
+ for (i = 0; i < 1; i++) {
s = psrc + i * 8;
d3u8 = vld1_u8(s);
void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh, int count) {
+ const uint8_t *thresh) {
int i;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
- for (i = 0; i < 8 * count; ++i) {
+ for (i = 0; i < 8; ++i) {
const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
const int8_t mask = filter_mask(*limit, *blimit,
const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
- vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
- vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
+ vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
+ vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
}
void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
const uint8_t *b_limit_ptr,
const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr,
- int32_t count) {
+ const uint8_t *thresh_ptr) {
uint64_t p1_d, p0_d, q0_d, q1_d;
v16u8 mask, hev, flat, thresh, b_limit, limit;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
- (void)count;
-
/* load vector elements */
LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
int pitch,
const uint8_t *blimit,
const uint8_t *limit,
- const uint8_t *thresh,
- int count) {
+ const uint8_t *thresh) {
uint8_t i;
uint32_t mask;
uint32_t hev;
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
- vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
- vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+ vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
+ vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
}
void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
-add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_horizontal_4 mmx neon dspr2 msa/;
add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
; int src_pixel_step,
; const char *blimit,
; const char *limit,
-; const char *thresh,
-; int count
+; const char *thresh
;)
global sym(vpx_lpf_horizontal_4_mmx) PRIVATE
sym(vpx_lpf_horizontal_4_mmx):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
+ SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
mov rsi, arg(0) ;src_ptr
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
- movsxd rcx, dword ptr arg(5) ;count
-.next8_h:
mov rdx, arg(3) ;limit
movq mm7, [rdx]
mov rdi, rsi ; rdi points to row +1 for indirect addressing
pxor mm7, [GLOBAL(t80)] ; unoffset
movq [rdi], mm7 ; write back
- add rsi,8
- neg rax
- dec rcx
- jnz .next8_h
-
add rsp, 32
pop rsp
; begin epilog