*q9s16 = vcombine_s16(d18s16, d19s16);
}
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride, int tx_type) {
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
+ int tx_type) {
uint8x8_t d26u8, d27u8;
int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
uint32x2_t d26u32, d27u32;
switch (tx_type) {
case 0: // idct_idct is not supported. Fall back to C
- vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+ vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
return;
case 1: // iadst_idct
// generate constants
q9s16 = vrshrq_n_s16(q9s16, 4);
d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
- dest += dest_stride;
+ dest += stride;
d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
- dest += dest_stride;
+ dest += stride;
d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
- dest += dest_stride;
+ dest += stride;
d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
- dest -= dest_stride;
+ dest -= stride;
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
- dest -= dest_stride;
+ dest -= stride;
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
- dest -= dest_stride;
+ dest -= stride;
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
}
*q15s16 = vsubq_s16(q5s16, q4s16);
}
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride, int tx_type) {
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
+ int tx_type) {
int i;
uint8_t *d1, *d2;
uint8x8_t d0u8, d1u8, d2u8, d3u8;
switch (tx_type) {
case 0: // idct_idct is not supported. Fall back to C
- vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+ vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
return;
case 1: // iadst_idct
// generate IDCT constants
}
d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
+ d1 += stride;
d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
+ d1 += stride;
d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
+ d1 += stride;
d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
+ d1 += stride;
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
+ d2 += stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
+ d2 += stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
+ d2 += stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
+ d2 += stride;
}
}
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
-void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride, int tx_type) {
+void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
int i, j;
DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
int16_t *outptr = out;
switch (tx_type) {
case DCT_DCT: // DCT in both horizontal and vertical
vpx_idct4_rows_dspr2(input, outptr);
- vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride);
break;
case ADST_DCT: // ADST in vertical, DCT in horizontal
vpx_idct4_rows_dspr2(input, outptr);
iadst4_dspr2(outptr, temp_out);
for (j = 0; j < 4; ++j)
- dest[j * dest_stride + i] = clip_pixel(
- ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) +
+ dest[j * stride + i]);
outptr += 4;
}
temp_in[i * 4 + j] = out[j * 4 + i];
}
}
- vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+ vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, stride);
break;
case ADST_ADST: // ADST in both directions
for (i = 0; i < 4; ++i) {
iadst4_dspr2(temp_in, temp_out);
for (j = 0; j < 4; ++j)
- dest[j * dest_stride + i] = clip_pixel(
- ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) +
+ dest[j * stride + i]);
}
break;
default: printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n"); break;
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
-void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride, int tx_type) {
+void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
int i, j;
DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
int16_t *outptr = out;
switch (tx_type) {
case DCT_DCT: // DCT in both horizontal and vertical
idct8_rows_dspr2(input, outptr, 8);
- idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ idct8_columns_add_blk_dspr2(&out[0], dest, stride);
break;
case ADST_DCT: // ADST in vertical, DCT in horizontal
idct8_rows_dspr2(input, outptr, 8);
iadst8_dspr2(&out[i * 8], temp_out);
for (j = 0; j < 8; ++j)
- dest[j * dest_stride + i] = clip_pixel(
- ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) +
+ dest[j * stride + i]);
}
break;
case DCT_ADST: // DCT in vertical, ADST in horizontal
temp_in[i * 8 + j] = out[j * 8 + i];
}
}
- idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+ idct8_columns_add_blk_dspr2(&temp_in[0], dest, stride);
break;
case ADST_ADST: // ADST in both directions
for (i = 0; i < 8; ++i) {
iadst8_dspr2(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- dest[j * dest_stride + i] = clip_pixel(
- ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) +
+ dest[j * stride + i]);
}
break;
default: printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); break;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
} else {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
specialize qw/vp9_iht4x4_16_add sse2/;
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
specialize qw/vp9_iht8x8_64_add sse2/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
} else {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/;
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
#
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
- add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+ add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
- add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+ add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
}
#include "vpx_dsp/inv_txfm.h"
void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8,
- int dest_stride, int bd) {
+ int stride, int bd) {
int i;
const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
const tran_low_t out0 = dct_const_round_shift(input[0] * cospi_16_64);
for (i = 0; i < 2; i++) {
d0 = vld1_u16(dest);
- d1 = vld1_u16(dest + dest_stride);
+ d1 = vld1_u16(dest + stride);
a = vreinterpretq_s16_u16(vcombine_u16(d0, d1));
a = vaddq_s16(dc, a);
a = vminq_s16(a, max);
b = vqshluq_n_s16(a, 0);
vst1_u16(dest, vget_low_u16(b));
- dest += dest_stride;
+ dest += stride;
vst1_u16(dest, vget_high_u16(b));
- dest += dest_stride;
+ dest += stride;
}
}
}
void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8,
- int dest_stride, int bd) {
+ int stride, int bd) {
DECLARE_ALIGNED(16, static const int32_t, kCospi32[4]) = { 0, 15137, 11585,
6270 };
const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
}
d0 = vreinterpret_s16_u16(vld1_u16(dst));
- dst += dest_stride;
+ dst += stride;
d1 = vreinterpret_s16_u16(vld1_u16(dst));
- dst += dest_stride;
+ dst += stride;
d2 = vreinterpret_s16_u16(vld1_u16(dst));
- dst += dest_stride;
+ dst += stride;
d3 = vreinterpret_s16_u16(vld1_u16(dst));
d01 = vcombine_s16(d0, d1);
d32 = vcombine_s16(d3, d2);
d32_u16 = vqshluq_n_s16(d32, 0);
vst1_u16(dest, vget_low_u16(d01_u16));
- dest += dest_stride;
+ dest += stride;
vst1_u16(dest, vget_high_u16(d01_u16));
- dest += dest_stride;
+ dest += stride;
vst1_u16(dest, vget_high_u16(d32_u16));
- dest += dest_stride;
+ dest += stride;
vst1_u16(dest, vget_low_u16(d32_u16));
}
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride)
+;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int stride)
;
; r0 int16_t input
; r1 uint8_t *dest
-; r2 int dest_stride)
+; r2 int stride)
|vpx_idct16x16_1_add_neon| PROC
ldrsh r0, [r0]
#include "vpx_ports/mem.h"
void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
uint8x8_t d2u8, d3u8, d30u8, d31u8;
uint64x1_t d2u64, d3u64, d4u64, d5u64;
uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
for (j = 0; j < 2; j++) {
d2u64 = vld1_u64((const uint64_t *)d1);
d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
- d1 += dest_stride;
+ d1 += stride;
d4u64 = vld1_u64((const uint64_t *)d1);
d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
- d1 += dest_stride;
+ d1 += stride;
q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
+ d2 += stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
- d2 += dest_stride;
+ d2 += stride;
}
}
}
; int16_t *pass1_output,
; int16_t skip_adding,
; uint8_t *dest,
-; int dest_stride)
+; int stride)
;
; r0 const int16_t *src
; r1 int16_t *output
; r2 int16_t *pass1_output
; r3 int16_t skip_adding
; r4 uint8_t *dest
-; r5 int dest_stride
+; r5 int stride
; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
; will be stored back into q8-q15 registers. This function will touch q0-q7
ldr r7, [sp, #28] ; dest used to save element 0-7
mov r9, r7 ; save dest pointer for later use
- ldr r8, [sp, #32] ; load dest_stride
+ ldr r8, [sp, #32] ; load stride
; stage 7
; load the data in pass1
vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q12, q12, d12 ; + dest[j * stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * stride + i]
vqmovun.s16 d12, q12 ; clip pixel
vqmovun.s16 d13, q13 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q12, q12, d12 ; + dest[j * stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * stride + i]
vqmovun.s16 d12, q12 ; clip pixel
vqmovun.s16 d13, q13 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q12, q12, d12 ; + dest[j * stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * stride + i]
vqmovun.s16 d12, q12 ; clip pixel
vqmovun.s16 d13, q13 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q12, q12, d12 ; + dest[j * stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * stride + i]
vqmovun.s16 d12, q12 ; clip pixel
vqmovun.s16 d13, q13 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
; store the data output 8,9,10,11,12,13,14,15
vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q8, q8, d12 ; + dest[j * stride + i]
vqmovun.s16 d12, q8 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
vld1.64 {d12}, [r7], r8 ; load destinatoin data
vrshr.s16 q9, q9, #6
- vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q9, q9, d13 ; + dest[j * stride + i]
vqmovun.s16 d13, q9 ; clip pixel
vst1.64 {d13}, [r9], r8 ; store the data
vld1.64 {d13}, [r7], r8 ; load destinatoin data
vrshr.s16 q2, q2, #6
- vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q2, q2, d12 ; + dest[j * stride + i]
vqmovun.s16 d12, q2 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
vld1.64 {d12}, [r7], r8 ; load destinatoin data
vrshr.s16 q3, q3, #6
- vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q3, q3, d13 ; + dest[j * stride + i]
vqmovun.s16 d13, q3 ; clip pixel
vst1.64 {d13}, [r9], r8 ; store the data
vld1.64 {d13}, [r7], r8 ; load destinatoin data
vrshr.s16 q4, q4, #6
- vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q4, q4, d12 ; + dest[j * stride + i]
vqmovun.s16 d12, q4 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
vld1.64 {d12}, [r7], r8 ; load destinatoin data
vrshr.s16 q5, q5, #6
- vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q5, q5, d13 ; + dest[j * stride + i]
vqmovun.s16 d13, q5 ; clip pixel
vst1.64 {d13}, [r9], r8 ; store the data
vld1.64 {d13}, [r7], r8 ; load destinatoin data
vrshr.s16 q14, q14, #6
- vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q14, q14, d12 ; + dest[j * stride + i]
vqmovun.s16 d12, q14 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
vld1.64 {d12}, [r7], r8 ; load destinatoin data
vrshr.s16 q15, q15, #6
- vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q15, q15, d13 ; + dest[j * stride + i]
vqmovun.s16 d13, q15 ; clip pixel
vst1.64 {d13}, [r9], r8 ; store the data
b end_idct16x16_pass2
; int16_t *pass1_output,
; int16_t skip_adding,
; uint8_t *dest,
-; int dest_stride)
+; int stride)
;
; r0 const tran_low_t *src
; r1 int16_t *output
; r2 int16_t *pass1_output
; r3 int16_t skip_adding
; r4 uint8_t *dest
-; r5 int dest_stride
+; r5 int stride
|vpx_idct16x16_256_add_neon_pass2_tran_low| PROC
LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
const int16x8_t s6, const int16x8_t s7,
int16_t *out, int16_t *pass1_output,
int16_t skip_adding, uint8_t *dest,
- int dest_stride) {
+ int stride) {
uint8_t *d;
uint8x8_t d12u8, d13u8;
int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
q1s16 = vld1q_s16(pass1_output);
pass1_output += 8;
d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
q12s16 = vaddq_s16(q0s16, q15s16);
q13s16 = vaddq_s16(q1s16, q14s16);
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
+ d += stride;
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
+ d += stride;
q14s16 = vsubq_s16(q1s16, q14s16);
q15s16 = vsubq_s16(q0s16, q15s16);
q11s16 = vld1q_s16(pass1_output);
pass1_output += 8;
d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
q12s16 = vaddq_s16(q10s16, q5s16);
q13s16 = vaddq_s16(q11s16, q4s16);
q12s16 = vrshrq_n_s16(q12s16, 6);
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
+ d += stride;
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
+ d += stride;
q4s16 = vsubq_s16(q11s16, q4s16);
q5s16 = vsubq_s16(q10s16, q5s16);
q1s16 = vld1q_s16(pass1_output);
pass1_output += 8;
d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
q12s16 = vaddq_s16(q0s16, q3s16);
q13s16 = vaddq_s16(q1s16, q2s16);
q12s16 = vrshrq_n_s16(q12s16, 6);
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
+ d += stride;
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
+ d += stride;
q2s16 = vsubq_s16(q1s16, q2s16);
q3s16 = vsubq_s16(q0s16, q3s16);
pass1_output += 8;
q11s16 = vld1q_s16(pass1_output);
d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
q12s16 = vaddq_s16(q10s16, q9s16);
q13s16 = vaddq_s16(q11s16, q8s16);
q12s16 = vrshrq_n_s16(q12s16, 6);
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
+ d += stride;
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
+ d += stride;
q8s16 = vsubq_s16(q11s16, q8s16);
q9s16 = vsubq_s16(q10s16, q9s16);
// store the data out 8,9,10,11,12,13,14,15
d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
q8s16 = vrshrq_n_s16(q8s16, 6);
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
+ d += stride;
d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
q9s16 = vrshrq_n_s16(q9s16, 6);
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
+ d += stride;
d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
q2s16 = vrshrq_n_s16(q2s16, 6);
q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
+ d += stride;
d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
q3s16 = vrshrq_n_s16(q3s16, 6);
q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
+ d += stride;
d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
q4s16 = vrshrq_n_s16(q4s16, 6);
q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
+ d += stride;
d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
q5s16 = vrshrq_n_s16(q5s16, 6);
q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
+ d += stride;
d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
+ dest += stride;
q14s16 = vrshrq_n_s16(q14s16, 6);
q14u16 =
vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
+ d += stride;
d12s64 = vld1_s64((int64_t *)dest);
q15s16 = vrshrq_n_s16(q15s16, 6);
void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out,
int16_t *pass1_output,
int16_t skip_adding, uint8_t *dest,
- int dest_stride) {
+ int stride) {
int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
int16x8x2_t q0x2s16;
idct16x16_256_add_neon_pass2(q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
q14s16, q15s16, out, pass1_output, skip_adding,
- dest, dest_stride);
+ dest, stride);
}
#if CONFIG_VP9_HIGHBITDEPTH
int16_t *out,
int16_t *pass1_output,
int16_t skip_adding,
- uint8_t *dest, int dest_stride) {
+ uint8_t *dest, int stride) {
int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
int16x8x2_t q0x2s16;
idct16x16_256_add_neon_pass2(q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
q14s16, q15s16, out, pass1_output, skip_adding,
- dest, dest_stride);
+ dest, stride);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
int16_t *pass1_output,
int16_t skip_adding, uint8_t *dest,
- int dest_stride);
+ int stride);
#if CONFIG_VP9_HIGHBITDEPTH
void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *input,
int16_t *output);
int16_t *output,
int16_t *pass1_output,
int16_t skip_adding,
- uint8_t *dest, int dest_stride);
+ uint8_t *dest, int stride);
#else
#define vpx_idct16x16_256_add_neon_pass1_tran_low \
vpx_idct16x16_256_add_neon_pass1
#endif // HAVE_NEON_ASM
void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
#if HAVE_NEON_ASM
int64_t store_reg[8];
#endif
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
vpx_idct16x16_256_add_neon_pass2_tran_low(input + 1, row_idct_output,
- pass1_output, 0, dest, dest_stride);
+ pass1_output, 0, dest, stride);
/* Parallel idct on the lower 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
- vpx_idct16x16_256_add_neon_pass2_tran_low(input + 8 * 16 + 1,
- row_idct_output + 8, pass1_output,
- 0, dest, dest_stride);
+ vpx_idct16x16_256_add_neon_pass2_tran_low(
+ input + 8 * 16 + 1, row_idct_output + 8, pass1_output, 0, dest, stride);
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
- pass1_output, 1, dest, dest_stride);
+ pass1_output, 1, dest, stride);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
row_idct_output + 8, pass1_output, 1,
- dest + 8, dest_stride);
+ dest + 8, stride);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
}
void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
#if HAVE_NEON_ASM
int64_t store_reg[8];
#endif
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
- pass1_output, 1, dest, dest_stride);
+ pass1_output, 1, dest, stride);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
row_idct_output + 8, pass1_output, 1,
- dest + 8, dest_stride);
+ dest + 8, stride);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
}
void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int i, j, dest_stride8;
uint8_t *d;
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
- dest_stride8 = dest_stride * 8;
+ dest_stride8 = stride * 8;
if (a1 >= 0) { // diff_positive_32_32
a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
q0u8 = vdupq_n_u8((uint8_t)a1);
for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
d = dest;
for (j = 0; j < 4; j++) {
- LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
- &q14u8, &q15u8);
+ LD_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
+ &q15u8);
ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q14u8, &q15u8);
- ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
- &q14u8, &q15u8);
+ ST_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
+ &q15u8);
d += dest_stride8;
}
}
for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
d = dest;
for (j = 0; j < 4; j++) {
- LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
- &q14u8, &q15u8);
+ LD_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
+ &q15u8);
SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q14u8, &q15u8);
- ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
- &q14u8, &q15u8);
+ ST_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
+ &q15u8);
d += dest_stride8;
}
}
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride)
+;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int stride)
;
; r0 int16_t input
; r1 uint8_t *dest
-; r2 int dest_stride)
+; r2 int stride)
|vpx_idct4x4_1_add_neon| PROC
ldrsh r0, [r0]
#include "vpx_dsp/inv_txfm.h"
void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
int i;
const int16_t out0 = dct_const_round_shift((int16_t)input[0] * cospi_16_64);
const int16_t out1 = dct_const_round_shift(out0 * cospi_16_64);
uint8x8_t b;
assert(!((intptr_t)dest % sizeof(uint32_t)));
- assert(!(dest_stride % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
for (i = 0; i < 2; i++) {
d = vld1_lane_u32((const uint32_t *)dest, d, 0);
- d = vld1_lane_u32((const uint32_t *)(dest + dest_stride), d, 1);
+ d = vld1_lane_u32((const uint32_t *)(dest + stride), d, 1);
a = vaddw_u8(vreinterpretq_u16_s16(dc), vreinterpret_u8_u32(d));
b = vqmovun_s16(vreinterpretq_s16_u16(a));
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(b), 0);
- dest += dest_stride;
+ dest += stride;
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(b), 1);
- dest += dest_stride;
+ dest += stride;
}
}
INCLUDE vpx_dsp/arm/idct_neon.asm.S
AREA Block, CODE, READONLY ; name this block of code
-;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
;
; r0 int16_t input
; r1 uint8_t *dest
-; r2 int dest_stride)
+; r2 int stride)
|vpx_idct4x4_16_add_neon| PROC
vld1.32 {d27[1]}, [r1], r2
vld1.32 {d27[0]}, [r1] ; no post-increment
- ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+ ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]
vaddw.u8 q8, q8, d26
vaddw.u8 q9, q9, d27
#include "vpx_dsp/txfm_common.h"
void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
const uint8_t *dst = dest;
const int16x4_t cospis = vld1_s16(kCospi);
uint32x2_t dest01_u32 = vdup_n_u32(0);
uint16x8_t d01_u16, d32_u16;
assert(!((intptr_t)dest % sizeof(uint32_t)));
- assert(!(dest_stride % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
// Rows
a0 = load_tran_low_to_s16q(input);
a1 = vrshrq_n_s16(a1, 4);
dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 0);
- dst += dest_stride;
+ dst += stride;
dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 1);
- dst += dest_stride;
+ dst += stride;
dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1);
- dst += dest_stride;
+ dst += stride;
dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0);
d01_u16 =
d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16));
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 0);
- dest += dest_stride;
+ dest += stride;
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 1);
- dest += dest_stride;
+ dest += stride;
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1);
- dest += dest_stride;
+ dest += stride;
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0);
}
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride)
+;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int stride)
;
; r0 int16_t input
; r1 uint8_t *dest
-; r2 int dest_stride)
+; r2 int stride)
|vpx_idct8x8_1_add_neon| PROC
ldrsh r0, [r0]
#include "vpx_ports/mem.h"
void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
int i;
const int16_t out0 = dct_const_round_shift(input[0] * cospi_16_64);
const int16_t out1 = dct_const_round_shift(out0 * cospi_16_64);
for (i = 0; i < 2; i++) {
d0 = vld1_u8(dst);
- dst += dest_stride;
+ dst += stride;
d1 = vld1_u8(dst);
- dst += dest_stride;
+ dst += stride;
d2 = vld1_u8(dst);
- dst += dest_stride;
+ dst += stride;
d3 = vld1_u8(dst);
- dst += dest_stride;
+ dst += stride;
d0_u16 = vaddw_u8(dc_u16, d0);
d1_u16 = vaddw_u8(dc_u16, d1);
d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16));
vst1_u8(dest, d0);
- dest += dest_stride;
+ dest += stride;
vst1_u8(dest, d1);
- dest += dest_stride;
+ dest += stride;
vst1_u8(dest, d2);
- dest += dest_stride;
+ dest += stride;
vst1_u8(dest, d3);
- dest += dest_stride;
+ dest += stride;
}
}
MEND
AREA Block, CODE, READONLY ; name this block of code
-;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int stride)
;
; r0 int16_t input
; r1 uint8_t *dest
-; r2 int dest_stride)
+; r2 int stride)
|vpx_idct8x8_64_add_neon| PROC
push {r4-r9}
vld1.64 {d6}, [r1], r2
vld1.64 {d7}, [r1]
- ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]
vaddw.u8 q8, q8, d0
vaddw.u8 q9, q9, d1
vaddw.u8 q10, q10, d2
bx lr
ENDP ; |vpx_idct8x8_64_add_neon|
-;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int stride)
;
; r0 int16_t input
; r1 uint8_t *dest
-; r2 int dest_stride)
+; r2 int stride)
|vpx_idct8x8_12_add_neon| PROC
push {r4-r9}
vld1.64 {d6}, [r1], r2
vld1.64 {d7}, [r1]
- ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]
vaddw.u8 q8, q8, d0
vaddw.u8 q9, q9, d1
vaddw.u8 q10, q10, d2
static INLINE void add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
int16x8_t a3, int16x8_t a4, int16x8_t a5,
int16x8_t a6, int16x8_t a7, uint8_t *dest,
- const int dest_stride) {
+ const int stride) {
const uint8_t *dst = dest;
uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
a7 = vrshrq_n_s16(a7, 5);
d0 = vld1_u8(dst);
- dst += dest_stride;
+ dst += stride;
d1 = vld1_u8(dst);
- dst += dest_stride;
+ dst += stride;
d2 = vld1_u8(dst);
- dst += dest_stride;
+ dst += stride;
d3 = vld1_u8(dst);
- dst += dest_stride;
+ dst += stride;
d4 = vld1_u8(dst);
- dst += dest_stride;
+ dst += stride;
d5 = vld1_u8(dst);
- dst += dest_stride;
+ dst += stride;
d6 = vld1_u8(dst);
- dst += dest_stride;
+ dst += stride;
d7 = vld1_u8(dst);
d0_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), d0);
d7 = vqmovun_s16(vreinterpretq_s16_u16(d7_u16));
vst1_u8(dest, d0);
- dest += dest_stride;
+ dest += stride;
vst1_u8(dest, d1);
- dest += dest_stride;
+ dest += stride;
vst1_u8(dest, d2);
- dest += dest_stride;
+ dest += stride;
vst1_u8(dest, d3);
- dest += dest_stride;
+ dest += stride;
vst1_u8(dest, d4);
- dest += dest_stride;
+ dest += stride;
vst1_u8(dest, d5);
- dest += dest_stride;
+ dest += stride;
vst1_u8(dest, d6);
- dest += dest_stride;
+ dest += stride;
vst1_u8(dest, d7);
}
void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
const int16x8_t cospis = vld1q_s16(kCospi);
const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28
IDCT8x8_1D(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
IDCT8x8_1D(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
- add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, dest_stride);
+ add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, stride);
}
static INLINE void IDCT8x4_1D(const int16x4_t cospis0, const int16x4_t cospisd0,
}
void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
const int16x8_t cospis = vld1q_s16(kCospi);
const int16x8_t cospisd = vaddq_s16(cospis, cospis);
const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
transpose_s16_4x8(b8, b9, b10, b11, b4, b5, b6, b7, &a0, &a1, &a2, &a3);
IDCT8x4_1D(cospis0, cospisd0, cospisd1, &a0, &a1, &a2, &a3, &a4, &a5, &a6,
&a7);
- add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, dest_stride);
+ add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, stride);
}
}
}
-void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
+void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
int i;
tran_high_t a1, e1;
tran_low_t tmp[4];
for (i = 0; i < 4; i++) {
e1 = ip[0] >> 1;
a1 = ip[0] - e1;
- dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
- dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
- dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
- dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
+ dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
+ dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
+ dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
+ dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
ip++;
dest++;
}
}
}
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i;
tran_high_t a1;
tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
dest[1] = clip_pixel_add(dest[1], a1);
dest[2] = clip_pixel_add(dest[2], a1);
dest[3] = clip_pixel_add(dest[3], a1);
- dest += dest_stride;
+ dest += stride;
}
}
}
void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
- int dest_stride, int bd) {
+ int stride, int bd) {
int i;
tran_high_t a1, e1;
tran_low_t tmp[4];
for (i = 0; i < 4; i++) {
e1 = ip[0] >> 1;
a1 = ip[0] - e1;
- dest[dest_stride * 0] =
- highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
- dest[dest_stride * 1] =
- highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
- dest[dest_stride * 2] =
- highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
- dest[dest_stride * 3] =
- highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
+ dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+ dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
+ dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
+ dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
ip++;
dest++;
}
}
void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int dest_stride, int bd) {
+ int stride, int bd) {
int i;
tran_high_t a1;
tran_low_t out =
dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
- dest += dest_stride;
+ dest += stride;
}
}
out; \
})
-void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride);
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output);
-void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride);
+void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
void iadst4_dspr2(const int16_t *input, int16_t *output);
void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride);
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
void iadst8_dspr2(const int16_t *input, int16_t *output);
void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride);
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
void iadst16_dspr2(const int16_t *input, int16_t *output);
#endif // #if HAVE_DSPR2
}
}
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
int i;
int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
int step1_8, step1_9, step1_10, step1_11;
"add %[load6], %[step1_1], %[step1_6] \n\t"
"add %[load6], %[load6], %[step1_14] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
"add %[load5], %[step1_2], %[step1_5] \n\t"
"add %[load6], %[step1_3], %[step1_4] \n\t"
"add %[load6], %[load6], %[step1_12] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
"sub %[load5], %[step1_3], %[step1_4] \n\t"
"sub %[load6], %[step1_2], %[step1_5] \n\t"
"add %[load6], %[load6], %[step1_10] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"sub %[load5], %[step1_1], %[step1_6] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
"sub %[load6], %[step1_0], %[step1_7] \n\t"
"add %[load6], %[load6], %[step1_8] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
"sub %[load5], %[step1_0], %[step1_7] \n\t"
"sub %[load6], %[step1_1], %[step1_6] \n\t"
"sub %[load6], %[load6], %[step1_9] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
"sub %[load5], %[step1_2], %[step1_5] \n\t"
"sub %[load6], %[step1_3], %[step1_4] \n\t"
"sub %[load6], %[load6], %[step1_11] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
"add %[load5], %[step1_3], %[step1_4] \n\t"
"add %[load6], %[step1_2], %[step1_5] \n\t"
"sub %[load6], %[load6], %[step1_13] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
"add %[load5], %[step1_1], %[step1_6] \n\t"
"add %[load6], %[step1_0], %[step1_7] \n\t"
"sub %[load6], %[load6], %[step1_15] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
: [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
[load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
:
- [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+ [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
[step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
[step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
[step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
}
void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
uint32_t pos = 45;
idct16_rows_dspr2(input, out, 16);
// Then transform columns and add to dest
- idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+ idct16_cols_add_blk_dspr2(out, dest, stride);
}
void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
int16_t *outptr = out;
uint32_t i;
}
// Then transform columns
- idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+ idct16_cols_add_blk_dspr2(out, dest, stride);
}
void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
uint32_t pos = 45;
int32_t out;
int32_t r;
"sw %[vector_2], 4(%[dest]) \n\t"
"sw %[vector_3], 8(%[dest]) \n\t"
"sw %[vector_4], 12(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
: [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
[vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
[vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
[dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
}
} else {
/* use quad-byte
"sw %[vector_2], 4(%[dest]) \n\t"
"sw %[vector_3], 8(%[dest]) \n\t"
"sw %[vector_4], 12(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
: [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
[vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
[vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
[dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
}
}
}
#include "vpx_dsp/txfm_common.h"
#if HAVE_DSPR2
-void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
for (i = 0; i < 32; ++i) {
dest_pix = dest + i;
- dest_pix1 = dest + i + 31 * dest_stride;
+ dest_pix1 = dest + i + 31 * stride;
__asm__ __volatile__(
"lh %[load1], 2(%[input]) \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_1], %[step2_30] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_2], %[step2_29] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_3], %[step2_28] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+ : [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
[step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
[step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
[step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
"add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_14] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_13] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_12] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
- [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+ : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+ [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+ [step3_15] "r"(step3_15));
__asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_5], %[step1_26] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_6], %[step1_25] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_7], %[step1_24] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4),
+ : [cm] "r"(cm), [stride] "r"(stride), [step1_4] "r"(step1_4),
[step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
[step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
[step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
"add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_14] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_13] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_12] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
- [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+ : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+ [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+ [step3_15] "r"(step3_15));
__asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_9], %[step1_22] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_10], %[step1_21] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_11], %[step1_20] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8),
+ : [cm] "r"(cm), [stride] "r"(stride), [step1_8] "r"(step1_8),
[step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
[step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
[step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
"add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_14] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_13] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_12] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
- [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+ : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+ [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+ [step3_15] "r"(step3_15));
__asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_13], %[step2_18] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_14], %[step2_17] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_15], %[step2_16] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
- [step1_14] "r"(step1_14), [step1_15] "r"(step1_15),
- [step2_16] "r"(step2_16), [step2_17] "r"(step2_17),
- [step2_18] "r"(step2_18), [step2_19] "r"(step2_19));
+ : [cm] "r"(cm), [stride] "r"(stride), [step1_12] "r"(step1_12),
+ [step1_13] "r"(step1_13), [step1_14] "r"(step1_14),
+ [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
+ [step2_17] "r"(step2_17), [step2_18] "r"(step2_18),
+ [step2_19] "r"(step2_19));
step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
"add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_14] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_13] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_12] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
- [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+ : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+ [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+ [step3_15] "r"(step3_15));
input += 32;
}
}
void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
int16_t *outptr = out;
uint32_t pos = 45;
idct32_rows_dspr2(input, outptr, 32);
// Columns
- vpx_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
+ vpx_idct32_cols_add_blk_dspr2(out, dest, stride);
}
void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
}
void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
int16_t step_0, step_1, step_2, step_3;
int Temp0, Temp1, Temp2, Temp3;
const int const_2_power_13 = 8192;
"add %[Temp0], %[step_1], %[step_2] \n\t"
"lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"addi %[Temp0], %[Temp0], 8 \n\t"
"sra %[Temp0], %[Temp0], 4 \n\t"
"sub %[Temp0], %[step_1], %[step_2] \n\t"
"lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"addi %[Temp0], %[Temp0], 8 \n\t"
"sra %[Temp0], %[Temp0], 4 \n\t"
"sub %[Temp0], %[step_0], %[step_3] \n\t"
"lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"addi %[Temp0], %[Temp0], 8 \n\t"
"sra %[Temp0], %[Temp0], 4 \n\t"
: [const_2_power_13] "r"(const_2_power_13),
[cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
[cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
- [dest_stride] "r"(dest_stride));
+ [stride] "r"(stride));
input += 4;
}
}
-void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
int16_t *outptr = out;
uint32_t pos = 45;
vpx_idct4_rows_dspr2(input, outptr);
// Columns
- vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride);
}
-void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
int a1, absa1;
int r;
int32_t out;
"lw %[t2], 0(%[dest]) \n\t"
"subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
"sw %[vector_a], 0(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
: [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
}
} else {
/* use quad-byte
"lw %[t2], 0(%[dest]) \n\t"
"addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
"sw %[vector_a], 0(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
: [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
}
}
}
}
}
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride) {
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
int Temp0, Temp1, Temp2, Temp3;
int i;
"add %[Temp0], %[step1_1], %[step1_6] \n\t"
"lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
"add %[Temp0], %[step1_2], %[step1_5] \n\t"
"lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
"add %[Temp0], %[step1_3], %[step1_4] \n\t"
"lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
"sub %[Temp0], %[step1_3], %[step1_4] \n\t"
"lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
"sub %[Temp0], %[step1_2], %[step1_5] \n\t"
"lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
"sub %[Temp0], %[step1_1], %[step1_6] \n\t"
"lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
"sub %[Temp0], %[step1_0], %[step1_7] \n\t"
"lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
[cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
[cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
[cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
- [dest_stride] "r"(dest_stride));
+ [stride] "r"(stride));
input += 8;
}
}
-void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
int16_t *outptr = out;
uint32_t pos = 45;
idct8_rows_dspr2(input, outptr, 8);
// Then transform columns and add to dest
- idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ idct8_columns_add_blk_dspr2(&out[0], dest, stride);
}
-void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
int16_t *outptr = out;
uint32_t pos = 45;
: [outptr] "r"(outptr));
// Then transform columns and add to dest
- idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ idct8_columns_add_blk_dspr2(&out[0], dest, stride);
}
-void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
uint32_t pos = 45;
int32_t out;
int32_t r;
"subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
"sw %[vector_1], 0(%[dest]) \n\t"
"sw %[vector_2], 4(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
: [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
[vector_2] "=&r"(vector_2), [dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
}
} else {
/* use quad-byte
"addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
"sw %[vector_1], 0(%[dest]) \n\t"
"sw %[vector_2], 4(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
: [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
[vector_2] "=&r"(vector_2), [dest] "+r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
}
}
}
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
- add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_iwht4x4_16_add sse2/;
- add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct4x4_1_add neon/;
- add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct32x32_1_add sse2/;
- add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
} else {
- add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct4x4_16_add neon sse2/;
- add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct4x4_1_add neon sse2/;
- add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_1_add neon sse2/;
- add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_256_add neon sse2/;
- add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_10_add neon sse2/;
- add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_1_add neon sse2/;
- add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_135_add neon sse2/, "$ssse3_x86_64";
# Need to add 135 eob idct32x32 implementations.
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
- add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_34_add neon sse2/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_1_add neon sse2/;
- add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;
- add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct8x8_64_add sse2/;
- add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct8x8_12_add sse2/;
- add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct16x16_256_add sse2/;
- add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct16x16_10_add sse2/;
} # CONFIG_EMULATE_HARDWARE
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
} else {
- add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct4x4_1_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct4x4_16_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_256_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
$vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
$vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
- add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_iwht4x4_1_add msa/;
- add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_iwht4x4_16_add msa sse2/;
} # CONFIG_EMULATE_HARDWARE
} # CONFIG_VP9_HIGHBITDEPTH