orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
+ subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
- sub r4, r4, r7 ; substract negative differences from sum
+ sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
- sub r4, r4, r7 ; substract negative differences from sum
+ sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
+ subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
+ subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
+ subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
pxor mm7, [GLOBAL(t80)] ; unoffset
; mm7 = q1
- ; tranpose and write back
+ ; transpose and write back
; mm1 = 72 62 52 42 32 22 12 02
; mm6 = 73 63 53 43 33 23 13 03
; mm3 = 74 64 54 44 34 24 14 04
pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01
pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06
- ; tranpose and write back
+ ; transpose and write back
movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00
movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00
; start work on filters
B_FILTER 2
- ; tranpose and write back - only work on q1, q0, p0, p1
+ ; transpose and write back - only work on q1, q0, p0, p1
BV_TRANSPOSE
; store 16-line result
; start work on filters
B_FILTER 2
- ; tranpose and write back - only work on q1, q0, p0, p1
+ ; transpose and write back - only work on q1, q0, p0, p1
BV_TRANSPOSE
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
; reg1 = output[first_offset]
; reg2 = output[second_offset]
; for proper address calculation, the last offset used when manipulating
- ; output, wethere reading or storing) must be passed in. use 0 for first
+ ; output, whether reading or storing) must be passed in. use 0 for first
; use.
MACRO
LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
; output[first_offset] = reg1
; output[second_offset] = reg2
; for proper address calculation, the last offset used when manipulating
- ; output, wethere reading or storing) must be passed in. use 0 for first
+ ; output, whether reading or storing) must be passed in. use 0 for first
; use.
MACRO
STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
; TODO(cd): have special case to re-use constants when they are similar for
; consecutive butterflies
; TODO(cd): have special case when both constants are the same, do the
- ; additions/substractions before the multiplies.
+ ; additions/subtractions before the multiplies.
; generate the constants
; generate scalar constants
mov r8, #$first_constant & 0xFF00
vmull.s16 q11, $regB, d31
vmull.s16 q12, $regC, d31
; (used) five for intermediate (q8-q12), one for constants (q15)
- ; do some addition/substractions (to get back two register)
+ ; do some addition/subtractions (to get back two register)
vsub.s32 q8, q8, q10
vsub.s32 q9, q9, q11
; do more multiplications (ordered for maximum latency hiding)
vmull.s16 q11, $regA, d30
vmull.s16 q15, $regB, d30
; (used) six for intermediate (q8-q12, q15)
- ; do more addition/substractions
+ ; do more addition/subtractions
vadd.s32 q11, q12, q11
vadd.s32 q10, q10, q15
; (used) four for intermediate (q8-q11)
pxor mm7, [GLOBAL(t80)] ; unoffset
; mm7 = q1
- ; tranpose and write back
+ ; transpose and write back
; mm1 = 72 62 52 42 32 22 12 02
; mm6 = 73 63 53 43 33 23 13 03
; mm3 = 74 64 54 44 34 24 14 04
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we tranpose the columns (that
+ // as the first pass results are transposed, we transpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
int pass;
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we tranpose the columns (that
+ // as the first pass results are transposed, we transpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
int pass;
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we tranpose the columns (that
+ // as the first pass results are transposed, we transpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
int pass;
in3 = _mm_slli_epi16(in3, 4);
// if (i == 0 && input[0]) input[0] += 1;
{
- // The mask will only contain wether the first value is zero, all
+ // The mask will only contain whether the first value is zero, all
// other comparison will fail as something shifted by 4 (above << 4)
// can never be equal to one. To increment in the non-zero case, we
// add the mask and one for the first element:
}
// Do the two transform/transpose passes
for (pass = 0; pass < 2; ++pass) {
- // Transform 1/2: Add/substract
+ // Transform 1/2: Add/subtract
const __m128i r0 = _mm_add_epi16(in0, in3);
const __m128i r1 = _mm_add_epi16(in1, in2);
const __m128i r2 = _mm_sub_epi16(in1, in2);
for (pass = 0; pass < 2; pass++) {
// To store results of each pass before the transpose.
__m128i res0, res1, res2, res3, res4, res5, res6, res7;
- // Add/substract
+ // Add/subtract
const __m128i q0 = _mm_add_epi16(in0, in7);
const __m128i q1 = _mm_add_epi16(in1, in6);
const __m128i q2 = _mm_add_epi16(in2, in5);
const __m128i q7 = _mm_sub_epi16(in0, in7);
// Work on first four results
{
- // Add/substract
+ // Add/subtract
const __m128i r0 = _mm_add_epi16(q0, q3);
const __m128i r1 = _mm_add_epi16(q1, q2);
const __m128i r2 = _mm_sub_epi16(q1, q2);
// Combine
const __m128i r0 = _mm_packs_epi32(s0, s1);
const __m128i r1 = _mm_packs_epi32(s2, s3);
- // Add/substract
+ // Add/subtract
const __m128i x0 = _mm_add_epi16(q4, r0);
const __m128i x1 = _mm_sub_epi16(q4, r0);
const __m128i x2 = _mm_sub_epi16(q7, r1);
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we tranpose the columns (that
+ // as the first pass results are transposed, we transpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
int pass;
}
// Work on the first eight values; fdct8(input, even_results);
{
- // Add/substract
+ // Add/subtract
const __m128i q0 = _mm_add_epi16(input0, input7);
const __m128i q1 = _mm_add_epi16(input1, input6);
const __m128i q2 = _mm_add_epi16(input2, input5);
const __m128i q7 = _mm_sub_epi16(input0, input7);
// Work on first four results
{
- // Add/substract
+ // Add/subtract
const __m128i r0 = _mm_add_epi16(q0, q3);
const __m128i r1 = _mm_add_epi16(q1, q2);
const __m128i r2 = _mm_sub_epi16(q1, q2);
// Combine
const __m128i r0 = _mm_packs_epi32(s0, s1);
const __m128i r1 = _mm_packs_epi32(s2, s3);
- // Add/substract
+ // Add/subtract
const __m128i x0 = _mm_add_epi16(q4, r0);
const __m128i x1 = _mm_sub_epi16(q4, r0);
const __m128i x2 = _mm_sub_epi16(q7, r1);
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we tranpose the columns (that
+ // as the first pass results are transposed, we transpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
int pass;
in1 = _mm_slli_epi16(in1, 4);
// if (i == 0 && input[0]) input[0] += 1;
{
- // The mask will only contain wether the first value is zero, all
+ // The mask will only contain whether the first value is zero, all
// other comparison will fail as something shifted by 4 (above << 4)
// can never be equal to one. To increment in the non-zero case, we
// add the mask and one for the first element:
}
// Do the two transform/transpose passes
for (pass = 0; pass < 2; ++pass) {
- // Transform 1/2: Add/substract
+ // Transform 1/2: Add/subtract
const __m128i r0 = _mm_add_epi16(in0, in1);
const __m128i r1 = _mm_sub_epi16(in0, in1);
const __m128i r2 = _mm_unpacklo_epi64(r0, r1);
for (pass = 0; pass < 2; pass++) {
// To store results of each pass before the transpose.
__m128i res0, res1, res2, res3, res4, res5, res6, res7;
- // Add/substract
+ // Add/subtract
const __m128i q0 = _mm_add_epi16(in0, in7);
const __m128i q1 = _mm_add_epi16(in1, in6);
const __m128i q2 = _mm_add_epi16(in2, in5);
const __m128i q7 = _mm_sub_epi16(in0, in7);
// Work on first four results
{
- // Add/substract
+ // Add/subtract
const __m128i r0 = _mm_add_epi16(q0, q3);
const __m128i r1 = _mm_add_epi16(q1, q2);
const __m128i r2 = _mm_sub_epi16(q1, q2);
// Combine
const __m128i r0 = _mm_packs_epi32(s0, s1);
const __m128i r1 = _mm_packs_epi32(s2, s3);
- // Add/substract
+ // Add/subtract
const __m128i x0 = _mm_add_epi16(q4, r0);
const __m128i x1 = _mm_sub_epi16(q4, r0);
const __m128i x2 = _mm_sub_epi16(q7, r1);
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we tranpose the columns (that
+ // as the first pass results are transposed, we transpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
int pass;
}
// Work on the first eight values; fdct8(input, even_results);
{
- // Add/substract
+ // Add/subtract
const __m128i q0 = _mm_add_epi16(input0, input7);
const __m128i q1 = _mm_add_epi16(input1, input6);
const __m128i q2 = _mm_add_epi16(input2, input5);
const __m128i q7 = _mm_sub_epi16(input0, input7);
// Work on first four results
{
- // Add/substract
+ // Add/subtract
const __m128i r0 = _mm_add_epi16(q0, q3);
const __m128i r1 = _mm_add_epi16(q1, q2);
const __m128i r2 = _mm_sub_epi16(q1, q2);
// Combine
const __m128i r0 = _mm_packs_epi32(s0, s1);
const __m128i r1 = _mm_packs_epi32(s2, s3);
- // Add/substract
+ // Add/subtract
const __m128i x0 = _mm_add_epi16(q4, r0);
const __m128i x1 = _mm_sub_epi16(q4, r0);
const __m128i x2 = _mm_sub_epi16(q7, r1);