s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]);
}
+ // The output_height is always a multiple of two.
+ assert(!(output_height & 1));
+
for (i = output_height; i > 1; i -= 2) {
__m256i srcRegHead2, srcRegHead3;
s2[2] = s2[3];
srcRegHead1 = srcRegHead3;
}
-
- // if the number of strides is odd.
- // process only 16 bytes
- if (i > 0) {
- // load the last 16 bytes
- const __m128i srcRegHead2 =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
- // merge the last 2 results together
- s1[0] = _mm256_castsi128_si256(
- _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
- s2[0] = _mm256_castsi128_si256(
- _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
-
- outReg1 = convolve8_8_avx2(s1, f);
- outReg2 = convolve8_8_avx2(s2, f);
-
- // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
- // contain the first and second convolve result respectively
- outReg1 = _mm_packus_epi16(outReg1, outReg2);
-
- // average if necessary
- if (avg) {
- outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
- }
-
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, outReg1);
- }
}
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,