Use load_unaligned mem_neon.h helpers in SAD and SAD4D

author Jonathan Wright <jonathan.wright@arm.com>

Tue, 31 Jan 2023 13:32:33 +0000 (13:32 +0000)

committer Jonathan Wright <jonathan.wright@arm.com>

Tue, 31 Jan 2023 15:39:21 +0000 (15:39 +0000)
author Jonathan Wright <jonathan.wright@arm.com>
Tue, 31 Jan 2023 13:32:33 +0000 (13:32 +0000)
committer Jonathan Wright <jonathan.wright@arm.com>
Tue, 31 Jan 2023 15:39:21 +0000 (15:39 +0000)
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c

index 5064770ee677034bbd90c78d7964aa4996c51f25..85f6c1e5b11e73aae2701667151d71edc3e40404 100644 (file)
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -285,35 +285,16 @@ static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
  
    int i = 0;
    do {
-    uint32x2_t s, r0, r1, r2, r3;
-    uint32_t s_lo, s_hi, r0_lo, r0_hi, r1_lo, r1_hi, r2_lo, r2_hi, r3_lo, r3_hi;
-
-    memcpy(&s_lo, src + i * src_stride, 4);
-    memcpy(&r0_lo, ref[0] + i * ref_stride, 4);
-    memcpy(&r1_lo, ref[1] + i * ref_stride, 4);
-    memcpy(&r2_lo, ref[2] + i * ref_stride, 4);
-    memcpy(&r3_lo, ref[3] + i * ref_stride, 4);
-    s = vdup_n_u32(s_lo);
-    r0 = vdup_n_u32(r0_lo);
-    r1 = vdup_n_u32(r1_lo);
-    r2 = vdup_n_u32(r2_lo);
-    r3 = vdup_n_u32(r3_lo);
-
-    memcpy(&s_hi, src + (i + 1) * src_stride, 4);
-    memcpy(&r0_hi, ref[0] + (i + 1) * ref_stride, 4);
-    memcpy(&r1_hi, ref[1] + (i + 1) * ref_stride, 4);
-    memcpy(&r2_hi, ref[2] + (i + 1) * ref_stride, 4);
-    memcpy(&r3_hi, ref[3] + (i + 1) * ref_stride, 4);
-    s = vset_lane_u32(s_hi, s, 1);
-    r0 = vset_lane_u32(r0_hi, r0, 1);
-    r1 = vset_lane_u32(r1_hi, r1, 1);
-    r2 = vset_lane_u32(r2_hi, r2, 1);
-    r3 = vset_lane_u32(r3_hi, r3, 1);
-
-    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r0), &sum[0]);
-    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r1), &sum[1]);
-    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r2), &sum[2]);
-    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r3), &sum[3]);
+    uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride);
+    uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride);
+    uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride);
+    uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride);
+    uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride);
+
+    sad8_neon(s, r0, &sum[0]);
+    sad8_neon(s, r1, &sum[1]);
+    sad8_neon(s, r2, &sum[2]);
+    sad8_neon(s, r3, &sum[3]);
  
      i += 2;
    } while (i < h);
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c

index 7336edb69403a6375ac4ef1236afe3738e4d0b1f..9382b806261ddbd9eaaafc63b136a3e77dee4f9b 100644 (file)
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -214,24 +214,13 @@ static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
  
    int i = h / 2;
    do {
-    uint32x2_t s, r;
-    uint32_t s0, s1, r0, r1;
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
  
-    memcpy(&s0, src_ptr, 4);
-    memcpy(&r0, ref_ptr, 4);
-    s = vdup_n_u32(s0);
-    r = vdup_n_u32(r0);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-
-    memcpy(&s1, src_ptr, 4);
-    memcpy(&r1, ref_ptr, 4);
-    s = vset_lane_u32(s1, s, 1);
-    r = vset_lane_u32(r1, r, 1);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
+    sum = vabal_u8(sum, s, r);
  
-    sum = vabal_u8(sum, vreinterpret_u8_u32(s), vreinterpret_u8_u32(r));
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
    } while (--i != 0);
  
    return horizontal_add_uint16x8(sum);
@@ -509,28 +498,15 @@ static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
  
    int i = h / 2;
    do {
-    uint32x2_t s, r;
-    uint32_t s0, s1, r0, r1;
-    uint8x8_t p, avg;
-
-    memcpy(&s0, src_ptr, 4);
-    memcpy(&r0, ref_ptr, 4);
-    s = vdup_n_u32(s0);
-    r = vdup_n_u32(r0);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-
-    memcpy(&s1, src_ptr, 4);
-    memcpy(&r1, ref_ptr, 4);
-    s = vset_lane_u32(s1, s, 1);
-    r = vset_lane_u32(r1, r, 1);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    uint8x8_t p = vld1_u8(second_pred);
  
-    p = vld1_u8(second_pred);
-    avg = vrhadd_u8(vreinterpret_u8_u32(r), p);
+    uint8x8_t avg = vrhadd_u8(r, p);
+    sum = vabal_u8(sum, s, avg);
  
-    sum = vabal_u8(sum, vreinterpret_u8_u32(s), avg);
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
      second_pred += 8;
    } while (--i != 0);
author	Jonathan Wright <jonathan.wright@arm.com>
	Tue, 31 Jan 2023 13:32:33 +0000 (13:32 +0000)
committer	Jonathan Wright <jonathan.wright@arm.com>
	Tue, 31 Jan 2023 15:39:21 +0000 (15:39 +0000)
vpx_dsp/arm/sad4d_neon.c		patch \| blob \| history
vpx_dsp/arm/sad_neon.c		patch \| blob \| history