ref: a28d43658e3347d55d70655e6ee3d87d0d3fba8a
parent: 7ef83148cfbfd0163fa22a0550d128d935fe2bad
author: Jonathan Wright <jonathan.wright@arm.com>
date: Thu May 6 10:51:05 EDT 2021
Optimize Neon SAD reductions using wider ADDP instruction Implement AArch64-only paths for each of the Neon SAD reduction functions, making use of a wider pairwise addition instruction only available on AArch64. This change removes the need for shuffling between high and low halves of Neon vectors - resulting in a faster reduction that requires fewer instructions. Bug: b/181236880 Change-Id: I1c48580b4aec27222538eeab44e38ecc1f2009dc
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -34,7 +34,9 @@
uint32_t *const res) {
int i;
uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+#if !defined(__aarch64__)
uint16x4_t a[2];
+#endif
uint32x4_t r;
assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
@@ -51,9 +53,14 @@
abs[1] = vabal_u8(abs[1], s, ref23);
}
+#if defined(__aarch64__)
+ abs[0] = vpaddq_u16(abs[0], abs[1]);
+ r = vpaddlq_u16(abs[0]);
+#else
a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
+#endif
vst1q_u32(res, r);
}
@@ -74,6 +81,12 @@
// Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
uint32_t *const res) {
+#if defined(__aarch64__)
+ const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+ const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+ const uint16x8_t b0 = vpaddq_u16(a0, a1);
+ const uint32x4_t r = vpaddlq_u16(b0);
+#else
const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
@@ -81,6 +94,7 @@
const uint16x4_t b0 = vpadd_u16(a0, a1);
const uint16x4_t b1 = vpadd_u16(a2, a3);
const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
vst1q_u32(res, r);
}
@@ -87,6 +101,14 @@
// Can handle 1024 pixels' sad sum (such as 32x32)
static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
uint32_t *const res) {
+#if defined(__aarch64__)
+ const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+ const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+ const uint32x4_t b0 = vpaddlq_u16(a0);
+ const uint32x4_t b1 = vpaddlq_u16(a1);
+ const uint32x4_t r = vpaddq_u32(b0, b1);
+ vst1q_u32(res, r);
+#else
const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
@@ -96,15 +118,26 @@
const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
vst1q_u32(res, vcombine_u32(c0, c1));
+#endif
}
// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
uint32_t *const res) {
+#if defined(__aarch64__)
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
const uint32x4_t a2 = vpaddlq_u16(sum[2]);
const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+ const uint32x4_t b0 = vpaddq_u32(a0, a1);
+ const uint32x4_t b1 = vpaddq_u32(a2, a3);
+ const uint32x4_t r = vpaddq_u32(b0, b1);
+ vst1q_u32(res, r);
+#else
+ const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+ const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+ const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+ const uint32x4_t a3 = vpaddlq_u16(sum[3]);
const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));
const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));
const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));
@@ -112,11 +145,13 @@
const uint32x2_t c0 = vpadd_u32(b0, b1);
const uint32x2_t c1 = vpadd_u32(b2, b3);
vst1q_u32(res, vcombine_u32(c0, c1));
+#endif
}
// Can handle 4096 pixels' sad sum (such as 64x64)
static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
uint32_t *const res) {
+#if defined(__aarch64__)
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
const uint32x4_t a2 = vpaddlq_u16(sum[2]);
@@ -129,6 +164,23 @@
const uint32x4_t b1 = vaddq_u32(a2, a3);
const uint32x4_t b2 = vaddq_u32(a4, a5);
const uint32x4_t b3 = vaddq_u32(a6, a7);
+ const uint32x4_t c0 = vpaddq_u32(b0, b1);
+ const uint32x4_t c1 = vpaddq_u32(b2, b3);
+ const uint32x4_t r = vpaddq_u32(c0, c1);
+ vst1q_u32(res, r);
+#else
+ const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+ const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+ const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+ const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+ const uint32x4_t a4 = vpaddlq_u16(sum[4]);
+ const uint32x4_t a5 = vpaddlq_u16(sum[5]);
+ const uint32x4_t a6 = vpaddlq_u16(sum[6]);
+ const uint32x4_t a7 = vpaddlq_u16(sum[7]);
+ const uint32x4_t b0 = vaddq_u32(a0, a1);
+ const uint32x4_t b1 = vaddq_u32(a2, a3);
+ const uint32x4_t b2 = vaddq_u32(a4, a5);
+ const uint32x4_t b3 = vaddq_u32(a6, a7);
const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
@@ -136,6 +188,7 @@
const uint32x2_t d0 = vpadd_u32(c0, c1);
const uint32x2_t d1 = vpadd_u32(c2, c3);
vst1q_u32(res, vcombine_u32(d0, d1));
+#endif
}
static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,