ref: f7364c05748b70a1e0fd57849665a9d9f0990803
parent: a28d43658e3347d55d70655e6ee3d87d0d3fba8a
author: Jonathan Wright <jonathan.wright@arm.com>
date: Thu May 6 11:11:52 EDT 2021
Manually unroll the inner loop of Neon sad16x_4d() Manually unrolling the inner loop is sufficient to stop the compiler getting confused and emitting inefficient code. Co-authored by: James Greenhalgh <james.greenhalgh@arm.com> Bug: b/181236880 Change-Id: I860768ce0e6c0e0b6286d3fc1b94f0eae95d0a1a
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -243,7 +243,7 @@
static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res, const int height) {
- int i, j;
+ int i;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
@@ -252,10 +252,15 @@
for (i = 0; i < height; ++i) {
const uint8x16_t s = vld1q_u8(src_ptr);
src_ptr += src_stride;
- for (j = 0; j < 4; ++j) {
- sad16_neon(ref_loop[j], s, &sum[j]);
- ref_loop[j] += ref_stride;
- }
+ /* Manual unrolling here stops the compiler from getting confused. */
+ sad16_neon(ref_loop[0], s, &sum[0]);
+ ref_loop[0] += ref_stride;
+ sad16_neon(ref_loop[1], s, &sum[1]);
+ ref_loop[1] += ref_stride;
+ sad16_neon(ref_loop[2], s, &sum[2]);
+ ref_loop[2] += ref_stride;
+ sad16_neon(ref_loop[3], s, &sum[3]);
+ ref_loop[3] += ref_stride;
}
sad_512_pel_final_neon(sum, res);