ref: 12a14913947b510514746389319b49a188a53579
parent: abc7105acdfbbeaeecf41c675148683a1cb8b4f7
author: James Zern <jzern@google.com>
date: Tue May 4 08:13:17 EDT 2021
vp9_denoiser_neon,horizontal_add_s8x16: use vaddlv w/aarch64 this reduces the number of instructions to compute the sum Change-Id: Icae4d4fb3e343d5b6e5a095c60ac6d171b3e7d54
--- a/vp9/encoder/arm/neon/vp9_denoiser_neon.c
+++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
@@ -21,6 +21,9 @@
// Compute the sum of all pixel differences of this MB.
static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+#if defined(__aarch64__)
+ return vaddlvq_s8(v_sum_diff_total);
+#else
const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
@@ -28,6 +31,7 @@
vget_low_s64(fedcba98_76543210));
const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
return sum_diff;
+#endif
}
// Denoise a 16x1 vector.