ref: fc5a3728144c62b634bb6fb036a6da47ee9bdf8f
parent: 1407506a0dfce7c5ca1184e4cf1452e82109f46c
author: Martin Storsjö <martin@martin.st>
date: Sun Jan 6 17:28:02 EST 2019
arm64: mc: Optimize the mul_mla_8_* macros for Cortex A53 Before: Cortex A53 Snapdragon 835 mc_8tap_regular_w2_v_8bpc_neon: 155.1 131.8 mc_8tap_regular_w4_v_8bpc_neon: 199.6 148.1 mc_8tap_regular_w8_v_8bpc_neon: 286.2 225.5 After: mc_8tap_regular_w2_v_8bpc_neon: 134.1 129.5 mc_8tap_regular_w4_v_8bpc_neon: 157.6 146.5 mc_8tap_regular_w8_v_8bpc_neon: 208.0 225.0
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -546,58 +546,61 @@
mla \d\wd, \s2\wd, v0.h[2]
mla \d\wd, \s3\wd, v0.h[3]
.endm
+// Interleaving the mul/mla chains actually hurts performance
+// significantly on Cortex A53, thus keeping mul/mla tightly
+// chained like this.
.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
mul \d0\().8h, \s0\().8h, v0.h[0]
- mul \d1\().8h, \s1\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
- mla \d1\().8h, \s2\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
- mla \d1\().8h, \s3\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3]
- mla \d1\().8h, \s4\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4]
- mla \d1\().8h, \s5\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5]
- mla \d1\().8h, \s6\().8h, v0.h[5]
mla \d0\().8h, \s6\().8h, v0.h[6]
- mla \d1\().8h, \s7\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
+ mul \d1\().8h, \s1\().8h, v0.h[0]
+ mla \d1\().8h, \s2\().8h, v0.h[1]
+ mla \d1\().8h, \s3\().8h, v0.h[2]
+ mla \d1\().8h, \s4\().8h, v0.h[3]
+ mla \d1\().8h, \s5\().8h, v0.h[4]
+ mla \d1\().8h, \s6\().8h, v0.h[5]
+ mla \d1\().8h, \s7\().8h, v0.h[6]
mla \d1\().8h, \s8\().8h, v0.h[7]
.endm
.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
mul \d0\().8h, \s0\().8h, v0.h[0]
- mul \d1\().8h, \s2\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
- mla \d1\().8h, \s3\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
- mla \d1\().8h, \s4\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3]
- mla \d1\().8h, \s5\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4]
- mla \d1\().8h, \s6\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5]
- mla \d1\().8h, \s7\().8h, v0.h[5]
- mla \d0\().8h, \s6\().8h, v0.h[6]
- mla \d1\().8h, \s8\().8h, v0.h[6]
- mla \d0\().8h, \s7\().8h, v0.h[7]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mla \d0\().8h, \s7\().8h, v0.h[7]
+ mul \d1\().8h, \s2\().8h, v0.h[0]
+ mla \d1\().8h, \s3\().8h, v0.h[1]
+ mla \d1\().8h, \s4\().8h, v0.h[2]
+ mla \d1\().8h, \s5\().8h, v0.h[3]
+ mla \d1\().8h, \s6\().8h, v0.h[4]
+ mla \d1\().8h, \s7\().8h, v0.h[5]
+ mla \d1\().8h, \s8\().8h, v0.h[6]
mla \d1\().8h, \s9\().8h, v0.h[7]
.endm
.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
mul \d0\().8h, \s0\().8h, v0.h[0]
- mul \d1\().8h, \s4\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
- mla \d1\().8h, \s5\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
- mla \d1\().8h, \s6\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3]
- mla \d1\().8h, \s7\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4]
- mla \d1\().8h, \s8\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5]
- mla \d1\().8h, \s9\().8h, v0.h[5]
mla \d0\().8h, \s6\().8h, v0.h[6]
- mla \d1\().8h, \s10\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
+ mul \d1\().8h, \s4\().8h, v0.h[0]
+ mla \d1\().8h, \s5\().8h, v0.h[1]
+ mla \d1\().8h, \s6\().8h, v0.h[2]
+ mla \d1\().8h, \s7\().8h, v0.h[3]
+ mla \d1\().8h, \s8\().8h, v0.h[4]
+ mla \d1\().8h, \s9\().8h, v0.h[5]
+ mla \d1\().8h, \s10\().8h, v0.h[6]
mla \d1\().8h, \s11\().8h, v0.h[7]
.endm
.macro sqrshrun_b shift, r0, r1, r2, r3