shithub: dav1d

Download patch

ref: fc5a3728144c62b634bb6fb036a6da47ee9bdf8f
parent: 1407506a0dfce7c5ca1184e4cf1452e82109f46c
author: Martin Storsjö <martin@martin.st>
date: Sun Jan 6 17:28:02 EST 2019

arm64: mc: Optimize the mul_mla_8_* macros for Cortex A53

Before:                      Cortex A53   Snapdragon 835
mc_8tap_regular_w2_v_8bpc_neon:   155.1   131.8
mc_8tap_regular_w4_v_8bpc_neon:   199.6   148.1
mc_8tap_regular_w8_v_8bpc_neon:   286.2   225.5
After:
mc_8tap_regular_w2_v_8bpc_neon:   134.1   129.5
mc_8tap_regular_w4_v_8bpc_neon:   157.6   146.5
mc_8tap_regular_w8_v_8bpc_neon:   208.0   225.0

--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -546,58 +546,61 @@
         mla             \d\wd,  \s2\wd,  v0.h[2]
         mla             \d\wd,  \s3\wd,  v0.h[3]
 .endm
+// Interleaving the mul/mla chains actually hurts performance
+// significantly on Cortex A53, thus keeping mul/mla tightly
+// chained like this.
 .macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
         mul             \d0\().8h, \s0\().8h, v0.h[0]
-        mul             \d1\().8h, \s1\().8h, v0.h[0]
         mla             \d0\().8h, \s1\().8h, v0.h[1]
-        mla             \d1\().8h, \s2\().8h, v0.h[1]
         mla             \d0\().8h, \s2\().8h, v0.h[2]
-        mla             \d1\().8h, \s3\().8h, v0.h[2]
         mla             \d0\().8h, \s3\().8h, v0.h[3]
-        mla             \d1\().8h, \s4\().8h, v0.h[3]
         mla             \d0\().8h, \s4\().8h, v0.h[4]
-        mla             \d1\().8h, \s5\().8h, v0.h[4]
         mla             \d0\().8h, \s5\().8h, v0.h[5]
-        mla             \d1\().8h, \s6\().8h, v0.h[5]
         mla             \d0\().8h, \s6\().8h, v0.h[6]
-        mla             \d1\().8h, \s7\().8h, v0.h[6]
         mla             \d0\().8h, \s7\().8h, v0.h[7]
+        mul             \d1\().8h, \s1\().8h, v0.h[0]
+        mla             \d1\().8h, \s2\().8h, v0.h[1]
+        mla             \d1\().8h, \s3\().8h, v0.h[2]
+        mla             \d1\().8h, \s4\().8h, v0.h[3]
+        mla             \d1\().8h, \s5\().8h, v0.h[4]
+        mla             \d1\().8h, \s6\().8h, v0.h[5]
+        mla             \d1\().8h, \s7\().8h, v0.h[6]
         mla             \d1\().8h, \s8\().8h, v0.h[7]
 .endm
 .macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
         mul             \d0\().8h, \s0\().8h, v0.h[0]
-        mul             \d1\().8h, \s2\().8h, v0.h[0]
         mla             \d0\().8h, \s1\().8h, v0.h[1]
-        mla             \d1\().8h, \s3\().8h, v0.h[1]
         mla             \d0\().8h, \s2\().8h, v0.h[2]
-        mla             \d1\().8h, \s4\().8h, v0.h[2]
         mla             \d0\().8h, \s3\().8h, v0.h[3]
-        mla             \d1\().8h, \s5\().8h, v0.h[3]
         mla             \d0\().8h, \s4\().8h, v0.h[4]
-        mla             \d1\().8h, \s6\().8h, v0.h[4]
         mla             \d0\().8h, \s5\().8h, v0.h[5]
-        mla             \d1\().8h, \s7\().8h, v0.h[5]
-        mla             \d0\().8h, \s6\().8h, v0.h[6]
-        mla             \d1\().8h, \s8\().8h, v0.h[6]
-        mla             \d0\().8h, \s7\().8h, v0.h[7]
+        mla             \d0\().8h, \s6\().8h, v0.h[6]
+        mla             \d0\().8h, \s7\().8h, v0.h[7]
+        mul             \d1\().8h, \s2\().8h, v0.h[0]
+        mla             \d1\().8h, \s3\().8h, v0.h[1]
+        mla             \d1\().8h, \s4\().8h, v0.h[2]
+        mla             \d1\().8h, \s5\().8h, v0.h[3]
+        mla             \d1\().8h, \s6\().8h, v0.h[4]
+        mla             \d1\().8h, \s7\().8h, v0.h[5]
+        mla             \d1\().8h, \s8\().8h, v0.h[6]
         mla             \d1\().8h, \s9\().8h, v0.h[7]
 .endm
 .macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
         mul             \d0\().8h, \s0\().8h,  v0.h[0]
-        mul             \d1\().8h, \s4\().8h,  v0.h[0]
         mla             \d0\().8h, \s1\().8h,  v0.h[1]
-        mla             \d1\().8h, \s5\().8h,  v0.h[1]
         mla             \d0\().8h, \s2\().8h,  v0.h[2]
-        mla             \d1\().8h, \s6\().8h,  v0.h[2]
         mla             \d0\().8h, \s3\().8h,  v0.h[3]
-        mla             \d1\().8h, \s7\().8h,  v0.h[3]
         mla             \d0\().8h, \s4\().8h,  v0.h[4]
-        mla             \d1\().8h, \s8\().8h,  v0.h[4]
         mla             \d0\().8h, \s5\().8h,  v0.h[5]
-        mla             \d1\().8h, \s9\().8h,  v0.h[5]
         mla             \d0\().8h, \s6\().8h,  v0.h[6]
-        mla             \d1\().8h, \s10\().8h, v0.h[6]
         mla             \d0\().8h, \s7\().8h,  v0.h[7]
+        mul             \d1\().8h, \s4\().8h,  v0.h[0]
+        mla             \d1\().8h, \s5\().8h,  v0.h[1]
+        mla             \d1\().8h, \s6\().8h,  v0.h[2]
+        mla             \d1\().8h, \s7\().8h,  v0.h[3]
+        mla             \d1\().8h, \s8\().8h,  v0.h[4]
+        mla             \d1\().8h, \s9\().8h,  v0.h[5]
+        mla             \d1\().8h, \s10\().8h, v0.h[6]
         mla             \d1\().8h, \s11\().8h, v0.h[7]
 .endm
 .macro sqrshrun_b shift, r0, r1, r2, r3