shithub: dav1d

Download patch

ref: 4ae3f5f7f330fa8b5d3ae0794eaac3c25dc4ae48
parent: 65a1aafda9fc9eda432408477cc7a3d0d7bd0d28
author: Martin Storsjö <martin@martin.st>
date: Thu Sep 3 07:34:14 EDT 2020

arm64: mc: Apply tuning from w4/w8 case to w2 case in 16 bpc 8tap_hv

Narrowing the intermediates from the horizontal pass is beneficial
(on most cores, but a small slowdown on A53) here as well. This
increases consistency in the code between the cases.

(The corresponding change in the upcoming arm32 version is beneficial
on all tested cores except for on A53 - it helps, on some cores a lot,
on A7, A8, A9, A72, A73 and only makes it marginally slower on A53.)

Before:                        Cortex A53     A72     A73
mc_8tap_regular_w2_hv_16bpc_neon:   457.7   301.0   317.1
After:
mc_8tap_regular_w2_hv_16bpc_neon:   472.0   276.0   284.3

--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -2039,7 +2039,6 @@
         sxtl            v0.8h,   v0.8b
         sxtl            v1.8h,   v1.8b
         mov             x15, x30
-        sxtl            v1.4s,   v1.4h
 
         ld1             {v27.8h}, [\src], \s_strd
         ext             v28.16b, v27.16b, v27.16b, #2
@@ -2049,18 +2048,23 @@
         addp            v16.4s,  v27.4s,  v27.4s
         srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
         bl              L(\type\()_8tap_filter_2)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53).
+        xtn             v16.4h,  v16.4s
 
-        trn1            v16.2d,  v16.2d,  v24.2d
-        mov             v17.16b, v24.16b
+        trn1            v16.2s,  v16.2s,  v24.2s
+        mov             v17.8b,  v24.8b
 
 2:
         bl              L(\type\()_8tap_filter_2)
 
-        ext             v18.16b, v17.16b, v24.16b, #8
-        mul             v2.4s,   v16.4s,  v1.s[0]
-        mla             v2.4s,   v17.4s,  v1.s[1]
-        mla             v2.4s,   v18.4s,  v1.s[2]
-        mla             v2.4s,   v24.4s,  v1.s[3]
+        ext             v18.8b,  v17.8b,  v24.8b,  #4
+        smull           v2.4s,   v16.4h,  v1.h[0]
+        smlal           v2.4s,   v17.4h,  v1.h[1]
+        smlal           v2.4s,   v18.4h,  v1.h[2]
+        smlal           v2.4s,   v24.4h,  v1.h[3]
 
         srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
         sqxtun          v2.4h,   v2.4s
@@ -2069,8 +2073,8 @@
         st1             {v2.s}[0], [\dst], \d_strd
         st1             {v2.s}[1], [\ds2], \d_strd
         b.le            0f
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v24.16b
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v24.8b
         b               2b
 
 280:    // 2x8, 2x16, 2x32 hv
@@ -2084,8 +2088,6 @@
         sxtl            v0.8h,   v0.8b
         sxtl            v1.8h,   v1.8b
         mov             x15, x30
-        sxtl2           v2.4s,   v1.8h
-        sxtl            v1.4s,   v1.4h
 
         ld1             {v27.8h}, [\src], \s_strd
         ext             v28.16b, v27.16b, v27.16b, #2
@@ -2094,28 +2096,33 @@
         addp            v27.4s,  v27.4s,  v28.4s
         addp            v16.4s,  v27.4s,  v27.4s
         srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53).
 
         bl              L(\type\()_8tap_filter_2)
-        trn1            v16.2d,  v16.2d,  v24.2d
-        mov             v17.16b, v24.16b
+        xtn             v16.4h,  v16.4s
+        trn1            v16.2s,  v16.2s,  v24.2s
+        mov             v17.8b,  v24.8b
         bl              L(\type\()_8tap_filter_2)
-        ext             v18.16b, v17.16b, v24.16b, #8
-        mov             v19.16b, v24.16b
+        ext             v18.8b,  v17.8b,  v24.8b,  #4
+        mov             v19.8b,  v24.8b
         bl              L(\type\()_8tap_filter_2)
-        ext             v20.16b, v19.16b, v24.16b, #8
-        mov             v21.16b, v24.16b
+        ext             v20.8b,  v19.8b,  v24.8b,  #4
+        mov             v21.8b,  v24.8b
 
 28:
         bl              L(\type\()_8tap_filter_2)
-        ext             v22.16b, v21.16b, v24.16b, #8
-        mul             v3.4s,   v16.4s,  v1.s[0]
-        mla             v3.4s,   v17.4s,  v1.s[1]
-        mla             v3.4s,   v18.4s,  v1.s[2]
-        mla             v3.4s,   v19.4s,  v1.s[3]
-        mla             v3.4s,   v20.4s,  v2.s[0]
-        mla             v3.4s,   v21.4s,  v2.s[1]
-        mla             v3.4s,   v22.4s,  v2.s[2]
-        mla             v3.4s,   v24.4s,  v2.s[3]
+        ext             v22.8b,  v21.8b,  v24.8b,  #4
+        smull           v3.4s,   v16.4h,  v1.h[0]
+        smlal           v3.4s,   v17.4h,  v1.h[1]
+        smlal           v3.4s,   v18.4h,  v1.h[2]
+        smlal           v3.4s,   v19.4h,  v1.h[3]
+        smlal           v3.4s,   v20.4h,  v1.h[4]
+        smlal           v3.4s,   v21.4h,  v1.h[5]
+        smlal           v3.4s,   v22.4h,  v1.h[6]
+        smlal           v3.4s,   v24.4h,  v1.h[7]
 
         srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
         sqxtun          v3.4h,   v3.4s
@@ -2124,12 +2131,12 @@
         st1             {v3.s}[0], [\dst], \d_strd
         st1             {v3.s}[1], [\ds2], \d_strd
         b.le            0f
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v19.16b
-        mov             v18.16b, v20.16b
-        mov             v19.16b, v21.16b
-        mov             v20.16b, v22.16b
-        mov             v21.16b, v24.16b
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v19.8b
+        mov             v18.8b,  v20.8b
+        mov             v19.8b,  v21.8b
+        mov             v20.8b,  v22.8b
+        mov             v21.8b,  v24.8b
         b               28b
 
 0:
@@ -2149,6 +2156,7 @@
         smlal           v24.4s,  v27.4h,  v0.h[2]
         smlal           v24.4s,  v28.4h,  v0.h[3]
         srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
+        xtn             v24.4h,  v24.4s
         ret
 .endif