shithub: dav1d

Download patch

ref: dfaa2a101f67f2850f880f28cc77d791966b8a3a
parent: fa6a0924d7aef7fbbdb02c7a8df0714d00e40408
author: Martin Storsjö <martin@martin.st>
date: Sat Oct 5 04:29:20 EDT 2019

arm64: cdef: Improve find_dir

Only add .4h elements to the upper half of sum_alt, as only 11
elements are needed, and .8h + .4h gives 12 in total.

Fuse two consecutive ext #8 + ext #2 into ext #10.

Move a few stores further away from where they are calculated.

Before:         Cortex A53     A72     A73
cdef_dir_8bpc_neon:  404.0   278.2   302.4
After:
cdef_dir_8bpc_neon:  400.0   269.3   282.5

--- a/src/arm/64/cdef.S
+++ b/src/arm/64/cdef.S
@@ -464,15 +464,15 @@
         ext             v24.16b, v30.16b, v29.16b, #(16-2*\i)
         ext             v25.16b, v29.16b, v30.16b, #(16-2*\i)
         add             v6.8h,   v6.8h,   v22.8h      // sum_alt[0]
-        add             v7.8h,   v7.8h,   v23.8h      // sum_alt[0]
+        add             v7.4h,   v7.4h,   v23.4h      // sum_alt[0]
         add             v16.8h,  v16.8h,  v24.8h      // sum_alt[1]
-        add             v17.8h,  v17.8h,  v25.8h      // sum_alt[1]
+        add             v17.4h,  v17.4h,  v25.4h      // sum_alt[1]
 .endif
 .if \i < 6
         ext             v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
         ext             v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
         add             v18.8h,  v18.8h,  v22.8h      // sum_alt[2]
-        add             v19.8h,  v19.8h,  v23.8h      // sum_alt[2]
+        add             v19.4h,  v19.4h,  v23.4h      // sum_alt[2]
 .else
         add             v18.8h,  v18.8h,  v26.8h      // sum_alt[2]
 .endif
@@ -484,7 +484,7 @@
         ext             v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
         ext             v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
         add             v20.8h,  v20.8h,  v24.8h      // sum_alt[3]
-        add             v21.8h,  v21.8h,  v25.8h      // sum_alt[3]
+        add             v21.4h,  v21.4h,  v25.4h      // sum_alt[3]
 .endif
 .endr
 
@@ -501,10 +501,8 @@
 
         rev64           v1.8h,   v1.8h
         rev64           v3.8h,   v3.8h
-        ext             v1.16b,  v1.16b,  v1.16b, #8  // sum_diag[0][15-n]
-        ext             v3.16b,  v3.16b,  v3.16b, #8  // sum_diag[1][15-n]
-        ext             v1.16b,  v1.16b,  v1.16b, #2  // sum_diag[0][14-n]
-        ext             v3.16b,  v3.16b,  v3.16b, #2  // sum_diag[1][14-n]
+        ext             v1.16b,  v1.16b,  v1.16b, #10 // sum_diag[0][14-n]
+        ext             v3.16b,  v3.16b,  v3.16b, #10 // sum_diag[1][14-n]
 
         str             s4,  [sp, #2*4]               // cost[2]
         str             s5,  [sp, #6*4]               // cost[6]
@@ -556,15 +554,16 @@
         addv            \d2, v25.4s                   // *cost_ptr
 .endm
         cost_alt        s6,  s16, v6,  v7,  v16, v17  // cost[1], cost[3]
+        cost_alt        s18, s20, v18, v19, v20, v21  // cost[5], cost[7]
         str             s6,  [sp, #1*4]               // cost[1]
         str             s16, [sp, #3*4]               // cost[3]
-        cost_alt        s18, s20, v18, v19, v20, v21  // cost[5], cost[7]
-        str             s18, [sp, #5*4]               // cost[5]
-        str             s20, [sp, #7*4]               // cost[7]
 
         mov             w0,  #0                       // best_dir
         mov             w1,  v0.s[0]                  // best_cost
         mov             w3,  #1                       // n
+
+        str             s18, [sp, #5*4]               // cost[5]
+        str             s20, [sp, #7*4]               // cost[7]
 
         mov             w4,  v6.s[0]