shithub: dav1d

Download patch

ref: 8f8dc9281629cf187f542a64b023782b50ed1881
parent: 4f5261a0ed399dcec88c87f34d1095b0152b9ae1
author: Martin Storsjö <martin@martin.st>
date: Tue Mar 5 06:43:25 EST 2019

arm64: cdef: Use a smarter padding constant

Pad with a value which works both as a large unsigned value and a
negative signed value. This allows doing the max operation using
signed max, avoiding the conditional altogether.

Based on the same idea for x86 by Kyle Siefring.

Before:                  Cortex A53     A72     A73
cdef_filter_4x4_8bpc_neon:    645.5   401.9   422.5
cdef_filter_4x8_8bpc_neon:   1193.7   756.6   782.4
cdef_filter_8x8_8bpc_neon:   2162.4  1361.9  1375.6
After:
cdef_filter_4x4_8bpc_neon:    596.3   377.8   384.8
cdef_filter_4x8_8bpc_neon:   1097.4   705.5   707.1
cdef_filter_8x8_8bpc_neon:   1967.4  1232.3  1239.9

--- a/src/arm/64/cdef.S
+++ b/src/arm/64/cdef.S
@@ -136,8 +136,7 @@
 
 .macro padding_func w, stride, rn, rw
 function cdef_padding\w\()_neon, export=1
-        movi            v30.16b, #255
-        ushr            v30.8h, v30.8h, #1 // INT16_MAX
+        movi            v30.8h,  #0x80, lsl #8
         mov             v31.16b, v30.16b
         sub             x0,  x0,  #2*(2*\stride+2)
         tst             w6,  #4 // CDEF_HAVE_TOP
@@ -290,14 +289,10 @@
 .endif
 .endm
 .macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
-        cmeq            v16.8h,  \s1\().8h,  v31.8h
-        cmeq            v17.8h,  \s2\().8h,  v31.8h
-        bic             v16.16b, \s1\().16b, v16.16b
-        bic             v17.16b, \s2\().16b, v17.16b
         umin            v2.8h,   v2.8h,  \s1\().8h
-        umax            v3.8h,   v3.8h,  v16.8h
+        smax            v3.8h,   v3.8h,  \s1\().8h
         umin            v2.8h,   v2.8h,  \s2\().8h
-        umax            v3.8h,   v3.8h,  v17.8h
+        smax            v3.8h,   v3.8h,  \s2\().8h
 
         cbz             \threshold, 3f
         uabd            v16.8h, v0.8h,  \s1\().8h   // abs(diff)
@@ -308,8 +303,8 @@
         uqsub           v21.8h, \thresh_vec, v21.8h // imax(0, threshold - (abs(diff) >> shift))
         cmhi            v18.8h, v0.8h,  \s1\().8h   // px > p0
         cmhi            v22.8h, v0.8h,  \s2\().8h   // px > p1
-        smin            v17.8h, v17.8h, v16.8h      // imin(abs(diff), imax())
-        smin            v21.8h, v21.8h, v20.8h      // imin(abs(diff), imax())
+        umin            v17.8h, v17.8h, v16.8h      // imin(abs(diff), imax())
+        umin            v21.8h, v21.8h, v20.8h      // imin(abs(diff), imax())
         dup             v19.8h, \tap                // taps[k]/taps[k]
         neg             v16.8h, v17.8h              // -imin()
         neg             v20.8h, v21.8h              // -imin()
@@ -330,10 +325,8 @@
         add             x8,  x8,  w9, uxtw #1
         movrel          x9,  directions\w
         add             x5,  x9,  w5, uxtw #1
-        movi            v31.16b,  #255
         movi            v30.8h,   #15
         dup             v28.8h,   w6                // damping
-        ushr            v31.8h,   v31.8h, #1        // INT16_MAX
 
         dup             v25.8h, w3                  // threshold
         dup             v27.8h, w4                  // threshold