shithub: dav1d

Download patch

ref: 361a3c8ee2d03f87f42a76213ee0f93e49fa9ec3
parent: 6ad9bd5f92621d81a227b6d271c29dfaa578000a
author: Martin Storsjö <martin@martin.st>
date: Tue Jan 28 06:07:14 EST 2020

arm: cdef: Add special cased versions for pri_strength/sec_strength being zero

Before:
ARM32:                    Cortex A7      A8      A9     A53     A72     A73
cdef_filter_4x4_8bpc_neon:    964.6   599.5   707.9   601.2   465.1   405.2
cdef_filter_4x8_8bpc_neon:   1726.0  1066.2  1238.7  1041.7   798.6   725.3
cdef_filter_8x8_8bpc_neon:   2974.4  1671.8  1943.9  1806.1  1229.8  1242.1
ARM64:
cdef_filter_4x4_8bpc_neon:                            569.2   337.8   348.7
cdef_filter_4x8_8bpc_neon:                           1031.1   623.3   633.6
cdef_filter_8x8_8bpc_neon:                           1847.5  1097.7  1117.5

After:
ARM32:                    Cortex A7      A8      A9     A53     A72     A73
cdef_filter_4x4_8bpc_neon:    798.4   524.2   617.3   506.8   432.4   361.1
cdef_filter_4x8_8bpc_neon:   1394.7   910.4  1054.0   863.6   730.2   632.2
cdef_filter_8x8_8bpc_neon:   2364.6  1453.8  1675.1  1466.0  1086.4  1107.7
ARM64:
cdef_filter_4x4_8bpc_neon:                            461.7   303.1   308.6
cdef_filter_4x8_8bpc_neon:                            833.0   547.5   556.0
cdef_filter_8x8_8bpc_neon:                           1459.3   934.1   967.9

--- a/src/arm/32/cdef.S
+++ b/src/arm/32/cdef.S
@@ -311,14 +311,13 @@
         vld1.16         {\d22}, [r9]         // p1
 .endif
 .endm
-.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
-        cmp             \threshold, #0
+.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap, min
+.if \min
         vmin.u16        q2,  q2,  \s1
         vmax.s16        q3,  q3,  \s1
         vmin.u16        q2,  q2,  \s2
         vmax.s16        q3,  q3,  \s2
-
-        beq             3f
+.endif
         vabd.u16        q8,  q0,  \s1        // abs(diff)
         vabd.u16        q11, q0,  \s2        // abs(diff)
         vshl.u16        q9,  q8,  \shift     // abs(diff) >> shift
@@ -342,22 +341,24 @@
 // void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
 //                              const uint16_t *tmp, int pri_strength,
 //                              int sec_strength, int dir, int damping, int h);
-.macro filter w
-function cdef_filter\w\()_neon, export=1
-        push            {r4-r9,lr}
-        vpush           {q4-q7}
-        ldrd            r4,  r5,  [sp, #92]
-        ldrd            r6,  r7,  [sp, #100]
+.macro filter_func w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_neon
+.if \pri
         movrel_local    r8,  pri_taps
         and             r9,  r3,  #1
         add             r8,  r8,  r9, lsl #1
+.endif
         movrel_local    r9,  directions\w
         add             r5,  r9,  r5, lsl #1
         vmov.u16        d17, #15
         vdup.16         d16, r6              // damping
 
+.if \pri
         vdup.16         q5,  r3              // threshold
+.endif
+.if \sec
         vdup.16         q7,  r4              // threshold
+.endif
         vmov.16         d8[0], r3
         vmov.16         d8[1], r4
         vclz.i16        d8,  d8              // clz(threshold)
@@ -364,8 +365,12 @@
         vsub.i16        d8,  d17, d8         // ulog2(threshold)
         vqsub.u16       d8,  d16, d8         // shift = imax(0, damping - ulog2(threshold))
         vneg.s16        d8,  d8              // -shift
+.if \sec
         vdup.16         q6,  d8[1]
+.endif
+.if \pri
         vdup.16         q4,  d8[0]
+.endif
 
 1:
 .if \w == 8
@@ -377,39 +382,54 @@
 .endif
 
         vmov.u16        q1,  #0              // sum
+.if \min
         vmov.u16        q2,  q0              // min
         vmov.u16        q3,  q0              // max
+.endif
 
         // Instead of loading sec_taps 2, 1 from memory, just set it
         // to 2 initially and decrease for the second round.
+        // This is also used as loop counter.
         mov             lr,  #2              // sec_taps[0]
 
 2:
+.if \pri
         ldrsb           r9,  [r5]            // off1
 
         load_px         d28, d29, d30, d31, \w
+.endif
 
+.if \sec
         add             r5,  r5,  #4         // +2*2
         ldrsb           r9,  [r5]            // off2
+.endif
 
+.if \pri
         ldrb            r12, [r8]            // *pri_taps
 
-        handle_pixel    q14, q15, r3,  q5,  q4,  r12
+        handle_pixel    q14, q15, r3,  q5,  q4,  r12, \min
+.endif
 
+.if \sec
         load_px         d28, d29, d30, d31, \w
 
         add             r5,  r5,  #8         // +2*4
         ldrsb           r9,  [r5]            // off3
 
-        handle_pixel    q14, q15, r4,  q7,  q6,  lr
+        handle_pixel    q14, q15, r4,  q7,  q6,  lr, \min
 
         load_px         d28, d29, d30, d31, \w
 
-        handle_pixel    q14, q15, r4,  q7,  q6,  lr
+        handle_pixel    q14, q15, r4,  q7,  q6,  lr, \min
 
         sub             r5,  r5,  #11        // r5 -= 2*(2+4); r5 += 1;
+.else
+        add             r5,  r5,  #1         // r5 += 1
+.endif
         subs            lr,  lr,  #1         // sec_tap-- (value)
+.if \pri
         add             r8,  r8,  #1         // pri_taps++ (pointer)
+.endif
         bne             2b
 
         vshr.s16        q14, q1,  #15        // -(sum < 0)
@@ -416,8 +436,10 @@
         vadd.i16        q1,  q1,  q14        // sum - (sum < 0)
         vrshr.s16       q1,  q1,  #4         // (8 + sum - (sum < 0)) >> 4
         vadd.i16        q0,  q0,  q1         // px + (8 + sum ...) >> 4
+.if \min
         vmin.s16        q0,  q0,  q3
         vmax.s16        q0,  q0,  q2         // iclip(px + .., min, max)
+.endif
         vmovn.u16       d0,  q0
 .if \w == 8
         add             r2,  r2,  #2*16      // tmp += tmp_stride
@@ -432,11 +454,35 @@
 
         // Reset pri_taps and directions back to the original point
         sub             r5,  r5,  #2
+.if \pri
         sub             r8,  r8,  #2
+.endif
 
         bgt             1b
         vpop            {q4-q7}
         pop             {r4-r9,pc}
+endfunc
+.endm
+
+.macro filter w
+filter_func \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_neon, export=1
+        push            {r4-r9,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #92]
+        ldrd            r6,  r7,  [sp, #100]
+        cmp             r3,  #0 // pri_strength
+        bne             1f
+        b               cdef_filter\w\()_sec_neon // only sec
+1:
+        cmp             r4,  #0 // sec_strength
+        bne             1f
+        b               cdef_filter\w\()_pri_neon // only pri
+1:
+        b               cdef_filter\w\()_pri_sec_neon // both pri and sec
 endfunc
 .endm
 
--- a/src/arm/64/cdef.S
+++ b/src/arm/64/cdef.S
@@ -286,13 +286,13 @@
         ld1             {\d2\().d}[1], [x9]         // p1
 .endif
 .endm
-.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
+.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap, min
+.if \min
         umin            v2.8h,   v2.8h,  \s1\().8h
         smax            v3.8h,   v3.8h,  \s1\().8h
         umin            v2.8h,   v2.8h,  \s2\().8h
         smax            v3.8h,   v3.8h,  \s2\().8h
-
-        cbz             \threshold, 3f
+.endif
         uabd            v16.8h, v0.8h,  \s1\().8h   // abs(diff)
         uabd            v20.8h, v0.8h,  \s2\().8h   // abs(diff)
         ushl            v17.8h, v16.8h, \shift      // abs(diff) >> shift
@@ -316,25 +316,35 @@
 // void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
 //                              const uint16_t *tmp, int pri_strength,
 //                              int sec_strength, int dir, int damping, int h);
-.macro filter w
-function cdef_filter\w\()_neon, export=1
+.macro filter_func w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_neon
+.if \pri
         movrel          x8,  pri_taps
         and             w9,  w3,  #1
         add             x8,  x8,  w9, uxtw #1
+.endif
         movrel          x9,  directions\w
         add             x5,  x9,  w5, uxtw #1
         movi            v30.4h,   #15
         dup             v28.4h,   w6                // damping
 
+.if \pri
         dup             v25.8h, w3                  // threshold
+.endif
+.if \sec
         dup             v27.8h, w4                  // threshold
+.endif
         trn1            v24.4h, v25.4h, v27.4h
         clz             v24.4h, v24.4h              // clz(threshold)
         sub             v24.4h, v30.4h, v24.4h      // ulog2(threshold)
         uqsub           v24.4h, v28.4h, v24.4h      // shift = imax(0, damping - ulog2(threshold))
         neg             v24.4h, v24.4h              // -shift
+.if \sec
         dup             v26.8h, v24.h[1]
+.endif
+.if \pri
         dup             v24.8h, v24.h[0]
+.endif
 
 1:
 .if \w == 8
@@ -346,37 +356,52 @@
 .endif
 
         movi            v1.8h,  #0                  // sum
+.if \min
         mov             v2.16b, v0.16b              // min
         mov             v3.16b, v0.16b              // max
+.endif
 
         // Instead of loading sec_taps 2, 1 from memory, just set it
         // to 2 initially and decrease for the second round.
+        // This is also used as loop counter.
         mov             w11, #2                     // sec_taps[0]
 
 2:
+.if \pri
         ldrb            w9,  [x5]                   // off1
 
         load_px         v4,  v5, \w
+.endif
 
+.if \sec
         add             x5,  x5,  #4                // +2*2
         ldrb            w9,  [x5]                   // off2
         load_px         v6,  v7,  \w
+.endif
 
+.if \pri
         ldrb            w10, [x8]                   // *pri_taps
 
-        handle_pixel    v4,  v5,  w3,  v25.8h, v24.8h, w10
+        handle_pixel    v4,  v5,  w3,  v25.8h, v24.8h, w10, \min
+.endif
 
+.if \sec
         add             x5,  x5,  #8                // +2*4
         ldrb            w9,  [x5]                   // off3
         load_px         v4,  v5,  \w
 
-        handle_pixel    v6,  v7,  w4,  v27.8h, v26.8h, w11
+        handle_pixel    v6,  v7,  w4,  v27.8h, v26.8h, w11, \min
 
-        handle_pixel    v4,  v5,  w4,  v27.8h, v26.8h, w11
+        handle_pixel    v4,  v5,  w4,  v27.8h, v26.8h, w11, \min
 
         sub             x5,  x5,  #11               // x5 -= 2*(2+4); x5 += 1;
+.else
+        add             x5,  x5,  #1                // x5 += 1
+.endif
         subs            w11, w11, #1                // sec_tap-- (value)
+.if \pri
         add             x8,  x8,  #1                // pri_taps++ (pointer)
+.endif
         b.ne            2b
 
         sshr            v4.8h,  v1.8h,  #15         // -(sum < 0)
@@ -383,8 +408,10 @@
         add             v1.8h,  v1.8h,  v4.8h       // sum - (sum < 0)
         srshr           v1.8h,  v1.8h,  #4          // (8 + sum - (sum < 0)) >> 4
         add             v0.8h,  v0.8h,  v1.8h       // px + (8 + sum ...) >> 4
+.if \min
         smin            v0.8h,  v0.8h,  v3.8h
         smax            v0.8h,  v0.8h,  v2.8h       // iclip(px + .., min, max)
+.endif
         xtn             v0.8b,  v0.8h
 .if \w == 8
         add             x2,  x2,  #2*16             // tmp += tmp_stride
@@ -399,10 +426,28 @@
 
         // Reset pri_taps and directions back to the original point
         sub             x5,  x5,  #2
+.if \pri
         sub             x8,  x8,  #2
+.endif
 
         b.gt            1b
         ret
+endfunc
+.endm
+
+.macro filter w
+filter_func \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_neon, export=1
+        cbnz            w3,  1f // pri_strength
+        b               cdef_filter\w\()_sec_neon // only sec
+1:
+        cbnz            w4,  1f // sec_strength
+        b               cdef_filter\w\()_pri_neon // only pri
+1:
+        b               cdef_filter\w\()_pri_sec_neon // both pri and sec
 endfunc
 .endm