shithub: dav1d

Download patch

ref: 61442bee60f45b05da627ddbac10a9a63e243f47
parent: 5647a57eabc454e2e2360429aba494452af00cb3
author: Martin Storsjö <martin@martin.st>
date: Mon Oct 7 09:29:41 EDT 2019

arm: mc: Port the ARM64 warp filter to arm32

Relative speedup over C code:
                  Cortex A7     A8     A9    A53    A72    A73
warp_8x8_8bpc_neon:    2.79   5.45   4.18   3.96   4.16   4.51
warp_8x8t_8bpc_neon:   2.79   5.33   4.18   3.98   4.22   4.25

Comparison to original ARM64 assembly:

ARM64:            Cortex A53     A72     A73
warp_8x8_8bpc_neon:   1854.6  1072.5  1102.5
warp_8x8t_8bpc_neon:  1839.6  1069.4  1089.5
ARM32:
warp_8x8_8bpc_neon:   2132.5  1160.3  1218.0
warp_8x8t_8bpc_neon:  2113.7  1148.0  1209.1

--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -2971,3 +2971,206 @@
 
 filter_fn put,  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10
 filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
+
+.macro load_filter_ptr src
+        asr             r12, \src, #10
+        add             r12, r11, r12, lsl #3
+.endm
+
+.macro load_filter_coef dst, src, inc
+        vld1.8          {\dst}, [r12, :64]
+        add             \src, \src, \inc
+.endm
+
+.macro load_filter_row dst, src, inc
+        load_filter_ptr \src
+        load_filter_coef \dst, \src, \inc
+.endm
+
+function warp_filter_horz_neon
+        load_filter_ptr r5                  // filter 0
+        vld1.16         {q7}, [r2], r3
+
+        load_filter_coef d0, r5,  r7        // filter 0
+        vmovl.u8        q6,  d14            // original pixels
+        load_filter_row d2,  r5,  r7        // filter 1
+        vmovl.u8        q7,  d15            // original pixels
+        load_filter_row d4,  r5,  r7        // filter 2
+        vmovl.s8        q0,  d0             // filter 0
+        vext.8          q3,  q6,  q7,  #2*1 // filter 1 pixels
+        load_filter_ptr r5                  // filter 3
+        vmovl.s8        q1,  d2             // filter 1
+        vmul.i16        q5,  q6,  q0        // filter 0 output
+        load_filter_coef d0, r5,  r7        // filter 3
+        vmovl.s8        q2,  d4             // filter 2
+        load_filter_ptr r5                  // filter 4
+        vext.8          q4,  q6,  q7,  #2*2 // filter 2 pixels
+        vmul.i16        q3,  q3,  q1        // filter 1 output
+        load_filter_coef d2, r5,  r7        // filter 4
+        vmul.i16        q4,  q4,  q2        // filter 2 output
+        vext.8          q2,  q6,  q7,  #2*3 // filter 3 pixels
+        vmovl.s8        q0,  d0             // filter 3
+        vpaddl.s16      q5,  q5             // pixel 0 (4x32)
+        vpaddl.s16      q3,  q3             // pixel 1 (4x32)
+        vmul.i16        q0,  q2,  q0        // filter 3 output
+        load_filter_ptr r5                  // filter 5
+        vext.8          q2,  q6,  q7,  #2*4 // filter 4 pixels
+        vmovl.s8        q1,  d2             // filter 4
+        vpaddl.s16      q4,  q4             // pixel 2 (4x32)
+        vpadd.s32       d10, d10, d11       // pixel 0 (2x32)
+        vpadd.s32       d11, d6,  d7        // pixel 1 (2x32)
+        load_filter_coef d6, r5,  r7        // filter 5
+        vmul.i16        q1,  q2,  q1        // filter 4 output
+        vpadd.s32       d8,  d8,  d9        // pixel 2 (2x32)
+        load_filter_ptr r5                  // filter 6
+        vpaddl.s16      q0,  q0             // pixel 3 (4x32)
+        vpadd.s32       d10, d10, d11       // pixel 0,1
+        vext.8          q2,  q6,  q7,  #2*5 // filter 5 pixels
+        vmovl.s8        q3,  d6             // filter 5
+        vpaddl.s16      q1,  q1             // pixel 4 (4x32)
+        vpadd.s32       d9,  d0,  d1        // pixel 3 (2x32)
+        load_filter_coef d0, r5,  r7        // filter 6
+        vmul.i16        q2,  q2,  q3        // filter 5 output
+        vpadd.s32       d11, d8,  d9        // pixel 2,3
+        load_filter_ptr r5                  // filter 7
+        vpaddl.s16      q2,  q2             // pixel 5 (4x32)
+        vpadd.s32       d8,  d2,  d3        // pixel 4 (2x32)
+        vext.8          q3,  q6,  q7,  #2*6 // filter 6 pixels
+        vmovl.s8        q0,  d0             // filter 6
+        vpadd.s32       d9,  d4,  d5        // pixel 5 (2x32)
+        load_filter_coef d4, r5,  r7        // filter 7
+        vpadd.s32       d8,  d8,  d9        // pixel 4,5
+        vext.8          q1,  q6,  q7,  #2*7 // filter 7 pixels
+        vmovl.s8        q2,  d4             // filter 7
+        vmul.i16        q3,  q3,  q0        // filter 6 output
+        vmul.i16        q1,  q1,  q2        // filter 7 output
+        sub             r5,  r5,  r7, lsl #3
+        vpaddl.s16      q3,  q3             // pixel 6 (4x32)
+        vpaddl.s16      q1,  q1             // pixel 7 (4x32)
+        vpadd.s32       d6,  d6,  d7        // pixel 6 (2x32)
+        vpadd.s32       d2,  d2,  d3        // pixel 7 (2x32)
+        vpadd.s32       d9,  d6,  d2        // pixel 6,7
+
+        add             r5,  r5,  r8
+
+        vrshrn.s32      d10, q5,  #3
+        vrshrn.s32      d11, q4,  #3
+
+        bx              lr
+endfunc
+
+// void dav1d_warp_affine_8x8_8bpc_neon(
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *src, const ptrdiff_t src_stride,
+//         const int16_t *const abcd, int mx, int my)
+.macro warp t, shift
+function warp_affine_8x8\t\()_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldr             r6,  [sp, #108]
+        ldrd            r8,  r9,  [r4]
+        sxth            r7,  r8
+        asr             r8,  r8, #16
+        asr             r4,  r9, #16
+        sxth            r9,  r9
+        mov             r10, #8
+        sub             r2,  r2,  r3, lsl #1
+        sub             r2,  r2,  r3
+        sub             r2,  r2,  #3
+        movrel          r11, X(mc_warp_filter), 64*8
+.ifnb \t
+        lsl             r1,  r1,  #1
+.endif
+        add             r5,  r5,  #512
+        add             r6,  r6,  #512
+
+        bl              warp_filter_horz_neon
+        vmov            q8,  q5
+        bl              warp_filter_horz_neon
+        vmov            q9,  q5
+        bl              warp_filter_horz_neon
+        vmov            q10, q5
+        bl              warp_filter_horz_neon
+        vmov            q11, q5
+        bl              warp_filter_horz_neon
+        vmov            q12, q5
+        bl              warp_filter_horz_neon
+        vmov            q13, q5
+        bl              warp_filter_horz_neon
+        vmov            q14, q5
+
+1:
+        bl              warp_filter_horz_neon
+        vmov            q15, q5
+
+        load_filter_row d8,  r6,  r9
+        load_filter_row d9,  r6,  r9
+        load_filter_row d10, r6,  r9
+        load_filter_row d11, r6,  r9
+        load_filter_row d12, r6,  r9
+        load_filter_row d13, r6,  r9
+        load_filter_row d14, r6,  r9
+        load_filter_row d15, r6,  r9
+        transpose_8x8b  q4,  q5,  q6,  q7,  d8,  d9,  d10, d11, d12, d13, d14, d15
+        vmovl.s8        q1,  d8
+        vmovl.s8        q2,  d9
+        vmovl.s8        q3,  d10
+        vmovl.s8        q4,  d11
+        vmovl.s8        q5,  d12
+        vmovl.s8        q6,  d13
+
+        sub             r6,  r6,  r9, lsl #3
+
+        // This ordering of vmull/vmlal is highly beneficial for
+        // Cortex A8/A9/A53 here, but harmful for Cortex A7.
+        vmull.s16       q0,  d16,  d2
+        vmlal.s16       q0,  d18,  d4
+        vmlal.s16       q0,  d20,  d6
+        vmlal.s16       q0,  d22,  d8
+        vmlal.s16       q0,  d24,  d10
+        vmlal.s16       q0,  d26,  d12
+        vmull.s16       q1,  d17,  d3
+        vmlal.s16       q1,  d19,  d5
+        vmlal.s16       q1,  d21,  d7
+        vmlal.s16       q1,  d23,  d9
+        vmlal.s16       q1,  d25,  d11
+        vmlal.s16       q1,  d27,  d13
+
+        vmovl.s8        q2,  d14
+        vmovl.s8        q3,  d15
+
+        vmlal.s16       q0,  d28,  d4
+        vmlal.s16       q0,  d30,  d6
+        vmlal.s16       q1,  d29,  d5
+        vmlal.s16       q1,  d31,  d7
+
+        vmov            q8,  q9
+        vmov            q9,  q10
+        vqrshrn.s32     d0,  q0,  #\shift
+        vmov            q10, q11
+        vqrshrn.s32     d1,  q1,  #\shift
+        vmov            q11, q12
+        vmov            q12, q13
+.ifb \t
+        vqmovun.s16     d0,  q0
+.endif
+        vmov            q13, q14
+        vmov            q14, q15
+        subs            r10, r10, #1
+.ifnb \t
+        vst1.16         {q0}, [r0, :128], r1
+.else
+        vst1.8          {d0}, [r0, :64], r1
+.endif
+
+        add             r6,  r6,  r4
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+warp  , 11
+warp t, 7
--- a/src/arm/32/util.S
+++ b/src/arm/32/util.S
@@ -69,4 +69,19 @@
 #endif
 .endm
 
+.macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+        vtrn.32         \q0,  \q2
+        vtrn.32         \q1,  \q3
+
+        vtrn.16         \r0,  \r2
+        vtrn.16         \r1,  \r3
+        vtrn.16         \r4,  \r6
+        vtrn.16         \r5,  \r7
+
+        vtrn.8          \r0,  \r1
+        vtrn.8          \r2,  \r3
+        vtrn.8          \r4,  \r5
+        vtrn.8          \r6,  \r7
+.endm
+
 #endif /* DAV1D_SRC_ARM_32_UTIL_S */
--- a/src/arm/mc_init_tmpl.c
+++ b/src/arm/mc_init_tmpl.c
@@ -107,9 +107,7 @@
     c->w_mask[0] = dav1d_w_mask_444_8bpc_neon;
     c->w_mask[1] = dav1d_w_mask_422_8bpc_neon;
     c->w_mask[2] = dav1d_w_mask_420_8bpc_neon;
-#if ARCH_AARCH64
     c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
     c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
-#endif
 #endif
 }