shithub: dav1d

Download patch

ref: 97ab8290604d5b728113cc816da3c99455542841
parent: 8abcf5dc6739c85a7458985fe566ef4caf0537f8
author: Martin Storsjö <martin@martin.st>
date: Mon Feb 11 18:29:51 EST 2019

arm64: mc: NEON implementation of warp8x8{,t}

Relative speedup vs C code:
                 Cortex A53    A72    A73
warp_8x8_8bpc_neon:    3.19   2.60   3.66
warp_8x8t_8bpc_neon:   3.09   2.50   3.58

--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -2328,3 +2328,191 @@
 
 filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
 filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+
+.macro load_filter_row dst, src, inc
+        asr             w13, \src, #10
+        ldr             \dst, [x11, w13, sxtw #3]
+        add             \src, \src, \inc
+.endm
+
+function warp_filter_horz
+        add             w12, w5,  #512
+
+        ld1             {v16.8b, v17.8b}, [x2], x3
+
+        load_filter_row d0, w12, w7
+        load_filter_row d1, w12, w7
+        load_filter_row d2, w12, w7
+        sxtl            v0.8h,   v0.8b
+        load_filter_row d3, w12, w7
+        sxtl            v1.8h,   v1.8b
+        load_filter_row d4, w12, w7
+        sxtl            v2.8h,   v2.8b
+        load_filter_row d5, w12, w7
+        sxtl            v3.8h,   v3.8b
+        load_filter_row d6, w12, w7
+        sxtl            v4.8h,   v4.8b
+        load_filter_row d7, w12, w7
+        sxtl            v5.8h,   v5.8b
+        sxtl            v6.8h,   v6.8b
+        sxtl            v7.8h,   v7.8b
+
+        uxtl            v16.8h,  v16.8b
+        uxtl            v17.8h,  v17.8b
+
+        ext             v18.16b, v16.16b, v17.16b, #2*1
+        mul             v23.8h,  v16.8h,  v0.8h
+        ext             v19.16b, v16.16b, v17.16b, #2*2
+        mul             v18.8h,  v18.8h,  v1.8h
+        ext             v20.16b, v16.16b, v17.16b, #2*3
+        mul             v19.8h,  v19.8h,  v2.8h
+        ext             v21.16b, v16.16b, v17.16b, #2*4
+        saddlp          v23.4s,  v23.8h
+        mul             v20.8h,  v20.8h,  v3.8h
+        ext             v22.16b, v16.16b, v17.16b, #2*5
+        saddlp          v18.4s,  v18.8h
+        mul             v21.8h,  v21.8h,  v4.8h
+        saddlp          v19.4s,  v19.8h
+        mul             v22.8h,  v22.8h,  v5.8h
+        saddlp          v20.4s,  v20.8h
+        addv            s23,     v23.4s
+        saddlp          v21.4s,  v21.8h
+        addv            s18,     v18.4s
+        saddlp          v22.4s,  v22.8h
+        addv            s19,     v19.4s
+        trn1            v18.2s,  v23.2s,  v18.2s
+        addv            s20,     v20.4s
+        ext             v23.16b, v16.16b, v17.16b, #2*6
+        trn1            v19.2s,  v19.2s,  v20.2s
+        addv            s21,     v21.4s
+        mul             v23.8h,  v23.8h,  v6.8h
+        ext             v20.16b, v16.16b, v17.16b, #2*7
+        addv            s22,     v22.4s
+        mul             v20.8h,  v20.8h,  v7.8h
+        saddlp          v23.4s,  v23.8h
+        trn1            v21.2s,  v21.2s,  v22.2s
+        saddlp          v20.4s,  v20.8h
+        addv            s23,     v23.4s
+        addv            s20,     v20.4s
+        trn1            v20.2s,  v23.2s,  v20.2s
+        trn1            v18.2d,  v18.2d,  v19.2d
+        trn1            v20.2d,  v21.2d,  v20.2d
+
+        add             w5,  w5,  w8
+
+        rshrn           v16.4h,  v18.4s,  #3
+        rshrn2          v16.8h,  v20.4s,  #3
+
+        ret
+endfunc
+
+// void dav1d_warp_affine_8x8_8bpc_neon(
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *src, const ptrdiff_t src_stride,
+//         const int16_t *const abcd, int mx, int my)
+.macro warp t, shift
+function warp_affine_8x8\t\()_8bpc_neon, export=1
+        ldr             x4,  [x4]
+        ubfx            x7,  x4, #0,  #16
+        ubfx            x8,  x4, #16, #16
+        ubfx            x9,  x4, #32, #16
+        ubfx            x4,  x4, #48, #16
+        sxth            w7,  w7
+        sxth            w8,  w8
+        sxth            w9,  w9
+        sxth            w4,  w4
+        mov             w10, #8
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        sub             x2,  x2,  #3
+        movrel          x11, dav1d_mc_warp_filter, 64*8
+        mov             x15, x30
+.ifnb \t
+        lsl             x1,  x1,  #1
+.endif
+
+        bl              warp_filter_horz
+        mov             v24.16b, v16.16b
+        bl              warp_filter_horz
+        mov             v25.16b, v16.16b
+        bl              warp_filter_horz
+        mov             v26.16b, v16.16b
+        bl              warp_filter_horz
+        mov             v27.16b, v16.16b
+        bl              warp_filter_horz
+        mov             v28.16b, v16.16b
+        bl              warp_filter_horz
+        mov             v29.16b, v16.16b
+        bl              warp_filter_horz
+        mov             v30.16b, v16.16b
+
+1:
+        add             w14, w6,  #512
+        bl              warp_filter_horz
+        mov             v31.16b, v16.16b
+
+        load_filter_row d0, w14, w9
+        load_filter_row d1, w14, w9
+        load_filter_row d2, w14, w9
+        load_filter_row d3, w14, w9
+        load_filter_row d4, w14, w9
+        load_filter_row d5, w14, w9
+        load_filter_row d6, w14, w9
+        load_filter_row d7, w14, w9
+        transpose_8x8b  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        sxtl            v2.8h,   v2.8b
+        sxtl            v3.8h,   v3.8b
+        sxtl            v4.8h,   v4.8b
+        sxtl            v5.8h,   v5.8b
+        sxtl            v6.8h,   v6.8b
+        sxtl            v7.8h,   v7.8b
+
+        // This ordering of smull/smlal/smull2/smlal2 is highly
+        // beneficial for Cortex A53 here.
+        smull           v16.4s,  v24.4h,  v0.4h
+        smlal           v16.4s,  v25.4h,  v1.4h
+        smlal           v16.4s,  v26.4h,  v2.4h
+        smlal           v16.4s,  v27.4h,  v3.4h
+        smlal           v16.4s,  v28.4h,  v4.4h
+        smlal           v16.4s,  v29.4h,  v5.4h
+        smlal           v16.4s,  v30.4h,  v6.4h
+        smlal           v16.4s,  v31.4h,  v7.4h
+        smull2          v17.4s,  v24.8h,  v0.8h
+        smlal2          v17.4s,  v25.8h,  v1.8h
+        smlal2          v17.4s,  v26.8h,  v2.8h
+        smlal2          v17.4s,  v27.8h,  v3.8h
+        smlal2          v17.4s,  v28.8h,  v4.8h
+        smlal2          v17.4s,  v29.8h,  v5.8h
+        smlal2          v17.4s,  v30.8h,  v6.8h
+        smlal2          v17.4s,  v31.8h,  v7.8h
+
+        mov             v24.16b, v25.16b
+        mov             v25.16b, v26.16b
+        sqrshrn         v16.4h,  v16.4s,  #\shift
+        mov             v26.16b, v27.16b
+        sqrshrn2        v16.8h,  v17.4s,  #\shift
+        mov             v27.16b, v28.16b
+        mov             v28.16b, v29.16b
+.ifb \t
+        sqxtun          v16.8b,  v16.8h
+.endif
+        mov             v29.16b, v30.16b
+        mov             v30.16b, v31.16b
+        subs            w10, w10, #1
+.ifnb \t
+        st1             {v16.8h}, [x0], x1
+.else
+        st1             {v16.8b}, [x0], x1
+.endif
+
+        add             w6,  w6,  w4
+        b.gt            1b
+
+        br              x15
+endfunc
+.endm
+
+warp  , 11
+warp t, 7
--- a/src/arm/64/util.S
+++ b/src/arm/64/util.S
@@ -59,4 +59,33 @@
 #endif
 .endm
 
+.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+    trn1        \r8\().8b,  \r0\().8b,  \r1\().8b
+    trn2        \r9\().8b,  \r0\().8b,  \r1\().8b
+    trn1        \r1\().8b,  \r2\().8b,  \r3\().8b
+    trn2        \r3\().8b,  \r2\().8b,  \r3\().8b
+    trn1        \r0\().8b,  \r4\().8b,  \r5\().8b
+    trn2        \r5\().8b,  \r4\().8b,  \r5\().8b
+    trn1        \r2\().8b,  \r6\().8b,  \r7\().8b
+    trn2        \r7\().8b,  \r6\().8b,  \r7\().8b
+
+    trn1        \r4\().4h,  \r0\().4h,  \r2\().4h
+    trn2        \r2\().4h,  \r0\().4h,  \r2\().4h
+    trn1        \r6\().4h,  \r5\().4h,  \r7\().4h
+    trn2        \r7\().4h,  \r5\().4h,  \r7\().4h
+    trn1        \r5\().4h,  \r9\().4h,  \r3\().4h
+    trn2        \r9\().4h,  \r9\().4h,  \r3\().4h
+    trn1        \r3\().4h,  \r8\().4h,  \r1\().4h
+    trn2        \r8\().4h,  \r8\().4h,  \r1\().4h
+
+    trn1        \r0\().2s,  \r3\().2s,  \r4\().2s
+    trn2        \r4\().2s,  \r3\().2s,  \r4\().2s
+    trn1        \r1\().2s,  \r5\().2s,  \r6\().2s
+    trn2        \r5\().2s,  \r5\().2s,  \r6\().2s
+    trn2        \r6\().2s,  \r8\().2s,  \r2\().2s
+    trn1        \r2\().2s,  \r8\().2s,  \r2\().2s
+    trn1        \r3\().2s,  \r9\().2s,  \r7\().2s
+    trn2        \r7\().2s,  \r9\().2s,  \r7\().2s
+.endm
+
 #endif /* DAVID_SRC_ARM_64_UTIL_S */
--- a/src/arm/mc_init_tmpl.c
+++ b/src/arm/mc_init_tmpl.c
@@ -56,6 +56,9 @@
 decl_w_avg_fn(dav1d_w_avg_8bpc_neon);
 decl_mask_fn(dav1d_mask_8bpc_neon);
 
+decl_warp8x8_fn(dav1d_warp_affine_8x8_8bpc_neon);
+decl_warp8x8t_fn(dav1d_warp_affine_8x8t_8bpc_neon);
+
 void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
 #define init_mc_fn(type, name, suffix) \
     c->mc[type] = dav1d_put_##name##_8bpc_##suffix
@@ -91,5 +94,9 @@
     c->avg = dav1d_avg_8bpc_neon;
     c->w_avg = dav1d_w_avg_8bpc_neon;
     c->mask = dav1d_mask_8bpc_neon;
+#if ARCH_AARCH64
+    c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
+    c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
+#endif
 #endif
 }