shithub: dav1d

Download patch

ref: 9b40bb95e17ec06c966cddb0353759d5d32f794b
parent: 2e271c49c7c372b1b8324b6582c5548774bce2bd
author: Martin Storsjö <martin@martin.st>
date: Mon Jun 29 10:03:31 EDT 2020

arm64: ipred: 8 bpc NEON implementation of the cfl_ac 444 function

Relative speedup over C code:
                      Cortex A53    A72    A73
cfl_ac_444_w4_8bpc_neon:    8.72   8.75  10.50
cfl_ac_444_w8_8bpc_neon:   13.10  10.77  11.23
cfl_ac_444_w16_8bpc_neon:  13.08   9.95  10.49
cfl_ac_444_w32_8bpc_neon:  12.58   9.43  10.63

--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -2080,6 +2080,7 @@
         sub             x0,  x0,  w6, uxtw #4
         urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
         dup             v4.8h,   v4.h[0]
+L(ipred_cfl_ac_420_w8_subtract_dc):
 6:      // Subtract dc from ac
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
         subs            w6,  w6,  #4
@@ -2473,4 +2474,291 @@
         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
+
+// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                           const ptrdiff_t stride, const int w_pad,
+//                           const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_8bpc_neon, export=1
+        clz             w8,  w5
+        lsl             w4,  w4,  #2
+        adr             x7,  L(ipred_cfl_ac_444_tbl)
+        sub             w8,  w8,  #26
+        ldrh            w8,  [x7, w8, uxtw #1]
+        movi            v16.8h,  #0
+        movi            v17.8h,  #0
+        movi            v18.8h,  #0
+        movi            v19.8h,  #0
+        sub             x7,  x7,  w8, uxtw
+        sub             w8,  w6,  w4         // height - h_pad
+        rbit            w9,  w5              // rbit(width)
+        rbit            w10, w6              // rbit(height)
+        clz             w9,  w9              // ctz(width)
+        clz             w10, w10             // ctz(height)
+        add             w9,  w9,  w10        // log2sz
+        add             x10, x1,  x2
+        dup             v31.4s,  w9
+        lsl             x2,  x2,  #1
+        neg             v31.4s,  v31.4s      // -log2sz
+        br              x7
+
+L(ipred_cfl_ac_444_w4):
+1:      // Copy and expand input
+        ld1             {v0.s}[0], [x1],  x2
+        ld1             {v0.s}[1], [x10], x2
+        ld1             {v1.s}[0], [x1],  x2
+        ld1             {v1.s}[1], [x10], x2
+        ushll           v0.8h,   v0.8b,   #3
+        ushll           v1.8h,   v1.8b,   #3
+        subs            w8,  w8,  #4
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        st1             {v0.8h, v1.8h}, [x0], #32
+        b.gt            1b
+        trn2            v0.2d,   v1.2d,   v1.2d
+        trn2            v1.2d,   v1.2d,   v1.2d
+        b               L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+1:      // Copy and expand input
+        ld1             {v0.8b}, [x1],  x2
+        ld1             {v1.8b}, [x10], x2
+        ld1             {v2.8b}, [x1],  x2
+        ushll           v0.8h,   v0.8b,   #3
+        ld1             {v3.8b}, [x10], x2
+        ushll           v1.8h,   v1.8b,   #3
+        ushll           v2.8h,   v2.8b,   #3
+        ushll           v3.8h,   v3.8b,   #3
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v3.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
+1:      // Copy and expand input, without padding
+        ld1             {v0.16b}, [x1],  x2
+        ld1             {v2.16b}, [x10], x2
+        ld1             {v4.16b}, [x1],  x2
+        ushll2          v1.8h,   v0.16b,  #3
+        ushll           v0.8h,   v0.8b,   #3
+        ld1             {v6.16b}, [x10], x2
+        ushll2          v3.8h,   v2.16b,  #3
+        ushll           v2.8h,   v2.8b,   #3
+        ushll2          v5.8h,   v4.16b,  #3
+        ushll           v4.8h,   v4.8b,   #3
+        ushll2          v7.8h,   v6.16b,  #3
+        ushll           v6.8h,   v6.8b,   #3
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            1b
+        mov             v0.16b,  v6.16b
+        mov             v1.16b,  v7.16b
+        mov             v2.16b,  v6.16b
+        mov             v3.16b,  v7.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1:      // Copy and expand input, padding 8
+        ld1             {v0.8b}, [x1],  x2
+        ld1             {v2.8b}, [x10], x2
+        ld1             {v4.8b}, [x1],  x2
+        ld1             {v6.8b}, [x10], x2
+        ushll           v0.8h,   v0.8b,   #3
+        ushll           v2.8h,   v2.8b,   #3
+        ushll           v4.8h,   v4.8b,   #3
+        ushll           v6.8h,   v6.8b,   #3
+        dup             v1.8h,   v0.h[7]
+        dup             v3.8h,   v2.h[7]
+        dup             v5.8h,   v4.h[7]
+        dup             v7.8h,   v6.h[7]
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            1b
+        mov             v0.16b,  v6.16b
+        mov             v1.16b,  v7.16b
+        mov             v2.16b,  v6.16b
+        mov             v3.16b,  v7.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
+        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
+        sub             x7,  x7,  w3, uxtw
+        br              x7
+
+L(ipred_cfl_ac_444_w32_wpad0):
+1:      // Copy and expand input, without padding
+        ld1             {v2.16b, v3.16b}, [x1],  x2
+        ld1             {v6.16b, v7.16b}, [x10], x2
+        ushll           v0.8h,   v2.8b,   #3
+        ushll2          v1.8h,   v2.16b,  #3
+        ushll           v2.8h,   v3.8b,   #3
+        ushll2          v3.8h,   v3.16b,  #3
+        ushll           v4.8h,   v6.8b,   #3
+        ushll2          v5.8h,   v6.16b,  #3
+        ushll           v6.8h,   v7.8b,   #3
+        ushll2          v7.8h,   v7.16b,  #3
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            1b
+        b               L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+1:      // Copy and expand input, padding 8
+        ldr             d2,  [x1,  #16]
+        ld1             {v1.16b}, [x1],  x2
+        ldr             d6,  [x10, #16]
+        ld1             {v5.16b}, [x10], x2
+        ushll           v2.8h,   v2.8b,   #3
+        ushll           v0.8h,   v1.8b,   #3
+        ushll2          v1.8h,   v1.16b,  #3
+        ushll           v6.8h,   v6.8b,   #3
+        ushll           v4.8h,   v5.8b,   #3
+        ushll2          v5.8h,   v5.16b,  #3
+        dup             v3.8h,   v2.h[7]
+        dup             v7.8h,   v6.h[7]
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            1b
+        b               L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+1:      // Copy and expand input, padding 16
+        ld1             {v1.16b}, [x1],  x2
+        ld1             {v5.16b}, [x10], x2
+        ushll           v0.8h,   v1.8b,   #3
+        ushll2          v1.8h,   v1.16b,  #3
+        ushll           v4.8h,   v5.8b,   #3
+        ushll2          v5.8h,   v5.16b,  #3
+        dup             v2.8h,   v1.h[7]
+        dup             v3.8h,   v1.h[7]
+        dup             v6.8h,   v5.h[7]
+        dup             v7.8h,   v5.h[7]
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            1b
+        b               L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+1:      // Copy and expand input, padding 24
+        ld1             {v0.8b}, [x1],  x2
+        ld1             {v4.8b}, [x10], x2
+        ushll           v0.8h,   v0.8b,   #3
+        ushll           v4.8h,   v4.8b,   #3
+        dup             v1.8h,   v0.h[7]
+        dup             v2.8h,   v0.h[7]
+        dup             v3.8h,   v0.h[7]
+        dup             v5.8h,   v4.h[7]
+        dup             v6.8h,   v4.h[7]
+        dup             v7.8h,   v4.h[7]
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            1b
+
+L(ipred_cfl_ac_444_w32_hpad):
+        cbz             w4,  3f
+2:      // Vertical padding (h_pad > 0)
+        subs            w4,  w4,  #2
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            2b
+3:
+
+        // Quadruple the height and reuse the w8 subtracting
+        lsl             w6,  w6,  #2
+        // Aggregate the sums, with wider intermediates earlier than in
+        // ipred_cfl_ac_420_w8_calc_subtract_dc.
+        uaddlp          v0.4s,   v16.8h
+        uaddlp          v1.4s,   v17.8h
+        uaddlp          v2.4s,   v18.8h
+        uaddlp          v3.4s,   v19.8h
+        add             v0.4s,   v0.4s,   v1.4s
+        add             v2.4s,   v2.4s,   v3.4s
+        add             v0.4s,   v0.4s,   v2.4s
+        addv            s0,  v0.4s                // sum
+        sub             x0,  x0,  w6, uxtw #4
+        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
+        dup             v4.8h,   v4.h[0]
+        b               L(ipred_cfl_ac_420_w8_subtract_dc)
+
+L(ipred_cfl_ac_444_tbl):
+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
+
+L(ipred_cfl_ac_444_w32_tbl):
+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
 endfunc
--- a/src/arm/ipred_init_tmpl.c
+++ b/src/arm/ipred_init_tmpl.c
@@ -46,6 +46,7 @@
 
 decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon));
 decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon));
 
 decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
 
@@ -75,6 +76,9 @@
 
     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon);
     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);
+#if BITDEPTH == 8
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);
+#endif
 
     c->pal_pred                  = BF(dav1d_pal_pred, neon);
 #endif