shithub: dav1d

--- a/src/arm/64/ipred16.S

+++ b/src/arm/64/ipred16.S

@@ -2829,3 +2829,248 @@

         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)

         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)

 endfunc

+// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,

+//                            const ptrdiff_t stride, const int w_pad,

+//                            const int h_pad, const int cw, const int ch);

+function ipred_cfl_ac_444_16bpc_neon, export=1

+        clz             w8,  w5

+        lsl             w4,  w4,  #2

+        adr             x7,  L(ipred_cfl_ac_444_tbl)

+        sub             w8,  w8,  #26

+        ldrh            w8,  [x7, w8, uxtw #1]

+        movi            v24.4s,  #0

+        movi            v25.4s,  #0

+        movi            v26.4s,  #0

+        movi            v27.4s,  #0

+        sub             x7,  x7,  w8, uxtw

+        sub             w8,  w6,  w4         // height - h_pad

+        rbit            w9,  w5              // rbit(width)

+        rbit            w10, w6              // rbit(height)

+        clz             w9,  w9              // ctz(width)

+        clz             w10, w10             // ctz(height)

+        add             w9,  w9,  w10        // log2sz

+        add             x10, x1,  x2

+        dup             v31.4s,  w9

+        lsl             x2,  x2,  #1

+        neg             v31.4s,  v31.4s      // -log2sz

+        br              x7

+L(ipred_cfl_ac_444_w4):

+1:      // Copy and expand input

+        ld1             {v0.4h},   [x1],  x2

+        ld1             {v0.d}[1], [x10], x2

+        ld1             {v1.4h},   [x1],  x2

+        ld1             {v1.d}[1], [x10], x2

+        shl             v0.8h,   v0.8h,   #3

+        shl             v1.8h,   v1.8h,   #3

+        subs            w8,  w8,  #4

+        st1             {v0.8h, v1.8h}, [x0], #32

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        b.gt            1b

+        trn2            v0.2d,   v1.2d,   v1.2d

+        trn2            v1.2d,   v1.2d,   v1.2d

+        b               L(ipred_cfl_ac_420_w4_hpad)

+L(ipred_cfl_ac_444_w8):

+1:      // Copy and expand input

+        ld1             {v0.8h}, [x1],  x2

+        ld1             {v1.8h}, [x10], x2

+        ld1             {v2.8h}, [x1],  x2

+        shl             v0.8h,   v0.8h,   #3

+        ld1             {v3.8h}, [x10], x2

+        shl             v1.8h,   v1.8h,   #3

+        shl             v2.8h,   v2.8h,   #3

+        shl             v3.8h,   v3.8h,   #3

+        subs            w8,  w8,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        mov             v0.16b,  v3.16b

+        mov             v1.16b,  v3.16b

+        b               L(ipred_cfl_ac_420_w8_hpad)

+L(ipred_cfl_ac_444_w16):

+        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)

+1:      // Copy and expand input, without padding

+        ld1             {v0.8h, v1.8h}, [x1],  x2

+        ld1             {v2.8h, v3.8h}, [x10], x2

+        shl             v0.8h,   v0.8h,   #3

+        shl             v1.8h,   v1.8h,   #3

+        shl             v2.8h,   v2.8h,   #3

+        shl             v3.8h,   v3.8h,   #3

+        subs            w8,  w8,  #2

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        mov             v0.16b,  v2.16b

+        mov             v1.16b,  v3.16b

+        b               L(ipred_cfl_ac_420_w16_hpad)

+L(ipred_cfl_ac_444_w16_wpad):

+1:      // Copy and expand input, padding 8

+        ld1             {v0.8h}, [x1],  x2

+        ld1             {v2.8h}, [x10], x2

+        shl             v0.8h,   v0.8h,   #3

+        shl             v2.8h,   v2.8h,   #3

+        dup             v1.8h,   v0.h[7]

+        dup             v3.8h,   v2.h[7]

+        subs            w8,  w8,  #2

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        mov             v0.16b,  v2.16b

+        mov             v1.16b,  v3.16b

+        b               L(ipred_cfl_ac_420_w16_hpad)

+L(ipred_cfl_ac_444_w32):

+        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)

+        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1

+        lsr             x2,  x2,  #1 // Restore the stride to one line increments

+        sub             x7,  x7,  w3, uxtw

+        br              x7

+L(ipred_cfl_ac_444_w32_wpad0):

+1:      // Copy and expand input, without padding

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2

+        shl             v0.8h,   v0.8h,   #3

+        shl             v1.8h,   v1.8h,   #3

+        shl             v2.8h,   v2.8h,   #3

+        shl             v3.8h,   v3.8h,   #3

+        subs            w8,  w8,  #1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        b               L(ipred_cfl_ac_444_w32_hpad)

+L(ipred_cfl_ac_444_w32_wpad2):

+1:      // Copy and expand input, padding 8

+        ld1             {v0.8h, v1.8h, v2.8h}, [x1],  x2

+        shl             v2.8h,   v2.8h,   #3

+        shl             v0.8h,   v0.8h,   #3

+        shl             v1.8h,   v1.8h,   #3

+        dup             v3.8h,   v2.h[7]

+        subs            w8,  w8,  #1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        b               L(ipred_cfl_ac_444_w32_hpad)

+L(ipred_cfl_ac_444_w32_wpad4):

+1:      // Copy and expand input, padding 16

+        ld1             {v0.8h, v1.8h}, [x1],  x2

+        shl             v1.8h,   v1.8h,   #3

+        shl             v0.8h,   v0.8h,   #3

+        dup             v2.8h,   v1.h[7]

+        dup             v3.8h,   v1.h[7]

+        subs            w8,  w8,  #1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        b               L(ipred_cfl_ac_444_w32_hpad)

+L(ipred_cfl_ac_444_w32_wpad6):

+1:      // Copy and expand input, padding 24

+        ld1             {v0.8h}, [x1],  x2

+        shl             v0.8h,   v0.8h,   #3

+        dup             v1.8h,   v0.h[7]

+        dup             v2.8h,   v0.h[7]

+        dup             v3.8h,   v0.h[7]

+        subs            w8,  w8,  #1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+L(ipred_cfl_ac_444_w32_hpad):

+        cbz             w4,  3f

+2:      // Vertical padding (h_pad > 0)

+        subs            w4,  w4,  #2

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            2b

+3:

+        //  Multiply the height by eight and reuse the w4 subtracting

+        lsl             w6,  w6,  #3

+        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)

+L(ipred_cfl_ac_444_tbl):

+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)

+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)

+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)

+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)

+L(ipred_cfl_ac_444_w32_tbl):

+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)

+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)

+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)

+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)

+endfunc

--- a/src/arm/ipred_init_tmpl.c

+++ b/src/arm/ipred_init_tmpl.c

@@ -76,9 +76,7 @@

     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon);

     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);

-#if BITDEPTH == 8

     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);

-#endif

     c->pal_pred                  = BF(dav1d_pal_pred, neon);

 #endif

--

⑨