shithub: dav1d

Download patch

ref: 1dc2dc7d27bd0075684945b00b3539be429886aa
parent: d20d70e83704f2c64855a2a605c0c39aaaf39e1a
author: B Krishnan Iyer <krishnaniyer97@gmail.com>
date: Mon Jul 22 19:20:30 EDT 2019

arm64: mc: NEON implementation of blend, blend_h and blend_v function

                   	A73	A53
blend_h_w2_8bpc_c:	184.7	301.5
blend_h_w2_8bpc_neon:	58.8	104.1
blend_h_w4_8bpc_c:	291.4	507.3
blend_h_w4_8bpc_neon:	48.7	108.9
blend_h_w8_8bpc_c:	510.1	992.7
blend_h_w8_8bpc_neon:	66.5	99.3
blend_h_w16_8bpc_c:	972	1835.3
blend_h_w16_8bpc_neon:	82.7	145.2
blend_h_w32_8bpc_c:	776.7	912.9
blend_h_w32_8bpc_neon:	155.1	266.9
blend_h_w64_8bpc_c:	1424.3	1635.4
blend_h_w64_8bpc_neon:	273.4	480.9
blend_h_w128_8bpc_c:	3318.1	3774
blend_h_w128_8bpc_neon:	614.1	1097.9
blend_v_w2_8bpc_c:	278.8	427.5
blend_v_w2_8bpc_neon:	113.7	170.4
blend_v_w4_8bpc_c:	960.2	1597.7
blend_v_w4_8bpc_neon:	222.9	351.4
blend_v_w8_8bpc_c:	1694.2	3333.5
blend_v_w8_8bpc_neon:	200.9	333.6
blend_v_w16_8bpc_c:	3115.2	5971.6
blend_v_w16_8bpc_neon:	233.2	494.8
blend_v_w32_8bpc_c:	3949.7	6070.6
blend_v_w32_8bpc_neon:	460.4	841.6
blend_w4_8bpc_c:	244.2	388.3
blend_w4_8bpc_neon:	25.5	66.7
blend_w8_8bpc_c:	616.3	1120.8
blend_w8_8bpc_neon:	46	110.7
blend_w16_8bpc_c:	2193.1	4056.4
blend_w16_8bpc_neon:	140.7	299.3
blend_w32_8bpc_c:	2502.8	2998.5
blend_w32_8bpc_neon:	381.4	725.3

--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -234,6 +234,413 @@
 bidir_fn mask
 
 
+function blend_8bpc_neon, export=1
+        adr             x6,  L(blend_tbl)
+        clz             w3,  w3
+        sub             w3,  w3,  #26
+        ldrh            w3,  [x6,  x3,  lsl #1]
+        sub             x6,  x6,  w3,  uxtw
+        movi            v4.16b,  #64
+        add             x8,  x0,  x1
+        lsl             w1,  w1,  #1
+        br              x6
+4:
+        ld1             {v2.d}[0],   [x5],  #8
+        ld1             {v1.d}[0],   [x2],  #8
+        ld1             {v0.s}[0],   [x0]
+        subs            w4,  w4,  #2
+        ld1             {v0.s}[1],   [x8]
+        sub             v3.8b,   v4.8b,   v2.8b
+        umull           v5.8h,   v1.8b,   v2.8b
+        umlal           v5.8h,   v0.8b,   v3.8b
+        rshrn           v6.8b,   v5.8h,   #6
+        st1             {v6.s}[0],   [x0],  x1
+        st1             {v6.s}[1],   [x8],  x1
+        b.gt            4b
+        ret
+8:
+        ld1             {v2.2d},   [x5],  #16
+        ld1             {v1.2d},   [x2],  #16
+        ld1             {v0.d}[0],   [x0]
+        ld1             {v0.d}[1],   [x8]
+        sub             v3.16b,  v4.16b,  v2.16b
+        subs            w4,  w4,  #2
+        umull           v5.8h,   v1.8b,   v2.8b
+        umlal           v5.8h,   v0.8b,   v3.8b
+        umull2          v6.8h,   v1.16b,  v2.16b
+        umlal2          v6.8h,   v0.16b,  v3.16b
+        rshrn           v7.8b,   v5.8h,   #6
+        rshrn2          v7.16b,  v6.8h,   #6
+        st1             {v7.d}[0],   [x0],  x1
+        st1             {v7.d}[1],   [x8],  x1
+        b.gt            8b
+        ret
+16:
+        ld1             {v1.2d,   v2.2d},   [x5],  #32
+        ld1             {v5.2d,   v6.2d},   [x2],  #32
+        ld1             {v0.2d},   [x0]
+        subs            w4,  w4,  #2
+        sub             v7.16b,  v4.16b,  v1.16b
+        sub             v20.16b, v4.16b,  v2.16b
+        ld1             {v3.2d},   [x8]
+        umull           v16.8h,  v5.8b,   v1.8b
+        umlal           v16.8h,  v0.8b,   v7.8b
+        umull2          v17.8h,  v5.16b,  v1.16b
+        umlal2          v17.8h,  v0.16b,  v7.16b
+        umull           v21.8h,  v6.8b,   v2.8b
+        umlal           v21.8h,  v3.8b,   v20.8b
+        umull2          v22.8h,  v6.16b,  v2.16b
+        umlal2          v22.8h,  v3.16b,  v20.16b
+        rshrn           v18.8b,  v16.8h,  #6
+        rshrn2          v18.16b, v17.8h,  #6
+        rshrn           v19.8b,  v21.8h,  #6
+        rshrn2          v19.16b, v22.8h,  #6
+        st1             {v18.2d},  [x0],  x1
+        st1             {v19.2d},  [x8],  x1
+        b.gt            16b
+        ret
+32:
+        ld1             {v0.2d,   v1.2d,   v2.2d,   v3.2d},   [x5],  #64
+        ld1             {v16.2d,  v17.2d,  v18.2d,  v19.2d},  [x2],  #64
+        ld1             {v20.2d,  v21.2d},  [x0]
+        subs            w4,  w4,  #2
+        ld1             {v22.2d,  v23.2d},  [x8]
+        sub             v5.16b,  v4.16b,  v0.16b
+        sub             v6.16b,  v4.16b,  v1.16b
+        sub             v30.16b, v4.16b,  v2.16b
+        sub             v31.16b, v4.16b,  v3.16b
+        umull           v24.8h,  v16.8b,  v0.8b
+        umlal           v24.8h,  v20.8b,  v5.8b
+        umull2          v26.8h,  v16.16b, v0.16b
+        umlal2          v26.8h,  v20.16b, v5.16b
+        umull           v28.8h,  v17.8b,  v1.8b
+        umlal           v28.8h,  v21.8b,  v6.8b
+        umull2          v7.8h,   v17.16b, v1.16b
+        umlal2          v7.8h,   v21.16b, v6.16b
+        umull           v27.8h,  v18.8b,  v2.8b
+        umlal           v27.8h,  v22.8b,  v30.8b
+        umull2          v1.8h,   v18.16b, v2.16b
+        umlal2          v1.8h,   v22.16b, v30.16b
+        umull           v29.8h,  v19.8b,  v3.8b
+        umlal           v29.8h,  v23.8b,  v31.8b
+        umull2          v21.8h,  v19.16b, v3.16b
+        umlal2          v21.8h,  v23.16b, v31.16b
+        rshrn           v24.8b,  v24.8h,  #6
+        rshrn2          v24.16b, v26.8h,  #6
+        rshrn           v25.8b,  v28.8h,  #6
+        rshrn2          v25.16b, v7.8h,   #6
+        rshrn           v27.8b,  v27.8h,  #6
+        rshrn2          v27.16b, v1.8h,   #6
+        rshrn           v28.8b,  v29.8h,  #6
+        rshrn2          v28.16b, v21.8h,  #6
+        st1             {v24.2d, v25.2d}, [x0],  x1
+        st1             {v27.2d, v28.2d}, [x8],  x1
+        b.gt            32b
+        ret
+L(blend_tbl):
+        .hword L(blend_tbl) - 32b
+        .hword L(blend_tbl) - 16b
+        .hword L(blend_tbl) -  8b
+        .hword L(blend_tbl) -  4b
+endfunc
+
+function blend_h_8bpc_neon, export=1
+        adr             x6, L(blend_h_tbl)
+        movrel          x5,  X(obmc_masks)
+        add             x5,  x5,  w4,  uxtw
+        sub             w4,  w4,  w4,  lsr #2
+        clz             w7,  w3
+        movi            v4.16b,  #64
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        sub             w7,  w7,  #24
+        ldrh            w7,  [x6,  x7,  lsl #1]
+        sub             x6,  x6,  w7, uxtw
+        br              x6
+2:
+        ld1             {v0.h}[0],   [x5],  #2
+        ld1             {v1.s}[0],   [x2],  #4
+        subs            w4,  w4,  #2
+        ld1             {v2.h}[0],   [x0]
+        zip1            v0.8b,   v0.8b,   v0.8b
+        sub             v3.8b,   v4.8b,   v0.8b
+        ld1             {v2.h}[1],   [x8]
+        umull           v5.8h,   v1.8b,   v0.8b
+        umlal           v5.8h,   v2.8b,   v3.8b
+        rshrn           v5.8b,   v5.8h,   #6
+        st1             {v5.h}[0],   [x0],  x1
+        st1             {v5.h}[1],   [x8],  x1
+        b.gt            2b
+        ret
+4:
+        ld2r            {v0.8b,   v1.8b},   [x5],  #2
+        ld1             {v2.2s},   [x2],  #8
+        subs            w4,  w4,  #2
+        ext             v0.8b,   v0.8b,   v1.8b,   #4
+        ld1             {v3.s}[0],   [x0]
+        sub             v5.8b,   v4.8b,   v0.8b
+        ld1             {v3.s}[1],   [x8]
+        umull           v6.8h,   v2.8b,   v0.8b
+        umlal           v6.8h,   v3.8b,   v5.8b
+        rshrn           v6.8b,   v6.8h,   #6
+        st1             {v6.s}[0],   [x0],  x1
+        st1             {v6.s}[1],   [x8],  x1
+        b.gt            4b
+        ret
+8:
+        ld2r            {v0.16b,  v1.16b},  [x5],  #2
+        ld1             {v2.16b},  [x2],  #16
+        ld1             {v3.d}[0],   [x0]
+        ext             v0.16b,  v0.16b,  v1.16b,  #8
+        sub             v5.16b,  v4.16b,  v0.16b
+        ld1             {v3.d}[1],   [x8]
+        subs            w4,  w4,  #2
+        umull           v6.8h,   v0.8b,   v2.8b
+        umlal           v6.8h,   v3.8b,   v5.8b
+        umull2          v7.8h,   v0.16b,  v2.16b
+        umlal2          v7.8h,   v3.16b,  v5.16b
+        rshrn           v16.8b,  v6.8h,   #6
+        rshrn2          v16.16b, v7.8h,   #6
+        st1             {v16.d}[0],  [x0],  x1
+        st1             {v16.d}[1],  [x8],  x1
+        b.gt            8b
+        ret
+16:
+        ld2r            {v0.16b,  v1.16b},  [x5],  #2
+        ld1             {v2.16b,  v3.16b},  [x2],  #32
+        ld1             {v5.16b},  [x0]
+        sub             v7.16b,  v4.16b,  v0.16b
+        sub             v16.16b, v4.16b,  v1.16b
+        ld1             {v6.16b},  [x8]
+        subs            w4,  w4,  #2
+        umull           v17.8h,  v0.8b,   v2.8b
+        umlal           v17.8h,  v5.8b,   v7.8b
+        umull2          v18.8h,  v0.16b,  v2.16b
+        umlal2          v18.8h,  v5.16b,  v7.16b
+        umull           v19.8h,  v1.8b,   v3.8b
+        umlal           v19.8h,  v6.8b,   v16.8b
+        umull2          v20.8h,  v1.16b,  v3.16b
+        umlal2          v20.8h,  v6.16b,  v16.16b
+        rshrn           v21.8b,  v17.8h,  #6
+        rshrn2          v21.16b, v18.8h,  #6
+        rshrn           v22.8b,  v19.8h,  #6
+        rshrn2          v22.16b, v20.8h,  #6
+        st1             {v21.16b}, [x0],  x1
+        st1             {v22.16b}, [x8],  x1
+        b.gt            16b
+        ret
+1280:
+640:
+320:
+        sub             x1,  x1,  w3,  uxtw
+        add             x7,  x2,  w3,  uxtw
+321:
+        ld2r            {v0.16b,  v1.16b},  [x5],  #2
+        mov             w6,  w3
+        sub             v20.16b, v4.16b,  v0.16b
+        sub             v21.16b, v4.16b,  v1.16b
+32:
+        ld1             {v16.16b, v17.16b}, [x2],  #32
+        ld1             {v2.16b,  v3.16b},  [x0]
+        subs            w6,  w6,  #32
+        umull           v23.8h,  v0.8b,   v16.8b
+        umlal           v23.8h,  v2.8b,   v20.8b
+        ld1             {v18.16b, v19.16b}, [x7],  #32
+        umull2          v27.8h,  v0.16b,  v16.16b
+        umlal2          v27.8h,  v2.16b,  v20.16b
+        ld1             {v6.16b,  v7.16b},  [x8]
+        umull           v24.8h,  v0.8b,   v17.8b
+        umlal           v24.8h,  v3.8b,   v20.8b
+        umull2          v28.8h,  v0.16b,  v17.16b
+        umlal2          v28.8h,  v3.16b,  v20.16b
+        umull           v25.8h,  v1.8b,   v18.8b
+        umlal           v25.8h,  v6.8b,   v21.8b
+        umull2          v5.8h,   v1.16b,  v18.16b
+        umlal2          v5.8h,   v6.16b,  v21.16b
+        rshrn           v29.8b,  v23.8h,  #6
+        rshrn2          v29.16b, v27.8h,  #6
+        umull           v26.8h,  v1.8b,   v19.8b
+        umlal           v26.8h,  v7.8b,   v21.8b
+        umull2          v31.8h,  v1.16b,  v19.16b
+        umlal2          v31.8h,  v7.16b,  v21.16b
+        rshrn           v30.8b,  v24.8h,  #6
+        rshrn2          v30.16b, v28.8h,  #6
+        rshrn           v23.8b,  v25.8h,  #6
+        rshrn2          v23.16b, v5.8h,   #6
+        rshrn           v24.8b,  v26.8h,  #6
+        st1             {v29.16b, v30.16b}, [x0],  #32
+        rshrn2          v24.16b, v31.8h,  #6
+        st1             {v23.16b, v24.16b}, [x8],  #32
+        b.gt            32b
+        subs            w4,  w4,  #2
+        add             x0,  x0,  x1
+        add             x8,  x8,  x1
+        add             x2,  x2,  w3,  uxtw
+        add             x7,  x7,  w3,  uxtw
+        b.gt            321b
+        ret
+L(blend_h_tbl):
+        .hword L(blend_h_tbl) - 1280b
+        .hword L(blend_h_tbl) -  640b
+        .hword L(blend_h_tbl) -  320b
+        .hword L(blend_h_tbl) -   16b
+        .hword L(blend_h_tbl) -    8b
+        .hword L(blend_h_tbl) -    4b
+        .hword L(blend_h_tbl) -    2b
+endfunc
+
+function blend_v_8bpc_neon, export=1
+        adr             x6,  L(blend_v_tbl)
+        movrel          x5,  X(obmc_masks)
+        add             x5,  x5,  w3,  uxtw
+        clz             w3,  w3
+        movi            v4.16b,  #64
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        sub             w3,  w3,  #26
+        ldrh            w3,  [x6,  x3,  lsl #1]
+        sub             x6,  x6,  w3,  uxtw
+        br              x6
+20:
+        ld1r            {v0.8b},   [x5]
+        sub             v1.8b,   v4.8b,   v0.8b
+2:
+        ld1             {v2.h}[0],   [x2],  #2
+        ld1             {v3.b}[0],   [x0]
+        subs            w4,  w4,  #2
+        ld1             {v2.b}[1],   [x2]
+        ld1             {v3.b}[1],   [x8]
+        umull           v5.8h,   v2.8b,   v0.8b
+        umlal           v5.8h,   v3.8b,   v1.8b
+        rshrn           v5.8b,   v5.8h,   #6
+        add             x2,  x2,  #2
+        st1             {v5.b}[0],   [x0],  x1
+        st1             {v5.b}[1],   [x8],  x1
+        b.gt            2b
+        ret
+40:
+        ld1r            {v0.2s},   [x5]
+        sub             v1.8b,   v4.8b,   v0.8b
+        sub             x1,  x1,  #3
+4:
+        ld1             {v2.8b},   [x2],  #8
+        ld1             {v3.s}[0],   [x0]
+        ld1             {v3.s}[1],   [x8]
+        subs            w4,  w4,  #2
+        umull           v5.8h,   v2.8b,   v0.8b
+        umlal           v5.8h,   v3.8b,   v1.8b
+        rshrn           v5.8b,   v5.8h,   #6
+        st1             {v5.h}[0],   [x0],  #2
+        st1             {v5.h}[2],   [x8],  #2
+        st1             {v5.b}[2],   [x0],  #1
+        st1             {v5.b}[6],   [x8],  #1
+        add             x0,  x0,  x1
+        add             x8,  x8,  x1
+        b.gt            4b
+        ret
+80:
+        ld1r            {v0.2d},   [x5]
+        sub             v1.16b,  v4.16b,  v0.16b
+        sub             x1,  x1,  #6
+8:
+        ld1             {v2.16b},  [x2],  #16
+        ld1             {v3.d}[0],   [x0]
+        ld1             {v3.d}[1],   [x8]
+        subs            w4,  w4,  #2
+        umull           v5.8h,  v0.8b,  v2.8b
+        umlal           v5.8h,  v3.8b,  v1.8b
+        umull2          v6.8h,  v0.16b, v2.16b
+        umlal2          v6.8h,  v3.16b, v1.16b
+        rshrn           v7.8b,  v5.8h,  #6
+        rshrn2          v7.16b, v6.8h,  #6
+        st1             {v7.s}[0],   [x0],  #4
+        st1             {v7.s}[2],   [x8],  #4
+        st1             {v7.h}[2],   [x0],  #2
+        st1             {v7.h}[6],   [x8],  #2
+        add             x0,  x0,  x1
+        add             x8,  x8,  x1
+        b.gt            8b
+        ret
+160:
+        ld1             {v0.16b},  [x5]
+        sub             v2.16b,  v4.16b,  v0.16b
+        sub             x1,  x1,  #12
+16:
+        ld1             {v5.16b,  v6.16b},  [x2],  #32
+        ld1             {v7.16b},  [x0]
+        subs            w4,  w4,  #2
+        ld1             {v16.16b}, [x8]
+        umull           v17.8h,  v5.8b,   v0.8b
+        umlal           v17.8h,  v7.8b,   v2.8b
+        umull2          v18.8h,  v5.16b,  v0.16b
+        umlal2          v18.8h,  v7.16b,  v2.16b
+        umull           v20.8h,  v6.8b,   v0.8b
+        umlal           v20.8h,  v16.8b,  v2.8b
+        umull2          v21.8h,  v6.16b,  v0.16b
+        umlal2          v21.8h,  v16.16b, v2.16b
+        rshrn           v19.8b,  v17.8h,  #6
+        rshrn2          v19.16b, v18.8h,  #6
+        rshrn           v22.8b,  v20.8h,  #6
+        rshrn2          v22.16b, v21.8h,  #6
+        st1             {v19.8b},  [x0],  #8
+        st1             {v22.8b},  [x8],  #8
+        st1             {v19.s}[2],  [x0],  #4
+        st1             {v22.s}[2],  [x8],  #4
+        add             x0,  x0,  x1
+        add             x8,  x8,  x1
+        b.gt            16b
+        ret
+320:
+        ld1             {v0.16b,  v1.16b},  [x5]
+        sub             v2.16b,  v4.16b,  v0.16b
+        sub             v3.16b,  v4.16b,  v1.16b
+        sub             x1,  x1,  #24
+32:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
+        ld1             {v5.16b,  v6.16b},  [x0]
+        subs            w4,  w4,  #2
+        ld1             {v20.16b, v21.16b}, [x8]
+        umull           v22.8h,  v16.8b,  v0.8b
+        umlal           v22.8h,  v5.8b,   v2.8b
+        umull2          v23.8h,  v16.16b, v0.16b
+        umlal2          v23.8h,  v5.16b,  v2.16b
+        umull           v28.8h,  v17.8b,  v1.8b
+        umlal           v28.8h,  v6.8b,   v3.8b
+        umull2          v29.8h,  v17.16b, v1.16b
+        umlal2          v29.8h,  v6.16b,  v3.16b
+        umull           v30.8h,  v18.8b,  v0.8b
+        umlal           v30.8h,  v20.8b,  v2.8b
+        umull2          v31.8h,  v18.16b, v0.16b
+        umlal2          v31.8h,  v20.16b, v2.16b
+        umull           v25.8h,  v19.8b,  v1.8b
+        umlal           v25.8h,  v21.8b,  v3.8b
+        umull2          v26.8h,  v19.16b, v1.16b
+        umlal2          v26.8h,  v21.16b, v3.16b
+        rshrn           v24.8b,  v22.8h,  #6
+        rshrn2          v24.16b, v23.8h,  #6
+        rshrn           v28.8b,  v28.8h,  #6
+        rshrn2          v28.16b, v29.8h,  #6
+        rshrn           v30.8b,  v30.8h,  #6
+        rshrn2          v30.16b, v31.8h,  #6
+        rshrn           v27.8b,  v25.8h,  #6
+        rshrn2          v27.16b, v26.8h,  #6
+        st1             {v24.16b}, [x0],  #16
+        st1             {v30.16b}, [x8],  #16
+        st1             {v28.8b},  [x0],  #8
+        st1             {v27.8b},  [x8],  #8
+        add             x0,  x0,  x1
+        add             x8,  x8,  x1
+        b.gt            32b
+        ret
+L(blend_v_tbl):
+        .hword L(blend_v_tbl) - 320b
+        .hword L(blend_v_tbl) - 160b
+        .hword L(blend_v_tbl) -  80b
+        .hword L(blend_v_tbl) -  40b
+        .hword L(blend_v_tbl) -  20b
+endfunc
+
+
 // This has got the same signature as the put_8tap functions,
 // and assumes that x8 is set to (clz(w)-24).
 function put_neon
--- a/src/arm/mc_init_tmpl.c
+++ b/src/arm/mc_init_tmpl.c
@@ -101,13 +101,13 @@
     c->avg = dav1d_avg_8bpc_neon;
     c->w_avg = dav1d_w_avg_8bpc_neon;
     c->mask = dav1d_mask_8bpc_neon;
+    c->blend = dav1d_blend_8bpc_neon;
+    c->blend_h = dav1d_blend_h_8bpc_neon;
+    c->blend_v = dav1d_blend_v_8bpc_neon;
 #if ARCH_AARCH64
     c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
     c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
 #elif ARCH_ARM
-    c->blend = dav1d_blend_8bpc_neon;
-    c->blend_h = dav1d_blend_h_8bpc_neon;
-    c->blend_v = dav1d_blend_v_8bpc_neon;
     c->w_mask[0] = dav1d_w_mask_444_8bpc_neon;
     c->w_mask[1] = dav1d_w_mask_422_8bpc_neon;
     c->w_mask[2] = dav1d_w_mask_420_8bpc_neon;