shithub: dav1d

--- /dev/null

+++ b/src/arm/64/ipred.S

@@ -1,0 +1,692 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2019, Martin Storsjo

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "src/arm/asm.S"

+#include "util.S"

+// void ipred_dc_128_neon(pixel *dst, const ptrdiff_t stride,

+//                        const pixel *const topleft,

+//                        const int width, const int height, const int a,

+//                        const int max_width, const int max_height);

+function ipred_dc_128_neon, export=1

+        clz             w3,  w3

+        adr             x5,  L(ipred_dc_128_tbl)

+        sub             w3,  w3,  #25

+        ldrh            w3,  [x5, w3, uxtw #1]

+        movi            v0.16b,  #128

+        sub             x5,  x5,  w3, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+4:

+        st1             {v0.s}[0],  [x0], x1

+        st1             {v0.s}[0],  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.s}[0],  [x0], x1

+        st1             {v0.s}[0],  [x6], x1

+        b.gt            4b

+        ret

+8:

+        st1             {v0.8b},  [x0], x1

+        st1             {v0.8b},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8b},  [x0], x1

+        st1             {v0.8b},  [x6], x1

+        b.gt            8b

+        ret

+16:

+        st1             {v0.16b}, [x0], x1

+        st1             {v0.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b}, [x0], x1

+        st1             {v0.16b}, [x6], x1

+        b.gt            16b

+        ret

+320:

+        movi            v1.16b,  #128

+32:

+        st1             {v0.16b, v1.16b}, [x0], x1

+        st1             {v0.16b, v1.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b, v1.16b}, [x0], x1

+        st1             {v0.16b, v1.16b}, [x6], x1

+        b.gt            32b

+        ret

+640:

+        movi            v1.16b,  #128

+        movi            v2.16b,  #128

+        movi            v3.16b,  #128

+64:

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1

+        b.gt            64b

+        ret

+L(ipred_dc_128_tbl):

+        .hword L(ipred_dc_128_tbl) - 640b

+        .hword L(ipred_dc_128_tbl) - 320b

+        .hword L(ipred_dc_128_tbl) -  16b

+        .hword L(ipred_dc_128_tbl) -   8b

+        .hword L(ipred_dc_128_tbl) -   4b

+endfunc

+// void ipred_v_neon(pixel *dst, const ptrdiff_t stride,

+//                   const pixel *const topleft,

+//                   const int width, const int height, const int a,

+//                   const int max_width, const int max_height);

+function ipred_v_neon, export=1

+        clz             w3,  w3

+        adr             x5,  L(ipred_v_tbl)

+        sub             w3,  w3,  #25

+        ldrh            w3,  [x5, w3, uxtw #1]

+        add             x2,  x2,  #1

+        sub             x5,  x5,  w3, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+40:

+        ld1             {v0.s}[0],  [x2]

+4:

+        st1             {v0.s}[0],  [x0], x1

+        st1             {v0.s}[0],  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.s}[0],  [x0], x1

+        st1             {v0.s}[0],  [x6], x1

+        b.gt            4b

+        ret

+80:

+        ld1             {v0.8b},  [x2]

+8:

+        st1             {v0.8b},  [x0], x1

+        st1             {v0.8b},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8b},  [x0], x1

+        st1             {v0.8b},  [x6], x1

+        b.gt            8b

+        ret

+160:

+        ld1             {v0.16b}, [x2], #16

+16:

+        st1             {v0.16b}, [x0], x1

+        st1             {v0.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b}, [x0], x1

+        st1             {v0.16b}, [x6], x1

+        b.gt            16b

+        ret

+320:

+        ld1             {v0.16b, v1.16b}, [x2]

+32:

+        st1             {v0.16b, v1.16b}, [x0], x1

+        st1             {v0.16b, v1.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b, v1.16b}, [x0], x1

+        st1             {v0.16b, v1.16b}, [x6], x1

+        b.gt            32b

+        ret

+640:

+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]

+64:

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1

+        b.gt            64b

+        ret

+L(ipred_v_tbl):

+        .hword L(ipred_v_tbl) - 640b

+        .hword L(ipred_v_tbl) - 320b

+        .hword L(ipred_v_tbl) - 160b

+        .hword L(ipred_v_tbl) -  80b

+        .hword L(ipred_v_tbl) -  40b

+endfunc

+// void ipred_h_neon(pixel *dst, const ptrdiff_t stride,

+//                   const pixel *const topleft,

+//                   const int width, const int height, const int a,

+//                   const int max_width, const int max_height);

+function ipred_h_neon, export=1

+        clz             w3,  w3

+        adr             x5,  L(ipred_h_tbl)

+        sub             w3,  w3,  #25

+        ldrh            w3,  [x5, w3, uxtw #1]

+        sub             x2,  x2,  #4

+        sub             x5,  x5,  w3, uxtw

+        mov             x7,  #-4

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+4:

+        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7

+        st1             {v3.s}[0],  [x0], x1

+        st1             {v2.s}[0],  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v1.s}[0],  [x0], x1

+        st1             {v0.s}[0],  [x6], x1

+        b.gt            4b

+        ret

+8:

+        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7

+        st1             {v3.8b},  [x0], x1

+        st1             {v2.8b},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v1.8b},  [x0], x1

+        st1             {v0.8b},  [x6], x1

+        b.gt            8b

+        ret

+16:

+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7

+        st1             {v3.16b}, [x0], x1

+        st1             {v2.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v1.16b}, [x0], x1

+        st1             {v0.16b}, [x6], x1

+        b.gt            16b

+        ret

+32:

+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7

+        str             q3,  [x0, #16]

+        str             q2,  [x6, #16]

+        st1             {v3.16b}, [x0], x1

+        st1             {v2.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        str             q1,  [x0, #16]

+        str             q0,  [x6, #16]

+        st1             {v1.16b}, [x0], x1

+        st1             {v0.16b}, [x6], x1

+        b.gt            32b

+        ret

+64:

+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7

+        str             q3,  [x0, #16]

+        str             q2,  [x6, #16]

+        stp             q3,  q3,  [x0, #32]

+        stp             q2,  q2,  [x6, #32]

+        st1             {v3.16b}, [x0], x1

+        st1             {v2.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        str             q1,  [x0, #16]

+        str             q0,  [x6, #16]

+        stp             q1,  q1,  [x0, #32]

+        stp             q0,  q0,  [x6, #32]

+        st1             {v1.16b}, [x0], x1

+        st1             {v0.16b}, [x6], x1

+        b.gt            64b

+        ret

+L(ipred_h_tbl):

+        .hword L(ipred_h_tbl) - 64b

+        .hword L(ipred_h_tbl) - 32b

+        .hword L(ipred_h_tbl) - 16b

+        .hword L(ipred_h_tbl) -  8b

+        .hword L(ipred_h_tbl) -  4b

+endfunc

+// void ipred_dc_top_neon(pixel *dst, const ptrdiff_t stride,

+//                        const pixel *const topleft,

+//                        const int width, const int height, const int a,

+//                        const int max_width, const int max_height);

+function ipred_dc_top_neon, export=1

+        clz             w3,  w3

+        adr             x5,  L(ipred_dc_top_tbl)

+        sub             w3,  w3,  #25

+        ldrh            w3,  [x5, w3, uxtw #1]

+        add             x2,  x2,  #1

+        sub             x5,  x5,  w3, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+40:

+        ld1r            {v0.2s},  [x2]

+        uaddlv          h0,      v0.8b

+        rshrn           v0.8b,   v0.8h,   #3

+        dup             v0.8b,   v0.b[0]

+4:

+        st1             {v0.s}[0],  [x0], x1

+        st1             {v0.s}[0],  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.s}[0],  [x0], x1

+        st1             {v0.s}[0],  [x6], x1

+        b.gt            4b

+        ret

+80:

+        ld1             {v0.8b},  [x2]

+        uaddlv          h0,      v0.8b

+        rshrn           v0.8b,   v0.8h,   #3

+        dup             v0.8b,   v0.b[0]

+8:

+        st1             {v0.8b},  [x0], x1

+        st1             {v0.8b},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8b},  [x0], x1

+        st1             {v0.8b},  [x6], x1

+        b.gt            8b

+        ret

+160:

+        ld1             {v0.16b}, [x2]

+        uaddlv          h0,      v0.16b

+        rshrn           v0.8b,   v0.8h,   #4

+        dup             v0.16b,  v0.b[0]

+16:

+        st1             {v0.16b}, [x0], x1

+        st1             {v0.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b}, [x0], x1

+        st1             {v0.16b}, [x6], x1

+        b.gt            16b

+        ret

+320:

+        ld1             {v0.16b, v1.16b}, [x2]

+        uaddlv          h0,      v0.16b

+        uaddlv          h1,      v1.16b

+        add             v2.4h,   v0.4h,   v1.4h

+        rshrn           v2.8b,   v2.8h,   #5

+        dup             v0.16b,  v2.b[0]

+        dup             v1.16b,  v2.b[0]

+32:

+        st1             {v0.16b, v1.16b}, [x0], x1

+        st1             {v0.16b, v1.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b, v1.16b}, [x0], x1

+        st1             {v0.16b, v1.16b}, [x6], x1

+        b.gt            32b

+        ret

+640:

+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]

+        uaddlv          h0,      v0.16b

+        uaddlv          h1,      v1.16b

+        uaddlv          h2,      v2.16b

+        uaddlv          h3,      v3.16b

+        add             v4.4h,   v0.4h,   v1.4h

+        add             v5.4h,   v2.4h,   v3.4h

+        add             v4.4h,   v4.4h,   v5.4h

+        rshrn           v4.8b,   v4.8h,   #6

+        dup             v0.16b,  v4.b[0]

+        dup             v1.16b,  v4.b[0]

+        dup             v2.16b,  v4.b[0]

+        dup             v3.16b,  v4.b[0]

+64:

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1

+        b.gt            64b

+        ret

+L(ipred_dc_top_tbl):

+        .hword L(ipred_dc_top_tbl) - 640b

+        .hword L(ipred_dc_top_tbl) - 320b

+        .hword L(ipred_dc_top_tbl) - 160b

+        .hword L(ipred_dc_top_tbl) -  80b

+        .hword L(ipred_dc_top_tbl) -  40b

+endfunc

+// void ipred_dc_left_neon(pixel *dst, const ptrdiff_t stride,

+//                         const pixel *const topleft,

+//                         const int width, const int height, const int a,

+//                         const int max_width, const int max_height);

+function ipred_dc_left_neon, export=1

+        sub             x2,  x2,  w4, uxtw

+        clz             w3,  w3

+        clz             w7,  w4

+        adr             x5,  L(ipred_dc_left_tbl)

+        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5

+        sub             w7,  w7,  #25

+        ldrh            w3,  [x5, w3, uxtw #1]

+        ldrh            w7,  [x5, w7, uxtw #1]

+        sub             x3,  x5,  w3, uxtw

+        sub             x5,  x5,  w7, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+L(ipred_dc_left_h4):

+        ld1r            {v0.2s},  [x2]

+        uaddlv          h0,      v0.8b

+        rshrn           v0.8b,   v0.8h,   #3

+        dup             v0.16b,  v0.b[0]

+        br              x3

+L(ipred_dc_left_w4):

+        st1             {v0.s}[0],  [x0], x1

+        st1             {v0.s}[0],  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.s}[0],  [x0], x1

+        st1             {v0.s}[0],  [x6], x1

+        b.gt            L(ipred_dc_left_w4)

+        ret

+L(ipred_dc_left_h8):

+        ld1             {v0.8b},  [x2]

+        uaddlv          h0,      v0.8b

+        rshrn           v0.8b,   v0.8h,   #3

+        dup             v0.16b,  v0.b[0]

+        br              x3

+L(ipred_dc_left_w8):

+        st1             {v0.8b},  [x0], x1

+        st1             {v0.8b},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8b},  [x0], x1

+        st1             {v0.8b},  [x6], x1

+        b.gt            L(ipred_dc_left_w8)

+        ret

+L(ipred_dc_left_h16):

+        ld1             {v0.16b}, [x2]

+        uaddlv          h0,      v0.16b

+        rshrn           v0.8b,   v0.8h,   #4

+        dup             v0.16b,  v0.b[0]

+        br              x3

+L(ipred_dc_left_w16):

+        st1             {v0.16b}, [x0], x1

+        st1             {v0.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b}, [x0], x1

+        st1             {v0.16b}, [x6], x1

+        b.gt            L(ipred_dc_left_w16)

+        ret

+L(ipred_dc_left_h32):

+        ld1             {v0.16b, v1.16b}, [x2]

+        uaddlv          h0,      v0.16b

+        uaddlv          h1,      v1.16b

+        add             v0.4h,   v0.4h,   v1.4h

+        rshrn           v0.8b,   v0.8h,   #5

+        dup             v0.16b,  v0.b[0]

+        br              x3

+L(ipred_dc_left_w32):

+        mov             v1.16b,  v0.16b

+1:

+        st1             {v0.16b, v1.16b}, [x0], x1

+        st1             {v0.16b, v1.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b, v1.16b}, [x0], x1

+        st1             {v0.16b, v1.16b}, [x6], x1

+        b.gt            1b

+        ret

+L(ipred_dc_left_h64):

+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]

+        uaddlv          h0,      v0.16b

+        uaddlv          h1,      v1.16b

+        uaddlv          h2,      v2.16b

+        uaddlv          h3,      v3.16b

+        add             v0.4h,   v0.4h,   v1.4h

+        add             v2.4h,   v2.4h,   v3.4h

+        add             v0.4h,   v0.4h,   v2.4h

+        rshrn           v0.8b,   v0.8h,   #6

+        dup             v0.16b,  v0.b[0]

+        br              x3

+L(ipred_dc_left_w64):

+        mov             v1.16b,  v0.16b

+        mov             v2.16b,  v0.16b

+        mov             v3.16b,  v0.16b

+1:

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1

+        b.gt            1b

+        ret

+L(ipred_dc_left_tbl):

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)

+endfunc

+// void ipred_dc_neon(pixel *dst, const ptrdiff_t stride,

+//                    const pixel *const topleft,

+//                    const int width, const int height, const int a,

+//                    const int max_width, const int max_height);

+function ipred_dc_neon, export=1

+        sub             x2,  x2,  w4, uxtw

+        add             w7,  w3,  w4             // width + height

+        clz             w3,  w3

+        clz             w6,  w4

+        dup             v16.8h, w7               // width + height

+        adr             x5,  L(ipred_dc_tbl)

+        rbit            w7,  w7                  // rbit(width + height)

+        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5

+        sub             w6,  w6,  #25

+        clz             w7,  w7                  // ctz(width + height)

+        ldrh            w3,  [x5, w3, uxtw #1]

+        ldrh            w6,  [x5, w6, uxtw #1]

+        neg             w7,  w7                  // -ctz(width + height)

+        sub             x3,  x5,  w3, uxtw

+        sub             x5,  x5,  w6, uxtw

+        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1

+        dup             v17.8h,  w7              // -ctz(width + height)

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+L(ipred_dc_h4):

+        ld1             {v0.s}[0],  [x2], #4

+        ins             v0.s[1], wzr

+        uaddlv          h0,      v0.8b

+        br              x3

+L(ipred_dc_w4):

+        add             x2,  x2,  #1

+        ld1             {v1.s}[0],  [x2]

+        ins             v1.s[1], wzr

+        add             v0.4h,   v0.4h,   v16.4h

+        uaddlv          h1,      v1.8b

+        cmp             w4,  #4

+        add             v0.4h,   v0.4h,   v1.4h

+        ushl            v0.4h,   v0.4h,   v17.4h

+        b.eq            1f

+        // h = 8/16

+        mov             w16, #(0x3334/2)

+        movk            w16, #(0x5556/2), lsl #16

+        add             w17, w4,  w4  // w17 = 2*h = 16 or 32

+        lsr             w16, w16, w17

+        dup             v16.4h,  w16

+        sqdmulh         v0.4h,   v0.4h,   v16.4h

+1:

+        dup             v0.8b,   v0.b[0]

+2:

+        st1             {v0.s}[0],  [x0], x1

+        st1             {v0.s}[0],  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.s}[0],  [x0], x1

+        st1             {v0.s}[0],  [x6], x1

+        b.gt            2b

+        ret

+L(ipred_dc_h8):

+        ld1             {v0.8b},  [x2], #8

+        uaddlv          h0,      v0.8b

+        br              x3

+L(ipred_dc_w8):

+        add             x2,  x2,  #1

+        ld1             {v1.8b},  [x2]

+        add             v0.4h,   v0.4h,   v16.4h

+        uaddlv          h1,      v1.8b

+        cmp             w4,  #8

+        add             v0.4h,   v0.4h,   v1.4h

+        ushl            v0.4h,   v0.4h,   v17.4h

+        b.eq            1f

+        // h = 4/16/32

+        cmp             w4,  #32

+        mov             w16, #(0x3334/2)

+        mov             w17, #(0x5556/2)

+        csel            w16, w16, w17, eq

+        dup             v16.4h,  w16

+        sqdmulh         v0.4h,   v0.4h,   v16.4h

+1:

+        dup             v0.8b,   v0.b[0]

+2:

+        st1             {v0.8b},  [x0], x1

+        st1             {v0.8b},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8b},  [x0], x1

+        st1             {v0.8b},  [x6], x1

+        b.gt            2b

+        ret

+L(ipred_dc_h16):

+        ld1             {v0.16b}, [x2], #16

+        uaddlv          h0,      v0.16b

+        br              x3

+L(ipred_dc_w16):

+        add             x2,  x2,  #1

+        ld1             {v1.16b}, [x2]

+        add             v0.4h,   v0.4h,   v16.4h

+        uaddlv          h1,      v1.16b

+        cmp             w4,  #16

+        add             v0.4h,   v0.4h,   v1.4h

+        ushl            v0.4h,   v0.4h,   v17.4h

+        b.eq            1f

+        // h = 4/8/32/64

+        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask

+        mov             w16, #(0x3334/2)

+        mov             w17, #(0x5556/2)

+        csel            w16, w16, w17, eq

+        dup             v16.4h,  w16

+        sqdmulh         v0.4h,   v0.4h,   v16.4h

+1:

+        dup             v0.16b,  v0.b[0]

+2:

+        st1             {v0.16b}, [x0], x1

+        st1             {v0.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b}, [x0], x1

+        st1             {v0.16b}, [x6], x1

+        b.gt            2b

+        ret

+L(ipred_dc_h32):

+        ld1             {v0.16b, v1.16b}, [x2], #32

+        uaddlv          h0,      v0.16b

+        uaddlv          h1,      v1.16b

+        add             v0.4h,   v0.4h,   v1.4h

+        br              x3

+L(ipred_dc_w32):

+        add             x2,  x2,  #1

+        ld1             {v1.16b, v2.16b}, [x2]

+        add             v0.4h,   v0.4h,   v16.4h

+        uaddlv          h1,      v1.16b

+        uaddlv          h2,      v2.16b

+        cmp             w4,  #32

+        add             v0.4h,   v0.4h,   v1.4h

+        add             v0.4h,   v0.4h,   v2.4h

+        ushl            v0.4h,   v0.4h,   v17.4h

+        b.eq            1f

+        // h = 8/16/64

+        cmp             w4,  #8

+        mov             w16, #(0x3334/2)

+        mov             w17, #(0x5556/2)

+        csel            w16, w16, w17, eq

+        dup             v16.4h,  w16

+        sqdmulh         v0.4h,   v0.4h,   v16.4h

+1:

+        dup             v0.16b,  v0.b[0]

+        dup             v1.16b,  v0.b[0]

+2:

+        st1             {v0.16b, v1.16b}, [x0], x1

+        st1             {v0.16b, v1.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b, v1.16b}, [x0], x1

+        st1             {v0.16b, v1.16b}, [x6], x1

+        b.gt            2b

+        ret

+L(ipred_dc_h64):

+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64

+        uaddlv          h0,      v0.16b

+        uaddlv          h1,      v1.16b

+        uaddlv          h2,      v2.16b

+        uaddlv          h3,      v3.16b

+        add             v0.4h,   v0.4h,   v1.4h

+        add             v2.4h,   v2.4h,   v3.4h

+        add             v0.4h,   v0.4h,   v2.4h

+        br              x3

+L(ipred_dc_w64):

+        mov             v1.16b,  v0.16b

+        mov             v2.16b,  v0.16b

+        mov             v3.16b,  v0.16b

+2:

+        add             x2,  x2,  #1

+        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]

+        add             v0.4h,   v0.4h,   v16.4h

+        uaddlv          h1,      v1.16b

+        uaddlv          h2,      v2.16b

+        uaddlv          h3,      v3.16b

+        uaddlv          h4,      v4.16b

+        add             v1.4h,   v1.4h,   v2.4h

+        add             v3.4h,   v3.4h,   v4.4h

+        cmp             w4,  #64

+        add             v0.4h,   v0.4h,   v1.4h

+        add             v0.4h,   v0.4h,   v3.4h

+        ushl            v0.4h,   v0.4h,   v17.4h

+        b.eq            1f

+        // h = 16/32

+        mov             w16, #(0x5556/2)

+        movk            w16, #(0x3334/2), lsl #16

+        lsr             w16, w16, w4

+        dup             v16.4h,  w16

+        sqdmulh         v0.4h,   v0.4h,   v16.4h

+1:

+        dup             v0.16b,  v0.b[0]

+        dup             v1.16b,  v0.b[0]

+        dup             v2.16b,  v0.b[0]

+        dup             v3.16b,  v0.b[0]

+2:

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1

+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1

+        b.gt            2b

+        ret

+L(ipred_dc_tbl):

+        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)

+endfunc

--- /dev/null

+++ b/src/arm/ipred_init_tmpl.c

@@ -1,0 +1,50 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "src/cpu.h"

+#include "src/ipred.h"

+decl_angular_ipred_fn(dav1d_ipred_dc_neon);

+decl_angular_ipred_fn(dav1d_ipred_dc_128_neon);

+decl_angular_ipred_fn(dav1d_ipred_dc_top_neon);

+decl_angular_ipred_fn(dav1d_ipred_dc_left_neon);

+decl_angular_ipred_fn(dav1d_ipred_h_neon);

+decl_angular_ipred_fn(dav1d_ipred_v_neon);

+COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c) {

+    const unsigned flags = dav1d_get_cpu_flags();

+    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

+#if BITDEPTH == 8 && ARCH_AARCH64

+    c->intra_pred[DC_PRED]       = dav1d_ipred_dc_neon;

+    c->intra_pred[DC_128_PRED]   = dav1d_ipred_dc_128_neon;

+    c->intra_pred[TOP_DC_PRED]   = dav1d_ipred_dc_top_neon;

+    c->intra_pred[LEFT_DC_PRED]  = dav1d_ipred_dc_left_neon;

+    c->intra_pred[HOR_PRED]      = dav1d_ipred_h_neon;

+    c->intra_pred[VERT_PRED]     = dav1d_ipred_v_neon;

+#endif

+}

--- a/src/ipred.h

+++ b/src/ipred.h

@@ -89,6 +89,7 @@

 } Dav1dIntraPredDSPContext;

 bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c);

+bitfn_decls(void dav1d_intra_pred_dsp_init_arm, Dav1dIntraPredDSPContext *c);

 bitfn_decls(void dav1d_intra_pred_dsp_init_x86, Dav1dIntraPredDSPContext *c);

 #endif /* DAV1D_SRC_IPRED_H */

--- a/src/ipred_tmpl.c

+++ b/src/ipred_tmpl.c

@@ -751,7 +751,11 @@

     c->pal_pred = pal_pred_c;

-#if HAVE_ASM && ARCH_X86

+#if HAVE_ASM

+#if ARCH_AARCH64 || ARCH_ARM

+    bitfn(dav1d_intra_pred_dsp_init_arm)(c);

+#elif ARCH_X86

     bitfn(dav1d_intra_pred_dsp_init_x86)(c);

+#endif

 #endif

--- a/src/meson.build

+++ b/src/meson.build

@@ -93,6 +93,7 @@

         libdav1d_tmpl_sources += files(

             'arm/cdef_init_tmpl.c',

+            'arm/ipred_init_tmpl.c',

             'arm/itx_init_tmpl.c',

             'arm/loopfilter_init_tmpl.c',

             'arm/looprestoration_init_tmpl.c',

@@ -101,6 +102,7 @@

         if host_machine.cpu_family() == 'aarch64'

             libdav1d_sources += files(

                 'arm/64/cdef.S',

+                'arm/64/ipred.S',

                 'arm/64/itx.S',

                 'arm/64/loopfilter.S',

                 'arm/64/looprestoration.S',

--

⑨