shithub: dav1d

ref: c4dea948cb87cab375a4c54859ad75442a388b7d
dir: /src/arm/64/ipred.S/

View raw version
/*
 * Copyright © 2018, VideoLAN and dav1d authors
 * Copyright © 2019, Martin Storsjo
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "src/arm/asm.S"
#include "util.S"

// void ipred_dc_128_neon(pixel *dst, const ptrdiff_t stride,
//                        const pixel *const topleft,
//                        const int width, const int height, const int a,
//                        const int max_width, const int max_height);
function ipred_dc_128_neon, export=1
        clz             w3,  w3
        adr             x5,  L(ipred_dc_128_tbl)
        sub             w3,  w3,  #25
        ldrh            w3,  [x5, w3, uxtw #1]
        movi            v0.16b,  #128
        sub             x5,  x5,  w3, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
4:
        st1             {v0.s}[0],  [x0], x1
        st1             {v0.s}[0],  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.s}[0],  [x0], x1
        st1             {v0.s}[0],  [x6], x1
        b.gt            4b
        ret
8:
        st1             {v0.8b},  [x0], x1
        st1             {v0.8b},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8b},  [x0], x1
        st1             {v0.8b},  [x6], x1
        b.gt            8b
        ret
16:
        st1             {v0.16b}, [x0], x1
        st1             {v0.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b}, [x0], x1
        st1             {v0.16b}, [x6], x1
        b.gt            16b
        ret
320:
        movi            v1.16b,  #128
32:
        st1             {v0.16b, v1.16b}, [x0], x1
        st1             {v0.16b, v1.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b, v1.16b}, [x0], x1
        st1             {v0.16b, v1.16b}, [x6], x1
        b.gt            32b
        ret
640:
        movi            v1.16b,  #128
        movi            v2.16b,  #128
        movi            v3.16b,  #128
64:
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
        b.gt            64b
        ret

L(ipred_dc_128_tbl):
        .hword L(ipred_dc_128_tbl) - 640b
        .hword L(ipred_dc_128_tbl) - 320b
        .hword L(ipred_dc_128_tbl) -  16b
        .hword L(ipred_dc_128_tbl) -   8b
        .hword L(ipred_dc_128_tbl) -   4b
endfunc

// void ipred_v_neon(pixel *dst, const ptrdiff_t stride,
//                   const pixel *const topleft,
//                   const int width, const int height, const int a,
//                   const int max_width, const int max_height);
function ipred_v_neon, export=1
        clz             w3,  w3
        adr             x5,  L(ipred_v_tbl)
        sub             w3,  w3,  #25
        ldrh            w3,  [x5, w3, uxtw #1]
        add             x2,  x2,  #1
        sub             x5,  x5,  w3, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
40:
        ld1             {v0.s}[0],  [x2]
4:
        st1             {v0.s}[0],  [x0], x1
        st1             {v0.s}[0],  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.s}[0],  [x0], x1
        st1             {v0.s}[0],  [x6], x1
        b.gt            4b
        ret
80:
        ld1             {v0.8b},  [x2]
8:
        st1             {v0.8b},  [x0], x1
        st1             {v0.8b},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8b},  [x0], x1
        st1             {v0.8b},  [x6], x1
        b.gt            8b
        ret
160:
        ld1             {v0.16b}, [x2], #16
16:
        st1             {v0.16b}, [x0], x1
        st1             {v0.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b}, [x0], x1
        st1             {v0.16b}, [x6], x1
        b.gt            16b
        ret
320:
        ld1             {v0.16b, v1.16b}, [x2]
32:
        st1             {v0.16b, v1.16b}, [x0], x1
        st1             {v0.16b, v1.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b, v1.16b}, [x0], x1
        st1             {v0.16b, v1.16b}, [x6], x1
        b.gt            32b
        ret
640:
        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
64:
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
        b.gt            64b
        ret

L(ipred_v_tbl):
        .hword L(ipred_v_tbl) - 640b
        .hword L(ipred_v_tbl) - 320b
        .hword L(ipred_v_tbl) - 160b
        .hword L(ipred_v_tbl) -  80b
        .hword L(ipred_v_tbl) -  40b
endfunc

// void ipred_h_neon(pixel *dst, const ptrdiff_t stride,
//                   const pixel *const topleft,
//                   const int width, const int height, const int a,
//                   const int max_width, const int max_height);
function ipred_h_neon, export=1
        clz             w3,  w3
        adr             x5,  L(ipred_h_tbl)
        sub             w3,  w3,  #25
        ldrh            w3,  [x5, w3, uxtw #1]
        sub             x2,  x2,  #4
        sub             x5,  x5,  w3, uxtw
        mov             x7,  #-4
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
4:
        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
        st1             {v3.s}[0],  [x0], x1
        st1             {v2.s}[0],  [x6], x1
        subs            w4,  w4,  #4
        st1             {v1.s}[0],  [x0], x1
        st1             {v0.s}[0],  [x6], x1
        b.gt            4b
        ret
8:
        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
        st1             {v3.8b},  [x0], x1
        st1             {v2.8b},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v1.8b},  [x0], x1
        st1             {v0.8b},  [x6], x1
        b.gt            8b
        ret
16:
        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
        st1             {v3.16b}, [x0], x1
        st1             {v2.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v1.16b}, [x0], x1
        st1             {v0.16b}, [x6], x1
        b.gt            16b
        ret
32:
        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
        str             q3,  [x0, #16]
        str             q2,  [x6, #16]
        st1             {v3.16b}, [x0], x1
        st1             {v2.16b}, [x6], x1
        subs            w4,  w4,  #4
        str             q1,  [x0, #16]
        str             q0,  [x6, #16]
        st1             {v1.16b}, [x0], x1
        st1             {v0.16b}, [x6], x1
        b.gt            32b
        ret
64:
        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
        str             q3,  [x0, #16]
        str             q2,  [x6, #16]
        stp             q3,  q3,  [x0, #32]
        stp             q2,  q2,  [x6, #32]
        st1             {v3.16b}, [x0], x1
        st1             {v2.16b}, [x6], x1
        subs            w4,  w4,  #4
        str             q1,  [x0, #16]
        str             q0,  [x6, #16]
        stp             q1,  q1,  [x0, #32]
        stp             q0,  q0,  [x6, #32]
        st1             {v1.16b}, [x0], x1
        st1             {v0.16b}, [x6], x1
        b.gt            64b
        ret

L(ipred_h_tbl):
        .hword L(ipred_h_tbl) - 64b
        .hword L(ipred_h_tbl) - 32b
        .hword L(ipred_h_tbl) - 16b
        .hword L(ipred_h_tbl) -  8b
        .hword L(ipred_h_tbl) -  4b
endfunc

// void ipred_dc_top_neon(pixel *dst, const ptrdiff_t stride,
//                        const pixel *const topleft,
//                        const int width, const int height, const int a,
//                        const int max_width, const int max_height);
function ipred_dc_top_neon, export=1
        clz             w3,  w3
        adr             x5,  L(ipred_dc_top_tbl)
        sub             w3,  w3,  #25
        ldrh            w3,  [x5, w3, uxtw #1]
        add             x2,  x2,  #1
        sub             x5,  x5,  w3, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
40:
        ld1r            {v0.2s},  [x2]
        uaddlv          h0,      v0.8b
        rshrn           v0.8b,   v0.8h,   #3
        dup             v0.8b,   v0.b[0]
4:
        st1             {v0.s}[0],  [x0], x1
        st1             {v0.s}[0],  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.s}[0],  [x0], x1
        st1             {v0.s}[0],  [x6], x1
        b.gt            4b
        ret
80:
        ld1             {v0.8b},  [x2]
        uaddlv          h0,      v0.8b
        rshrn           v0.8b,   v0.8h,   #3
        dup             v0.8b,   v0.b[0]
8:
        st1             {v0.8b},  [x0], x1
        st1             {v0.8b},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8b},  [x0], x1
        st1             {v0.8b},  [x6], x1
        b.gt            8b
        ret
160:
        ld1             {v0.16b}, [x2]
        uaddlv          h0,      v0.16b
        rshrn           v0.8b,   v0.8h,   #4
        dup             v0.16b,  v0.b[0]
16:
        st1             {v0.16b}, [x0], x1
        st1             {v0.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b}, [x0], x1
        st1             {v0.16b}, [x6], x1
        b.gt            16b
        ret
320:
        ld1             {v0.16b, v1.16b}, [x2]
        uaddlv          h0,      v0.16b
        uaddlv          h1,      v1.16b
        add             v2.4h,   v0.4h,   v1.4h
        rshrn           v2.8b,   v2.8h,   #5
        dup             v0.16b,  v2.b[0]
        dup             v1.16b,  v2.b[0]
32:
        st1             {v0.16b, v1.16b}, [x0], x1
        st1             {v0.16b, v1.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b, v1.16b}, [x0], x1
        st1             {v0.16b, v1.16b}, [x6], x1
        b.gt            32b
        ret
640:
        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
        uaddlv          h0,      v0.16b
        uaddlv          h1,      v1.16b
        uaddlv          h2,      v2.16b
        uaddlv          h3,      v3.16b
        add             v4.4h,   v0.4h,   v1.4h
        add             v5.4h,   v2.4h,   v3.4h
        add             v4.4h,   v4.4h,   v5.4h
        rshrn           v4.8b,   v4.8h,   #6
        dup             v0.16b,  v4.b[0]
        dup             v1.16b,  v4.b[0]
        dup             v2.16b,  v4.b[0]
        dup             v3.16b,  v4.b[0]
64:
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
        b.gt            64b
        ret

L(ipred_dc_top_tbl):
        .hword L(ipred_dc_top_tbl) - 640b
        .hword L(ipred_dc_top_tbl) - 320b
        .hword L(ipred_dc_top_tbl) - 160b
        .hword L(ipred_dc_top_tbl) -  80b
        .hword L(ipred_dc_top_tbl) -  40b
endfunc

// void ipred_dc_left_neon(pixel *dst, const ptrdiff_t stride,
//                         const pixel *const topleft,
//                         const int width, const int height, const int a,
//                         const int max_width, const int max_height);
function ipred_dc_left_neon, export=1
        sub             x2,  x2,  w4, uxtw
        clz             w3,  w3
        clz             w7,  w4
        adr             x5,  L(ipred_dc_left_tbl)
        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
        sub             w7,  w7,  #25
        ldrh            w3,  [x5, w3, uxtw #1]
        ldrh            w7,  [x5, w7, uxtw #1]
        sub             x3,  x5,  w3, uxtw
        sub             x5,  x5,  w7, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5

L(ipred_dc_left_h4):
        ld1r            {v0.2s},  [x2]
        uaddlv          h0,      v0.8b
        rshrn           v0.8b,   v0.8h,   #3
        dup             v0.16b,  v0.b[0]
        br              x3
L(ipred_dc_left_w4):
        st1             {v0.s}[0],  [x0], x1
        st1             {v0.s}[0],  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.s}[0],  [x0], x1
        st1             {v0.s}[0],  [x6], x1
        b.gt            L(ipred_dc_left_w4)
        ret

L(ipred_dc_left_h8):
        ld1             {v0.8b},  [x2]
        uaddlv          h0,      v0.8b
        rshrn           v0.8b,   v0.8h,   #3
        dup             v0.16b,  v0.b[0]
        br              x3
L(ipred_dc_left_w8):
        st1             {v0.8b},  [x0], x1
        st1             {v0.8b},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8b},  [x0], x1
        st1             {v0.8b},  [x6], x1
        b.gt            L(ipred_dc_left_w8)
        ret

L(ipred_dc_left_h16):
        ld1             {v0.16b}, [x2]
        uaddlv          h0,      v0.16b
        rshrn           v0.8b,   v0.8h,   #4
        dup             v0.16b,  v0.b[0]
        br              x3
L(ipred_dc_left_w16):
        st1             {v0.16b}, [x0], x1
        st1             {v0.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b}, [x0], x1
        st1             {v0.16b}, [x6], x1
        b.gt            L(ipred_dc_left_w16)
        ret

L(ipred_dc_left_h32):
        ld1             {v0.16b, v1.16b}, [x2]
        uaddlv          h0,      v0.16b
        uaddlv          h1,      v1.16b
        add             v0.4h,   v0.4h,   v1.4h
        rshrn           v0.8b,   v0.8h,   #5
        dup             v0.16b,  v0.b[0]
        br              x3
L(ipred_dc_left_w32):
        mov             v1.16b,  v0.16b
1:
        st1             {v0.16b, v1.16b}, [x0], x1
        st1             {v0.16b, v1.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b, v1.16b}, [x0], x1
        st1             {v0.16b, v1.16b}, [x6], x1
        b.gt            1b
        ret

L(ipred_dc_left_h64):
        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
        uaddlv          h0,      v0.16b
        uaddlv          h1,      v1.16b
        uaddlv          h2,      v2.16b
        uaddlv          h3,      v3.16b
        add             v0.4h,   v0.4h,   v1.4h
        add             v2.4h,   v2.4h,   v3.4h
        add             v0.4h,   v0.4h,   v2.4h
        rshrn           v0.8b,   v0.8h,   #6
        dup             v0.16b,  v0.b[0]
        br              x3
L(ipred_dc_left_w64):
        mov             v1.16b,  v0.16b
        mov             v2.16b,  v0.16b
        mov             v3.16b,  v0.16b
1:
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
        b.gt            1b
        ret

L(ipred_dc_left_tbl):
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
endfunc

// void ipred_dc_neon(pixel *dst, const ptrdiff_t stride,
//                    const pixel *const topleft,
//                    const int width, const int height, const int a,
//                    const int max_width, const int max_height);
function ipred_dc_neon, export=1
        sub             x2,  x2,  w4, uxtw
        add             w7,  w3,  w4             // width + height
        clz             w3,  w3
        clz             w6,  w4
        dup             v16.8h, w7               // width + height
        adr             x5,  L(ipred_dc_tbl)
        rbit            w7,  w7                  // rbit(width + height)
        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
        sub             w6,  w6,  #25
        clz             w7,  w7                  // ctz(width + height)
        ldrh            w3,  [x5, w3, uxtw #1]
        ldrh            w6,  [x5, w6, uxtw #1]
        neg             w7,  w7                  // -ctz(width + height)
        sub             x3,  x5,  w3, uxtw
        sub             x5,  x5,  w6, uxtw
        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
        dup             v17.8h,  w7              // -ctz(width + height)
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5

L(ipred_dc_h4):
        ld1             {v0.s}[0],  [x2], #4
        ins             v0.s[1], wzr
        uaddlv          h0,      v0.8b
        br              x3
L(ipred_dc_w4):
        add             x2,  x2,  #1
        ld1             {v1.s}[0],  [x2]
        ins             v1.s[1], wzr
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h1,      v1.8b
        cmp             w4,  #4
        add             v0.4h,   v0.4h,   v1.4h
        ushl            v0.4h,   v0.4h,   v17.4h
        b.eq            1f
        // h = 8/16
        mov             w16, #(0x3334/2)
        movk            w16, #(0x5556/2), lsl #16
        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
        lsr             w16, w16, w17
        dup             v16.4h,  w16
        sqdmulh         v0.4h,   v0.4h,   v16.4h
1:
        dup             v0.8b,   v0.b[0]
2:
        st1             {v0.s}[0],  [x0], x1
        st1             {v0.s}[0],  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.s}[0],  [x0], x1
        st1             {v0.s}[0],  [x6], x1
        b.gt            2b
        ret

L(ipred_dc_h8):
        ld1             {v0.8b},  [x2], #8
        uaddlv          h0,      v0.8b
        br              x3
L(ipred_dc_w8):
        add             x2,  x2,  #1
        ld1             {v1.8b},  [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h1,      v1.8b
        cmp             w4,  #8
        add             v0.4h,   v0.4h,   v1.4h
        ushl            v0.4h,   v0.4h,   v17.4h
        b.eq            1f
        // h = 4/16/32
        cmp             w4,  #32
        mov             w16, #(0x3334/2)
        mov             w17, #(0x5556/2)
        csel            w16, w16, w17, eq
        dup             v16.4h,  w16
        sqdmulh         v0.4h,   v0.4h,   v16.4h
1:
        dup             v0.8b,   v0.b[0]
2:
        st1             {v0.8b},  [x0], x1
        st1             {v0.8b},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8b},  [x0], x1
        st1             {v0.8b},  [x6], x1
        b.gt            2b
        ret

L(ipred_dc_h16):
        ld1             {v0.16b}, [x2], #16
        uaddlv          h0,      v0.16b
        br              x3
L(ipred_dc_w16):
        add             x2,  x2,  #1
        ld1             {v1.16b}, [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h1,      v1.16b
        cmp             w4,  #16
        add             v0.4h,   v0.4h,   v1.4h
        ushl            v0.4h,   v0.4h,   v17.4h
        b.eq            1f
        // h = 4/8/32/64
        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
        mov             w16, #(0x3334/2)
        mov             w17, #(0x5556/2)
        csel            w16, w16, w17, eq
        dup             v16.4h,  w16
        sqdmulh         v0.4h,   v0.4h,   v16.4h
1:
        dup             v0.16b,  v0.b[0]
2:
        st1             {v0.16b}, [x0], x1
        st1             {v0.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b}, [x0], x1
        st1             {v0.16b}, [x6], x1
        b.gt            2b
        ret

L(ipred_dc_h32):
        ld1             {v0.16b, v1.16b}, [x2], #32
        uaddlv          h0,      v0.16b
        uaddlv          h1,      v1.16b
        add             v0.4h,   v0.4h,   v1.4h
        br              x3
L(ipred_dc_w32):
        add             x2,  x2,  #1
        ld1             {v1.16b, v2.16b}, [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h1,      v1.16b
        uaddlv          h2,      v2.16b
        cmp             w4,  #32
        add             v0.4h,   v0.4h,   v1.4h
        add             v0.4h,   v0.4h,   v2.4h
        ushl            v0.4h,   v0.4h,   v17.4h
        b.eq            1f
        // h = 8/16/64
        cmp             w4,  #8
        mov             w16, #(0x3334/2)
        mov             w17, #(0x5556/2)
        csel            w16, w16, w17, eq
        dup             v16.4h,  w16
        sqdmulh         v0.4h,   v0.4h,   v16.4h
1:
        dup             v0.16b,  v0.b[0]
        dup             v1.16b,  v0.b[0]
2:
        st1             {v0.16b, v1.16b}, [x0], x1
        st1             {v0.16b, v1.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b, v1.16b}, [x0], x1
        st1             {v0.16b, v1.16b}, [x6], x1
        b.gt            2b
        ret

L(ipred_dc_h64):
        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
        uaddlv          h0,      v0.16b
        uaddlv          h1,      v1.16b
        uaddlv          h2,      v2.16b
        uaddlv          h3,      v3.16b
        add             v0.4h,   v0.4h,   v1.4h
        add             v2.4h,   v2.4h,   v3.4h
        add             v0.4h,   v0.4h,   v2.4h
        br              x3
L(ipred_dc_w64):
        mov             v1.16b,  v0.16b
        mov             v2.16b,  v0.16b
        mov             v3.16b,  v0.16b
2:
        add             x2,  x2,  #1
        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h1,      v1.16b
        uaddlv          h2,      v2.16b
        uaddlv          h3,      v3.16b
        uaddlv          h4,      v4.16b
        add             v1.4h,   v1.4h,   v2.4h
        add             v3.4h,   v3.4h,   v4.4h
        cmp             w4,  #64
        add             v0.4h,   v0.4h,   v1.4h
        add             v0.4h,   v0.4h,   v3.4h
        ushl            v0.4h,   v0.4h,   v17.4h
        b.eq            1f
        // h = 16/32
        mov             w16, #(0x5556/2)
        movk            w16, #(0x3334/2), lsl #16
        lsr             w16, w16, w4
        dup             v16.4h,  w16
        sqdmulh         v0.4h,   v0.4h,   v16.4h
1:
        dup             v0.16b,  v0.b[0]
        dup             v1.16b,  v0.b[0]
        dup             v2.16b,  v0.b[0]
        dup             v3.16b,  v0.b[0]
2:
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
        b.gt            2b
        ret

L(ipred_dc_tbl):
        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
endfunc

// void ipred_paeth_neon(pixel *dst, const ptrdiff_t stride,
//                       const pixel *const topleft,
//                       const int width, const int height, const int a,
//                       const int max_width, const int max_height);
function ipred_paeth_neon, export=1
        clz             w9,  w3
        adr             x5,  L(ipred_paeth_tbl)
        sub             w9,  w9,  #25
        ldrh            w9,  [x5, w9, uxtw #1]
        ld1r            {v4.16b},  [x2]
        add             x8,  x2,  #1
        sub             x2,  x2,  #4
        sub             x5,  x5,  w9, uxtw
        mov             x7,  #-4
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
40:
        ld1r            {v5.4s},  [x8]
        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
4:
        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
        zip1            v0.2s,   v0.2s,   v1.2s
        zip1            v2.2s,   v2.2s,   v3.2s
        uaddw           v16.8h,  v6.8h,   v0.8b
        uaddw           v17.8h,  v6.8h,   v2.8b
        sqxtun          v16.8b,  v16.8h           // base
        sqxtun2         v16.16b, v17.8h
        zip1            v0.2d,   v0.2d,   v2.2d
        uabd            v20.16b, v5.16b,  v16.16b // tdiff
        uabd            v22.16b, v4.16b,  v16.16b // tldiff
        uabd            v16.16b, v0.16b,  v16.16b // ldiff
        umin            v18.16b, v20.16b, v22.16b // min(tdiff, tldiff)
        cmhs            v20.16b, v22.16b, v20.16b // tldiff >= tdiff
        cmhs            v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff
        bsl             v20.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
        bit             v20.16b, v0.16b,  v16.16b // ldiff <= min ? left : ...
        st1             {v20.s}[3], [x0], x1
        st1             {v20.s}[2], [x6], x1
        subs            w4,  w4,  #4
        st1             {v20.s}[1], [x0], x1
        st1             {v20.s}[0], [x6], x1
        b.gt            4b
        ret
80:
        ld1r            {v5.2d},  [x8]
        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
8:
        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
        uaddw           v16.8h,  v6.8h,   v0.8b
        uaddw           v17.8h,  v6.8h,   v1.8b
        uaddw           v18.8h,  v6.8h,   v2.8b
        uaddw           v19.8h,  v6.8h,   v3.8b
        sqxtun          v16.8b,  v16.8h           // base
        sqxtun2         v16.16b, v17.8h
        sqxtun          v18.8b,  v18.8h
        sqxtun2         v18.16b, v19.8h
        zip1            v2.2d,   v2.2d,   v3.2d
        zip1            v0.2d,   v0.2d,   v1.2d
        uabd            v21.16b, v5.16b,  v18.16b // tdiff
        uabd            v20.16b, v5.16b,  v16.16b
        uabd            v23.16b, v4.16b,  v18.16b // tldiff
        uabd            v22.16b, v4.16b,  v16.16b
        uabd            v17.16b, v2.16b,  v18.16b // ldiff
        uabd            v16.16b, v0.16b,  v16.16b
        umin            v19.16b, v21.16b, v23.16b // min(tdiff, tldiff)
        umin            v18.16b, v20.16b, v22.16b
        cmhs            v21.16b, v23.16b, v21.16b // tldiff >= tdiff
        cmhs            v20.16b, v22.16b, v20.16b
        cmhs            v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff
        cmhs            v16.16b, v18.16b, v16.16b
        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
        bsl             v20.16b, v5.16b,  v4.16b
        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
        bit             v20.16b, v0.16b,  v16.16b
        st1             {v21.d}[1], [x0], x1
        st1             {v21.d}[0], [x6], x1
        subs            w4,  w4,  #4
        st1             {v20.d}[1], [x0], x1
        st1             {v20.d}[0], [x6], x1
        b.gt            8b
        ret
160:
320:
640:
        ld1             {v5.16b},  [x8], #16
        mov             w9,  w3
        // Set up pointers for four rows in parallel; x0, x6, x5, x10
        add             x5,  x0,  x1
        add             x10, x6,  x1
        lsl             x1,  x1,  #1
        sub             x1,  x1,  w3, uxtw
1:
        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
2:
        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
        usubl2          v7.8h,   v5.16b,  v4.16b
        uaddw           v24.8h,  v6.8h,   v0.8b
        uaddw           v25.8h,  v7.8h,   v0.8b
        uaddw           v26.8h,  v6.8h,   v1.8b
        uaddw           v27.8h,  v7.8h,   v1.8b
        uaddw           v28.8h,  v6.8h,   v2.8b
        uaddw           v29.8h,  v7.8h,   v2.8b
        uaddw           v30.8h,  v6.8h,   v3.8b
        uaddw           v31.8h,  v7.8h,   v3.8b
        sqxtun          v17.8b,  v26.8h           // base
        sqxtun2         v17.16b, v27.8h
        sqxtun          v16.8b,  v24.8h
        sqxtun2         v16.16b, v25.8h
        sqxtun          v19.8b,  v30.8h
        sqxtun2         v19.16b, v31.8h
        sqxtun          v18.8b,  v28.8h
        sqxtun2         v18.16b, v29.8h
        uabd            v23.16b, v5.16b,  v19.16b // tdiff
        uabd            v22.16b, v5.16b,  v18.16b
        uabd            v21.16b, v5.16b,  v17.16b
        uabd            v20.16b, v5.16b,  v16.16b
        uabd            v27.16b, v4.16b,  v19.16b // tldiff
        uabd            v26.16b, v4.16b,  v18.16b
        uabd            v25.16b, v4.16b,  v17.16b
        uabd            v24.16b, v4.16b,  v16.16b
        uabd            v19.16b, v3.16b,  v19.16b // ldiff
        uabd            v18.16b, v2.16b,  v18.16b
        uabd            v17.16b, v1.16b,  v17.16b
        uabd            v16.16b, v0.16b,  v16.16b
        umin            v31.16b, v23.16b, v27.16b // min(tdiff, tldiff)
        umin            v30.16b, v22.16b, v26.16b
        umin            v29.16b, v21.16b, v25.16b
        umin            v28.16b, v20.16b, v24.16b
        cmhs            v23.16b, v27.16b, v23.16b // tldiff >= tdiff
        cmhs            v22.16b, v26.16b, v22.16b
        cmhs            v21.16b, v25.16b, v21.16b
        cmhs            v20.16b, v24.16b, v20.16b
        cmhs            v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff
        cmhs            v18.16b, v30.16b, v18.16b
        cmhs            v17.16b, v29.16b, v17.16b
        cmhs            v16.16b, v28.16b, v16.16b
        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
        bsl             v22.16b, v5.16b,  v4.16b
        bsl             v21.16b, v5.16b,  v4.16b
        bsl             v20.16b, v5.16b,  v4.16b
        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
        bit             v22.16b, v2.16b,  v18.16b
        bit             v21.16b, v1.16b,  v17.16b
        bit             v20.16b, v0.16b,  v16.16b
        subs            w3,  w3,  #16
        st1             {v23.16b}, [x0],  #16
        st1             {v22.16b}, [x6],  #16
        st1             {v21.16b}, [x5],  #16
        st1             {v20.16b}, [x10], #16
        b.le            8f
        ld1             {v5.16b},  [x8], #16
        b               2b
8:
        subs            w4,  w4,  #4
        b.le            9f
        // End of horizontal loop, move pointers to next four rows
        sub             x8,  x8,  w9, uxtw
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        // Load the top row as early as possible
        ld1             {v5.16b},  [x8], #16
        add             x5,  x5,  x1
        add             x10, x10, x1
        mov             w3,  w9
        b               1b
9:
        ret

L(ipred_paeth_tbl):
        .hword L(ipred_paeth_tbl) - 640b
        .hword L(ipred_paeth_tbl) - 320b
        .hword L(ipred_paeth_tbl) - 160b
        .hword L(ipred_paeth_tbl) -  80b
        .hword L(ipred_paeth_tbl) -  40b
endfunc

// void ipred_smooth_neon(pixel *dst, const ptrdiff_t stride,
//                        const pixel *const topleft,
//                        const int width, const int height, const int a,
//                        const int max_width, const int max_height);
function ipred_smooth_neon, export=1
        movrel          x10, X(sm_weights)
        add             x11, x10, w4, uxtw
        add             x10, x10, w3, uxtw
        clz             w9,  w3
        adr             x5,  L(ipred_smooth_tbl)
        sub             x12, x2,  w4, uxtw
        sub             w9,  w9,  #25
        ldrh            w9,  [x5, w9, uxtw #1]
        ld1r            {v4.16b},  [x12] // bottom
        add             x8,  x2,  #1
        sub             x5,  x5,  w9, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
40:
        sub             x2,  x2,  #4
        mov             x7,  #-4
        ld1r            {v6.2s}, [x8]             // top
        ld1r            {v7.2s}, [x10]            // weights_hor
        dup             v5.16b,  v6.b[3]          // right
        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
        uxtl            v7.8h,   v7.8b            // weights_hor
4:
        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
        shll            v20.8h,  v5.8b,   #8      // right*256
        shll            v21.8h,  v5.8b,   #8
        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
        zip1            v0.2s,   v3.2s,   v2.2s
        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
        zip1            v18.2s,  v18.2s,  v19.2s
        shll            v22.8h,  v4.8b,   #8      // bottom*256
        shll            v23.8h,  v4.8b,   #8
        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
        usubl           v1.8h,   v1.8b,   v5.8b
        uxtl            v16.8h,  v16.8b           // weights_ver
        uxtl            v18.8h,  v18.8b
        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
        mla             v21.8h,  v1.8h,   v7.8h
        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
        mla             v23.8h,  v6.8h,   v18.8h
        uhadd           v20.8h,  v20.8h,  v22.8h
        uhadd           v21.8h,  v21.8h,  v23.8h
        rshrn           v20.8b,  v20.8h,  #8
        rshrn           v21.8b,  v21.8h,  #8
        st1             {v20.s}[0], [x0], x1
        st1             {v20.s}[1], [x6], x1
        subs            w4,  w4,  #4
        st1             {v21.s}[0], [x0], x1
        st1             {v21.s}[1], [x6], x1
        b.gt            4b
        ret
80:
        sub             x2,  x2,  #4
        mov             x7,  #-4
        ld1             {v6.8b}, [x8]             // top
        ld1             {v7.8b}, [x10]            // weights_hor
        dup             v5.16b,  v6.b[7]          // right
        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
        uxtl            v7.8h,   v7.8b            // weights_hor
8:
        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
        shll            v20.8h,  v5.8b,   #8      // right*256
        shll            v21.8h,  v5.8b,   #8
        shll            v22.8h,  v5.8b,   #8
        shll            v23.8h,  v5.8b,   #8
        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
        usubl           v1.8h,   v1.8b,   v5.8b
        usubl           v2.8h,   v2.8b,   v5.8b
        usubl           v3.8h,   v3.8b,   v5.8b
        shll            v24.8h,  v4.8b,   #8      // bottom*256
        shll            v25.8h,  v4.8b,   #8
        shll            v26.8h,  v4.8b,   #8
        shll            v27.8h,  v4.8b,   #8
        uxtl            v16.8h,  v16.8b           // weights_ver
        uxtl            v17.8h,  v17.8b
        uxtl            v18.8h,  v18.8b
        uxtl            v19.8h,  v19.8b
        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
        mla             v22.8h,  v1.8h,   v7.8h
        mla             v23.8h,  v0.8h,   v7.8h
        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
        mla             v25.8h,  v6.8h,   v17.8h
        mla             v26.8h,  v6.8h,   v18.8h
        mla             v27.8h,  v6.8h,   v19.8h
        uhadd           v20.8h,  v20.8h,  v24.8h
        uhadd           v21.8h,  v21.8h,  v25.8h
        uhadd           v22.8h,  v22.8h,  v26.8h
        uhadd           v23.8h,  v23.8h,  v27.8h
        rshrn           v20.8b,  v20.8h,  #8
        rshrn           v21.8b,  v21.8h,  #8
        rshrn           v22.8b,  v22.8h,  #8
        rshrn           v23.8b,  v23.8h,  #8
        st1             {v20.8b}, [x0], x1
        st1             {v21.8b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v22.8b}, [x0], x1
        st1             {v23.8b}, [x6], x1
        b.gt            8b
        ret
160:
320:
640:
        add             x12, x2,  w3, uxtw
        sub             x2,  x2,  #2
        mov             x7,  #-2
        ld1r            {v5.16b}, [x12]           // right
        sub             x1,  x1,  w3, uxtw
        mov             w9,  w3

1:
        ld2r            {v0.8b, v1.8b},   [x2],  x7 // left
        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
        usubl           v1.8h,   v1.8b,   v5.8b
        uxtl            v16.8h,  v16.8b           // weights_ver
        uxtl            v17.8h,  v17.8b
2:
        ld1             {v7.16b}, [x10],  #16     // weights_hor
        ld1             {v3.16b}, [x8],   #16     // top
        shll            v20.8h,  v5.8b,   #8      // right*256
        shll            v21.8h,  v5.8b,   #8
        shll            v22.8h,  v5.8b,   #8
        shll            v23.8h,  v5.8b,   #8
        uxtl            v6.8h,   v7.8b            // weights_hor
        uxtl2           v7.8h,   v7.16b
        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
        usubl2          v3.8h,   v3.16b,  v4.16b
        mla             v20.8h,  v1.8h,   v6.8h   // right*256  + (left-right)*weights_hor
        mla             v21.8h,  v1.8h,   v7.8h   // (left flipped)
        mla             v22.8h,  v0.8h,   v6.8h
        mla             v23.8h,  v0.8h,   v7.8h
        shll            v24.8h,  v4.8b,   #8      // bottom*256
        shll            v25.8h,  v4.8b,   #8
        shll            v26.8h,  v4.8b,   #8
        shll            v27.8h,  v4.8b,   #8
        mla             v24.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
        mla             v25.8h,  v3.8h,   v16.8h
        mla             v26.8h,  v2.8h,   v17.8h
        mla             v27.8h,  v3.8h,   v17.8h
        uhadd           v20.8h,  v20.8h,  v24.8h
        uhadd           v21.8h,  v21.8h,  v25.8h
        uhadd           v22.8h,  v22.8h,  v26.8h
        uhadd           v23.8h,  v23.8h,  v27.8h
        rshrn           v20.8b,  v20.8h,  #8
        rshrn2          v20.16b, v21.8h,  #8
        rshrn           v22.8b,  v22.8h,  #8
        rshrn2          v22.16b, v23.8h,  #8
        subs            w3,  w3,  #16
        st1             {v20.16b}, [x0],  #16
        st1             {v22.16b}, [x6],  #16
        b.gt            2b
        subs            w4,  w4,  #2
        b.le            9f
        sub             x8,  x8,  w9, uxtw
        sub             x10, x10, w9, uxtw
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        mov             w3,  w9
        b               1b
9:
        ret

L(ipred_smooth_tbl):
        .hword L(ipred_smooth_tbl) - 640b
        .hword L(ipred_smooth_tbl) - 320b
        .hword L(ipred_smooth_tbl) - 160b
        .hword L(ipred_smooth_tbl) -  80b
        .hword L(ipred_smooth_tbl) -  40b
endfunc

// void ipred_smooth_v_neon(pixel *dst, const ptrdiff_t stride,
//                          const pixel *const topleft,
//                          const int width, const int height, const int a,
//                          const int max_width, const int max_height);
function ipred_smooth_v_neon, export=1
        movrel          x7,  X(sm_weights)
        add             x7,  x7,  w4, uxtw
        clz             w9,  w3
        adr             x5,  L(ipred_smooth_v_tbl)
        sub             x8,  x2,  w4, uxtw
        sub             w9,  w9,  #25
        ldrh            w9,  [x5, w9, uxtw #1]
        ld1r            {v4.16b},  [x8] // bottom
        add             x2,  x2,  #1
        sub             x5,  x5,  w9, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
40:
        ld1r            {v6.2s}, [x2]             // top
        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
4:
        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
        shll            v22.8h,  v4.8b,   #8      // bottom*256
        shll            v23.8h,  v4.8b,   #8
        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
        zip1            v18.2s,  v18.2s,  v19.2s
        uxtl            v16.8h,  v16.8b           // weights_ver
        uxtl            v18.8h,  v18.8b
        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
        mla             v23.8h,  v6.8h,   v18.8h
        rshrn           v22.8b,  v22.8h,  #8
        rshrn           v23.8b,  v23.8h,  #8
        st1             {v22.s}[0], [x0], x1
        st1             {v22.s}[1], [x6], x1
        subs            w4,  w4,  #4
        st1             {v23.s}[0], [x0], x1
        st1             {v23.s}[1], [x6], x1
        b.gt            4b
        ret
80:
        ld1             {v6.8b}, [x2]             // top
        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
8:
        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
        shll            v24.8h,  v4.8b,   #8      // bottom*256
        shll            v25.8h,  v4.8b,   #8
        shll            v26.8h,  v4.8b,   #8
        shll            v27.8h,  v4.8b,   #8
        uxtl            v16.8h,  v16.8b           // weights_ver
        uxtl            v17.8h,  v17.8b
        uxtl            v18.8h,  v18.8b
        uxtl            v19.8h,  v19.8b
        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
        mla             v25.8h,  v6.8h,   v17.8h
        mla             v26.8h,  v6.8h,   v18.8h
        mla             v27.8h,  v6.8h,   v19.8h
        rshrn           v24.8b,  v24.8h,  #8
        rshrn           v25.8b,  v25.8h,  #8
        rshrn           v26.8b,  v26.8h,  #8
        rshrn           v27.8b,  v27.8h,  #8
        st1             {v24.8b}, [x0], x1
        st1             {v25.8b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v26.8b}, [x0], x1
        st1             {v27.8b}, [x6], x1
        b.gt            8b
        ret
160:
320:
640:
        // Set up pointers for four rows in parallel; x0, x6, x5, x8
        add             x5,  x0,  x1
        add             x8,  x6,  x1
        lsl             x1,  x1,  #1
        sub             x1,  x1,  w3, uxtw
        mov             w9,  w3

1:
        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
        uxtl            v16.8h,  v16.8b           // weights_ver
        uxtl            v17.8h,  v17.8b
        uxtl            v18.8h,  v18.8b
        uxtl            v19.8h,  v19.8b
2:
        ld1             {v3.16b}, [x2],   #16     // top
        shll            v20.8h,  v4.8b,   #8      // bottom*256
        shll            v21.8h,  v4.8b,   #8
        shll            v22.8h,  v4.8b,   #8
        shll            v23.8h,  v4.8b,   #8
        shll            v24.8h,  v4.8b,   #8
        shll            v25.8h,  v4.8b,   #8
        shll            v26.8h,  v4.8b,   #8
        shll            v27.8h,  v4.8b,   #8
        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
        usubl2          v3.8h,   v3.16b,  v4.16b
        mla             v20.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
        mla             v21.8h,  v3.8h,   v16.8h
        mla             v22.8h,  v2.8h,   v17.8h
        mla             v23.8h,  v3.8h,   v17.8h
        mla             v24.8h,  v2.8h,   v18.8h
        mla             v25.8h,  v3.8h,   v18.8h
        mla             v26.8h,  v2.8h,   v19.8h
        mla             v27.8h,  v3.8h,   v19.8h
        rshrn           v20.8b,  v20.8h,  #8
        rshrn2          v20.16b, v21.8h,  #8
        rshrn           v22.8b,  v22.8h,  #8
        rshrn2          v22.16b, v23.8h,  #8
        rshrn           v24.8b,  v24.8h,  #8
        rshrn2          v24.16b, v25.8h,  #8
        rshrn           v26.8b,  v26.8h,  #8
        rshrn2          v26.16b, v27.8h,  #8
        subs            w3,  w3,  #16
        st1             {v20.16b}, [x0],  #16
        st1             {v22.16b}, [x6],  #16
        st1             {v24.16b}, [x5],  #16
        st1             {v26.16b}, [x8],  #16
        b.gt            2b
        subs            w4,  w4,  #4
        b.le            9f
        sub             x2,  x2,  w9, uxtw
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        add             x5,  x5,  x1
        add             x8,  x8,  x1
        mov             w3,  w9
        b               1b
9:
        ret

L(ipred_smooth_v_tbl):
        .hword L(ipred_smooth_v_tbl) - 640b
        .hword L(ipred_smooth_v_tbl) - 320b
        .hword L(ipred_smooth_v_tbl) - 160b
        .hword L(ipred_smooth_v_tbl) -  80b
        .hword L(ipred_smooth_v_tbl) -  40b
endfunc

// void ipred_smooth_h_neon(pixel *dst, const ptrdiff_t stride,
//                          const pixel *const topleft,
//                          const int width, const int height, const int a,
//                          const int max_width, const int max_height);
function ipred_smooth_h_neon, export=1
        movrel          x8,  X(sm_weights)
        add             x8,  x8,  w3, uxtw
        clz             w9,  w3
        adr             x5,  L(ipred_smooth_h_tbl)
        add             x12, x2,  w3, uxtw
        sub             w9,  w9,  #25
        ldrh            w9,  [x5, w9, uxtw #1]
        ld1r            {v5.16b},  [x12] // right
        sub             x5,  x5,  w9, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
40:
        ld1r            {v7.2s}, [x8]             // weights_hor
        sub             x2,  x2,  #4
        mov             x7,  #-4
        uxtl            v7.8h,   v7.8b            // weights_hor
4:
        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
        shll            v20.8h,  v5.8b,   #8      // right*256
        shll            v21.8h,  v5.8b,   #8
        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
        zip1            v0.2s,   v3.2s,   v2.2s
        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
        usubl           v1.8h,   v1.8b,   v5.8b
        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
        mla             v21.8h,  v1.8h,   v7.8h
        rshrn           v20.8b,  v20.8h,  #8
        rshrn           v21.8b,  v21.8h,  #8
        st1             {v20.s}[0], [x0], x1
        st1             {v20.s}[1], [x6], x1
        subs            w4,  w4,  #4
        st1             {v21.s}[0], [x0], x1
        st1             {v21.s}[1], [x6], x1
        b.gt            4b
        ret
80:
        ld1             {v7.8b}, [x8]             // weights_hor
        sub             x2,  x2,  #4
        mov             x7,  #-4
        uxtl            v7.8h,   v7.8b            // weights_hor
8:
        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
        shll            v20.8h,  v5.8b,   #8      // right*256
        shll            v21.8h,  v5.8b,   #8
        shll            v22.8h,  v5.8b,   #8
        shll            v23.8h,  v5.8b,   #8
        usubl           v3.8h,   v3.8b,   v5.8b   // left-right
        usubl           v2.8h,   v2.8b,   v5.8b
        usubl           v1.8h,   v1.8b,   v5.8b
        usubl           v0.8h,   v0.8b,   v5.8b
        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
        mla             v22.8h,  v1.8h,   v7.8h
        mla             v23.8h,  v0.8h,   v7.8h
        rshrn           v20.8b,  v20.8h,  #8
        rshrn           v21.8b,  v21.8h,  #8
        rshrn           v22.8b,  v22.8h,  #8
        rshrn           v23.8b,  v23.8h,  #8
        st1             {v20.8b}, [x0], x1
        st1             {v21.8b}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v22.8b}, [x0], x1
        st1             {v23.8b}, [x6], x1
        b.gt            8b
        ret
160:
320:
640:
        sub             x2,  x2,  #4
        mov             x7,  #-4
        // Set up pointers for four rows in parallel; x0, x6, x5, x10
        add             x5,  x0,  x1
        add             x10, x6,  x1
        lsl             x1,  x1,  #1
        sub             x1,  x1,  w3, uxtw
        mov             w9,  w3

1:
        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},   [x2],  x7 // left
        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
        usubl           v1.8h,   v1.8b,   v5.8b
        usubl           v2.8h,   v2.8b,   v5.8b
        usubl           v3.8h,   v3.8b,   v5.8b
2:
        ld1             {v7.16b}, [x8],   #16     // weights_hor
        shll            v20.8h,  v5.8b,   #8      // right*256
        shll            v21.8h,  v5.8b,   #8
        shll            v22.8h,  v5.8b,   #8
        shll            v23.8h,  v5.8b,   #8
        shll            v24.8h,  v5.8b,   #8
        shll            v25.8h,  v5.8b,   #8
        shll            v26.8h,  v5.8b,   #8
        shll            v27.8h,  v5.8b,   #8
        uxtl            v6.8h,   v7.8b            // weights_hor
        uxtl2           v7.8h,   v7.16b
        mla             v20.8h,  v3.8h,   v6.8h   // right*256  + (left-right)*weights_hor
        mla             v21.8h,  v3.8h,   v7.8h   // (left flipped)
        mla             v22.8h,  v2.8h,   v6.8h
        mla             v23.8h,  v2.8h,   v7.8h
        mla             v24.8h,  v1.8h,   v6.8h
        mla             v25.8h,  v1.8h,   v7.8h
        mla             v26.8h,  v0.8h,   v6.8h
        mla             v27.8h,  v0.8h,   v7.8h
        rshrn           v20.8b,  v20.8h,  #8
        rshrn2          v20.16b, v21.8h,  #8
        rshrn           v22.8b,  v22.8h,  #8
        rshrn2          v22.16b, v23.8h,  #8
        rshrn           v24.8b,  v24.8h,  #8
        rshrn2          v24.16b, v25.8h,  #8
        rshrn           v26.8b,  v26.8h,  #8
        rshrn2          v26.16b, v27.8h,  #8
        subs            w3,  w3,  #16
        st1             {v20.16b}, [x0],  #16
        st1             {v22.16b}, [x6],  #16
        st1             {v24.16b}, [x5],  #16
        st1             {v26.16b}, [x10], #16
        b.gt            2b
        subs            w4,  w4,  #4
        b.le            9f
        sub             x8,  x8,  w9, uxtw
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        add             x5,  x5,  x1
        add             x10, x10, x1
        mov             w3,  w9
        b               1b
9:
        ret

L(ipred_smooth_h_tbl):
        .hword L(ipred_smooth_h_tbl) - 640b
        .hword L(ipred_smooth_h_tbl) - 320b
        .hword L(ipred_smooth_h_tbl) - 160b
        .hword L(ipred_smooth_h_tbl) -  80b
        .hword L(ipred_smooth_h_tbl) -  40b
endfunc

// void ipred_filter_neon(pixel *dst, const ptrdiff_t stride,
//                        const pixel *const topleft,
//                        const int width, const int height, const int filt_idx,
//                        const int max_width, const int max_height);
function ipred_filter_neon, export=1
        and             w5,  w5,  #511
        movrel          x6,  X(filter_intra_taps)
        lsl             w5,  w5,  #6
        add             x6,  x6,  w5, uxtw
        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
        clz             w9,  w3
        adr             x5,  L(ipred_filter_tbl)
        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
        sub             w9,  w9,  #26
        ldrh            w9,  [x5, w9, uxtw #1]
        sxtl            v16.8h,  v16.8b
        sxtl            v17.8h,  v17.8b
        sub             x5,  x5,  w9, uxtw
        sxtl            v18.8h,  v18.8b
        sxtl            v19.8h,  v19.8b
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        sxtl            v20.8h,  v20.8b
        sxtl            v21.8h,  v21.8b
        sxtl            v22.8h,  v22.8b
        br              x5
40:
        ldur            s0,  [x2, #1]             // top (0-3)
        sub             x2,  x2,  #2
        mov             x7,  #-2
        uxtl            v0.8h,   v0.8b            // top (0-3)
4:
        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
        sqrshrun        v2.8b,   v2.8h,   #4
        subs            w4,  w4,  #2
        st1             {v2.s}[0], [x0], x1
        uxtl            v0.8h,   v2.8b
        st1             {v2.s}[1], [x6], x1
        ext             v0.16b,  v0.16b,  v0.16b, #8 // move top from [4-7] to [0-3]
        b.gt            4b
        ret
80:
        ldur            d0,  [x2, #1]             // top (0-7)
        sub             x2,  x2,  #2
        mov             x7,  #-2
        uxtl            v0.8h,   v0.8b            // top (0-7)
8:
        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
        sqrshrun        v2.8b,   v2.8h,   #4
        uxtl            v1.8h,   v2.8b            // first block, in 16 bit
        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
        mla             v3.8h,   v21.8h,  v1.h[3] // p5(left[0]) * filter(5)
        mla             v3.8h,   v22.8h,  v1.h[7] // p6(left[1]) * filter(6)
        sqrshrun        v3.8b,   v3.8h,   #4
        subs            w4,  w4,  #2
        st2             {v2.s, v3.s}[0], [x0], x1
        zip2            v0.2s,   v2.2s,   v3.2s
        st2             {v2.s, v3.s}[1], [x6], x1
        uxtl            v0.8h,   v0.8b
        b.gt            8b
        ret
160:
320:
        add             x8,  x2,  #1
        sub             x2,  x2,  #2
        mov             x7,  #-2
        sub             x1,  x1,  w3, uxtw
        mov             w9,  w3

1:
        ld1             {v0.s}[0], [x2], x7       // left (0-1) + topleft (2)
        uxtl            v0.8h,   v0.8b            // left (0-1) + topleft (2)
2:
        ld1             {v2.16b}, [x8],   #16     // top(0-15)
        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
        uxtl            v1.8h,   v2.8b            // top(0-7)
        uxtl2           v2.8h,   v2.16b           // top(8-15)
        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)

        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
        sqrshrun        v3.8b,   v3.8h,   #4
        uxtl            v0.8h,   v3.8b            // first block, in 16 bit
        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
        mla             v4.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
        mla             v4.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)

        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
        sqrshrun        v4.8b,   v4.8h,   #4
        uxtl            v0.8h,   v4.8b            // second block, in 16 bit
        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
        mla             v5.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
        mla             v5.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)

        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
        sqrshrun        v5.8b,   v5.8h,   #4
        uxtl            v0.8h,   v5.8b            // third block, in 16 bit
        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
        mla             v6.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
        mla             v6.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)

        subs            w3,  w3,  #16
        sqrshrun        v6.8b,   v6.8h,   #4

        ins             v0.h[2], v2.h[7]
        st4             {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
        ins             v0.b[0], v6.b[7]
        st4             {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
        ins             v0.b[2], v6.b[3]
        b.gt            2b
        subs            w4,  w4,  #2
        b.le            9f
        sub             x8,  x6,  w9, uxtw
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        mov             w3,  w9
        b               1b
9:
        ret

L(ipred_filter_tbl):
        .hword L(ipred_filter_tbl) - 320b
        .hword L(ipred_filter_tbl) - 160b
        .hword L(ipred_filter_tbl) -  80b
        .hword L(ipred_filter_tbl) -  40b
endfunc

// void pal_pred_neon(pixel *dst, const ptrdiff_t stride,
//                    const uint16_t *const pal, const uint8_t *idx,
//                    const int w, const int h);
function pal_pred_neon, export=1
        ld1             {v0.8h}, [x2]
        clz             w9,  w4
        adr             x6,  L(pal_pred_tbl)
        sub             w9,  w9,  #25
        ldrh            w9,  [x6, w9, uxtw #1]
        xtn             v0.8b,  v0.8h
        sub             x6,  x6,  w9, uxtw
        add             x2,  x0,  x1
        lsl             x1,  x1,  #1
        br              x6
4:
        ld1             {v1.16b}, [x3], #16
        subs            w5,  w5,  #4
        tbl             v1.16b, {v0.16b}, v1.16b
        st1             {v1.s}[0], [x0], x1
        st1             {v1.s}[1], [x2], x1
        st1             {v1.s}[2], [x0], x1
        st1             {v1.s}[3], [x2], x1
        b.gt            4b
        ret
8:
        ld1             {v1.16b, v2.16b}, [x3], #32
        subs            w5,  w5,  #4
        tbl             v1.16b, {v0.16b}, v1.16b
        st1             {v1.d}[0], [x0], x1
        tbl             v2.16b, {v0.16b}, v2.16b
        st1             {v1.d}[1], [x2], x1
        st1             {v2.d}[0], [x0], x1
        st1             {v2.d}[1], [x2], x1
        b.gt            8b
        ret
16:
        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64
        subs            w5,  w5,  #4
        tbl             v1.16b, {v0.16b}, v1.16b
        tbl             v2.16b, {v0.16b}, v2.16b
        st1             {v1.16b}, [x0], x1
        tbl             v3.16b, {v0.16b}, v3.16b
        st1             {v2.16b}, [x2], x1
        tbl             v4.16b, {v0.16b}, v4.16b
        st1             {v3.16b}, [x0], x1
        st1             {v4.16b}, [x2], x1
        b.gt            16b
        ret
32:
        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
        subs            w5,  w5,  #4
        tbl             v16.16b, {v0.16b}, v16.16b
        tbl             v17.16b, {v0.16b}, v17.16b
        tbl             v18.16b, {v0.16b}, v18.16b
        tbl             v19.16b, {v0.16b}, v19.16b
        tbl             v20.16b, {v0.16b}, v20.16b
        st1             {v16.16b, v17.16b}, [x0], x1
        tbl             v21.16b, {v0.16b}, v21.16b
        st1             {v18.16b, v19.16b}, [x2], x1
        tbl             v22.16b, {v0.16b}, v22.16b
        st1             {v20.16b, v21.16b}, [x0], x1
        tbl             v23.16b, {v0.16b}, v23.16b
        st1             {v22.16b, v23.16b}, [x2], x1
        b.gt            32b
        ret
64:
        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
        subs            w5,  w5,  #2
        tbl             v16.16b, {v0.16b}, v16.16b
        tbl             v17.16b, {v0.16b}, v17.16b
        tbl             v18.16b, {v0.16b}, v18.16b
        tbl             v19.16b, {v0.16b}, v19.16b
        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
        tbl             v20.16b, {v0.16b}, v20.16b
        tbl             v21.16b, {v0.16b}, v21.16b
        tbl             v22.16b, {v0.16b}, v22.16b
        tbl             v23.16b, {v0.16b}, v23.16b
        st1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
        b.gt            64b
        ret

L(pal_pred_tbl):
        .hword L(pal_pred_tbl) - 64b
        .hword L(pal_pred_tbl) - 32b
        .hword L(pal_pred_tbl) - 16b
        .hword L(pal_pred_tbl) -  8b
        .hword L(pal_pred_tbl) -  4b
endfunc

// void ipred_cfl_128_neon(pixel *dst, const ptrdiff_t stride,
//                         const pixel *const topleft,
//                         const int width, const int height,
//                         const int16_t *ac, const int alpha);
function ipred_cfl_128_neon, export=1
        clz             w9,  w3
        adr             x7,  L(ipred_cfl_128_tbl)
        sub             w9,  w9,  #26
        ldrh            w9,  [x7, w9, uxtw #1]
        movi            v0.8h,   #128 // dc
        dup             v1.8h,   w6   // alpha
        sub             x7,  x7,  w9, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x7
L(ipred_cfl_splat_w4):
        ld1             {v2.8h, v3.8h}, [x5], #32
        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
        mul             v3.8h,   v3.8h,   v1.8h
        sshr            v4.8h,   v2.8h,   #15    // sign = diff >> 15
        sshr            v5.8h,   v3.8h,   #15
        add             v2.8h,   v2.8h,   v4.8h  // diff + sign
        add             v3.8h,   v3.8h,   v5.8h
        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
        srshr           v3.8h,   v3.8h,   #6
        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
        add             v3.8h,   v3.8h,   v0.8h
        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
        sqxtun          v3.8b,   v3.8h
        st1             {v2.s}[0],  [x0], x1
        st1             {v2.s}[1],  [x6], x1
        subs            w4,  w4,  #4
        st1             {v3.s}[0],  [x0], x1
        st1             {v3.s}[1],  [x6], x1
        b.gt            L(ipred_cfl_splat_w4)
        ret
L(ipred_cfl_splat_w8):
        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
        mul             v3.8h,   v3.8h,   v1.8h
        mul             v4.8h,   v4.8h,   v1.8h
        mul             v5.8h,   v5.8h,   v1.8h
        sshr            v16.8h,  v2.8h,   #15    // sign = diff >> 15
        sshr            v17.8h,  v3.8h,   #15
        sshr            v18.8h,  v4.8h,   #15
        sshr            v19.8h,  v5.8h,   #15
        add             v2.8h,   v2.8h,   v16.8h // diff + sign
        add             v3.8h,   v3.8h,   v17.8h
        add             v4.8h,   v4.8h,   v18.8h
        add             v5.8h,   v5.8h,   v19.8h
        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
        srshr           v3.8h,   v3.8h,   #6
        srshr           v4.8h,   v4.8h,   #6
        srshr           v5.8h,   v5.8h,   #6
        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
        add             v3.8h,   v3.8h,   v0.8h
        add             v4.8h,   v4.8h,   v0.8h
        add             v5.8h,   v5.8h,   v0.8h
        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
        sqxtun          v3.8b,   v3.8h
        sqxtun          v4.8b,   v4.8h
        sqxtun          v5.8b,   v5.8h
        st1             {v2.8b},  [x0], x1
        st1             {v3.8b},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v4.8b},  [x0], x1
        st1             {v5.8b},  [x6], x1
        b.gt            L(ipred_cfl_splat_w8)
        ret
L(ipred_cfl_splat_w16):
        add             x7,  x5,  w3, uxtw #1
        sub             x1,  x1,  w3, uxtw
        mov             w9,  w3
1:
        ld1             {v2.8h, v3.8h}, [x5], #32
        ld1             {v4.8h, v5.8h}, [x7], #32
        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
        mul             v3.8h,   v3.8h,   v1.8h
        mul             v4.8h,   v4.8h,   v1.8h
        mul             v5.8h,   v5.8h,   v1.8h
        sshr            v16.8h,  v2.8h,   #15    // sign = diff >> 15
        sshr            v17.8h,  v3.8h,   #15
        sshr            v18.8h,  v4.8h,   #15
        sshr            v19.8h,  v5.8h,   #15
        add             v2.8h,   v2.8h,   v16.8h // diff + sign
        add             v3.8h,   v3.8h,   v17.8h
        add             v4.8h,   v4.8h,   v18.8h
        add             v5.8h,   v5.8h,   v19.8h
        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
        srshr           v3.8h,   v3.8h,   #6
        srshr           v4.8h,   v4.8h,   #6
        srshr           v5.8h,   v5.8h,   #6
        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
        add             v3.8h,   v3.8h,   v0.8h
        add             v4.8h,   v4.8h,   v0.8h
        add             v5.8h,   v5.8h,   v0.8h
        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
        sqxtun          v3.8b,   v3.8h
        sqxtun          v4.8b,   v4.8h
        sqxtun          v5.8b,   v5.8h
        subs            w3,  w3,  #16
        st1             {v2.8b, v3.8b},  [x0], #16
        st1             {v4.8b, v5.8b},  [x6], #16
        b.gt            1b
        subs            w4,  w4,  #2
        add             x5,  x5,  w9, uxtw #1
        add             x7,  x7,  w9, uxtw #1
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        mov             w3,  w9
        b.gt            1b
        ret

L(ipred_cfl_128_tbl):
L(ipred_cfl_splat_tbl):
        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
endfunc

// void ipred_cfl_top_neon(pixel *dst, const ptrdiff_t stride,
//                         const pixel *const topleft,
//                         const int width, const int height,
//                         const int16_t *ac, const int alpha);
function ipred_cfl_top_neon, export=1
        clz             w9,  w3
        adr             x7,  L(ipred_cfl_top_tbl)
        sub             w9,  w9,  #26
        ldrh            w9,  [x7, w9, uxtw #1]
        dup             v1.8h,   w6   // alpha
        add             x2,  x2,  #1
        sub             x7,  x7,  w9, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x7
4:
        ld1r            {v0.2s},  [x2]
        uaddlv          h0,      v0.8b
        urshr           v0.8h,   v0.8h,   #3
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w4)
8:
        ld1             {v0.8b},  [x2]
        uaddlv          h0,      v0.8b
        urshr           v0.8h,   v0.8h,   #3
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w8)
16:
        ld1             {v0.16b}, [x2]
        uaddlv          h0,      v0.16b
        urshr           v0.8h,   v0.8h,   #4
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w16)
32:
        ld1             {v2.16b, v3.16b}, [x2]
        uaddlv          h2,      v2.16b
        uaddlv          h3,      v3.16b
        add             v2.4h,   v2.4h,   v3.4h
        urshr           v2.8h,   v2.8h,   #5
        dup             v0.8h,   v2.h[0]
        b               L(ipred_cfl_splat_w16)

L(ipred_cfl_top_tbl):
        .hword L(ipred_cfl_top_tbl) - 32b
        .hword L(ipred_cfl_top_tbl) - 16b
        .hword L(ipred_cfl_top_tbl) -  8b
        .hword L(ipred_cfl_top_tbl) -  4b
endfunc

// void ipred_cfl_left_neon(pixel *dst, const ptrdiff_t stride,
//                          const pixel *const topleft,
//                          const int width, const int height,
//                          const int16_t *ac, const int alpha);
function ipred_cfl_left_neon, export=1
        sub             x2,  x2,  w4, uxtw
        clz             w9,  w3
        clz             w8,  w4
        adr             x10, L(ipred_cfl_splat_tbl)
        adr             x7,  L(ipred_cfl_left_tbl)
        sub             w9,  w9,  #26
        sub             w8,  w8,  #26
        ldrh            w9,  [x10, w9, uxtw #1]
        ldrh            w8,  [x7,  w8, uxtw #1]
        dup             v1.8h,   w6   // alpha
        sub             x9,  x10, w9, uxtw
        sub             x7,  x7,  w8, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x7

L(ipred_cfl_left_h4):
        ld1r            {v0.2s},  [x2]
        uaddlv          h0,      v0.8b
        urshr           v0.8h,   v0.8h,   #3
        dup             v0.8h,   v0.h[0]
        br              x9

L(ipred_cfl_left_h8):
        ld1             {v0.8b},  [x2]
        uaddlv          h0,      v0.8b
        urshr           v0.8h,   v0.8h,   #3
        dup             v0.8h,   v0.h[0]
        br              x9

L(ipred_cfl_left_h16):
        ld1             {v0.16b}, [x2]
        uaddlv          h0,      v0.16b
        urshr           v0.8h,   v0.8h,   #4
        dup             v0.8h,   v0.h[0]
        br              x9

L(ipred_cfl_left_h32):
        ld1             {v2.16b, v3.16b}, [x2]
        uaddlv          h2,      v2.16b
        uaddlv          h3,      v3.16b
        add             v2.4h,   v2.4h,   v3.4h
        urshr           v2.8h,   v2.8h,   #5
        dup             v0.8h,   v2.h[0]
        br              x9

L(ipred_cfl_left_tbl):
        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
endfunc

// void ipred_cfl_neon(pixel *dst, const ptrdiff_t stride,
//                     const pixel *const topleft,
//                     const int width, const int height,
//                     const int16_t *ac, const int alpha);
function ipred_cfl_neon, export=1
        sub             x2,  x2,  w4, uxtw
        add             w8,  w3,  w4             // width + height
        dup             v1.8h,   w6              // alpha
        clz             w9,  w3
        clz             w6,  w4
        dup             v16.8h, w8               // width + height
        adr             x7,  L(ipred_cfl_tbl)
        rbit            w8,  w8                  // rbit(width + height)
        sub             w9,  w9,  #22            // 22 leading bits, minus table offset 4
        sub             w6,  w6,  #26
        clz             w8,  w8                  // ctz(width + height)
        ldrh            w9,  [x7, w9, uxtw #1]
        ldrh            w6,  [x7, w6, uxtw #1]
        neg             w8,  w8                  // -ctz(width + height)
        sub             x9,  x7,  w9, uxtw
        sub             x7,  x7,  w6, uxtw
        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
        dup             v17.8h,  w8              // -ctz(width + height)
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x7

L(ipred_cfl_h4):
        ld1             {v0.s}[0],  [x2], #4
        ins             v0.s[1], wzr
        uaddlv          h0,      v0.8b
        br              x9
L(ipred_cfl_w4):
        add             x2,  x2,  #1
        ld1             {v2.s}[0],  [x2]
        ins             v2.s[1], wzr
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h2,      v2.8b
        cmp             w4,  #4
        add             v0.4h,   v0.4h,   v2.4h
        ushl            v0.4h,   v0.4h,   v17.4h
        b.eq            1f
        // h = 8/16
        mov             w16, #(0x3334/2)
        movk            w16, #(0x5556/2), lsl #16
        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
        lsr             w16, w16, w17
        dup             v16.4h,  w16
        sqdmulh         v0.4h,   v0.4h,   v16.4h
1:
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w4)

L(ipred_cfl_h8):
        ld1             {v0.8b},  [x2], #8
        uaddlv          h0,      v0.8b
        br              x9
L(ipred_cfl_w8):
        add             x2,  x2,  #1
        ld1             {v2.8b},  [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h2,      v2.8b
        cmp             w4,  #8
        add             v0.4h,   v0.4h,   v2.4h
        ushl            v0.4h,   v0.4h,   v17.4h
        b.eq            1f
        // h = 4/16/32
        cmp             w4,  #32
        mov             w16, #(0x3334/2)
        mov             w17, #(0x5556/2)
        csel            w16, w16, w17, eq
        dup             v16.4h,  w16
        sqdmulh         v0.4h,   v0.4h,   v16.4h
1:
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w8)

L(ipred_cfl_h16):
        ld1             {v0.16b}, [x2], #16
        uaddlv          h0,      v0.16b
        br              x9
L(ipred_cfl_w16):
        add             x2,  x2,  #1
        ld1             {v2.16b}, [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h2,      v2.16b
        cmp             w4,  #16
        add             v0.4h,   v0.4h,   v2.4h
        ushl            v0.4h,   v0.4h,   v17.4h
        b.eq            1f
        // h = 4/8/32
        cmp             w4,  #4
        mov             w16, #(0x3334/2)
        mov             w17, #(0x5556/2)
        csel            w16, w16, w17, eq
        dup             v16.4h,  w16
        sqdmulh         v0.4h,   v0.4h,   v16.4h
1:
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w16)

L(ipred_cfl_h32):
        ld1             {v2.16b, v3.16b}, [x2], #32
        uaddlv          h2,      v2.16b
        uaddlv          h3,      v3.16b
        add             v0.4h,   v2.4h,   v3.4h
        br              x9
L(ipred_cfl_w32):
        add             x2,  x2,  #1
        ld1             {v2.16b, v3.16b}, [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h2,      v2.16b
        uaddlv          h3,      v3.16b
        cmp             w4,  #32
        add             v0.4h,   v0.4h,   v2.4h
        add             v0.4h,   v0.4h,   v3.4h
        ushl            v0.4h,   v0.4h,   v17.4h
        b.eq            1f
        // h = 8/16
        mov             w16, #(0x5556/2)
        movk            w16, #(0x3334/2), lsl #16
        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
        lsr             w16, w16, w17
        dup             v16.4h,  w16
        sqdmulh         v0.4h,   v0.4h,   v16.4h
1:
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w16)

L(ipred_cfl_tbl):
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
endfunc

// void cfl_ac_420_neon(int16_t *const ac, const pixel *const ypx,
//                      const ptrdiff_t stride, const int w_pad,
//                      const int h_pad, const int cw, const int ch);
function ipred_cfl_ac_420_neon, export=1
        clz             w8,  w5
        lsl             w4,  w4,  #2
        adr             x7,  L(ipred_cfl_ac_420_tbl)
        sub             w8,  w8,  #27
        ldrh            w8,  [x7, w8, uxtw #1]
        sub             x7,  x7,  w8, uxtw
        sub             w8,  w6,  w4         // height - h_pad
        rbit            w9,  w5              // rbit(width)
        rbit            w10, w6              // rbit(height)
        clz             w9,  w9              // ctz(width)
        clz             w10, w10             // ctz(height)
        add             w9,  w9,  w10        // log2sz
        movi            v16.4s,  #1
        add             x10, x1,  x2
        lsl             x2,  x2,  #1
        dup             v17.4s,  w9
        sshl            v16.4s,  v16.4s,  v17.4s // 1 << log2sz
        neg             v17.4s,  v17.4s          // -log2sz
        ushr            v16.4s,  v16.4s,  #1     // 1 << (log2sz - 1)
        mov             w9,  w6
        br              x7

L(ipred_cfl_ac_420_w4):
1:      // Copy and subsample input
        ld1             {v0.8b},   [x1],  x2
        ld1             {v1.8b},   [x10], x2
        ld1             {v0.d}[1], [x1],  x2
        ld1             {v1.d}[1], [x10], x2
        uaddlp          v0.8h,   v0.16b
        uaddlp          v1.8h,   v1.16b
        add             v0.8h,   v0.8h,   v1.8h
        shl             v0.8h,   v0.8h,   #1
        subs            w8,  w8,  #2
        st1             {v0.8h}, [x0], #16
        b.gt            1b
        trn2            v1.2d,   v0.2d,   v0.2d
        trn2            v0.2d,   v0.2d,   v0.2d
L(ipred_cfl_ac_420_w4_hpad):
        cbz             w4,  3f
2:      // Vertical padding (h_pad > 0)
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h}, [x0], #32
        b.gt            2b
3:
        sub             x0,  x0,  w6, uxtw #3
        // Sum the produced ac values
        subs            w6,  w6,  #4
        ld1             {v0.8h, v1.8h}, [x0], #32
        b.le            5f
4:
        ld1             {v2.8h, v3.8h}, [x0], #32
        subs            w6,  w6,  #4
        add             v0.8h,   v0.8h,   v2.8h
        add             v1.8h,   v1.8h,   v3.8h
        b.gt            4b
5:
        add             v0.8h,   v0.8h,   v1.8h
        uaddlv          s0,  v0.8h                // sum
        sub             x0,  x0,  w9, uxtw #3
        add             v0.2s,   v0.2s,   v16.2s  // sum += 1 << (log2sz - 1)
        ushl            v4.2s,   v0.2s,   v17.2s  // sum >>= log2sz
        dup             v4.8h,   v4.h[0]
6:      // Subtract dc from ac
        ld1             {v0.8h, v1.8h}, [x0]
        subs            w9,  w9,  #4
        sub             v0.8h,   v0.8h,   v4.8h
        sub             v1.8h,   v1.8h,   v4.8h
        st1             {v0.8h, v1.8h}, [x0], #32
        b.gt            6b
        ret

L(ipred_cfl_ac_420_w8):
        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
1:      // Copy and subsample input, without padding
        ld1             {v0.16b}, [x1],  x2
        ld1             {v1.16b}, [x10], x2
        ld1             {v2.16b}, [x1],  x2
        uaddlp          v0.8h,   v0.16b
        ld1             {v3.16b}, [x10], x2
        uaddlp          v1.8h,   v1.16b
        uaddlp          v2.8h,   v2.16b
        uaddlp          v3.8h,   v3.16b
        add             v0.8h,   v0.8h,   v1.8h
        add             v2.8h,   v2.8h,   v3.8h
        shl             v0.8h,   v0.8h,   #1
        shl             v1.8h,   v2.8h,   #1
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h}, [x0], #32
        b.gt            1b
        mov             v0.16b,  v1.16b
        b               L(ipred_cfl_ac_420_w8_hpad)

L(ipred_cfl_ac_420_w8_wpad):
1:      // Copy and subsample input, padding 4
        ld1             {v0.8b},   [x1],  x2
        ld1             {v1.8b},   [x10], x2
        ld1             {v0.d}[1], [x1],  x2
        ld1             {v1.d}[1], [x10], x2
        uaddlp          v0.8h,   v0.16b
        uaddlp          v1.8h,   v1.16b
        add             v0.8h,   v0.8h,   v1.8h
        shl             v0.8h,   v0.8h,   #1
        dup             v1.4h,   v0.h[3]
        dup             v3.4h,   v0.h[7]
        trn2            v2.2d,   v0.2d,   v0.2d
        subs            w8,  w8,  #2
        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
        b.gt            1b
        trn1            v0.2d,   v2.2d,   v3.2d
        trn1            v1.2d,   v2.2d,   v3.2d

L(ipred_cfl_ac_420_w8_hpad):
        cbz             w4,  3f
2:      // Vertical padding (h_pad > 0)
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h}, [x0], #32
        st1             {v0.8h, v1.8h}, [x0], #32
        b.gt            2b
3:

L(ipred_cfl_ac_420_w8_calc_subtract_dc):
        sub             x0,  x0,  w6, uxtw #4
        // Sum the produced ac values
        subs            w6,  w6,  #4
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.le            5f
4:
        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
        subs            w6,  w6,  #4
        add             v0.8h,   v0.8h,   v4.8h
        add             v1.8h,   v1.8h,   v5.8h
        add             v2.8h,   v2.8h,   v6.8h
        add             v3.8h,   v3.8h,   v7.8h
        b.gt            4b
5:
        add             v0.8h,   v0.8h,   v1.8h
        add             v2.8h,   v2.8h,   v3.8h
        uaddlp          v0.4s,   v0.8h
        uaddlp          v2.4s,   v2.8h
        add             v0.4s,   v0.4s,   v2.4s
        addv            s0,  v0.4s                // sum
        sub             x0,  x0,  w9, uxtw #4
        add             v0.2s,   v0.2s,   v16.2s  // sum += 1 << (log2sz - 1)
        ushl            v4.2s,   v0.2s,   v17.2s  // sum >>= log2sz
        dup             v4.8h,   v4.h[0]
6:      // Subtract dc from ac
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
        subs            w9,  w9,  #4
        sub             v0.8h,   v0.8h,   v4.8h
        sub             v1.8h,   v1.8h,   v4.8h
        sub             v2.8h,   v2.8h,   v4.8h
        sub             v3.8h,   v3.8h,   v4.8h
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            6b
        ret

L(ipred_cfl_ac_420_w16):
        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
        ldrh            w3,  [x7, w3, uxtw #1]
        sub             x7,  x7,  w3, uxtw
        br              x7

L(ipred_cfl_ac_420_w16_wpad0):
1:      // Copy and subsample input, without padding
        ld1             {v0.16b, v1.16b}, [x1],  x2
        ld1             {v2.16b, v3.16b}, [x10], x2
        uaddlp          v0.8h,   v0.16b
        ld1             {v4.16b, v5.16b}, [x1],  x2
        uaddlp          v1.8h,   v1.16b
        ld1             {v6.16b, v7.16b}, [x10], x2
        uaddlp          v2.8h,   v2.16b
        uaddlp          v3.8h,   v3.16b
        uaddlp          v4.8h,   v4.16b
        uaddlp          v5.8h,   v5.16b
        uaddlp          v6.8h,   v6.16b
        uaddlp          v7.8h,   v7.16b
        add             v0.8h,   v0.8h,   v2.8h
        add             v1.8h,   v1.8h,   v3.8h
        add             v4.8h,   v4.8h,   v6.8h
        add             v5.8h,   v5.8h,   v7.8h
        shl             v0.8h,   v0.8h,   #1
        shl             v1.8h,   v1.8h,   #1
        shl             v2.8h,   v4.8h,   #1
        shl             v3.8h,   v5.8h,   #1
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_420_w16_wpad1):
1:      // Copy and subsample input, padding 4
        ldr             d1,  [x1,  #16]
        ld1             {v0.16b}, [x1],  x2
        ldr             d3,  [x10, #16]
        ld1             {v2.16b}, [x10], x2
        uaddlp          v1.4h,   v1.8b
        ldr             d5,  [x1,  #16]
        uaddlp          v0.8h,   v0.16b
        ld1             {v4.16b}, [x1],  x2
        uaddlp          v3.4h,   v3.8b
        ldr             d7,  [x10, #16]
        uaddlp          v2.8h,   v2.16b
        ld1             {v6.16b}, [x10], x2
        uaddlp          v5.4h,   v5.8b
        uaddlp          v4.8h,   v4.16b
        uaddlp          v7.4h,   v7.8b
        uaddlp          v6.8h,   v6.16b
        add             v1.4h,   v1.4h,   v3.4h
        add             v0.8h,   v0.8h,   v2.8h
        add             v5.4h,   v5.4h,   v7.4h
        add             v4.8h,   v4.8h,   v6.8h
        shl             v1.4h,   v1.4h,   #1
        shl             v0.8h,   v0.8h,   #1
        shl             v3.4h,   v5.4h,   #1
        shl             v2.8h,   v4.8h,   #1
        dup             v4.4h,   v1.h[3]
        dup             v5.4h,   v3.h[3]
        trn1            v1.2d,   v1.2d,   v4.2d
        trn1            v3.2d,   v3.2d,   v5.2d
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_420_w16_wpad2):
1:      // Copy and subsample input, padding 8
        ld1             {v0.16b}, [x1],  x2
        ld1             {v2.16b}, [x10], x2
        ld1             {v4.16b}, [x1],  x2
        uaddlp          v0.8h,   v0.16b
        ld1             {v6.16b}, [x10], x2
        uaddlp          v2.8h,   v2.16b
        uaddlp          v4.8h,   v4.16b
        uaddlp          v6.8h,   v6.16b
        add             v0.8h,   v0.8h,   v2.8h
        add             v4.8h,   v4.8h,   v6.8h
        shl             v0.8h,   v0.8h,   #1
        shl             v2.8h,   v4.8h,   #1
        dup             v1.8h,   v0.h[7]
        dup             v3.8h,   v2.h[7]
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_420_w16_wpad3):
1:      // Copy and subsample input, padding 12
        ld1             {v0.8b}, [x1],  x2
        ld1             {v2.8b}, [x10], x2
        ld1             {v4.8b}, [x1],  x2
        uaddlp          v0.4h,   v0.8b
        ld1             {v6.8b}, [x10], x2
        uaddlp          v2.4h,   v2.8b
        uaddlp          v4.4h,   v4.8b
        uaddlp          v6.4h,   v6.8b
        add             v0.4h,   v0.4h,   v2.4h
        add             v4.4h,   v4.4h,   v6.4h
        shl             v0.4h,   v0.4h,   #1
        shl             v2.4h,   v4.4h,   #1
        dup             v1.8h,   v0.h[3]
        dup             v3.8h,   v2.h[3]
        trn1            v0.2d,   v0.2d,   v1.2d
        trn1            v2.2d,   v2.2d,   v3.2d
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_420_w16_hpad):
        cbz             w4,  3f
2:      // Vertical padding (h_pad > 0)
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            2b
3:

        // Double the height and reuse the w8 summing/subtracting
        lsl             w6,  w6,  #1
        lsl             w9,  w9,  #1
        b               L(ipred_cfl_ac_420_w8_calc_subtract_dc)

L(ipred_cfl_ac_420_tbl):
        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
        .hword 0

L(ipred_cfl_ac_420_w16_tbl):
        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
endfunc

// void cfl_ac_422_neon(int16_t *const ac, const pixel *const ypx,
//                      const ptrdiff_t stride, const int w_pad,
//                      const int h_pad, const int cw, const int ch);
function ipred_cfl_ac_422_neon, export=1
        clz             w8,  w5
        lsl             w4,  w4,  #2
        adr             x7,  L(ipred_cfl_ac_422_tbl)
        sub             w8,  w8,  #27
        ldrh            w8,  [x7, w8, uxtw #1]
        sub             x7,  x7,  w8, uxtw
        sub             w8,  w6,  w4         // height - h_pad
        rbit            w9,  w5              // rbit(width)
        rbit            w10, w6              // rbit(height)
        clz             w9,  w9              // ctz(width)
        clz             w10, w10             // ctz(height)
        add             w9,  w9,  w10        // log2sz
        movi            v16.4s,  #1
        add             x10, x1,  x2
        lsl             x2,  x2,  #1
        dup             v17.4s,  w9
        sshl            v16.4s,  v16.4s,  v17.4s // 1 << log2sz
        neg             v17.4s,  v17.4s          // -log2sz
        ushr            v16.4s,  v16.4s,  #1     // 1 << (log2sz - 1)
        mov             w9,  w6
        br              x7

L(ipred_cfl_ac_422_w4):
1:      // Copy and subsample input
        ld1             {v0.8b},   [x1],  x2
        ld1             {v0.d}[1], [x10], x2
        ld1             {v1.8b},   [x1],  x2
        ld1             {v1.d}[1], [x10], x2
        uaddlp          v0.8h,   v0.16b
        uaddlp          v1.8h,   v1.16b
        shl             v0.8h,   v0.8h,   #2
        shl             v1.8h,   v1.8h,   #2
        subs            w8,  w8,  #4
        st1             {v0.8h, v1.8h}, [x0], #32
        b.gt            1b
        trn2            v0.2d,   v1.2d,   v1.2d
        trn2            v1.2d,   v1.2d,   v1.2d
        b               L(ipred_cfl_ac_420_w4_hpad)

L(ipred_cfl_ac_422_w8):
        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
1:      // Copy and subsample input, without padding
        ld1             {v0.16b}, [x1],  x2
        ld1             {v1.16b}, [x10], x2
        ld1             {v2.16b}, [x1],  x2
        uaddlp          v0.8h,   v0.16b
        ld1             {v3.16b}, [x10], x2
        uaddlp          v1.8h,   v1.16b
        uaddlp          v2.8h,   v2.16b
        uaddlp          v3.8h,   v3.16b
        shl             v0.8h,   v0.8h,   #2
        shl             v1.8h,   v1.8h,   #2
        shl             v2.8h,   v2.8h,   #2
        shl             v3.8h,   v3.8h,   #2
        subs            w8,  w8,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            1b
        mov             v0.16b,  v3.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w8_hpad)

L(ipred_cfl_ac_422_w8_wpad):
1:      // Copy and subsample input, padding 4
        ld1             {v0.8b},   [x1],  x2
        ld1             {v0.d}[1], [x10], x2
        ld1             {v2.8b},   [x1],  x2
        ld1             {v2.d}[1], [x10], x2
        uaddlp          v0.8h,   v0.16b
        uaddlp          v2.8h,   v2.16b
        shl             v0.8h,   v0.8h,   #2
        shl             v2.8h,   v2.8h,   #2
        dup             v4.4h,   v0.h[3]
        dup             v5.8h,   v0.h[7]
        dup             v6.4h,   v2.h[3]
        dup             v7.8h,   v2.h[7]
        trn2            v1.2d,   v0.2d,   v5.2d
        trn1            v0.2d,   v0.2d,   v4.2d
        trn2            v3.2d,   v2.2d,   v7.2d
        trn1            v2.2d,   v2.2d,   v6.2d
        subs            w8,  w8,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            1b
        mov             v0.16b,  v3.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w8_hpad)

L(ipred_cfl_ac_422_w16):
        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
        ldrh            w3,  [x7, w3, uxtw #1]
        sub             x7,  x7,  w3, uxtw
        br              x7

L(ipred_cfl_ac_422_w16_wpad0):
1:      // Copy and subsample input, without padding
        ld1             {v0.16b, v1.16b}, [x1],  x2
        ld1             {v2.16b, v3.16b}, [x10], x2
        uaddlp          v0.8h,   v0.16b
        uaddlp          v1.8h,   v1.16b
        uaddlp          v2.8h,   v2.16b
        uaddlp          v3.8h,   v3.16b
        shl             v0.8h,   v0.8h,   #2
        shl             v1.8h,   v1.8h,   #2
        shl             v2.8h,   v2.8h,   #2
        shl             v3.8h,   v3.8h,   #2
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_422_w16_wpad1):
1:      // Copy and subsample input, padding 4
        ldr             d1,  [x1,  #16]
        ld1             {v0.16b}, [x1],  x2
        ldr             d3,  [x10, #16]
        ld1             {v2.16b}, [x10], x2
        uaddlp          v1.4h,   v1.8b
        uaddlp          v0.8h,   v0.16b
        uaddlp          v3.4h,   v3.8b
        uaddlp          v2.8h,   v2.16b
        shl             v1.4h,   v1.4h,   #2
        shl             v0.8h,   v0.8h,   #2
        shl             v3.4h,   v3.4h,   #2
        shl             v2.8h,   v2.8h,   #2
        dup             v4.4h,   v1.h[3]
        dup             v5.4h,   v3.h[3]
        trn1            v1.2d,   v1.2d,   v4.2d
        trn1            v3.2d,   v3.2d,   v5.2d
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_422_w16_wpad2):
1:      // Copy and subsample input, padding 8
        ld1             {v0.16b}, [x1],  x2
        ld1             {v2.16b}, [x10], x2
        uaddlp          v0.8h,   v0.16b
        uaddlp          v2.8h,   v2.16b
        shl             v0.8h,   v0.8h,   #2
        shl             v2.8h,   v2.8h,   #2
        dup             v1.8h,   v0.h[7]
        dup             v3.8h,   v2.h[7]
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_422_w16_wpad3):
1:      // Copy and subsample input, padding 12
        ld1             {v0.8b}, [x1],  x2
        ld1             {v2.8b}, [x10], x2
        uaddlp          v0.4h,   v0.8b
        uaddlp          v2.4h,   v2.8b
        shl             v0.4h,   v0.4h,   #2
        shl             v2.4h,   v2.4h,   #2
        dup             v1.8h,   v0.h[3]
        dup             v3.8h,   v2.h[3]
        trn1            v0.2d,   v0.2d,   v1.2d
        trn1            v2.2d,   v2.2d,   v3.2d
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_422_tbl):
        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
        .hword 0

L(ipred_cfl_ac_422_w16_tbl):
        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
endfunc