ref: ff3054feb26ed4476f632965d6a76b6af1d4f31c
dir: /src/arm/64/ipred16.S/
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height,
// const int bitdepth_max);
function ipred_dc_128_16bpc_neon, export=1
ldr w8, [sp]
clz w3, w3
adr x5, L(ipred_dc_128_tbl)
sub w3, w3, #25
ldrh w3, [x5, w3, uxtw #1]
dup v0.8h, w8
sub x5, x5, w3, uxtw
add x6, x0, x1
lsl x1, x1, #1
urshr v0.8h, v0.8h, #1
br x5
4:
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
subs w4, w4, #4
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
b.gt 4b
ret
8:
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 8b
ret
160:
mov v1.16b, v0.16b
16:
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
b.gt 16b
ret
320:
mov v1.16b, v0.16b
mov v2.16b, v0.16b
mov v3.16b, v0.16b
32:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 32b
ret
640:
mov v1.16b, v0.16b
mov v2.16b, v0.16b
mov v3.16b, v0.16b
sub x1, x1, #64
64:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 64b
ret
L(ipred_dc_128_tbl):
.hword L(ipred_dc_128_tbl) - 640b
.hword L(ipred_dc_128_tbl) - 320b
.hword L(ipred_dc_128_tbl) - 160b
.hword L(ipred_dc_128_tbl) - 8b
.hword L(ipred_dc_128_tbl) - 4b
endfunc
// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_v_16bpc_neon, export=1
clz w3, w3
adr x5, L(ipred_v_tbl)
sub w3, w3, #25
ldrh w3, [x5, w3, uxtw #1]
add x2, x2, #2
sub x5, x5, w3, uxtw
add x6, x0, x1
lsl x1, x1, #1
br x5
40:
ld1 {v0.4h}, [x2]
4:
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
subs w4, w4, #4
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
b.gt 4b
ret
80:
ld1 {v0.8h}, [x2]
8:
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 8b
ret
160:
ld1 {v0.8h, v1.8h}, [x2]
16:
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
b.gt 16b
ret
320:
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
32:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 32b
ret
640:
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
sub x1, x1, #64
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
64:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
b.gt 64b
ret
L(ipred_v_tbl):
.hword L(ipred_v_tbl) - 640b
.hword L(ipred_v_tbl) - 320b
.hword L(ipred_v_tbl) - 160b
.hword L(ipred_v_tbl) - 80b
.hword L(ipred_v_tbl) - 40b
endfunc
// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_h_16bpc_neon, export=1
clz w3, w3
adr x5, L(ipred_h_tbl)
sub w3, w3, #25
ldrh w3, [x5, w3, uxtw #1]
sub x2, x2, #8
sub x5, x5, w3, uxtw
mov x7, #-8
add x6, x0, x1
lsl x1, x1, #1
br x5
4:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
st1 {v3.4h}, [x0], x1
st1 {v2.4h}, [x6], x1
subs w4, w4, #4
st1 {v1.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
b.gt 4b
ret
8:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
st1 {v3.8h}, [x0], x1
st1 {v2.8h}, [x6], x1
subs w4, w4, #4
st1 {v1.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 8b
ret
16:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
str q3, [x0, #16]
str q2, [x6, #16]
st1 {v3.8h}, [x0], x1
st1 {v2.8h}, [x6], x1
subs w4, w4, #4
str q1, [x0, #16]
str q0, [x6, #16]
st1 {v1.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 16b
ret
32:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
str q3, [x0, #16]
str q2, [x6, #16]
stp q3, q3, [x0, #32]
stp q2, q2, [x6, #32]
st1 {v3.8h}, [x0], x1
st1 {v2.8h}, [x6], x1
subs w4, w4, #4
str q1, [x0, #16]
str q0, [x6, #16]
stp q1, q1, [x0, #32]
stp q0, q0, [x6, #32]
st1 {v1.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 32b
ret
64:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
str q3, [x0, #16]
str q2, [x6, #16]
stp q3, q3, [x0, #32]
stp q2, q2, [x6, #32]
stp q3, q3, [x0, #64]
stp q2, q2, [x6, #64]
stp q3, q3, [x0, #96]
stp q2, q2, [x6, #96]
st1 {v3.8h}, [x0], x1
st1 {v2.8h}, [x6], x1
subs w4, w4, #4
str q1, [x0, #16]
str q0, [x6, #16]
stp q1, q1, [x0, #32]
stp q0, q0, [x6, #32]
stp q1, q1, [x0, #64]
stp q0, q0, [x6, #64]
stp q1, q1, [x0, #96]
stp q0, q0, [x6, #96]
st1 {v1.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 64b
ret
L(ipred_h_tbl):
.hword L(ipred_h_tbl) - 64b
.hword L(ipred_h_tbl) - 32b
.hword L(ipred_h_tbl) - 16b
.hword L(ipred_h_tbl) - 8b
.hword L(ipred_h_tbl) - 4b
endfunc
// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_top_16bpc_neon, export=1
clz w3, w3
adr x5, L(ipred_dc_top_tbl)
sub w3, w3, #25
ldrh w3, [x5, w3, uxtw #1]
add x2, x2, #2
sub x5, x5, w3, uxtw
add x6, x0, x1
lsl x1, x1, #1
br x5
40:
ld1 {v0.4h}, [x2]
addv h0, v0.4h
urshr v0.4h, v0.4h, #2
dup v0.4h, v0.h[0]
4:
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
subs w4, w4, #4
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
b.gt 4b
ret
80:
ld1 {v0.8h}, [x2]
addv h0, v0.8h
urshr v0.4h, v0.4h, #3
dup v0.8h, v0.h[0]
8:
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 8b
ret
160:
ld1 {v0.8h, v1.8h}, [x2]
addp v0.8h, v0.8h, v1.8h
addv h0, v0.8h
urshr v2.4h, v0.4h, #4
dup v0.8h, v2.h[0]
dup v1.8h, v2.h[0]
16:
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
b.gt 16b
ret
320:
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
addp v0.8h, v0.8h, v2.8h
uaddlv s0, v0.8h
rshrn v4.4h, v0.4s, #5
dup v0.8h, v4.h[0]
dup v1.8h, v4.h[0]
dup v2.8h, v4.h[0]
dup v3.8h, v4.h[0]
32:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 32b
ret
640:
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
addp v0.8h, v0.8h, v1.8h
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v6.8h, v6.8h, v7.8h
addp v0.8h, v0.8h, v2.8h
addp v4.8h, v4.8h, v6.8h
addp v0.8h, v0.8h, v4.8h
uaddlv s0, v0.8h
rshrn v4.4h, v0.4s, #6
sub x1, x1, #64
dup v0.8h, v4.h[0]
dup v1.8h, v4.h[0]
dup v2.8h, v4.h[0]
dup v3.8h, v4.h[0]
64:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 64b
ret
L(ipred_dc_top_tbl):
.hword L(ipred_dc_top_tbl) - 640b
.hword L(ipred_dc_top_tbl) - 320b
.hword L(ipred_dc_top_tbl) - 160b
.hword L(ipred_dc_top_tbl) - 80b
.hword L(ipred_dc_top_tbl) - 40b
endfunc
// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_left_16bpc_neon, export=1
sub x2, x2, w4, uxtw #1
clz w3, w3
clz w7, w4
adr x5, L(ipred_dc_left_tbl)
sub w3, w3, #20 // 25 leading bits, minus table offset 5
sub w7, w7, #25
ldrh w3, [x5, w3, uxtw #1]
ldrh w7, [x5, w7, uxtw #1]
sub x3, x5, w3, uxtw
sub x5, x5, w7, uxtw
add x6, x0, x1
lsl x1, x1, #1
br x5
L(ipred_dc_left_h4):
ld1 {v0.4h}, [x2]
addv h0, v0.4h
urshr v0.4h, v0.4h, #2
dup v0.8h, v0.h[0]
br x3
L(ipred_dc_left_w4):
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
subs w4, w4, #4
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
b.gt L(ipred_dc_left_w4)
ret
L(ipred_dc_left_h8):
ld1 {v0.8h}, [x2]
addv h0, v0.8h
urshr v0.4h, v0.4h, #3
dup v0.8h, v0.h[0]
br x3
L(ipred_dc_left_w8):
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt L(ipred_dc_left_w8)
ret
L(ipred_dc_left_h16):
ld1 {v0.8h, v1.8h}, [x2]
addp v0.8h, v0.8h, v1.8h
addv h0, v0.8h
urshr v2.4h, v0.4h, #4
dup v0.8h, v2.h[0]
dup v1.8h, v2.h[0]
br x3
L(ipred_dc_left_w16):
mov v1.16b, v0.16b
1:
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
b.gt 1b
ret
L(ipred_dc_left_h32):
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
addp v0.8h, v0.8h, v2.8h
uaddlp v0.4s, v0.8h
addv s0, v0.4s
rshrn v4.4h, v0.4s, #5
dup v0.8h, v4.h[0]
br x3
L(ipred_dc_left_w32):
mov v1.16b, v0.16b
mov v2.16b, v0.16b
mov v3.16b, v0.16b
1:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 1b
ret
L(ipred_dc_left_h64):
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
addp v0.8h, v0.8h, v1.8h
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v6.8h, v6.8h, v7.8h
addp v0.8h, v0.8h, v2.8h
addp v4.8h, v4.8h, v6.8h
addp v0.8h, v0.8h, v4.8h
uaddlv s0, v0.8h
rshrn v4.4h, v0.4s, #6
dup v0.8h, v4.h[0]
br x3
L(ipred_dc_left_w64):
mov v1.16b, v0.16b
mov v2.16b, v0.16b
mov v3.16b, v0.16b
sub x1, x1, #64
1:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 1b
ret
L(ipred_dc_left_tbl):
.hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
.hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
.hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
.hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
.hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
.hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
.hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
.hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
.hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
.hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
endfunc
// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_16bpc_neon, export=1
sub x2, x2, w4, uxtw #1
add w7, w3, w4 // width + height
clz w3, w3
clz w6, w4
dup v16.4s, w7 // width + height
adr x5, L(ipred_dc_tbl)
rbit w7, w7 // rbit(width + height)
sub w3, w3, #20 // 25 leading bits, minus table offset 5
sub w6, w6, #25
clz w7, w7 // ctz(width + height)
ldrh w3, [x5, w3, uxtw #1]
ldrh w6, [x5, w6, uxtw #1]
neg w7, w7 // -ctz(width + height)
sub x3, x5, w3, uxtw
sub x5, x5, w6, uxtw
ushr v16.4s, v16.4s, #1 // (width + height) >> 1
dup v17.4s, w7 // -ctz(width + height)
add x6, x0, x1
lsl x1, x1, #1
br x5
L(ipred_dc_h4):
ld1 {v0.4h}, [x2], #8
uaddlv s0, v0.4h
br x3
L(ipred_dc_w4):
add x2, x2, #2
ld1 {v1.4h}, [x2]
add v0.2s, v0.2s, v16.2s
uaddlv s1, v1.4h
cmp w4, #4
add v0.2s, v0.2s, v1.2s
ushl v0.2s, v0.2s, v17.2s
b.eq 1f
// h = 8/16
cmp w4, #16
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v0.2s, v0.2s, v16.2s
ushr v0.2s, v0.2s, #17
1:
dup v0.4h, v0.h[0]
2:
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
subs w4, w4, #4
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
b.gt 2b
ret
L(ipred_dc_h8):
ld1 {v0.8h}, [x2], #16
uaddlv s0, v0.8h
br x3
L(ipred_dc_w8):
add x2, x2, #2
ld1 {v1.8h}, [x2]
add v0.2s, v0.2s, v16.2s
uaddlv s1, v1.8h
cmp w4, #8
add v0.2s, v0.2s, v1.2s
ushl v0.2s, v0.2s, v17.2s
b.eq 1f
// h = 4/16/32
cmp w4, #32
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v0.2s, v0.2s, v16.2s
ushr v0.2s, v0.2s, #17
1:
dup v0.8h, v0.h[0]
2:
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 2b
ret
L(ipred_dc_h16):
ld1 {v0.8h, v1.8h}, [x2], #32
addp v0.8h, v0.8h, v1.8h
uaddlv s0, v0.8h
br x3
L(ipred_dc_w16):
add x2, x2, #2
ld1 {v1.8h, v2.8h}, [x2]
add v0.2s, v0.2s, v16.2s
addp v1.8h, v1.8h, v2.8h
uaddlv s1, v1.8h
cmp w4, #16
add v0.2s, v0.2s, v1.2s
ushl v4.2s, v0.2s, v17.2s
b.eq 1f
// h = 4/8/32/64
tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v4.2s, v4.2s, v16.2s
ushr v4.2s, v4.2s, #17
1:
dup v0.8h, v4.h[0]
dup v1.8h, v4.h[0]
2:
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
b.gt 2b
ret
L(ipred_dc_h32):
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
addp v0.8h, v0.8h, v2.8h
uaddlv s0, v0.8h
br x3
L(ipred_dc_w32):
add x2, x2, #2
ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
add v0.2s, v0.2s, v16.2s
addp v1.8h, v1.8h, v2.8h
addp v3.8h, v3.8h, v4.8h
addp v1.8h, v1.8h, v3.8h
uaddlv s1, v1.8h
cmp w4, #32
add v0.2s, v0.2s, v1.2s
ushl v4.2s, v0.2s, v17.2s
b.eq 1f
// h = 8/16/64
cmp w4, #8
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v4.2s, v4.2s, v16.2s
ushr v4.2s, v4.2s, #17
1:
dup v0.8h, v4.h[0]
dup v1.8h, v4.h[0]
dup v2.8h, v4.h[0]
dup v3.8h, v4.h[0]
2:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 2b
ret
L(ipred_dc_h64):
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
addp v0.8h, v0.8h, v1.8h
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v6.8h, v6.8h, v7.8h
addp v0.8h, v0.8h, v2.8h
addp v4.8h, v4.8h, v6.8h
addp v0.8h, v0.8h, v4.8h
uaddlv s0, v0.8h
br x3
L(ipred_dc_w64):
add x2, x2, #2
ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
add v0.2s, v0.2s, v16.2s
addp v1.8h, v1.8h, v2.8h
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
addp v3.8h, v3.8h, v4.8h
addp v20.8h, v20.8h, v21.8h
addp v22.8h, v22.8h, v23.8h
addp v1.8h, v1.8h, v3.8h
addp v20.8h, v20.8h, v22.8h
addp v1.8h, v1.8h, v20.8h
uaddlv s1, v1.8h
cmp w4, #64
add v0.2s, v0.2s, v1.2s
ushl v4.2s, v0.2s, v17.2s
b.eq 1f
// h = 16/32
cmp w4, #16
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v4.2s, v4.2s, v16.2s
ushr v4.2s, v4.2s, #17
1:
sub x1, x1, #64
dup v0.8h, v4.h[0]
dup v1.8h, v4.h[0]
dup v2.8h, v4.h[0]
dup v3.8h, v4.h[0]
2:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 2b
ret
L(ipred_dc_tbl):
.hword L(ipred_dc_tbl) - L(ipred_dc_h64)
.hword L(ipred_dc_tbl) - L(ipred_dc_h32)
.hword L(ipred_dc_tbl) - L(ipred_dc_h16)
.hword L(ipred_dc_tbl) - L(ipred_dc_h8)
.hword L(ipred_dc_tbl) - L(ipred_dc_h4)
.hword L(ipred_dc_tbl) - L(ipred_dc_w64)
.hword L(ipred_dc_tbl) - L(ipred_dc_w32)
.hword L(ipred_dc_tbl) - L(ipred_dc_w16)
.hword L(ipred_dc_tbl) - L(ipred_dc_w8)
.hword L(ipred_dc_tbl) - L(ipred_dc_w4)
endfunc
// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_paeth_16bpc_neon, export=1
clz w9, w3
adr x5, L(ipred_paeth_tbl)
sub w9, w9, #25
ldrh w9, [x5, w9, uxtw #1]
ld1r {v4.8h}, [x2]
add x8, x2, #2
sub x2, x2, #8
sub x5, x5, w9, uxtw
mov x7, #-8
add x6, x0, x1
lsl x1, x1, #1
br x5
40:
ld1r {v5.2d}, [x8]
sub v6.8h, v5.8h, v4.8h // top - topleft
4:
ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7
zip1 v0.2d, v0.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d
add v16.8h, v6.8h, v0.8h // base
add v17.8h, v6.8h, v2.8h
sabd v20.8h, v5.8h, v16.8h // tdiff
sabd v21.8h, v5.8h, v17.8h
sabd v22.8h, v4.8h, v16.8h // tldiff
sabd v23.8h, v4.8h, v17.8h
sabd v16.8h, v0.8h, v16.8h // ldiff
sabd v17.8h, v2.8h, v17.8h
umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff)
umin v19.8h, v21.8h, v23.8h
cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff
cmge v21.8h, v23.8h, v21.8h
cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff
cmge v17.8h, v19.8h, v17.8h
bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
bsl v20.16b, v5.16b, v4.16b
bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ...
bit v20.16b, v0.16b, v16.16b
st1 {v21.d}[1], [x0], x1
st1 {v21.d}[0], [x6], x1
subs w4, w4, #4
st1 {v20.d}[1], [x0], x1
st1 {v20.d}[0], [x6], x1
b.gt 4b
ret
80:
160:
320:
640:
ld1 {v5.8h}, [x8], #16
mov w9, w3
// Set up pointers for four rows in parallel; x0, x6, x5, x10
add x5, x0, x1
add x10, x6, x1
lsl x1, x1, #1
sub x1, x1, w3, uxtw #1
1:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
2:
sub v6.8h, v5.8h, v4.8h // top - topleft
add v16.8h, v6.8h, v0.8h // base
add v17.8h, v6.8h, v1.8h
add v18.8h, v6.8h, v2.8h
add v19.8h, v6.8h, v3.8h
sabd v20.8h, v5.8h, v16.8h // tdiff
sabd v21.8h, v5.8h, v17.8h
sabd v22.8h, v5.8h, v18.8h
sabd v23.8h, v5.8h, v19.8h
sabd v24.8h, v4.8h, v16.8h // tldiff
sabd v25.8h, v4.8h, v17.8h
sabd v26.8h, v4.8h, v18.8h
sabd v27.8h, v4.8h, v19.8h
sabd v16.8h, v0.8h, v16.8h // ldiff
sabd v17.8h, v1.8h, v17.8h
sabd v18.8h, v2.8h, v18.8h
sabd v19.8h, v3.8h, v19.8h
umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff)
umin v29.8h, v21.8h, v25.8h
umin v30.8h, v22.8h, v26.8h
umin v31.8h, v23.8h, v27.8h
cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff
cmge v21.8h, v25.8h, v21.8h
cmge v22.8h, v26.8h, v22.8h
cmge v23.8h, v27.8h, v23.8h
cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff
cmge v17.8h, v29.8h, v17.8h
cmge v18.8h, v30.8h, v18.8h
cmge v19.8h, v31.8h, v19.8h
bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
bsl v22.16b, v5.16b, v4.16b
bsl v21.16b, v5.16b, v4.16b
bsl v20.16b, v5.16b, v4.16b
bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ...
bit v22.16b, v2.16b, v18.16b
bit v21.16b, v1.16b, v17.16b
bit v20.16b, v0.16b, v16.16b
st1 {v23.8h}, [x0], #16
st1 {v22.8h}, [x6], #16
subs w3, w3, #8
st1 {v21.8h}, [x5], #16
st1 {v20.8h}, [x10], #16
b.le 8f
ld1 {v5.8h}, [x8], #16
b 2b
8:
subs w4, w4, #4
b.le 9f
// End of horizontal loop, move pointers to next four rows
sub x8, x8, w9, uxtw #1
add x0, x0, x1
add x6, x6, x1
// Load the top row as early as possible
ld1 {v5.8h}, [x8], #16
add x5, x5, x1
add x10, x10, x1
mov w3, w9
b 1b
9:
ret
L(ipred_paeth_tbl):
.hword L(ipred_paeth_tbl) - 640b
.hword L(ipred_paeth_tbl) - 320b
.hword L(ipred_paeth_tbl) - 160b
.hword L(ipred_paeth_tbl) - 80b
.hword L(ipred_paeth_tbl) - 40b
endfunc
// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_smooth_16bpc_neon, export=1
movrel x10, X(sm_weights)
add x11, x10, w4, uxtw
add x10, x10, w3, uxtw
clz w9, w3
adr x5, L(ipred_smooth_tbl)
sub x12, x2, w4, uxtw #1
sub w9, w9, #25
ldrh w9, [x5, w9, uxtw #1]
ld1r {v4.8h}, [x12] // bottom
add x8, x2, #2
sub x5, x5, w9, uxtw
add x6, x0, x1
lsl x1, x1, #1
br x5
40:
sub x2, x2, #8
mov x7, #-8
ld1r {v6.2d}, [x8] // top
ld1r {v7.2s}, [x10] // weights_hor
dup v5.8h, v6.h[3] // right
sub v6.8h, v6.8h, v4.8h // top-bottom
uxtl v7.8h, v7.8b // weights_hor
add v31.4h, v4.4h, v5.4h // bottom+right
4:
ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
ushll v20.4s, v31.4h, #8 // (bottom+right)*256
ushll v21.4s, v31.4h, #8
ushll v22.4s, v31.4h, #8
ushll v23.4s, v31.4h, #8
zip1 v1.2d, v1.2d, v0.2d // left, flipped
zip1 v0.2d, v3.2d, v2.2d
zip1 v16.2s, v16.2s, v17.2s // weights_ver
zip1 v18.2s, v18.2s, v19.2s
sub v0.8h, v0.8h, v5.8h // left-right
sub v1.8h, v1.8h, v5.8h
uxtl v16.8h, v16.8b // weights_ver
uxtl v18.8h, v18.8b
smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor
smlal2 v21.4s, v0.8h, v7.8h
smlal v22.4s, v1.4h, v7.4h
smlal2 v23.4s, v1.8h, v7.8h
smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
smlal2 v21.4s, v6.8h, v16.8h
smlal v22.4s, v6.4h, v18.4h
smlal2 v23.4s, v6.8h, v18.8h
rshrn v20.4h, v20.4s, #9
rshrn v21.4h, v21.4s, #9
rshrn v22.4h, v22.4s, #9
rshrn v23.4h, v23.4s, #9
st1 {v20.4h}, [x0], x1
st1 {v21.4h}, [x6], x1
subs w4, w4, #4
st1 {v22.4h}, [x0], x1
st1 {v23.4h}, [x6], x1
b.gt 4b
ret
80:
sub x2, x2, #8
mov x7, #-8
ld1 {v6.8h}, [x8] // top
ld1 {v7.8b}, [x10] // weights_hor
dup v5.8h, v6.h[7] // right
sub v6.8h, v6.8h, v4.8h // top-bottom
uxtl v7.8h, v7.8b // weights_hor
add v31.4h, v4.4h, v5.4h // bottom+right
8:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
ushll v20.4s, v31.4h, #8 // (bottom+right)*256
ushll v21.4s, v31.4h, #8
ushll v22.4s, v31.4h, #8
ushll v23.4s, v31.4h, #8
ushll v24.4s, v31.4h, #8
ushll v25.4s, v31.4h, #8
ushll v26.4s, v31.4h, #8
ushll v27.4s, v31.4h, #8
sub v0.8h, v0.8h, v5.8h // left-right
sub v1.8h, v1.8h, v5.8h
sub v2.8h, v2.8h, v5.8h
sub v3.8h, v3.8h, v5.8h
uxtl v16.8h, v16.8b // weights_ver
uxtl v17.8h, v17.8b
uxtl v18.8h, v18.8b
uxtl v19.8h, v19.8b
smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor
smlal2 v21.4s, v3.8h, v7.8h // (left flipped)
smlal v22.4s, v2.4h, v7.4h
smlal2 v23.4s, v2.8h, v7.8h
smlal v24.4s, v1.4h, v7.4h
smlal2 v25.4s, v1.8h, v7.8h
smlal v26.4s, v0.4h, v7.4h
smlal2 v27.4s, v0.8h, v7.8h
smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
smlal2 v21.4s, v6.8h, v16.8h
smlal v22.4s, v6.4h, v17.4h
smlal2 v23.4s, v6.8h, v17.8h
smlal v24.4s, v6.4h, v18.4h
smlal2 v25.4s, v6.8h, v18.8h
smlal v26.4s, v6.4h, v19.4h
smlal2 v27.4s, v6.8h, v19.8h
rshrn v20.4h, v20.4s, #9
rshrn2 v20.8h, v21.4s, #9
rshrn v21.4h, v22.4s, #9
rshrn2 v21.8h, v23.4s, #9
rshrn v22.4h, v24.4s, #9
rshrn2 v22.8h, v25.4s, #9
rshrn v23.4h, v26.4s, #9
rshrn2 v23.8h, v27.4s, #9
st1 {v20.8h}, [x0], x1
st1 {v21.8h}, [x6], x1
subs w4, w4, #4
st1 {v22.8h}, [x0], x1
st1 {v23.8h}, [x6], x1
b.gt 8b
ret
160:
320:
640:
add x12, x2, w3, uxtw #1
sub x1, x1, w3, uxtw #1
ld1r {v5.8h}, [x12] // right
sub x2, x2, #4
mov x7, #-4
mov w9, w3
add v31.4h, v4.4h, v5.4h // bottom+right
1:
ld2r {v0.8h, v1.8h}, [x2], x7 // left
ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver
sub v0.8h, v0.8h, v5.8h // left-right
sub v1.8h, v1.8h, v5.8h
uxtl v16.8h, v16.8b // weights_ver
uxtl v17.8h, v17.8b
2:
ld1 {v7.16b}, [x10], #16 // weights_hor
ld1 {v2.8h, v3.8h}, [x8], #32 // top
ushll v20.4s, v31.4h, #8 // (bottom+right)*256
ushll v21.4s, v31.4h, #8
ushll v22.4s, v31.4h, #8
ushll v23.4s, v31.4h, #8
ushll v24.4s, v31.4h, #8
ushll v25.4s, v31.4h, #8
ushll v26.4s, v31.4h, #8
ushll v27.4s, v31.4h, #8
uxtl v6.8h, v7.8b // weights_hor
uxtl2 v7.8h, v7.16b
sub v2.8h, v2.8h, v4.8h // top-bottom
sub v3.8h, v3.8h, v4.8h
smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor
smlal2 v21.4s, v1.8h, v6.8h // (left flipped)
smlal v22.4s, v1.4h, v7.4h
smlal2 v23.4s, v1.8h, v7.8h
smlal v24.4s, v0.4h, v6.4h
smlal2 v25.4s, v0.8h, v6.8h
smlal v26.4s, v0.4h, v7.4h
smlal2 v27.4s, v0.8h, v7.8h
smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver
smlal2 v21.4s, v2.8h, v16.8h
smlal v22.4s, v3.4h, v16.4h
smlal2 v23.4s, v3.8h, v16.8h
smlal v24.4s, v2.4h, v17.4h
smlal2 v25.4s, v2.8h, v17.8h
smlal v26.4s, v3.4h, v17.4h
smlal2 v27.4s, v3.8h, v17.8h
rshrn v20.4h, v20.4s, #9
rshrn2 v20.8h, v21.4s, #9
rshrn v21.4h, v22.4s, #9
rshrn2 v21.8h, v23.4s, #9
rshrn v22.4h, v24.4s, #9
rshrn2 v22.8h, v25.4s, #9
rshrn v23.4h, v26.4s, #9
rshrn2 v23.8h, v27.4s, #9
subs w3, w3, #16
st1 {v20.8h, v21.8h}, [x0], #32
st1 {v22.8h, v23.8h}, [x6], #32
b.gt 2b
subs w4, w4, #2
b.le 9f
sub x8, x8, w9, uxtw #1
sub x10, x10, w9, uxtw
add x0, x0, x1
add x6, x6, x1
mov w3, w9
b 1b
9:
ret
L(ipred_smooth_tbl):
.hword L(ipred_smooth_tbl) - 640b
.hword L(ipred_smooth_tbl) - 320b
.hword L(ipred_smooth_tbl) - 160b
.hword L(ipred_smooth_tbl) - 80b
.hword L(ipred_smooth_tbl) - 40b
endfunc
// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_smooth_v_16bpc_neon, export=1
movrel x7, X(sm_weights)
add x7, x7, w4, uxtw
clz w9, w3
adr x5, L(ipred_smooth_v_tbl)
sub x8, x2, w4, uxtw #1
sub w9, w9, #25
ldrh w9, [x5, w9, uxtw #1]
ld1r {v4.8h}, [x8] // bottom
add x2, x2, #2
sub x5, x5, w9, uxtw
add x6, x0, x1
lsl x1, x1, #1
br x5
40:
ld1r {v6.2d}, [x2] // top
sub v6.8h, v6.8h, v4.8h // top-bottom
4:
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
zip1 v16.2s, v16.2s, v17.2s // weights_ver
zip1 v18.2s, v18.2s, v19.2s
ushll v16.8h, v16.8b, #7 // weights_ver << 7
ushll v18.8h, v18.8b, #7
sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
sqrdmulh v21.8h, v6.8h, v18.8h
add v20.8h, v20.8h, v4.8h
add v21.8h, v21.8h, v4.8h
st1 {v20.d}[0], [x0], x1
st1 {v20.d}[1], [x6], x1
subs w4, w4, #4
st1 {v21.d}[0], [x0], x1
st1 {v21.d}[1], [x6], x1
b.gt 4b
ret
80:
ld1 {v6.8h}, [x2] // top
sub v6.8h, v6.8h, v4.8h // top-bottom
8:
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
ushll v16.8h, v16.8b, #7 // weights_ver << 7
ushll v17.8h, v17.8b, #7
ushll v18.8h, v18.8b, #7
ushll v19.8h, v19.8b, #7
sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
sqrdmulh v21.8h, v6.8h, v17.8h
sqrdmulh v22.8h, v6.8h, v18.8h
sqrdmulh v23.8h, v6.8h, v19.8h
add v20.8h, v20.8h, v4.8h
add v21.8h, v21.8h, v4.8h
add v22.8h, v22.8h, v4.8h
add v23.8h, v23.8h, v4.8h
st1 {v20.8h}, [x0], x1
st1 {v21.8h}, [x6], x1
subs w4, w4, #4
st1 {v22.8h}, [x0], x1
st1 {v23.8h}, [x6], x1
b.gt 8b
ret
160:
320:
640:
// Set up pointers for four rows in parallel; x0, x6, x5, x8
add x5, x0, x1
add x8, x6, x1
lsl x1, x1, #1
sub x1, x1, w3, uxtw #1
mov w9, w3
1:
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
ushll v16.8h, v16.8b, #7 // weights_ver << 7
ushll v17.8h, v17.8b, #7
ushll v18.8h, v18.8b, #7
ushll v19.8h, v19.8b, #7
2:
ld1 {v2.8h, v3.8h}, [x2], #32 // top
sub v2.8h, v2.8h, v4.8h // top-bottom
sub v3.8h, v3.8h, v4.8h
sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
sqrdmulh v21.8h, v3.8h, v16.8h
sqrdmulh v22.8h, v2.8h, v17.8h
sqrdmulh v23.8h, v3.8h, v17.8h
sqrdmulh v24.8h, v2.8h, v18.8h
sqrdmulh v25.8h, v3.8h, v18.8h
sqrdmulh v26.8h, v2.8h, v19.8h
sqrdmulh v27.8h, v3.8h, v19.8h
add v20.8h, v20.8h, v4.8h
add v21.8h, v21.8h, v4.8h
add v22.8h, v22.8h, v4.8h
add v23.8h, v23.8h, v4.8h
add v24.8h, v24.8h, v4.8h
add v25.8h, v25.8h, v4.8h
add v26.8h, v26.8h, v4.8h
add v27.8h, v27.8h, v4.8h
subs w3, w3, #16
st1 {v20.8h, v21.8h}, [x0], #32
st1 {v22.8h, v23.8h}, [x6], #32
st1 {v24.8h, v25.8h}, [x5], #32
st1 {v26.8h, v27.8h}, [x8], #32
b.gt 2b
subs w4, w4, #4
b.le 9f
sub x2, x2, w9, uxtw #1
add x0, x0, x1
add x6, x6, x1
add x5, x5, x1
add x8, x8, x1
mov w3, w9
b 1b
9:
ret
L(ipred_smooth_v_tbl):
.hword L(ipred_smooth_v_tbl) - 640b
.hword L(ipred_smooth_v_tbl) - 320b
.hword L(ipred_smooth_v_tbl) - 160b
.hword L(ipred_smooth_v_tbl) - 80b
.hword L(ipred_smooth_v_tbl) - 40b
endfunc
// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_smooth_h_16bpc_neon, export=1
movrel x8, X(sm_weights)
add x8, x8, w3, uxtw
clz w9, w3
adr x5, L(ipred_smooth_h_tbl)
add x12, x2, w3, uxtw #1
sub w9, w9, #25
ldrh w9, [x5, w9, uxtw #1]
ld1r {v5.8h}, [x12] // right
sub x5, x5, w9, uxtw
add x6, x0, x1
lsl x1, x1, #1
br x5
40:
ld1r {v7.2s}, [x8] // weights_hor
sub x2, x2, #8
mov x7, #-8
ushll v7.8h, v7.8b, #7 // weights_hor << 7
4:
ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
zip1 v1.2d, v1.2d, v0.2d // left, flipped
zip1 v0.2d, v3.2d, v2.2d
sub v0.8h, v0.8h, v5.8h // left-right
sub v1.8h, v1.8h, v5.8h
sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
sqrdmulh v21.8h, v1.8h, v7.8h
add v20.8h, v20.8h, v5.8h
add v21.8h, v21.8h, v5.8h
st1 {v20.d}[0], [x0], x1
st1 {v20.d}[1], [x6], x1
subs w4, w4, #4
st1 {v21.d}[0], [x0], x1
st1 {v21.d}[1], [x6], x1
b.gt 4b
ret
80:
ld1 {v7.8b}, [x8] // weights_hor
sub x2, x2, #8
mov x7, #-8
ushll v7.8h, v7.8b, #7 // weights_hor << 7
8:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
sub v3.8h, v3.8h, v5.8h // left-right
sub v2.8h, v2.8h, v5.8h
sub v1.8h, v1.8h, v5.8h
sub v0.8h, v0.8h, v5.8h
sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped)
sqrdmulh v22.8h, v1.8h, v7.8h
sqrdmulh v23.8h, v0.8h, v7.8h
add v20.8h, v20.8h, v5.8h
add v21.8h, v21.8h, v5.8h
add v22.8h, v22.8h, v5.8h
add v23.8h, v23.8h, v5.8h
st1 {v20.8h}, [x0], x1
st1 {v21.8h}, [x6], x1
subs w4, w4, #4
st1 {v22.8h}, [x0], x1
st1 {v23.8h}, [x6], x1
b.gt 8b
ret
160:
320:
640:
sub x2, x2, #8
mov x7, #-8
// Set up pointers for four rows in parallel; x0, x6, x5, x10
add x5, x0, x1
add x10, x6, x1
lsl x1, x1, #1
sub x1, x1, w3, uxtw #1
mov w9, w3
1:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
sub v0.8h, v0.8h, v5.8h // left-right
sub v1.8h, v1.8h, v5.8h
sub v2.8h, v2.8h, v5.8h
sub v3.8h, v3.8h, v5.8h
2:
ld1 {v7.16b}, [x8], #16 // weights_hor
ushll v6.8h, v7.8b, #7 // weights_hor << 7
ushll2 v7.8h, v7.16b, #7
sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8
sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped)
sqrdmulh v22.8h, v2.8h, v6.8h
sqrdmulh v23.8h, v2.8h, v7.8h
sqrdmulh v24.8h, v1.8h, v6.8h
sqrdmulh v25.8h, v1.8h, v7.8h
sqrdmulh v26.8h, v0.8h, v6.8h
sqrdmulh v27.8h, v0.8h, v7.8h
add v20.8h, v20.8h, v5.8h
add v21.8h, v21.8h, v5.8h
add v22.8h, v22.8h, v5.8h
add v23.8h, v23.8h, v5.8h
add v24.8h, v24.8h, v5.8h
add v25.8h, v25.8h, v5.8h
add v26.8h, v26.8h, v5.8h
add v27.8h, v27.8h, v5.8h
subs w3, w3, #16
st1 {v20.8h, v21.8h}, [x0], #32
st1 {v22.8h, v23.8h}, [x6], #32
st1 {v24.8h, v25.8h}, [x5], #32
st1 {v26.8h, v27.8h}, [x10], #32
b.gt 2b
subs w4, w4, #4
b.le 9f
sub x8, x8, w9, uxtw
add x0, x0, x1
add x6, x6, x1
add x5, x5, x1
add x10, x10, x1
mov w3, w9
b 1b
9:
ret
L(ipred_smooth_h_tbl):
.hword L(ipred_smooth_h_tbl) - 640b
.hword L(ipred_smooth_h_tbl) - 320b
.hword L(ipred_smooth_h_tbl) - 160b
.hword L(ipred_smooth_h_tbl) - 80b
.hword L(ipred_smooth_h_tbl) - 40b
endfunc
// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int filt_idx,
// const int max_width, const int max_height,
// const int bitdepth_max);
.macro filter_fn bpc
function ipred_filter_\bpc\()bpc_neon
and w5, w5, #511
movrel x6, X(filter_intra_taps)
lsl w5, w5, #6
add x6, x6, w5, uxtw
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
clz w9, w3
adr x5, L(ipred_filter\bpc\()_tbl)
ld1 {v20.8b, v21.8b, v22.8b}, [x6]
sub w9, w9, #26
ldrh w9, [x5, w9, uxtw #1]
sxtl v16.8h, v16.8b
sxtl v17.8h, v17.8b
sub x5, x5, w9, uxtw
sxtl v18.8h, v18.8b
sxtl v19.8h, v19.8b
add x6, x0, x1
lsl x1, x1, #1
sxtl v20.8h, v20.8b
sxtl v21.8h, v21.8b
sxtl v22.8h, v22.8b
dup v31.8h, w8
movi v30.8h, #0
br x5
40:
ldur d0, [x2, #2] // top (0-3)
sub x2, x2, #4
mov x7, #-4
4:
ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2)
.if \bpc == 10
mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
srshr v2.8h, v2.8h, #4
smax v2.8h, v2.8h, v30.8h
.else
smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1)
smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2)
smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3)
smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4)
smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0)
smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5)
smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6)
smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0)
smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
sqrshrun v2.4h, v2.4s, #4
sqrshrun2 v2.8h, v3.4s, #4
.endif
smin v2.8h, v2.8h, v31.8h
subs w4, w4, #2
st1 {v2.d}[0], [x0], x1
uxtl v0.8h, v2.8b
ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3]
st1 {v2.d}[1], [x6], x1
b.gt 4b
ret
80:
ldur q0, [x2, #2] // top (0-7)
sub x2, x2, #4
mov x7, #-4
8:
ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2)
.if \bpc == 10
mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
srshr v2.8h, v2.8h, #4
smax v2.8h, v2.8h, v30.8h
smin v2.8h, v2.8h, v31.8h
mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0)
mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5)
mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6)
srshr v3.8h, v3.8h, #4
smax v3.8h, v3.8h, v30.8h
.else
smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1)
smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2)
smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3)
smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4)
smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0)
smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5)
smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6)
smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0)
smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1)
smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2)
smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3)
sqrshrun v2.4h, v2.4s, #4
sqrshrun2 v2.8h, v3.4s, #4
smin v2.8h, v2.8h, v31.8h
smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4)
smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0)
smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5)
smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6)
smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0)
smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5)
smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6)
sqrshrun v3.4h, v4.4s, #4
sqrshrun2 v3.8h, v5.4s, #4
.endif
smin v3.8h, v3.8h, v31.8h
subs w4, w4, #2
st2 {v2.d, v3.d}[0], [x0], x1
zip2 v0.2d, v2.2d, v3.2d
st2 {v2.d, v3.d}[1], [x6], x1
b.gt 8b
ret
160:
320:
add x8, x2, #2
sub x2, x2, #4
mov x7, #-4
sub x1, x1, w3, uxtw #1
mov w9, w3
1:
ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2)
2:
ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15)
.if \bpc == 10
mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0)
mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
srshr v3.8h, v3.8h, #4
smax v3.8h, v3.8h, v30.8h
smin v3.8h, v3.8h, v31.8h
mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0)
mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5)
mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6)
mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
srshr v4.8h, v4.8h, #4
smax v4.8h, v4.8h, v30.8h
smin v4.8h, v4.8h, v31.8h
mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0)
mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5)
mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6)
mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
srshr v5.8h, v5.8h, #4
smax v5.8h, v5.8h, v30.8h
smin v5.8h, v5.8h, v31.8h
mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0)
mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5)
mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6)
subs w3, w3, #16
srshr v6.8h, v6.8h, #4
smax v6.8h, v6.8h, v30.8h
.else
smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0)
smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5)
smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6)
smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1)
smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2)
smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3)
smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4)
smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0)
smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1)
smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2)
smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3)
sqrshrun v3.4h, v3.4s, #4
sqrshrun2 v3.8h, v4.4s, #4
smin v3.8h, v3.8h, v31.8h
smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4)
smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0)
smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5)
smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6)
smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0)
smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5)
smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6)
smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1)
smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2)
smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3)
sqrshrun v4.4h, v5.4s, #4
sqrshrun2 v4.8h, v6.4s, #4
smin v4.8h, v4.8h, v31.8h
smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4)
smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0)
smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5)
smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6)
smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0)
smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5)
smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6)
smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1)
smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2)
smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3)
sqrshrun v5.4h, v24.4s, #4
sqrshrun2 v5.8h, v25.4s, #4
smin v5.8h, v5.8h, v31.8h
smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4)
smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0)
smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5)
smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6)
smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0)
smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5)
smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6)
subs w3, w3, #16
sqrshrun v6.4h, v26.4s, #4
sqrshrun2 v6.8h, v27.4s, #4
.endif
smin v6.8h, v6.8h, v31.8h
ins v0.h[2], v2.h[7]
st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
ins v0.h[0], v6.h[7]
st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
ins v0.h[1], v6.h[3]
b.gt 2b
subs w4, w4, #2
b.le 9f
sub x8, x6, w9, uxtw #1
add x0, x0, x1
add x6, x6, x1
mov w3, w9
b 1b
9:
ret
L(ipred_filter\bpc\()_tbl):
.hword L(ipred_filter\bpc\()_tbl) - 320b
.hword L(ipred_filter\bpc\()_tbl) - 160b
.hword L(ipred_filter\bpc\()_tbl) - 80b
.hword L(ipred_filter\bpc\()_tbl) - 40b
endfunc
.endm
filter_fn 10
filter_fn 12
function ipred_filter_16bpc_neon, export=1
ldr w8, [sp]
cmp w8, 0x3ff
b.le ipred_filter_10bpc_neon
b ipred_filter_12bpc_neon
endfunc
// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const uint16_t *const pal, const uint8_t *idx,
// const int w, const int h);
function pal_pred_16bpc_neon, export=1
ld1 {v30.8h}, [x2]
clz w9, w4
adr x6, L(pal_pred_tbl)
sub w9, w9, #25
ldrh w9, [x6, w9, uxtw #1]
movi v31.8h, #1, lsl #8
sub x6, x6, w9, uxtw
br x6
40:
add x2, x0, x1
lsl x1, x1, #1
4:
ld1 {v1.16b}, [x3], #16
subs w5, w5, #4
// Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
add v1.16b, v1.16b, v1.16b
zip1 v0.16b, v1.16b, v1.16b
zip2 v1.16b, v1.16b, v1.16b
add v0.8h, v0.8h, v31.8h
add v1.8h, v1.8h, v31.8h
tbl v0.16b, {v30.16b}, v0.16b
st1 {v0.d}[0], [x0], x1
tbl v1.16b, {v30.16b}, v1.16b
st1 {v0.d}[1], [x2], x1
st1 {v1.d}[0], [x0], x1
st1 {v1.d}[1], [x2], x1
b.gt 4b
ret
80:
add x2, x0, x1
lsl x1, x1, #1
8:
ld1 {v2.16b, v3.16b}, [x3], #32
subs w5, w5, #4
add v2.16b, v2.16b, v2.16b
add v3.16b, v3.16b, v3.16b
zip1 v0.16b, v2.16b, v2.16b
zip2 v1.16b, v2.16b, v2.16b
zip1 v2.16b, v3.16b, v3.16b
zip2 v3.16b, v3.16b, v3.16b
add v0.8h, v0.8h, v31.8h
add v1.8h, v1.8h, v31.8h
add v2.8h, v2.8h, v31.8h
add v3.8h, v3.8h, v31.8h
tbl v0.16b, {v30.16b}, v0.16b
tbl v1.16b, {v30.16b}, v1.16b
st1 {v0.8h}, [x0], x1
tbl v2.16b, {v30.16b}, v2.16b
st1 {v1.8h}, [x2], x1
tbl v3.16b, {v30.16b}, v3.16b
st1 {v2.8h}, [x0], x1
st1 {v3.8h}, [x2], x1
b.gt 8b
ret
160:
add x2, x0, x1
lsl x1, x1, #1
16:
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
subs w5, w5, #4
add v4.16b, v4.16b, v4.16b
add v5.16b, v5.16b, v5.16b
add v6.16b, v6.16b, v6.16b
add v7.16b, v7.16b, v7.16b
zip1 v0.16b, v4.16b, v4.16b
zip2 v1.16b, v4.16b, v4.16b
zip1 v2.16b, v5.16b, v5.16b
zip2 v3.16b, v5.16b, v5.16b
zip1 v4.16b, v6.16b, v6.16b
zip2 v5.16b, v6.16b, v6.16b
zip1 v6.16b, v7.16b, v7.16b
zip2 v7.16b, v7.16b, v7.16b
add v0.8h, v0.8h, v31.8h
add v1.8h, v1.8h, v31.8h
add v2.8h, v2.8h, v31.8h
add v3.8h, v3.8h, v31.8h
add v4.8h, v4.8h, v31.8h
tbl v0.16b, {v30.16b}, v0.16b
add v5.8h, v5.8h, v31.8h
tbl v1.16b, {v30.16b}, v1.16b
add v6.8h, v6.8h, v31.8h
tbl v2.16b, {v30.16b}, v2.16b
add v7.8h, v7.8h, v31.8h
tbl v3.16b, {v30.16b}, v3.16b
tbl v4.16b, {v30.16b}, v4.16b
tbl v5.16b, {v30.16b}, v5.16b
st1 {v0.8h, v1.8h}, [x0], x1
tbl v6.16b, {v30.16b}, v6.16b
st1 {v2.8h, v3.8h}, [x2], x1
tbl v7.16b, {v30.16b}, v7.16b
st1 {v4.8h, v5.8h}, [x0], x1
st1 {v6.8h, v7.8h}, [x2], x1
b.gt 16b
ret
320:
add x2, x0, x1
lsl x1, x1, #1
32:
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
subs w5, w5, #2
add v4.16b, v4.16b, v4.16b
add v5.16b, v5.16b, v5.16b
add v6.16b, v6.16b, v6.16b
add v7.16b, v7.16b, v7.16b
zip1 v0.16b, v4.16b, v4.16b
zip2 v1.16b, v4.16b, v4.16b
zip1 v2.16b, v5.16b, v5.16b
zip2 v3.16b, v5.16b, v5.16b
zip1 v4.16b, v6.16b, v6.16b
zip2 v5.16b, v6.16b, v6.16b
zip1 v6.16b, v7.16b, v7.16b
zip2 v7.16b, v7.16b, v7.16b
add v0.8h, v0.8h, v31.8h
add v1.8h, v1.8h, v31.8h
add v2.8h, v2.8h, v31.8h
add v3.8h, v3.8h, v31.8h
add v4.8h, v4.8h, v31.8h
tbl v0.16b, {v30.16b}, v0.16b
add v5.8h, v5.8h, v31.8h
tbl v1.16b, {v30.16b}, v1.16b
add v6.8h, v6.8h, v31.8h
tbl v2.16b, {v30.16b}, v2.16b
add v7.8h, v7.8h, v31.8h
tbl v3.16b, {v30.16b}, v3.16b
tbl v4.16b, {v30.16b}, v4.16b
tbl v5.16b, {v30.16b}, v5.16b
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
tbl v6.16b, {v30.16b}, v6.16b
tbl v7.16b, {v30.16b}, v7.16b
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
b.gt 32b
ret
640:
add x2, x0, #64
64:
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
subs w5, w5, #1
add v4.16b, v4.16b, v4.16b
add v5.16b, v5.16b, v5.16b
add v6.16b, v6.16b, v6.16b
add v7.16b, v7.16b, v7.16b
zip1 v0.16b, v4.16b, v4.16b
zip2 v1.16b, v4.16b, v4.16b
zip1 v2.16b, v5.16b, v5.16b
zip2 v3.16b, v5.16b, v5.16b
zip1 v4.16b, v6.16b, v6.16b
zip2 v5.16b, v6.16b, v6.16b
zip1 v6.16b, v7.16b, v7.16b
zip2 v7.16b, v7.16b, v7.16b
add v0.8h, v0.8h, v31.8h
add v1.8h, v1.8h, v31.8h
add v2.8h, v2.8h, v31.8h
add v3.8h, v3.8h, v31.8h
add v4.8h, v4.8h, v31.8h
tbl v0.16b, {v30.16b}, v0.16b
add v5.8h, v5.8h, v31.8h
tbl v1.16b, {v30.16b}, v1.16b
add v6.8h, v6.8h, v31.8h
tbl v2.16b, {v30.16b}, v2.16b
add v7.8h, v7.8h, v31.8h
tbl v3.16b, {v30.16b}, v3.16b
tbl v4.16b, {v30.16b}, v4.16b
tbl v5.16b, {v30.16b}, v5.16b
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
tbl v6.16b, {v30.16b}, v6.16b
tbl v7.16b, {v30.16b}, v7.16b
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
b.gt 64b
ret
L(pal_pred_tbl):
.hword L(pal_pred_tbl) - 640b
.hword L(pal_pred_tbl) - 320b
.hword L(pal_pred_tbl) - 160b
.hword L(pal_pred_tbl) - 80b
.hword L(pal_pred_tbl) - 40b
endfunc
// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height,
// const int16_t *ac, const int alpha,
// const int bitdepth_max);
function ipred_cfl_128_16bpc_neon, export=1
dup v31.8h, w7 // bitdepth_max
clz w9, w3
adr x7, L(ipred_cfl_128_tbl)
sub w9, w9, #26
ldrh w9, [x7, w9, uxtw #1]
urshr v0.8h, v31.8h, #1
dup v1.8h, w6 // alpha
sub x7, x7, w9, uxtw
add x6, x0, x1
lsl x1, x1, #1
movi v30.8h, #0
br x7
L(ipred_cfl_splat_w4):
ld1 {v4.8h, v5.8h}, [x5], #32
subs w4, w4, #4
smull v2.4s, v4.4h, v1.4h // diff = ac * alpha
smull2 v3.4s, v4.8h, v1.8h
smull v4.4s, v5.4h, v1.4h
smull2 v5.4s, v5.8h, v1.8h
sshr v16.4s, v2.4s, #31 // sign = diff >> 31
sshr v17.4s, v3.4s, #31
sshr v18.4s, v4.4s, #31
sshr v19.4s, v5.4s, #31
add v2.4s, v2.4s, v16.4s // diff + sign
add v3.4s, v3.4s, v17.4s
add v4.4s, v4.4s, v18.4s
add v5.4s, v5.4s, v19.4s
rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
rshrn2 v2.8h, v3.4s, #6
rshrn v3.4h, v4.4s, #6
rshrn2 v3.8h, v5.4s, #6
add v2.8h, v2.8h, v0.8h // dc + apply_sign()
add v3.8h, v3.8h, v0.8h
smax v2.8h, v2.8h, v30.8h
smax v3.8h, v3.8h, v30.8h
smin v2.8h, v2.8h, v31.8h
smin v3.8h, v3.8h, v31.8h
st1 {v2.d}[0], [x0], x1
st1 {v2.d}[1], [x6], x1
st1 {v3.d}[0], [x0], x1
st1 {v3.d}[1], [x6], x1
b.gt L(ipred_cfl_splat_w4)
ret
L(ipred_cfl_splat_w8):
ld1 {v4.8h, v5.8h}, [x5], #32
subs w4, w4, #2
smull v2.4s, v4.4h, v1.4h // diff = ac * alpha
smull2 v3.4s, v4.8h, v1.8h
smull v4.4s, v5.4h, v1.4h
smull2 v5.4s, v5.8h, v1.8h
sshr v16.4s, v2.4s, #31 // sign = diff >> 31
sshr v17.4s, v3.4s, #31
sshr v18.4s, v4.4s, #31
sshr v19.4s, v5.4s, #31
add v2.4s, v2.4s, v16.4s // diff + sign
add v3.4s, v3.4s, v17.4s
add v4.4s, v4.4s, v18.4s
add v5.4s, v5.4s, v19.4s
rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
rshrn2 v2.8h, v3.4s, #6
rshrn v3.4h, v4.4s, #6
rshrn2 v3.8h, v5.4s, #6
add v2.8h, v2.8h, v0.8h // dc + apply_sign()
add v3.8h, v3.8h, v0.8h
smax v2.8h, v2.8h, v30.8h
smax v3.8h, v3.8h, v30.8h
smin v2.8h, v2.8h, v31.8h
smin v3.8h, v3.8h, v31.8h
st1 {v2.8h}, [x0], x1
st1 {v3.8h}, [x6], x1
b.gt L(ipred_cfl_splat_w8)
ret
L(ipred_cfl_splat_w16):
add x7, x5, w3, uxtw #1
sub x1, x1, w3, uxtw #1
mov w9, w3
1:
ld1 {v2.8h, v3.8h}, [x5], #32
ld1 {v4.8h, v5.8h}, [x7], #32
subs w3, w3, #16
smull v16.4s, v2.4h, v1.4h // diff = ac * alpha
smull2 v17.4s, v2.8h, v1.8h
smull v18.4s, v3.4h, v1.4h
smull2 v19.4s, v3.8h, v1.8h
smull v2.4s, v4.4h, v1.4h
smull2 v3.4s, v4.8h, v1.8h
smull v4.4s, v5.4h, v1.4h
smull2 v5.4s, v5.8h, v1.8h
sshr v20.4s, v16.4s, #31 // sign = diff >> 31
sshr v21.4s, v17.4s, #31
sshr v22.4s, v18.4s, #31
sshr v23.4s, v19.4s, #31
sshr v24.4s, v2.4s, #31
sshr v25.4s, v3.4s, #31
sshr v26.4s, v4.4s, #31
sshr v27.4s, v5.4s, #31
add v16.4s, v16.4s, v20.4s // diff + sign
add v17.4s, v17.4s, v21.4s
add v18.4s, v18.4s, v22.4s
add v19.4s, v19.4s, v23.4s
add v2.4s, v2.4s, v24.4s
add v3.4s, v3.4s, v25.4s
add v4.4s, v4.4s, v26.4s
add v5.4s, v5.4s, v27.4s
rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
rshrn2 v16.8h, v17.4s, #6
rshrn v17.4h, v18.4s, #6
rshrn2 v17.8h, v19.4s, #6
rshrn v6.4h, v2.4s, #6
rshrn2 v6.8h, v3.4s, #6
rshrn v7.4h, v4.4s, #6
rshrn2 v7.8h, v5.4s, #6
add v2.8h, v16.8h, v0.8h // dc + apply_sign()
add v3.8h, v17.8h, v0.8h
add v4.8h, v6.8h, v0.8h
add v5.8h, v7.8h, v0.8h
smax v2.8h, v2.8h, v30.8h
smax v3.8h, v3.8h, v30.8h
smax v4.8h, v4.8h, v30.8h
smax v5.8h, v5.8h, v30.8h
smin v2.8h, v2.8h, v31.8h
smin v3.8h, v3.8h, v31.8h
smin v4.8h, v4.8h, v31.8h
smin v5.8h, v5.8h, v31.8h
st1 {v2.8h, v3.8h}, [x0], #32
st1 {v4.8h, v5.8h}, [x6], #32
b.gt 1b
subs w4, w4, #2
add x5, x5, w9, uxtw #1
add x7, x7, w9, uxtw #1
add x0, x0, x1
add x6, x6, x1
mov w3, w9
b.gt 1b
ret
L(ipred_cfl_128_tbl):
L(ipred_cfl_splat_tbl):
.hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
.hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
.hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
.hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
endfunc
// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height,
// const int16_t *ac, const int alpha,
// const int bitdepth_max);
function ipred_cfl_top_16bpc_neon, export=1
dup v31.8h, w7 // bitdepth_max
clz w9, w3
adr x7, L(ipred_cfl_top_tbl)
sub w9, w9, #26
ldrh w9, [x7, w9, uxtw #1]
dup v1.8h, w6 // alpha
add x2, x2, #2
sub x7, x7, w9, uxtw
add x6, x0, x1
lsl x1, x1, #1
movi v30.8h, #0
br x7
4:
ld1 {v0.4h}, [x2]
addv h0, v0.4h
urshr v0.4h, v0.4h, #2
dup v0.8h, v0.h[0]
b L(ipred_cfl_splat_w4)
8:
ld1 {v0.8h}, [x2]
addv h0, v0.8h
urshr v0.4h, v0.4h, #3
dup v0.8h, v0.h[0]
b L(ipred_cfl_splat_w8)
16:
ld1 {v2.8h, v3.8h}, [x2]
addp v0.8h, v2.8h, v3.8h
addv h0, v0.8h
urshr v0.4h, v0.4h, #4
dup v0.8h, v0.h[0]
b L(ipred_cfl_splat_w16)
32:
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v0.8h, v2.8h, v4.8h
uaddlv s0, v0.8h
rshrn v0.4h, v0.4s, #5
dup v0.8h, v0.h[0]
b L(ipred_cfl_splat_w16)
L(ipred_cfl_top_tbl):
.hword L(ipred_cfl_top_tbl) - 32b
.hword L(ipred_cfl_top_tbl) - 16b
.hword L(ipred_cfl_top_tbl) - 8b
.hword L(ipred_cfl_top_tbl) - 4b
endfunc
// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height,
// const int16_t *ac, const int alpha,
// const int bitdepth_max);
function ipred_cfl_left_16bpc_neon, export=1
dup v31.8h, w7 // bitdepth_max
sub x2, x2, w4, uxtw #1
clz w9, w3
clz w8, w4
adr x10, L(ipred_cfl_splat_tbl)
adr x7, L(ipred_cfl_left_tbl)
sub w9, w9, #26
sub w8, w8, #26
ldrh w9, [x10, w9, uxtw #1]
ldrh w8, [x7, w8, uxtw #1]
dup v1.8h, w6 // alpha
sub x9, x10, w9, uxtw
sub x7, x7, w8, uxtw
add x6, x0, x1
lsl x1, x1, #1
movi v30.8h, #0
br x7
L(ipred_cfl_left_h4):
ld1 {v0.4h}, [x2]
addv h0, v0.4h
urshr v0.4h, v0.4h, #2
dup v0.8h, v0.h[0]
br x9
L(ipred_cfl_left_h8):
ld1 {v0.8h}, [x2]
addv h0, v0.8h
urshr v0.4h, v0.4h, #3
dup v0.8h, v0.h[0]
br x9
L(ipred_cfl_left_h16):
ld1 {v2.8h, v3.8h}, [x2]
addp v0.8h, v2.8h, v3.8h
addv h0, v0.8h
urshr v0.4h, v0.4h, #4
dup v0.8h, v0.h[0]
br x9
L(ipred_cfl_left_h32):
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v0.8h, v2.8h, v4.8h
uaddlv s0, v0.8h
rshrn v0.4h, v0.4s, #5
dup v0.8h, v0.h[0]
br x9
L(ipred_cfl_left_tbl):
.hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
.hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
.hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
.hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
endfunc
// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height,
// const int16_t *ac, const int alpha,
// const int bitdepth_max);
function ipred_cfl_16bpc_neon, export=1
dup v31.8h, w7 // bitdepth_max
sub x2, x2, w4, uxtw #1
add w8, w3, w4 // width + height
dup v1.8h, w6 // alpha
clz w9, w3
clz w6, w4
dup v16.4s, w8 // width + height
adr x7, L(ipred_cfl_tbl)
rbit w8, w8 // rbit(width + height)
sub w9, w9, #22 // 22 leading bits, minus table offset 4
sub w6, w6, #26
clz w8, w8 // ctz(width + height)
ldrh w9, [x7, w9, uxtw #1]
ldrh w6, [x7, w6, uxtw #1]
neg w8, w8 // -ctz(width + height)
sub x9, x7, w9, uxtw
sub x7, x7, w6, uxtw
ushr v16.4s, v16.4s, #1 // (width + height) >> 1
dup v17.4s, w8 // -ctz(width + height)
add x6, x0, x1
lsl x1, x1, #1
movi v30.8h, #0
br x7
L(ipred_cfl_h4):
ld1 {v0.4h}, [x2], #8
uaddlv s0, v0.4h
br x9
L(ipred_cfl_w4):
add x2, x2, #2
ld1 {v2.4h}, [x2]
add v0.2s, v0.2s, v16.2s
uaddlv s2, v2.4h
cmp w4, #4
add v0.2s, v0.2s, v2.2s
ushl v0.2s, v0.2s, v17.2s
b.eq 1f
// h = 8/16
cmp w4, #16
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v0.2s, v0.2s, v16.2s
ushr v0.2s, v0.2s, #17
1:
dup v0.8h, v0.h[0]
b L(ipred_cfl_splat_w4)
L(ipred_cfl_h8):
ld1 {v0.8h}, [x2], #16
uaddlv s0, v0.8h
br x9
L(ipred_cfl_w8):
add x2, x2, #2
ld1 {v2.8h}, [x2]
add v0.2s, v0.2s, v16.2s
uaddlv s2, v2.8h
cmp w4, #8
add v0.2s, v0.2s, v2.2s
ushl v0.2s, v0.2s, v17.2s
b.eq 1f
// h = 4/16/32
cmp w4, #32
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v0.2s, v0.2s, v16.2s
ushr v0.2s, v0.2s, #17
1:
dup v0.8h, v0.h[0]
b L(ipred_cfl_splat_w8)
L(ipred_cfl_h16):
ld1 {v2.8h, v3.8h}, [x2], #32
addp v0.8h, v2.8h, v3.8h
uaddlv s0, v0.8h
br x9
L(ipred_cfl_w16):
add x2, x2, #2
ld1 {v2.8h, v3.8h}, [x2]
add v0.2s, v0.2s, v16.2s
addp v2.8h, v2.8h, v3.8h
uaddlv s2, v2.8h
cmp w4, #16
add v0.2s, v0.2s, v2.2s
ushl v0.2s, v0.2s, v17.2s
b.eq 1f
// h = 4/8/32
tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v0.2s, v0.2s, v16.2s
ushr v0.2s, v0.2s, #17
1:
dup v0.8h, v0.h[0]
b L(ipred_cfl_splat_w16)
L(ipred_cfl_h32):
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v0.8h, v2.8h, v4.8h
uaddlv s0, v0.8h
br x9
L(ipred_cfl_w32):
add x2, x2, #2
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
add v0.4s, v0.4s, v16.4s
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v2.8h, v2.8h, v4.8h
cmp w4, #32
uaddlv s2, v2.8h
add v0.2s, v0.2s, v2.2s
ushl v0.2s, v0.2s, v17.2s
b.eq 1f
// h = 8/16
cmp w4, #8
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v0.2s, v0.2s, v16.2s
ushr v0.2s, v0.2s, #17
1:
dup v0.8h, v0.h[0]
b L(ipred_cfl_splat_w16)
L(ipred_cfl_tbl):
.hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
.hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
.hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
.hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
.hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
.hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
.hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
.hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
endfunc
// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
// const ptrdiff_t stride, const int w_pad,
// const int h_pad, const int cw, const int ch);
function ipred_cfl_ac_420_16bpc_neon, export=1
clz w8, w5
lsl w4, w4, #2
adr x7, L(ipred_cfl_ac_420_tbl)
sub w8, w8, #27
ldrh w8, [x7, w8, uxtw #1]
movi v24.4s, #0
movi v25.4s, #0
movi v26.4s, #0
movi v27.4s, #0
sub x7, x7, w8, uxtw
sub w8, w6, w4 // height - h_pad
rbit w9, w5 // rbit(width)
rbit w10, w6 // rbit(height)
clz w9, w9 // ctz(width)
clz w10, w10 // ctz(height)
add w9, w9, w10 // log2sz
add x10, x1, x2
dup v31.4s, w9
lsl x2, x2, #1
neg v31.4s, v31.4s // -log2sz
br x7
L(ipred_cfl_ac_420_w4):
1: // Copy and subsample input
ld1 {v0.8h}, [x1], x2
ld1 {v1.8h}, [x10], x2
ld1 {v2.8h}, [x1], x2
ld1 {v3.8h}, [x10], x2
addp v0.8h, v0.8h, v2.8h
addp v1.8h, v1.8h, v3.8h
add v0.8h, v0.8h, v1.8h
shl v0.8h, v0.8h, #1
subs w8, w8, #2
st1 {v0.8h}, [x0], #16
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
b.gt 1b
trn2 v1.2d, v0.2d, v0.2d
trn2 v0.2d, v0.2d, v0.2d
L(ipred_cfl_ac_420_w4_hpad):
cbz w4, 3f
2: // Vertical padding (h_pad > 0)
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], #32
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
b.gt 2b
3:
L(ipred_cfl_ac_420_w4_calc_subtract_dc):
// Aggregate the sums
add v24.4s, v24.4s, v25.4s
add v26.4s, v26.4s, v27.4s
add v0.4s, v24.4s, v26.4s
addv s0, v0.4s // sum
sub x0, x0, w6, uxtw #3
urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
dup v4.8h, v4.h[0]
6: // Subtract dc from ac
ld1 {v0.8h, v1.8h}, [x0]
subs w6, w6, #4
sub v0.8h, v0.8h, v4.8h
sub v1.8h, v1.8h, v4.8h
st1 {v0.8h, v1.8h}, [x0], #32
b.gt 6b
ret
L(ipred_cfl_ac_420_w8):
cbnz w3, L(ipred_cfl_ac_420_w8_wpad)
1: // Copy and subsample input, without padding
ld1 {v0.8h, v1.8h}, [x1], x2
ld1 {v2.8h, v3.8h}, [x10], x2
ld1 {v4.8h, v5.8h}, [x1], x2
addp v0.8h, v0.8h, v1.8h
ld1 {v6.8h, v7.8h}, [x10], x2
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v6.8h, v6.8h, v7.8h
add v0.8h, v0.8h, v2.8h
add v4.8h, v4.8h, v6.8h
shl v0.8h, v0.8h, #1
shl v1.8h, v4.8h, #1
subs w8, w8, #2
st1 {v0.8h, v1.8h}, [x0], #32
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
b.gt 1b
mov v0.16b, v1.16b
b L(ipred_cfl_ac_420_w8_hpad)
L(ipred_cfl_ac_420_w8_wpad):
1: // Copy and subsample input, padding 4
ld1 {v0.8h}, [x1], x2
ld1 {v1.8h}, [x10], x2
ld1 {v2.8h}, [x1], x2
ld1 {v3.8h}, [x10], x2
addp v0.8h, v0.8h, v2.8h
addp v1.8h, v1.8h, v3.8h
add v0.8h, v0.8h, v1.8h
shl v0.8h, v0.8h, #1
dup v1.4h, v0.h[3]
dup v3.4h, v0.h[7]
trn2 v2.2d, v0.2d, v0.2d
subs w8, w8, #2
st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
uaddw v24.4s, v24.4s, v0.4h
uaddw v25.4s, v25.4s, v1.4h
uaddw v26.4s, v26.4s, v2.4h
uaddw v27.4s, v27.4s, v3.4h
b.gt 1b
trn1 v0.2d, v2.2d, v3.2d
trn1 v1.2d, v2.2d, v3.2d
L(ipred_cfl_ac_420_w8_hpad):
cbz w4, 3f
2: // Vertical padding (h_pad > 0)
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], #32
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
st1 {v0.8h, v1.8h}, [x0], #32
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
b.gt 2b
3:
// Double the height and reuse the w4 summing/subtracting
lsl w6, w6, #1
lsl w9, w9, #1
b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
L(ipred_cfl_ac_420_w16):
adr x7, L(ipred_cfl_ac_420_w16_tbl)
ldrh w3, [x7, w3, uxtw #1]
sub x7, x7, w3, uxtw
br x7
L(ipred_cfl_ac_420_w16_wpad0):
1: // Copy and subsample input, without padding
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v6.8h, v6.8h, v7.8h
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2
add v0.8h, v0.8h, v4.8h
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
add v2.8h, v2.8h, v6.8h
addp v16.8h, v16.8h, v17.8h
addp v18.8h, v18.8h, v19.8h
addp v20.8h, v20.8h, v21.8h
addp v22.8h, v22.8h, v23.8h
add v16.8h, v16.8h, v20.8h
add v18.8h, v18.8h, v22.8h
shl v0.8h, v0.8h, #1
shl v1.8h, v2.8h, #1
shl v2.8h, v16.8h, #1
shl v3.8h, v18.8h, #1
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
uaddw v24.4s, v24.4s, v2.4h
uaddw2 v25.4s, v25.4s, v2.8h
uaddw v26.4s, v26.4s, v3.4h
uaddw2 v27.4s, v27.4s, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
b L(ipred_cfl_ac_420_w16_hpad)
L(ipred_cfl_ac_420_w16_wpad1):
1: // Copy and subsample input, padding 4
ldr q2, [x1, #32]
ld1 {v0.8h, v1.8h}, [x1], x2
ldr q5, [x10, #32]
ld1 {v3.8h, v4.8h}, [x10], x2
addp v2.8h, v2.8h, v2.8h
addp v0.8h, v0.8h, v1.8h
addp v5.8h, v5.8h, v5.8h
addp v3.8h, v3.8h, v4.8h
ldr q18, [x1, #32]
add v2.4h, v2.4h, v5.4h
ld1 {v16.8h, v17.8h}, [x1], x2
add v0.8h, v0.8h, v3.8h
ldr q21, [x10, #32]
ld1 {v19.8h, v20.8h}, [x10], x2
addp v18.8h, v18.8h, v18.8h
addp v16.8h, v16.8h, v17.8h
addp v21.8h, v21.8h, v21.8h
addp v19.8h, v19.8h, v20.8h
add v18.4h, v18.4h, v21.4h
add v16.8h, v16.8h, v19.8h
shl v1.4h, v2.4h, #1
shl v0.8h, v0.8h, #1
shl v3.4h, v18.4h, #1
shl v2.8h, v16.8h, #1
dup v4.4h, v1.h[3]
dup v5.4h, v3.h[3]
trn1 v1.2d, v1.2d, v4.2d
trn1 v3.2d, v3.2d, v5.2d
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
uaddw v24.4s, v24.4s, v2.4h
uaddw2 v25.4s, v25.4s, v2.8h
uaddw v26.4s, v26.4s, v3.4h
uaddw2 v27.4s, v27.4s, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
b L(ipred_cfl_ac_420_w16_hpad)
L(ipred_cfl_ac_420_w16_wpad2):
1: // Copy and subsample input, padding 8
ld1 {v0.8h, v1.8h}, [x1], x2
ld1 {v2.8h, v3.8h}, [x10], x2
ld1 {v4.8h, v5.8h}, [x1], x2
addp v0.8h, v0.8h, v1.8h
ld1 {v6.8h, v7.8h}, [x10], x2
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v6.8h, v6.8h, v7.8h
add v0.8h, v0.8h, v2.8h
add v4.8h, v4.8h, v6.8h
shl v0.8h, v0.8h, #1
shl v2.8h, v4.8h, #1
dup v1.8h, v0.h[7]
dup v3.8h, v2.h[7]
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
uaddw v24.4s, v24.4s, v2.4h
uaddw2 v25.4s, v25.4s, v2.8h
uaddw v26.4s, v26.4s, v3.4h
uaddw2 v27.4s, v27.4s, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
b L(ipred_cfl_ac_420_w16_hpad)
L(ipred_cfl_ac_420_w16_wpad3):
1: // Copy and subsample input, padding 12
ld1 {v0.8h}, [x1], x2
ld1 {v2.8h}, [x10], x2
ld1 {v4.8h}, [x1], x2
ld1 {v6.8h}, [x10], x2
addp v0.8h, v0.8h, v4.8h
addp v2.8h, v2.8h, v6.8h
add v0.8h, v0.8h, v2.8h
shl v0.8h, v0.8h, #1
dup v1.8h, v0.h[3]
dup v3.8h, v0.h[7]
trn2 v2.2d, v0.2d, v3.2d
trn1 v0.2d, v0.2d, v1.2d
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
uaddw v24.4s, v24.4s, v2.4h
uaddw2 v25.4s, v25.4s, v2.8h
uaddw v26.4s, v26.4s, v3.4h
uaddw2 v27.4s, v27.4s, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
b L(ipred_cfl_ac_420_w16_hpad)
L(ipred_cfl_ac_420_w16_hpad):
cbz w4, 3f
2: // Vertical padding (h_pad > 0)
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
uaddw v24.4s, v24.4s, v2.4h
uaddw2 v25.4s, v25.4s, v2.8h
uaddw v26.4s, v26.4s, v3.4h
uaddw2 v27.4s, v27.4s, v3.8h
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
uaddw v24.4s, v24.4s, v2.4h
uaddw2 v25.4s, v25.4s, v2.8h
uaddw v26.4s, v26.4s, v3.4h
uaddw2 v27.4s, v27.4s, v3.8h
b.gt 2b
3:
// Quadruple the height and reuse the w4 summing/subtracting
lsl w6, w6, #2
lsl w9, w9, #2
b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
L(ipred_cfl_ac_420_tbl):
.hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
.hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
.hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
.hword 0
L(ipred_cfl_ac_420_w16_tbl):
.hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
.hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
.hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
.hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
endfunc
// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
// const ptrdiff_t stride, const int w_pad,
// const int h_pad, const int cw, const int ch);
function ipred_cfl_ac_422_16bpc_neon, export=1
clz w8, w5
lsl w4, w4, #2
adr x7, L(ipred_cfl_ac_422_tbl)
sub w8, w8, #27
ldrh w8, [x7, w8, uxtw #1]
movi v24.4s, #0
movi v25.4s, #0
movi v26.4s, #0
movi v27.4s, #0
sub x7, x7, w8, uxtw
sub w8, w6, w4 // height - h_pad
rbit w9, w5 // rbit(width)
rbit w10, w6 // rbit(height)
clz w9, w9 // ctz(width)
clz w10, w10 // ctz(height)
add w9, w9, w10 // log2sz
add x10, x1, x2
dup v31.4s, w9
lsl x2, x2, #1
neg v31.4s, v31.4s // -log2sz
br x7
L(ipred_cfl_ac_422_w4):
1: // Copy and subsample input
ld1 {v0.8h}, [x1], x2
ld1 {v1.8h}, [x10], x2
ld1 {v2.8h}, [x1], x2
ld1 {v3.8h}, [x10], x2
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
shl v0.8h, v0.8h, #2
shl v1.8h, v2.8h, #2
subs w8, w8, #4
st1 {v0.8h, v1.8h}, [x0], #32
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
b.gt 1b
trn2 v0.2d, v1.2d, v1.2d
trn2 v1.2d, v1.2d, v1.2d
b L(ipred_cfl_ac_420_w4_hpad)
L(ipred_cfl_ac_422_w8):
cbnz w3, L(ipred_cfl_ac_422_w8_wpad)
1: // Copy and subsample input, without padding
ld1 {v0.8h, v1.8h}, [x1], x2
ld1 {v2.8h, v3.8h}, [x10], x2
ld1 {v4.8h, v5.8h}, [x1], x2
addp v0.8h, v0.8h, v1.8h
ld1 {v6.8h, v7.8h}, [x10], x2
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v6.8h, v6.8h, v7.8h
shl v0.8h, v0.8h, #2
shl v1.8h, v2.8h, #2
shl v2.8h, v4.8h, #2
shl v3.8h, v6.8h, #2
subs w8, w8, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
uaddw v24.4s, v24.4s, v2.4h
uaddw2 v25.4s, v25.4s, v2.8h
uaddw v26.4s, v26.4s, v3.4h
uaddw2 v27.4s, v27.4s, v3.8h
b.gt 1b
mov v0.16b, v3.16b
mov v1.16b, v3.16b
b L(ipred_cfl_ac_420_w8_hpad)
L(ipred_cfl_ac_422_w8_wpad):
1: // Copy and subsample input, padding 4
ld1 {v0.8h}, [x1], x2
ld1 {v1.8h}, [x10], x2
ld1 {v2.8h}, [x1], x2
ld1 {v3.8h}, [x10], x2
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
shl v0.8h, v0.8h, #2
shl v2.8h, v2.8h, #2
dup v4.4h, v0.h[3]
dup v5.8h, v0.h[7]
dup v6.4h, v2.h[3]
dup v7.8h, v2.h[7]
trn2 v1.2d, v0.2d, v5.2d
trn1 v0.2d, v0.2d, v4.2d
trn2 v3.2d, v2.2d, v7.2d
trn1 v2.2d, v2.2d, v6.2d
subs w8, w8, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
uaddw v24.4s, v24.4s, v2.4h
uaddw2 v25.4s, v25.4s, v2.8h
uaddw v26.4s, v26.4s, v3.4h
uaddw2 v27.4s, v27.4s, v3.8h
b.gt 1b
mov v0.16b, v3.16b
mov v1.16b, v3.16b
b L(ipred_cfl_ac_420_w8_hpad)
L(ipred_cfl_ac_422_w16):
adr x7, L(ipred_cfl_ac_422_w16_tbl)
ldrh w3, [x7, w3, uxtw #1]
sub x7, x7, w3, uxtw
br x7
L(ipred_cfl_ac_422_w16_wpad0):
1: // Copy and subsample input, without padding
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v6.8h, v6.8h, v7.8h
shl v0.8h, v0.8h, #2
shl v1.8h, v2.8h, #2
shl v2.8h, v4.8h, #2
shl v3.8h, v6.8h, #2
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
uaddw v24.4s, v24.4s, v2.4h
uaddw2 v25.4s, v25.4s, v2.8h
uaddw v26.4s, v26.4s, v3.4h
uaddw2 v27.4s, v27.4s, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
b L(ipred_cfl_ac_420_w16_hpad)
L(ipred_cfl_ac_422_w16_wpad1):
1: // Copy and subsample input, padding 4
ldr q2, [x1, #32]
ld1 {v0.8h, v1.8h}, [x1], x2
ldr q6, [x10, #32]
ld1 {v4.8h, v5.8h}, [x10], x2
addp v2.8h, v2.8h, v2.8h
addp v0.8h, v0.8h, v1.8h
addp v6.8h, v6.8h, v6.8h
addp v4.8h, v4.8h, v5.8h
shl v1.4h, v2.4h, #2
shl v0.8h, v0.8h, #2
shl v3.4h, v6.4h, #2
shl v2.8h, v4.8h, #2
dup v4.4h, v1.h[3]
dup v5.4h, v3.h[3]
trn1 v1.2d, v1.2d, v4.2d
trn1 v3.2d, v3.2d, v5.2d
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
uaddw v24.4s, v24.4s, v2.4h
uaddw2 v25.4s, v25.4s, v2.8h
uaddw v26.4s, v26.4s, v3.4h
uaddw2 v27.4s, v27.4s, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
b L(ipred_cfl_ac_420_w16_hpad)
L(ipred_cfl_ac_422_w16_wpad2):
1: // Copy and subsample input, padding 8
ld1 {v0.8h, v1.8h}, [x1], x2
ld1 {v2.8h, v3.8h}, [x10], x2
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
shl v0.8h, v0.8h, #2
shl v2.8h, v2.8h, #2
dup v1.8h, v0.h[7]
dup v3.8h, v2.h[7]
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
uaddw v24.4s, v24.4s, v2.4h
uaddw2 v25.4s, v25.4s, v2.8h
uaddw v26.4s, v26.4s, v3.4h
uaddw2 v27.4s, v27.4s, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
b L(ipred_cfl_ac_420_w16_hpad)
L(ipred_cfl_ac_422_w16_wpad3):
1: // Copy and subsample input, padding 12
ld1 {v0.8h}, [x1], x2
ld1 {v2.8h}, [x10], x2
addp v0.8h, v0.8h, v0.8h
addp v2.8h, v2.8h, v2.8h
shl v0.4h, v0.4h, #2
shl v2.4h, v2.4h, #2
dup v1.8h, v0.h[3]
dup v3.8h, v2.h[3]
trn1 v0.2d, v0.2d, v1.2d
trn1 v2.2d, v2.2d, v3.2d
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
uaddw v24.4s, v24.4s, v0.4h
uaddw2 v25.4s, v25.4s, v0.8h
uaddw v26.4s, v26.4s, v1.4h
uaddw2 v27.4s, v27.4s, v1.8h
uaddw v24.4s, v24.4s, v2.4h
uaddw2 v25.4s, v25.4s, v2.8h
uaddw v26.4s, v26.4s, v3.4h
uaddw2 v27.4s, v27.4s, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
b L(ipred_cfl_ac_420_w16_hpad)
L(ipred_cfl_ac_422_tbl):
.hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
.hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
.hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
.hword 0
L(ipred_cfl_ac_422_w16_tbl):
.hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
.hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
.hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
.hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
endfunc