ref: e3dbf92664918ecc830b4fde74b7cc0f6cd2065c
dir: /src/arm/64/ipred.S/
/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // void ipred_dc_128_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_128_neon, export=1 clz w3, w3 adr x5, L(ipred_dc_128_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] movi v0.16b, #128 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 4: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 8: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 16: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 320: movi v1.16b, #128 32: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 32b ret 640: movi v1.16b, #128 movi v2.16b, #128 movi v3.16b, #128 64: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret L(ipred_dc_128_tbl): .hword L(ipred_dc_128_tbl) - 640b .hword L(ipred_dc_128_tbl) - 320b .hword L(ipred_dc_128_tbl) - 16b .hword L(ipred_dc_128_tbl) - 8b .hword L(ipred_dc_128_tbl) - 4b endfunc // void ipred_v_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_v_neon, export=1 clz w3, w3 adr x5, L(ipred_v_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] add x2, x2, #1 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: ld1 {v0.s}[0], [x2] 4: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 80: ld1 {v0.8b}, [x2] 8: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 160: ld1 {v0.16b}, [x2], #16 16: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 320: ld1 {v0.16b, v1.16b}, [x2] 32: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 32b ret 640: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 64: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret L(ipred_v_tbl): .hword L(ipred_v_tbl) - 640b .hword L(ipred_v_tbl) - 320b .hword L(ipred_v_tbl) - 160b .hword L(ipred_v_tbl) - 80b .hword L(ipred_v_tbl) - 40b endfunc // void ipred_h_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_h_neon, export=1 clz w3, w3 adr x5, L(ipred_h_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] sub x2, x2, #4 sub x5, x5, w3, uxtw mov x7, #-4 add x6, x0, x1 lsl x1, x1, #1 br x5 4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 st1 {v3.s}[0], [x0], x1 st1 {v2.s}[0], [x6], x1 subs w4, w4, #4 st1 {v1.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 st1 {v3.8b}, [x0], x1 st1 {v2.8b}, [x6], x1 subs w4, w4, #4 st1 {v1.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 16: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 subs w4, w4, #4 st1 {v1.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 32: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] st1 {v1.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 32b ret 64: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] stp q3, q3, [x0, #32] stp q2, q2, [x6, #32] st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] stp q1, q1, [x0, #32] stp q0, q0, [x6, #32] st1 {v1.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 64b ret L(ipred_h_tbl): .hword L(ipred_h_tbl) - 64b .hword L(ipred_h_tbl) - 32b .hword L(ipred_h_tbl) - 16b .hword L(ipred_h_tbl) - 8b .hword L(ipred_h_tbl) - 4b endfunc // void ipred_dc_top_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_top_neon, export=1 clz w3, w3 adr x5, L(ipred_dc_top_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] add x2, x2, #1 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: ld1r {v0.2s}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.8b, v0.b[0] 4: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 80: ld1 {v0.8b}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.8b, v0.b[0] 8: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 160: ld1 {v0.16b}, [x2] uaddlv h0, v0.16b rshrn v0.8b, v0.8h, #4 dup v0.16b, v0.b[0] 16: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 320: ld1 {v0.16b, v1.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b add v2.4h, v0.4h, v1.4h rshrn v2.8b, v2.8h, #5 dup v0.16b, v2.b[0] dup v1.16b, v2.b[0] 32: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 32b ret 640: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b add v4.4h, v0.4h, v1.4h add v5.4h, v2.4h, v3.4h add v4.4h, v4.4h, v5.4h rshrn v4.8b, v4.8h, #6 dup v0.16b, v4.b[0] dup v1.16b, v4.b[0] dup v2.16b, v4.b[0] dup v3.16b, v4.b[0] 64: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret L(ipred_dc_top_tbl): .hword L(ipred_dc_top_tbl) - 640b .hword L(ipred_dc_top_tbl) - 320b .hword L(ipred_dc_top_tbl) - 160b .hword L(ipred_dc_top_tbl) - 80b .hword L(ipred_dc_top_tbl) - 40b endfunc // void ipred_dc_left_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_left_neon, export=1 sub x2, x2, w4, uxtw clz w3, w3 clz w7, w4 adr x5, L(ipred_dc_left_tbl) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w7, w7, #25 ldrh w3, [x5, w3, uxtw #1] ldrh w7, [x5, w7, uxtw #1] sub x3, x5, w3, uxtw sub x5, x5, w7, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_left_h4): ld1r {v0.2s}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w4): st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt L(ipred_dc_left_w4) ret L(ipred_dc_left_h8): ld1 {v0.8b}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w8): st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt L(ipred_dc_left_w8) ret L(ipred_dc_left_h16): ld1 {v0.16b}, [x2] uaddlv h0, v0.16b rshrn v0.8b, v0.8h, #4 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w16): st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt L(ipred_dc_left_w16) ret L(ipred_dc_left_h32): ld1 {v0.16b, v1.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b add v0.4h, v0.4h, v1.4h rshrn v0.8b, v0.8h, #5 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w32): mov v1.16b, v0.16b 1: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h64): ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b add v0.4h, v0.4h, v1.4h add v2.4h, v2.4h, v3.4h add v0.4h, v0.4h, v2.4h rshrn v0.8b, v0.8h, #6 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w64): mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b 1: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 1b ret L(ipred_dc_left_tbl): .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) endfunc // void ipred_dc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_neon, export=1 sub x2, x2, w4, uxtw add w7, w3, w4 // width + height clz w3, w3 clz w6, w4 dup v16.8h, w7 // width + height adr x5, L(ipred_dc_tbl) rbit w7, w7 // rbit(width + height) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w6, w6, #25 clz w7, w7 // ctz(width + height) ldrh w3, [x5, w3, uxtw #1] ldrh w6, [x5, w6, uxtw #1] neg w7, w7 // -ctz(width + height) sub x3, x5, w3, uxtw sub x5, x5, w6, uxtw ushr v16.8h, v16.8h, #1 // (width + height) >> 1 dup v17.8h, w7 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_h4): ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr uaddlv h0, v0.8b br x3 L(ipred_dc_w4): add x2, x2, #1 ld1 {v1.s}[0], [x2] ins v1.s[1], wzr add v0.4h, v0.4h, v16.4h uaddlv h1, v1.8b cmp w4, #4 add v0.4h, v0.4h, v1.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 8/16 mov w16, #(0x3334/2) movk w16, #(0x5556/2), lsl #16 add w17, w4, w4 // w17 = 2*h = 16 or 32 lsr w16, w16, w17 dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8b, v0.b[0] 2: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 2b ret L(ipred_dc_h8): ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b br x3 L(ipred_dc_w8): add x2, x2, #1 ld1 {v1.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.8b cmp w4, #8 add v0.4h, v0.4h, v1.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8b, v0.b[0] 2: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 2b ret L(ipred_dc_h16): ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b br x3 L(ipred_dc_w16): add x2, x2, #1 ld1 {v1.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b cmp w4, #16 add v0.4h, v0.4h, v1.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/8/32/64 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.16b, v0.b[0] 2: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 2b ret L(ipred_dc_h32): ld1 {v0.16b, v1.16b}, [x2], #32 uaddlv h0, v0.16b uaddlv h1, v1.16b add v0.4h, v0.4h, v1.4h br x3 L(ipred_dc_w32): add x2, x2, #1 ld1 {v1.16b, v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b uaddlv h2, v2.16b cmp w4, #32 add v0.4h, v0.4h, v1.4h add v0.4h, v0.4h, v2.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 8/16/64 cmp w4, #8 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.16b, v0.b[0] dup v1.16b, v0.b[0] 2: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 2b ret L(ipred_dc_h64): ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 uaddlv h0, v0.16b uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b add v0.4h, v0.4h, v1.4h add v2.4h, v2.4h, v3.4h add v0.4h, v0.4h, v2.4h br x3 L(ipred_dc_w64): mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b 2: add x2, x2, #1 ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b uaddlv h4, v4.16b add v1.4h, v1.4h, v2.4h add v3.4h, v3.4h, v4.4h cmp w4, #64 add v0.4h, v0.4h, v1.4h add v0.4h, v0.4h, v3.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 16/32 mov w16, #(0x5556/2) movk w16, #(0x3334/2), lsl #16 lsr w16, w16, w4 dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.16b, v0.b[0] dup v1.16b, v0.b[0] dup v2.16b, v0.b[0] dup v3.16b, v0.b[0] 2: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 2b ret L(ipred_dc_tbl): .hword L(ipred_dc_tbl) - L(ipred_dc_h64) .hword L(ipred_dc_tbl) - L(ipred_dc_h32) .hword L(ipred_dc_tbl) - L(ipred_dc_h16) .hword L(ipred_dc_tbl) - L(ipred_dc_h8) .hword L(ipred_dc_tbl) - L(ipred_dc_h4) .hword L(ipred_dc_tbl) - L(ipred_dc_w64) .hword L(ipred_dc_tbl) - L(ipred_dc_w32) .hword L(ipred_dc_tbl) - L(ipred_dc_w16) .hword L(ipred_dc_tbl) - L(ipred_dc_w8) .hword L(ipred_dc_tbl) - L(ipred_dc_w4) endfunc // void ipred_paeth_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_paeth_neon, export=1 clz w9, w3 adr x5, L(ipred_paeth_tbl) sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.16b}, [x2] add x8, x2, #1 sub x2, x2, #4 sub x5, x5, w9, uxtw mov x7, #-4 add x6, x0, x1 lsl x1, x1, #1 br x5 40: ld1r {v5.4s}, [x8] usubl v6.8h, v5.8b, v4.8b // top - topleft 4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 zip1 v0.2s, v0.2s, v1.2s zip1 v2.2s, v2.2s, v3.2s uaddw v16.8h, v6.8h, v0.8b uaddw v17.8h, v6.8h, v2.8b sqxtun v16.8b, v16.8h // base sqxtun2 v16.16b, v17.8h zip1 v0.2d, v0.2d, v2.2d uabd v20.16b, v5.16b, v16.16b // tdiff uabd v22.16b, v4.16b, v16.16b // tldiff uabd v16.16b, v0.16b, v16.16b // ldiff umin v18.16b, v20.16b, v22.16b // min(tdiff, tldiff) cmhs v20.16b, v22.16b, v20.16b // tldiff >= tdiff cmhs v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff bsl v20.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bit v20.16b, v0.16b, v16.16b // ldiff <= min ? left : ... st1 {v20.s}[3], [x0], x1 st1 {v20.s}[2], [x6], x1 subs w4, w4, #4 st1 {v20.s}[1], [x0], x1 st1 {v20.s}[0], [x6], x1 b.gt 4b ret 80: ld1r {v5.2d}, [x8] usubl v6.8h, v5.8b, v4.8b // top - topleft 8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 uaddw v16.8h, v6.8h, v0.8b uaddw v17.8h, v6.8h, v1.8b uaddw v18.8h, v6.8h, v2.8b uaddw v19.8h, v6.8h, v3.8b sqxtun v16.8b, v16.8h // base sqxtun2 v16.16b, v17.8h sqxtun v18.8b, v18.8h sqxtun2 v18.16b, v19.8h zip1 v2.2d, v2.2d, v3.2d zip1 v0.2d, v0.2d, v1.2d uabd v21.16b, v5.16b, v18.16b // tdiff uabd v20.16b, v5.16b, v16.16b uabd v23.16b, v4.16b, v18.16b // tldiff uabd v22.16b, v4.16b, v16.16b uabd v17.16b, v2.16b, v18.16b // ldiff uabd v16.16b, v0.16b, v16.16b umin v19.16b, v21.16b, v23.16b // min(tdiff, tldiff) umin v18.16b, v20.16b, v22.16b cmhs v21.16b, v23.16b, v21.16b // tldiff >= tdiff cmhs v20.16b, v22.16b, v20.16b cmhs v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff cmhs v16.16b, v18.16b, v16.16b bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v20.16b, v5.16b, v4.16b bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... bit v20.16b, v0.16b, v16.16b st1 {v21.d}[1], [x0], x1 st1 {v21.d}[0], [x6], x1 subs w4, w4, #4 st1 {v20.d}[1], [x0], x1 st1 {v20.d}[0], [x6], x1 b.gt 8b ret 160: 320: 640: ld1 {v5.16b}, [x8], #16 mov w9, w3 // Set up pointers for four rows in parallel; x0, x6, x5, x10 add x5, x0, x1 add x10, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw 1: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 2: usubl v6.8h, v5.8b, v4.8b // top - topleft usubl2 v7.8h, v5.16b, v4.16b uaddw v24.8h, v6.8h, v0.8b uaddw v25.8h, v7.8h, v0.8b uaddw v26.8h, v6.8h, v1.8b uaddw v27.8h, v7.8h, v1.8b uaddw v28.8h, v6.8h, v2.8b uaddw v29.8h, v7.8h, v2.8b uaddw v30.8h, v6.8h, v3.8b uaddw v31.8h, v7.8h, v3.8b sqxtun v17.8b, v26.8h // base sqxtun2 v17.16b, v27.8h sqxtun v16.8b, v24.8h sqxtun2 v16.16b, v25.8h sqxtun v19.8b, v30.8h sqxtun2 v19.16b, v31.8h sqxtun v18.8b, v28.8h sqxtun2 v18.16b, v29.8h uabd v23.16b, v5.16b, v19.16b // tdiff uabd v22.16b, v5.16b, v18.16b uabd v21.16b, v5.16b, v17.16b uabd v20.16b, v5.16b, v16.16b uabd v27.16b, v4.16b, v19.16b // tldiff uabd v26.16b, v4.16b, v18.16b uabd v25.16b, v4.16b, v17.16b uabd v24.16b, v4.16b, v16.16b uabd v19.16b, v3.16b, v19.16b // ldiff uabd v18.16b, v2.16b, v18.16b uabd v17.16b, v1.16b, v17.16b uabd v16.16b, v0.16b, v16.16b umin v31.16b, v23.16b, v27.16b // min(tdiff, tldiff) umin v30.16b, v22.16b, v26.16b umin v29.16b, v21.16b, v25.16b umin v28.16b, v20.16b, v24.16b cmhs v23.16b, v27.16b, v23.16b // tldiff >= tdiff cmhs v22.16b, v26.16b, v22.16b cmhs v21.16b, v25.16b, v21.16b cmhs v20.16b, v24.16b, v20.16b cmhs v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff cmhs v18.16b, v30.16b, v18.16b cmhs v17.16b, v29.16b, v17.16b cmhs v16.16b, v28.16b, v16.16b bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v22.16b, v5.16b, v4.16b bsl v21.16b, v5.16b, v4.16b bsl v20.16b, v5.16b, v4.16b bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... bit v22.16b, v2.16b, v18.16b bit v21.16b, v1.16b, v17.16b bit v20.16b, v0.16b, v16.16b subs w3, w3, #16 st1 {v23.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 st1 {v21.16b}, [x5], #16 st1 {v20.16b}, [x10], #16 b.le 8f ld1 {v5.16b}, [x8], #16 b 2b 8: subs w4, w4, #4 b.le 9f // End of horizontal loop, move pointers to next four rows sub x8, x8, w9, uxtw add x0, x0, x1 add x6, x6, x1 // Load the top row as early as possible ld1 {v5.16b}, [x8], #16 add x5, x5, x1 add x10, x10, x1 mov w3, w9 b 1b 9: ret L(ipred_paeth_tbl): .hword L(ipred_paeth_tbl) - 640b .hword L(ipred_paeth_tbl) - 320b .hword L(ipred_paeth_tbl) - 160b .hword L(ipred_paeth_tbl) - 80b .hword L(ipred_paeth_tbl) - 40b endfunc // void ipred_smooth_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_neon, export=1 movrel x10, X(sm_weights) add x11, x10, w4, uxtw add x10, x10, w3, uxtw clz w9, w3 adr x5, L(ipred_smooth_tbl) sub x12, x2, w4, uxtw sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.16b}, [x12] // bottom add x8, x2, #1 sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: sub x2, x2, #4 mov x7, #-4 ld1r {v6.2s}, [x8] // top ld1r {v7.2s}, [x10] // weights_hor dup v5.16b, v6.b[3] // right usubl v6.8h, v6.8b, v4.8b // top-bottom uxtl v7.8h, v7.8b // weights_hor 4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 zip1 v1.2s, v1.2s, v0.2s // left, flipped zip1 v0.2s, v3.2s, v2.2s zip1 v16.2s, v16.2s, v17.2s // weights_ver zip1 v18.2s, v18.2s, v19.2s shll v22.8h, v4.8b, #8 // bottom*256 shll v23.8h, v4.8b, #8 usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b uxtl v16.8h, v16.8b // weights_ver uxtl v18.8h, v18.8b mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v1.8h, v7.8h mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v23.8h, v6.8h, v18.8h uhadd v20.8h, v20.8h, v22.8h uhadd v21.8h, v21.8h, v23.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 st1 {v20.s}[0], [x0], x1 st1 {v20.s}[1], [x6], x1 subs w4, w4, #4 st1 {v21.s}[0], [x0], x1 st1 {v21.s}[1], [x6], x1 b.gt 4b ret 80: sub x2, x2, #4 mov x7, #-4 ld1 {v6.8b}, [x8] // top ld1 {v7.8b}, [x10] // weights_hor dup v5.16b, v6.b[7] // right usubl v6.8h, v6.8b, v4.8b // top-bottom uxtl v7.8h, v7.8b // weights_hor 8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b usubl v2.8h, v2.8b, v5.8b usubl v3.8h, v3.8b, v5.8b shll v24.8h, v4.8b, #8 // bottom*256 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v2.8h, v7.8h // (left flipped) mla v22.8h, v1.8h, v7.8h mla v23.8h, v0.8h, v7.8h mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v25.8h, v6.8h, v17.8h mla v26.8h, v6.8h, v18.8h mla v27.8h, v6.8h, v19.8h uhadd v20.8h, v20.8h, v24.8h uhadd v21.8h, v21.8h, v25.8h uhadd v22.8h, v22.8h, v26.8h uhadd v23.8h, v23.8h, v27.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn v23.8b, v23.8h, #8 st1 {v20.8b}, [x0], x1 st1 {v21.8b}, [x6], x1 subs w4, w4, #4 st1 {v22.8b}, [x0], x1 st1 {v23.8b}, [x6], x1 b.gt 8b ret 160: 320: 640: add x12, x2, w3, uxtw sub x2, x2, #2 mov x7, #-2 ld1r {v5.16b}, [x12] // right sub x1, x1, w3, uxtw mov w9, w3 1: ld2r {v0.8b, v1.8b}, [x2], x7 // left ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b 2: ld1 {v7.16b}, [x10], #16 // weights_hor ld1 {v3.16b}, [x8], #16 // top shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 uxtl v6.8h, v7.8b // weights_hor uxtl2 v7.8h, v7.16b usubl v2.8h, v3.8b, v4.8b // top-bottom usubl2 v3.8h, v3.16b, v4.16b mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor mla v21.8h, v1.8h, v7.8h // (left flipped) mla v22.8h, v0.8h, v6.8h mla v23.8h, v0.8h, v7.8h shll v24.8h, v4.8b, #8 // bottom*256 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v25.8h, v3.8h, v16.8h mla v26.8h, v2.8h, v17.8h mla v27.8h, v3.8h, v17.8h uhadd v20.8h, v20.8h, v24.8h uhadd v21.8h, v21.8h, v25.8h uhadd v22.8h, v22.8h, v26.8h uhadd v23.8h, v23.8h, v27.8h rshrn v20.8b, v20.8h, #8 rshrn2 v20.16b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn2 v22.16b, v23.8h, #8 subs w3, w3, #16 st1 {v20.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 b.gt 2b subs w4, w4, #2 b.le 9f sub x8, x8, w9, uxtw sub x10, x10, w9, uxtw add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_tbl): .hword L(ipred_smooth_tbl) - 640b .hword L(ipred_smooth_tbl) - 320b .hword L(ipred_smooth_tbl) - 160b .hword L(ipred_smooth_tbl) - 80b .hword L(ipred_smooth_tbl) - 40b endfunc // void ipred_smooth_v_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_v_neon, export=1 movrel x7, X(sm_weights) add x7, x7, w4, uxtw clz w9, w3 adr x5, L(ipred_smooth_v_tbl) sub x8, x2, w4, uxtw sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.16b}, [x8] // bottom add x2, x2, #1 sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: ld1r {v6.2s}, [x2] // top usubl v6.8h, v6.8b, v4.8b // top-bottom 4: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver shll v22.8h, v4.8b, #8 // bottom*256 shll v23.8h, v4.8b, #8 zip1 v16.2s, v16.2s, v17.2s // weights_ver zip1 v18.2s, v18.2s, v19.2s uxtl v16.8h, v16.8b // weights_ver uxtl v18.8h, v18.8b mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v23.8h, v6.8h, v18.8h rshrn v22.8b, v22.8h, #8 rshrn v23.8b, v23.8h, #8 st1 {v22.s}[0], [x0], x1 st1 {v22.s}[1], [x6], x1 subs w4, w4, #4 st1 {v23.s}[0], [x0], x1 st1 {v23.s}[1], [x6], x1 b.gt 4b ret 80: ld1 {v6.8b}, [x2] // top usubl v6.8h, v6.8b, v4.8b // top-bottom 8: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver shll v24.8h, v4.8b, #8 // bottom*256 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v25.8h, v6.8h, v17.8h mla v26.8h, v6.8h, v18.8h mla v27.8h, v6.8h, v19.8h rshrn v24.8b, v24.8h, #8 rshrn v25.8b, v25.8h, #8 rshrn v26.8b, v26.8h, #8 rshrn v27.8b, v27.8h, #8 st1 {v24.8b}, [x0], x1 st1 {v25.8b}, [x6], x1 subs w4, w4, #4 st1 {v26.8b}, [x0], x1 st1 {v27.8b}, [x6], x1 b.gt 8b ret 160: 320: 640: // Set up pointers for four rows in parallel; x0, x6, x5, x8 add x5, x0, x1 add x8, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw mov w9, w3 1: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b 2: ld1 {v3.16b}, [x2], #16 // top shll v20.8h, v4.8b, #8 // bottom*256 shll v21.8h, v4.8b, #8 shll v22.8h, v4.8b, #8 shll v23.8h, v4.8b, #8 shll v24.8h, v4.8b, #8 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 usubl v2.8h, v3.8b, v4.8b // top-bottom usubl2 v3.8h, v3.16b, v4.16b mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v21.8h, v3.8h, v16.8h mla v22.8h, v2.8h, v17.8h mla v23.8h, v3.8h, v17.8h mla v24.8h, v2.8h, v18.8h mla v25.8h, v3.8h, v18.8h mla v26.8h, v2.8h, v19.8h mla v27.8h, v3.8h, v19.8h rshrn v20.8b, v20.8h, #8 rshrn2 v20.16b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn2 v22.16b, v23.8h, #8 rshrn v24.8b, v24.8h, #8 rshrn2 v24.16b, v25.8h, #8 rshrn v26.8b, v26.8h, #8 rshrn2 v26.16b, v27.8h, #8 subs w3, w3, #16 st1 {v20.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 st1 {v24.16b}, [x5], #16 st1 {v26.16b}, [x8], #16 b.gt 2b subs w4, w4, #4 b.le 9f sub x2, x2, w9, uxtw add x0, x0, x1 add x6, x6, x1 add x5, x5, x1 add x8, x8, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_v_tbl): .hword L(ipred_smooth_v_tbl) - 640b .hword L(ipred_smooth_v_tbl) - 320b .hword L(ipred_smooth_v_tbl) - 160b .hword L(ipred_smooth_v_tbl) - 80b .hword L(ipred_smooth_v_tbl) - 40b endfunc // void ipred_smooth_h_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_h_neon, export=1 movrel x8, X(sm_weights) add x8, x8, w3, uxtw clz w9, w3 adr x5, L(ipred_smooth_h_tbl) add x12, x2, w3, uxtw sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v5.16b}, [x12] // right sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: ld1r {v7.2s}, [x8] // weights_hor sub x2, x2, #4 mov x7, #-4 uxtl v7.8h, v7.8b // weights_hor 4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 zip1 v1.2s, v1.2s, v0.2s // left, flipped zip1 v0.2s, v3.2s, v2.2s usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v1.8h, v7.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 st1 {v20.s}[0], [x0], x1 st1 {v20.s}[1], [x6], x1 subs w4, w4, #4 st1 {v21.s}[0], [x0], x1 st1 {v21.s}[1], [x6], x1 b.gt 4b ret 80: ld1 {v7.8b}, [x8] // weights_hor sub x2, x2, #4 mov x7, #-4 uxtl v7.8h, v7.8b // weights_hor 8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 usubl v3.8h, v3.8b, v5.8b // left-right usubl v2.8h, v2.8b, v5.8b usubl v1.8h, v1.8b, v5.8b usubl v0.8h, v0.8b, v5.8b mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v2.8h, v7.8h // (left flipped) mla v22.8h, v1.8h, v7.8h mla v23.8h, v0.8h, v7.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn v23.8b, v23.8h, #8 st1 {v20.8b}, [x0], x1 st1 {v21.8b}, [x6], x1 subs w4, w4, #4 st1 {v22.8b}, [x0], x1 st1 {v23.8b}, [x6], x1 b.gt 8b ret 160: 320: 640: sub x2, x2, #4 mov x7, #-4 // Set up pointers for four rows in parallel; x0, x6, x5, x10 add x5, x0, x1 add x10, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw mov w9, w3 1: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b usubl v2.8h, v2.8b, v5.8b usubl v3.8h, v3.8b, v5.8b 2: ld1 {v7.16b}, [x8], #16 // weights_hor shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 shll v24.8h, v5.8b, #8 shll v25.8h, v5.8b, #8 shll v26.8h, v5.8b, #8 shll v27.8h, v5.8b, #8 uxtl v6.8h, v7.8b // weights_hor uxtl2 v7.8h, v7.16b mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor mla v21.8h, v3.8h, v7.8h // (left flipped) mla v22.8h, v2.8h, v6.8h mla v23.8h, v2.8h, v7.8h mla v24.8h, v1.8h, v6.8h mla v25.8h, v1.8h, v7.8h mla v26.8h, v0.8h, v6.8h mla v27.8h, v0.8h, v7.8h rshrn v20.8b, v20.8h, #8 rshrn2 v20.16b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn2 v22.16b, v23.8h, #8 rshrn v24.8b, v24.8h, #8 rshrn2 v24.16b, v25.8h, #8 rshrn v26.8b, v26.8h, #8 rshrn2 v26.16b, v27.8h, #8 subs w3, w3, #16 st1 {v20.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 st1 {v24.16b}, [x5], #16 st1 {v26.16b}, [x10], #16 b.gt 2b subs w4, w4, #4 b.le 9f sub x8, x8, w9, uxtw add x0, x0, x1 add x6, x6, x1 add x5, x5, x1 add x10, x10, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_h_tbl): .hword L(ipred_smooth_h_tbl) - 640b .hword L(ipred_smooth_h_tbl) - 320b .hword L(ipred_smooth_h_tbl) - 160b .hword L(ipred_smooth_h_tbl) - 80b .hword L(ipred_smooth_h_tbl) - 40b endfunc // void ipred_filter_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, // const int max_width, const int max_height); function ipred_filter_neon, export=1 and w5, w5, #511 movrel x6, X(filter_intra_taps) lsl w5, w5, #6 add x6, x6, w5, uxtw ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 clz w9, w3 adr x5, L(ipred_filter_tbl) ld1 {v20.8b, v21.8b, v22.8b}, [x6] sub w9, w9, #26 ldrh w9, [x5, w9, uxtw #1] sxtl v16.8h, v16.8b sxtl v17.8h, v17.8b sub x5, x5, w9, uxtw sxtl v18.8h, v18.8b sxtl v19.8h, v19.8b add x6, x0, x1 lsl x1, x1, #1 sxtl v20.8h, v20.8b sxtl v21.8h, v21.8b sxtl v22.8h, v22.8b br x5 40: ldur s0, [x2, #1] // top (0-3) sub x2, x2, #2 mov x7, #-2 uxtl v0.8h, v0.8b // top (0-3) 4: ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) uxtl v1.8h, v1.8b // left (0-1) + topleft (2) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) sqrshrun v2.8b, v2.8h, #4 subs w4, w4, #2 st1 {v2.s}[0], [x0], x1 uxtl v0.8h, v2.8b st1 {v2.s}[1], [x6], x1 ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3] b.gt 4b ret 80: ldur d0, [x2, #1] // top (0-7) sub x2, x2, #2 mov x7, #-2 uxtl v0.8h, v0.8b // top (0-7) 8: ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) uxtl v1.8h, v1.8b // left (0-1) + topleft (2) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) sqrshrun v2.8b, v2.8h, #4 uxtl v1.8h, v2.8b // first block, in 16 bit mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5) mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6) sqrshrun v3.8b, v3.8h, #4 subs w4, w4, #2 st2 {v2.s, v3.s}[0], [x0], x1 zip2 v0.2s, v2.2s, v3.2s st2 {v2.s, v3.s}[1], [x6], x1 uxtl v0.8h, v0.8b b.gt 8b ret 160: 320: add x8, x2, #1 sub x2, x2, #2 mov x7, #-2 sub x1, x1, w3, uxtw mov w9, w3 1: ld1 {v0.s}[0], [x2], x7 // left (0-1) + topleft (2) uxtl v0.8h, v0.8b // left (0-1) + topleft (2) 2: ld1 {v2.16b}, [x8], #16 // top(0-15) mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) uxtl v1.8h, v2.8b // top(0-7) uxtl2 v2.8h, v2.16b // top(8-15) mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) sqrshrun v3.8b, v3.8h, #4 uxtl v0.8h, v3.8b // first block, in 16 bit mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) sqrshrun v4.8b, v4.8h, #4 uxtl v0.8h, v4.8b // second block, in 16 bit mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) sqrshrun v5.8b, v5.8h, #4 uxtl v0.8h, v5.8b // third block, in 16 bit mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) subs w3, w3, #16 sqrshrun v6.8b, v6.8h, #4 ins v0.h[2], v2.h[7] st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16 ins v0.b[0], v6.b[7] st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16 ins v0.b[2], v6.b[3] b.gt 2b subs w4, w4, #2 b.le 9f sub x8, x6, w9, uxtw add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret L(ipred_filter_tbl): .hword L(ipred_filter_tbl) - 320b .hword L(ipred_filter_tbl) - 160b .hword L(ipred_filter_tbl) - 80b .hword L(ipred_filter_tbl) - 40b endfunc // void pal_pred_neon(pixel *dst, const ptrdiff_t stride, // const uint16_t *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_neon, export=1 ld1 {v0.8h}, [x2] clz w9, w4 adr x6, L(pal_pred_tbl) sub w9, w9, #25 ldrh w9, [x6, w9, uxtw #1] xtn v0.8b, v0.8h sub x6, x6, w9, uxtw add x2, x0, x1 lsl x1, x1, #1 br x6 4: ld1 {v1.16b}, [x3], #16 subs w5, w5, #4 tbl v1.16b, {v0.16b}, v1.16b st1 {v1.s}[0], [x0], x1 st1 {v1.s}[1], [x2], x1 st1 {v1.s}[2], [x0], x1 st1 {v1.s}[3], [x2], x1 b.gt 4b ret 8: ld1 {v1.16b, v2.16b}, [x3], #32 subs w5, w5, #4 tbl v1.16b, {v0.16b}, v1.16b st1 {v1.d}[0], [x0], x1 tbl v2.16b, {v0.16b}, v2.16b st1 {v1.d}[1], [x2], x1 st1 {v2.d}[0], [x0], x1 st1 {v2.d}[1], [x2], x1 b.gt 8b ret 16: ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64 subs w5, w5, #4 tbl v1.16b, {v0.16b}, v1.16b tbl v2.16b, {v0.16b}, v2.16b st1 {v1.16b}, [x0], x1 tbl v3.16b, {v0.16b}, v3.16b st1 {v2.16b}, [x2], x1 tbl v4.16b, {v0.16b}, v4.16b st1 {v3.16b}, [x0], x1 st1 {v4.16b}, [x2], x1 b.gt 16b ret 32: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 subs w5, w5, #4 tbl v16.16b, {v0.16b}, v16.16b tbl v17.16b, {v0.16b}, v17.16b tbl v18.16b, {v0.16b}, v18.16b tbl v19.16b, {v0.16b}, v19.16b tbl v20.16b, {v0.16b}, v20.16b st1 {v16.16b, v17.16b}, [x0], x1 tbl v21.16b, {v0.16b}, v21.16b st1 {v18.16b, v19.16b}, [x2], x1 tbl v22.16b, {v0.16b}, v22.16b st1 {v20.16b, v21.16b}, [x0], x1 tbl v23.16b, {v0.16b}, v23.16b st1 {v22.16b, v23.16b}, [x2], x1 b.gt 32b ret 64: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 subs w5, w5, #2 tbl v16.16b, {v0.16b}, v16.16b tbl v17.16b, {v0.16b}, v17.16b tbl v18.16b, {v0.16b}, v18.16b tbl v19.16b, {v0.16b}, v19.16b st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 tbl v20.16b, {v0.16b}, v20.16b tbl v21.16b, {v0.16b}, v21.16b tbl v22.16b, {v0.16b}, v22.16b tbl v23.16b, {v0.16b}, v23.16b st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1 b.gt 64b ret L(pal_pred_tbl): .hword L(pal_pred_tbl) - 64b .hword L(pal_pred_tbl) - 32b .hword L(pal_pred_tbl) - 16b .hword L(pal_pred_tbl) - 8b .hword L(pal_pred_tbl) - 4b endfunc // void ipred_cfl_128_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_128_neon, export=1 clz w9, w3 adr x7, L(ipred_cfl_128_tbl) sub w9, w9, #26 ldrh w9, [x7, w9, uxtw #1] movi v0.8h, #128 // dc dup v1.8h, w6 // alpha sub x7, x7, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x7 L(ipred_cfl_splat_w4): ld1 {v2.8h, v3.8h}, [x5], #32 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h sshr v4.8h, v2.8h, #15 // sign = diff >> 15 sshr v5.8h, v3.8h, #15 add v2.8h, v2.8h, v4.8h // diff + sign add v3.8h, v3.8h, v5.8h srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() srshr v3.8h, v3.8h, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) sqxtun v3.8b, v3.8h st1 {v2.s}[0], [x0], x1 st1 {v2.s}[1], [x6], x1 subs w4, w4, #4 st1 {v3.s}[0], [x0], x1 st1 {v3.s}[1], [x6], x1 b.gt L(ipred_cfl_splat_w4) ret L(ipred_cfl_splat_w8): ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h mul v4.8h, v4.8h, v1.8h mul v5.8h, v5.8h, v1.8h sshr v16.8h, v2.8h, #15 // sign = diff >> 15 sshr v17.8h, v3.8h, #15 sshr v18.8h, v4.8h, #15 sshr v19.8h, v5.8h, #15 add v2.8h, v2.8h, v16.8h // diff + sign add v3.8h, v3.8h, v17.8h add v4.8h, v4.8h, v18.8h add v5.8h, v5.8h, v19.8h srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() srshr v3.8h, v3.8h, #6 srshr v4.8h, v4.8h, #6 srshr v5.8h, v5.8h, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h add v4.8h, v4.8h, v0.8h add v5.8h, v5.8h, v0.8h sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) sqxtun v3.8b, v3.8h sqxtun v4.8b, v4.8h sqxtun v5.8b, v5.8h st1 {v2.8b}, [x0], x1 st1 {v3.8b}, [x6], x1 subs w4, w4, #4 st1 {v4.8b}, [x0], x1 st1 {v5.8b}, [x6], x1 b.gt L(ipred_cfl_splat_w8) ret L(ipred_cfl_splat_w16): add x7, x5, w3, uxtw #1 sub x1, x1, w3, uxtw mov w9, w3 1: ld1 {v2.8h, v3.8h}, [x5], #32 ld1 {v4.8h, v5.8h}, [x7], #32 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h mul v4.8h, v4.8h, v1.8h mul v5.8h, v5.8h, v1.8h sshr v16.8h, v2.8h, #15 // sign = diff >> 15 sshr v17.8h, v3.8h, #15 sshr v18.8h, v4.8h, #15 sshr v19.8h, v5.8h, #15 add v2.8h, v2.8h, v16.8h // diff + sign add v3.8h, v3.8h, v17.8h add v4.8h, v4.8h, v18.8h add v5.8h, v5.8h, v19.8h srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() srshr v3.8h, v3.8h, #6 srshr v4.8h, v4.8h, #6 srshr v5.8h, v5.8h, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h add v4.8h, v4.8h, v0.8h add v5.8h, v5.8h, v0.8h sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) sqxtun v3.8b, v3.8h sqxtun v4.8b, v4.8h sqxtun v5.8b, v5.8h subs w3, w3, #16 st1 {v2.8b, v3.8b}, [x0], #16 st1 {v4.8b, v5.8b}, [x6], #16 b.gt 1b subs w4, w4, #2 add x5, x5, w9, uxtw #1 add x7, x7, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 mov w3, w9 b.gt 1b ret L(ipred_cfl_128_tbl): L(ipred_cfl_splat_tbl): .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) endfunc // void ipred_cfl_top_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_top_neon, export=1 clz w9, w3 adr x7, L(ipred_cfl_top_tbl) sub w9, w9, #26 ldrh w9, [x7, w9, uxtw #1] dup v1.8h, w6 // alpha add x2, x2, #1 sub x7, x7, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x7 4: ld1r {v0.2s}, [x2] uaddlv h0, v0.8b urshr v0.8h, v0.8h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) 8: ld1 {v0.8b}, [x2] uaddlv h0, v0.8b urshr v0.8h, v0.8h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) 16: ld1 {v0.16b}, [x2] uaddlv h0, v0.16b urshr v0.8h, v0.8h, #4 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) 32: ld1 {v2.16b, v3.16b}, [x2] uaddlv h2, v2.16b uaddlv h3, v3.16b add v2.4h, v2.4h, v3.4h urshr v2.8h, v2.8h, #5 dup v0.8h, v2.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_top_tbl): .hword L(ipred_cfl_top_tbl) - 32b .hword L(ipred_cfl_top_tbl) - 16b .hword L(ipred_cfl_top_tbl) - 8b .hword L(ipred_cfl_top_tbl) - 4b endfunc // void ipred_cfl_left_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_left_neon, export=1 sub x2, x2, w4, uxtw clz w9, w3 clz w8, w4 adr x10, L(ipred_cfl_splat_tbl) adr x7, L(ipred_cfl_left_tbl) sub w9, w9, #26 sub w8, w8, #26 ldrh w9, [x10, w9, uxtw #1] ldrh w8, [x7, w8, uxtw #1] dup v1.8h, w6 // alpha sub x9, x10, w9, uxtw sub x7, x7, w8, uxtw add x6, x0, x1 lsl x1, x1, #1 br x7 L(ipred_cfl_left_h4): ld1r {v0.2s}, [x2] uaddlv h0, v0.8b urshr v0.8h, v0.8h, #3 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h8): ld1 {v0.8b}, [x2] uaddlv h0, v0.8b urshr v0.8h, v0.8h, #3 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h16): ld1 {v0.16b}, [x2] uaddlv h0, v0.16b urshr v0.8h, v0.8h, #4 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h32): ld1 {v2.16b, v3.16b}, [x2] uaddlv h2, v2.16b uaddlv h3, v3.16b add v2.4h, v2.4h, v3.4h urshr v2.8h, v2.8h, #5 dup v0.8h, v2.h[0] br x9 L(ipred_cfl_left_tbl): .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) endfunc // void ipred_cfl_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_neon, export=1 sub x2, x2, w4, uxtw add w8, w3, w4 // width + height dup v1.8h, w6 // alpha clz w9, w3 clz w6, w4 dup v16.8h, w8 // width + height adr x7, L(ipred_cfl_tbl) rbit w8, w8 // rbit(width + height) sub w9, w9, #22 // 22 leading bits, minus table offset 4 sub w6, w6, #26 clz w8, w8 // ctz(width + height) ldrh w9, [x7, w9, uxtw #1] ldrh w6, [x7, w6, uxtw #1] neg w8, w8 // -ctz(width + height) sub x9, x7, w9, uxtw sub x7, x7, w6, uxtw ushr v16.8h, v16.8h, #1 // (width + height) >> 1 dup v17.8h, w8 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 br x7 L(ipred_cfl_h4): ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr uaddlv h0, v0.8b br x9 L(ipred_cfl_w4): add x2, x2, #1 ld1 {v2.s}[0], [x2] ins v2.s[1], wzr add v0.4h, v0.4h, v16.4h uaddlv h2, v2.8b cmp w4, #4 add v0.4h, v0.4h, v2.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 8/16 mov w16, #(0x3334/2) movk w16, #(0x5556/2), lsl #16 add w17, w4, w4 // w17 = 2*h = 16 or 32 lsr w16, w16, w17 dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) L(ipred_cfl_h8): ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b br x9 L(ipred_cfl_w8): add x2, x2, #1 ld1 {v2.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.8b cmp w4, #8 add v0.4h, v0.4h, v2.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) L(ipred_cfl_h16): ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b br x9 L(ipred_cfl_w16): add x2, x2, #1 ld1 {v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b cmp w4, #16 add v0.4h, v0.4h, v2.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/8/32 cmp w4, #4 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_h32): ld1 {v2.16b, v3.16b}, [x2], #32 uaddlv h2, v2.16b uaddlv h3, v3.16b add v0.4h, v2.4h, v3.4h br x9 L(ipred_cfl_w32): add x2, x2, #1 ld1 {v2.16b, v3.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b uaddlv h3, v3.16b cmp w4, #32 add v0.4h, v0.4h, v2.4h add v0.4h, v0.4h, v3.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 8/16 mov w16, #(0x5556/2) movk w16, #(0x3334/2), lsl #16 add w17, w4, w4 // w17 = 2*h = 16 or 32 lsr w16, w16, w17 dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_tbl): .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) endfunc // void cfl_ac_420_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_420_tbl) sub w8, w8, #27 ldrh w8, [x7, w8, uxtw #1] sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz movi v16.4s, #1 add x10, x1, x2 lsl x2, x2, #1 dup v17.4s, w9 sshl v16.4s, v16.4s, v17.4s // 1 << log2sz neg v17.4s, v17.4s // -log2sz ushr v16.4s, v16.4s, #1 // 1 << (log2sz - 1) mov w9, w6 br x7 L(ipred_cfl_ac_420_w4): 1: // Copy and subsample input ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x10], x2 ld1 {v0.d}[1], [x1], x2 ld1 {v1.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 subs w8, w8, #2 st1 {v0.8h}, [x0], #16 b.gt 1b trn2 v1.2d, v0.2d, v0.2d trn2 v0.2d, v0.2d, v0.2d L(ipred_cfl_ac_420_w4_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 b.gt 2b 3: sub x0, x0, w6, uxtw #3 // Sum the produced ac values subs w6, w6, #4 ld1 {v0.8h, v1.8h}, [x0], #32 b.le 5f 4: ld1 {v2.8h, v3.8h}, [x0], #32 subs w6, w6, #4 add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h b.gt 4b 5: add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h // sum sub x0, x0, w9, uxtw #3 add v0.2s, v0.2s, v16.2s // sum += 1 << (log2sz - 1) ushl v4.2s, v0.2s, v17.2s // sum >>= log2sz dup v4.8h, v4.h[0] 6: // Subtract dc from ac ld1 {v0.8h, v1.8h}, [x0] subs w9, w9, #4 sub v0.8h, v0.8h, v4.8h sub v1.8h, v1.8h, v4.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 6b ret L(ipred_cfl_ac_420_w8): cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.16b}, [x1], x2 ld1 {v1.16b}, [x10], x2 ld1 {v2.16b}, [x1], x2 uaddlp v0.8h, v0.16b ld1 {v3.16b}, [x10], x2 uaddlp v1.8h, v1.16b uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b add v0.8h, v0.8h, v1.8h add v2.8h, v2.8h, v3.8h shl v0.8h, v0.8h, #1 shl v1.8h, v2.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h}, [x0], #32 b.gt 1b mov v0.16b, v1.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_420_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x10], x2 ld1 {v0.d}[1], [x1], x2 ld1 {v1.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 dup v1.4h, v0.h[3] dup v3.4h, v0.h[7] trn2 v2.2d, v0.2d, v0.2d subs w8, w8, #2 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 b.gt 1b trn1 v0.2d, v2.2d, v3.2d trn1 v1.2d, v2.2d, v3.2d L(ipred_cfl_ac_420_w8_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 st1 {v0.8h, v1.8h}, [x0], #32 b.gt 2b 3: L(ipred_cfl_ac_420_w8_calc_subtract_dc): sub x0, x0, w6, uxtw #4 // Sum the produced ac values subs w6, w6, #4 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.le 5f 4: ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 subs w6, w6, #4 add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h b.gt 4b 5: add v0.8h, v0.8h, v1.8h add v2.8h, v2.8h, v3.8h uaddlp v0.4s, v0.8h uaddlp v2.4s, v2.8h add v0.4s, v0.4s, v2.4s addv s0, v0.4s // sum sub x0, x0, w9, uxtw #4 add v0.2s, v0.2s, v16.2s // sum += 1 << (log2sz - 1) ushl v4.2s, v0.2s, v17.2s // sum >>= log2sz dup v4.8h, v4.h[0] 6: // Subtract dc from ac ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] subs w9, w9, #4 sub v0.8h, v0.8h, v4.8h sub v1.8h, v1.8h, v4.8h sub v2.8h, v2.8h, v4.8h sub v3.8h, v3.8h, v4.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 6b ret L(ipred_cfl_ac_420_w16): adr x7, L(ipred_cfl_ac_420_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_420_w16_wpad0): 1: // Copy and subsample input, without padding ld1 {v0.16b, v1.16b}, [x1], x2 ld1 {v2.16b, v3.16b}, [x10], x2 uaddlp v0.8h, v0.16b ld1 {v4.16b, v5.16b}, [x1], x2 uaddlp v1.8h, v1.16b ld1 {v6.16b, v7.16b}, [x10], x2 uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b uaddlp v4.8h, v4.16b uaddlp v5.8h, v5.16b uaddlp v6.8h, v6.16b uaddlp v7.8h, v7.16b add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h add v4.8h, v4.8h, v6.8h add v5.8h, v5.8h, v7.8h shl v0.8h, v0.8h, #1 shl v1.8h, v1.8h, #1 shl v2.8h, v4.8h, #1 shl v3.8h, v5.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad1): 1: // Copy and subsample input, padding 4 ldr d1, [x1, #16] ld1 {v0.16b}, [x1], x2 ldr d3, [x10, #16] ld1 {v2.16b}, [x10], x2 uaddlp v1.4h, v1.8b ldr d5, [x1, #16] uaddlp v0.8h, v0.16b ld1 {v4.16b}, [x1], x2 uaddlp v3.4h, v3.8b ldr d7, [x10, #16] uaddlp v2.8h, v2.16b ld1 {v6.16b}, [x10], x2 uaddlp v5.4h, v5.8b uaddlp v4.8h, v4.16b uaddlp v7.4h, v7.8b uaddlp v6.8h, v6.16b add v1.4h, v1.4h, v3.4h add v0.8h, v0.8h, v2.8h add v5.4h, v5.4h, v7.4h add v4.8h, v4.8h, v6.8h shl v1.4h, v1.4h, #1 shl v0.8h, v0.8h, #1 shl v3.4h, v5.4h, #1 shl v2.8h, v4.8h, #1 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad2): 1: // Copy and subsample input, padding 8 ld1 {v0.16b}, [x1], x2 ld1 {v2.16b}, [x10], x2 ld1 {v4.16b}, [x1], x2 uaddlp v0.8h, v0.16b ld1 {v6.16b}, [x10], x2 uaddlp v2.8h, v2.16b uaddlp v4.8h, v4.16b uaddlp v6.8h, v6.16b add v0.8h, v0.8h, v2.8h add v4.8h, v4.8h, v6.8h shl v0.8h, v0.8h, #1 shl v2.8h, v4.8h, #1 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad3): 1: // Copy and subsample input, padding 12 ld1 {v0.8b}, [x1], x2 ld1 {v2.8b}, [x10], x2 ld1 {v4.8b}, [x1], x2 uaddlp v0.4h, v0.8b ld1 {v6.8b}, [x10], x2 uaddlp v2.4h, v2.8b uaddlp v4.4h, v4.8b uaddlp v6.4h, v6.8b add v0.4h, v0.4h, v2.4h add v4.4h, v4.4h, v6.4h shl v0.4h, v0.4h, #1 shl v2.4h, v4.4h, #1 dup v1.8h, v0.h[3] dup v3.8h, v2.h[3] trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v3.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 2b 3: // Double the height and reuse the w8 summing/subtracting lsl w6, w6, #1 lsl w9, w9, #1 b L(ipred_cfl_ac_420_w8_calc_subtract_dc) L(ipred_cfl_ac_420_tbl): .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) .hword 0 L(ipred_cfl_ac_420_w16_tbl): .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) endfunc // void cfl_ac_422_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_422_tbl) sub w8, w8, #27 ldrh w8, [x7, w8, uxtw #1] sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz movi v16.4s, #1 add x10, x1, x2 lsl x2, x2, #1 dup v17.4s, w9 sshl v16.4s, v16.4s, v17.4s // 1 << log2sz neg v17.4s, v17.4s // -log2sz ushr v16.4s, v16.4s, #1 // 1 << (log2sz - 1) mov w9, w6 br x7 L(ipred_cfl_ac_422_w4): 1: // Copy and subsample input ld1 {v0.8b}, [x1], x2 ld1 {v0.d}[1], [x10], x2 ld1 {v1.8b}, [x1], x2 ld1 {v1.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b shl v0.8h, v0.8h, #2 shl v1.8h, v1.8h, #2 subs w8, w8, #4 st1 {v0.8h, v1.8h}, [x0], #32 b.gt 1b trn2 v0.2d, v1.2d, v1.2d trn2 v1.2d, v1.2d, v1.2d b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_422_w8): cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.16b}, [x1], x2 ld1 {v1.16b}, [x10], x2 ld1 {v2.16b}, [x1], x2 uaddlp v0.8h, v0.16b ld1 {v3.16b}, [x10], x2 uaddlp v1.8h, v1.16b uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b shl v0.8h, v0.8h, #2 shl v1.8h, v1.8h, #2 shl v2.8h, v2.8h, #2 shl v3.8h, v3.8h, #2 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8b}, [x1], x2 ld1 {v0.d}[1], [x10], x2 ld1 {v2.8b}, [x1], x2 ld1 {v2.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v2.8h, v2.16b shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v4.4h, v0.h[3] dup v5.8h, v0.h[7] dup v6.4h, v2.h[3] dup v7.8h, v2.h[7] trn2 v1.2d, v0.2d, v5.2d trn1 v0.2d, v0.2d, v4.2d trn2 v3.2d, v2.2d, v7.2d trn1 v2.2d, v2.2d, v6.2d subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w16): adr x7, L(ipred_cfl_ac_422_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_422_w16_wpad0): 1: // Copy and subsample input, without padding ld1 {v0.16b, v1.16b}, [x1], x2 ld1 {v2.16b, v3.16b}, [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b shl v0.8h, v0.8h, #2 shl v1.8h, v1.8h, #2 shl v2.8h, v2.8h, #2 shl v3.8h, v3.8h, #2 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad1): 1: // Copy and subsample input, padding 4 ldr d1, [x1, #16] ld1 {v0.16b}, [x1], x2 ldr d3, [x10, #16] ld1 {v2.16b}, [x10], x2 uaddlp v1.4h, v1.8b uaddlp v0.8h, v0.16b uaddlp v3.4h, v3.8b uaddlp v2.8h, v2.16b shl v1.4h, v1.4h, #2 shl v0.8h, v0.8h, #2 shl v3.4h, v3.4h, #2 shl v2.8h, v2.8h, #2 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad2): 1: // Copy and subsample input, padding 8 ld1 {v0.16b}, [x1], x2 ld1 {v2.16b}, [x10], x2 uaddlp v0.8h, v0.16b uaddlp v2.8h, v2.16b shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad3): 1: // Copy and subsample input, padding 12 ld1 {v0.8b}, [x1], x2 ld1 {v2.8b}, [x10], x2 uaddlp v0.4h, v0.8b uaddlp v2.4h, v2.8b shl v0.4h, v0.4h, #2 shl v2.4h, v2.4h, #2 dup v1.8h, v0.h[3] dup v3.8h, v2.h[3] trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v3.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_tbl): .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) .hword 0 L(ipred_cfl_ac_422_w16_tbl): .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) endfunc