ref: f7743da199f9ccca9e15e8faf37c0bfa36d482f1
parent: a91a03b0e143f77bd647b481aff034eb3315ee7b
author: Martin Storsjö <martin@martin.st>
date: Tue Sep 17 20:07:08 EDT 2019
arm64: ipred: NEON implementation of dc/h/v prediction modes Relative speedups over the C code: Cortex A53 A72 A73 intra_pred_dc_128_w4_8bpc_neon: 2.08 1.47 2.17 intra_pred_dc_128_w8_8bpc_neon: 3.33 2.49 4.03 intra_pred_dc_128_w16_8bpc_neon: 3.93 3.86 3.75 intra_pred_dc_128_w32_8bpc_neon: 3.14 3.79 2.90 intra_pred_dc_128_w64_8bpc_neon: 3.68 1.97 2.42 intra_pred_dc_left_w4_8bpc_neon: 2.41 1.70 2.23 intra_pred_dc_left_w8_8bpc_neon: 3.53 2.41 3.32 intra_pred_dc_left_w16_8bpc_neon: 3.87 3.54 3.34 intra_pred_dc_left_w32_8bpc_neon: 4.10 3.60 2.76 intra_pred_dc_left_w64_8bpc_neon: 3.72 2.00 2.39 intra_pred_dc_top_w4_8bpc_neon: 2.27 1.66 2.07 intra_pred_dc_top_w8_8bpc_neon: 3.83 2.69 3.43 intra_pred_dc_top_w16_8bpc_neon: 3.66 3.60 3.20 intra_pred_dc_top_w32_8bpc_neon: 3.92 3.54 2.66 intra_pred_dc_top_w64_8bpc_neon: 3.60 1.98 2.30 intra_pred_dc_w4_8bpc_neon: 2.29 1.42 2.16 intra_pred_dc_w8_8bpc_neon: 3.56 2.83 3.05 intra_pred_dc_w16_8bpc_neon: 3.46 3.37 3.15 intra_pred_dc_w32_8bpc_neon: 3.79 3.41 2.74 intra_pred_dc_w64_8bpc_neon: 3.52 2.01 2.41 intra_pred_h_w4_8bpc_neon: 10.34 5.74 5.94 intra_pred_h_w8_8bpc_neon: 12.13 6.33 6.43 intra_pred_h_w16_8bpc_neon: 10.66 7.31 5.85 intra_pred_h_w32_8bpc_neon: 6.28 4.18 2.88 intra_pred_h_w64_8bpc_neon: 3.96 1.85 1.75 intra_pred_v_w4_8bpc_neon: 11.44 6.12 7.57 intra_pred_v_w8_8bpc_neon: 14.76 7.58 7.95 intra_pred_v_w16_8bpc_neon: 11.34 6.28 5.88 intra_pred_v_w32_8bpc_neon: 6.56 3.33 3.34 intra_pred_v_w64_8bpc_neon: 4.57 1.24 1.97
--- /dev/null
+++ b/src/arm/64/ipred.S
@@ -1,0 +1,692 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_128_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_dc_128_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ movi v0.16b, #128
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+4:
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+8:
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+16:
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ movi v1.16b, #128
+32:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ movi v1.16b, #128
+ movi v2.16b, #128
+ movi v3.16b, #128
+64:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_128_tbl):
+ .hword L(ipred_dc_128_tbl) - 640b
+ .hword L(ipred_dc_128_tbl) - 320b
+ .hword L(ipred_dc_128_tbl) - 16b
+ .hword L(ipred_dc_128_tbl) - 8b
+ .hword L(ipred_dc_128_tbl) - 4b
+endfunc
+
+// void ipred_v_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_v_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #1
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ ld1 {v0.s}[0], [x2]
+4:
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+ ld1 {v0.8b}, [x2]
+8:
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ ld1 {v0.16b}, [x2], #16
+16:
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ ld1 {v0.16b, v1.16b}, [x2]
+32:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+64:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_v_tbl):
+ .hword L(ipred_v_tbl) - 640b
+ .hword L(ipred_v_tbl) - 320b
+ .hword L(ipred_v_tbl) - 160b
+ .hword L(ipred_v_tbl) - 80b
+ .hword L(ipred_v_tbl) - 40b
+endfunc
+
+// void ipred_h_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_h_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ sub x2, x2, #4
+ sub x5, x5, w3, uxtw
+ mov x7, #-4
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+4:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ st1 {v3.s}[0], [x0], x1
+ st1 {v2.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v1.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+8:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ st1 {v3.8b}, [x0], x1
+ st1 {v2.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+16:
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+ st1 {v3.16b}, [x0], x1
+ st1 {v2.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+32:
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ st1 {v3.16b}, [x0], x1
+ st1 {v2.16b}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ st1 {v1.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 32b
+ ret
+64:
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ stp q3, q3, [x0, #32]
+ stp q2, q2, [x6, #32]
+ st1 {v3.16b}, [x0], x1
+ st1 {v2.16b}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ stp q1, q1, [x0, #32]
+ stp q0, q0, [x6, #32]
+ st1 {v1.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_h_tbl):
+ .hword L(ipred_h_tbl) - 64b
+ .hword L(ipred_h_tbl) - 32b
+ .hword L(ipred_h_tbl) - 16b
+ .hword L(ipred_h_tbl) - 8b
+ .hword L(ipred_h_tbl) - 4b
+endfunc
+
+// void ipred_dc_top_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_dc_top_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #1
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.8b, v0.b[0]
+4:
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.8b, v0.b[0]
+8:
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ rshrn v0.8b, v0.8h, #4
+ dup v0.16b, v0.b[0]
+16:
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ ld1 {v0.16b, v1.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add v2.4h, v0.4h, v1.4h
+ rshrn v2.8b, v2.8h, #5
+ dup v0.16b, v2.b[0]
+ dup v1.16b, v2.b[0]
+32:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v4.4h, v0.4h, v1.4h
+ add v5.4h, v2.4h, v3.4h
+ add v4.4h, v4.4h, v5.4h
+ rshrn v4.8b, v4.8h, #6
+ dup v0.16b, v4.b[0]
+ dup v1.16b, v4.b[0]
+ dup v2.16b, v4.b[0]
+ dup v3.16b, v4.b[0]
+64:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_top_tbl):
+ .hword L(ipred_dc_top_tbl) - 640b
+ .hword L(ipred_dc_top_tbl) - 320b
+ .hword L(ipred_dc_top_tbl) - 160b
+ .hword L(ipred_dc_top_tbl) - 80b
+ .hword L(ipred_dc_top_tbl) - 40b
+endfunc
+
+// void ipred_dc_left_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_neon, export=1
+ sub x2, x2, w4, uxtw
+ clz w3, w3
+ clz w7, w4
+ adr x5, L(ipred_dc_left_tbl)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w7, w7, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w7, [x5, w7, uxtw #1]
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w7, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_left_h4):
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w4):
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt L(ipred_dc_left_w4)
+ ret
+
+L(ipred_dc_left_h8):
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w8):
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt L(ipred_dc_left_w8)
+ ret
+
+L(ipred_dc_left_h16):
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ rshrn v0.8b, v0.8h, #4
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w16):
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt L(ipred_dc_left_w16)
+ ret
+
+L(ipred_dc_left_h32):
+ ld1 {v0.16b, v1.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add v0.4h, v0.4h, v1.4h
+ rshrn v0.8b, v0.8h, #5
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w32):
+ mov v1.16b, v0.16b
+1:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_h64):
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v0.4h, v0.4h, v1.4h
+ add v2.4h, v2.4h, v3.4h
+ add v0.4h, v0.4h, v2.4h
+ rshrn v0.8b, v0.8h, #6
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w64):
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+1:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_tbl):
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
+endfunc
+
+// void ipred_dc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_neon, export=1
+ sub x2, x2, w4, uxtw
+ add w7, w3, w4 // width + height
+ clz w3, w3
+ clz w6, w4
+ dup v16.8h, w7 // width + height
+ adr x5, L(ipred_dc_tbl)
+ rbit w7, w7 // rbit(width + height)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w6, w6, #25
+ clz w7, w7 // ctz(width + height)
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w6, [x5, w6, uxtw #1]
+ neg w7, w7 // -ctz(width + height)
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w6, uxtw
+ ushr v16.8h, v16.8h, #1 // (width + height) >> 1
+ dup v17.8h, w7 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_h4):
+ ld1 {v0.s}[0], [x2], #4
+ ins v0.s[1], wzr
+ uaddlv h0, v0.8b
+ br x3
+L(ipred_dc_w4):
+ add x2, x2, #1
+ ld1 {v1.s}[0], [x2]
+ ins v1.s[1], wzr
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.8b
+ cmp w4, #4
+ add v0.4h, v0.4h, v1.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16
+ mov w16, #(0x3334/2)
+ movk w16, #(0x5556/2), lsl #16
+ add w17, w4, w4 // w17 = 2*h = 16 or 32
+ lsr w16, w16, w17
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8b, v0.b[0]
+2:
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h8):
+ ld1 {v0.8b}, [x2], #8
+ uaddlv h0, v0.8b
+ br x3
+L(ipred_dc_w8):
+ add x2, x2, #1
+ ld1 {v1.8b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.8b
+ cmp w4, #8
+ add v0.4h, v0.4h, v1.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8b, v0.b[0]
+2:
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h16):
+ ld1 {v0.16b}, [x2], #16
+ uaddlv h0, v0.16b
+ br x3
+L(ipred_dc_w16):
+ add x2, x2, #1
+ ld1 {v1.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.16b
+ cmp w4, #16
+ add v0.4h, v0.4h, v1.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/8/32/64
+ tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.16b, v0.b[0]
+2:
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h32):
+ ld1 {v0.16b, v1.16b}, [x2], #32
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add v0.4h, v0.4h, v1.4h
+ br x3
+L(ipred_dc_w32):
+ add x2, x2, #1
+ ld1 {v1.16b, v2.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ cmp w4, #32
+ add v0.4h, v0.4h, v1.4h
+ add v0.4h, v0.4h, v2.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16/64
+ cmp w4, #8
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.16b, v0.b[0]
+ dup v1.16b, v0.b[0]
+2:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h64):
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v0.4h, v0.4h, v1.4h
+ add v2.4h, v2.4h, v3.4h
+ add v0.4h, v0.4h, v2.4h
+ br x3
+L(ipred_dc_w64):
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+2:
+ add x2, x2, #1
+ ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ uaddlv h4, v4.16b
+ add v1.4h, v1.4h, v2.4h
+ add v3.4h, v3.4h, v4.4h
+ cmp w4, #64
+ add v0.4h, v0.4h, v1.4h
+ add v0.4h, v0.4h, v3.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 16/32
+ mov w16, #(0x5556/2)
+ movk w16, #(0x3334/2), lsl #16
+ lsr w16, w16, w4
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.16b, v0.b[0]
+ dup v1.16b, v0.b[0]
+ dup v2.16b, v0.b[0]
+ dup v3.16b, v0.b[0]
+2:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_tbl):
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
+endfunc
--- /dev/null
+++ b/src/arm/ipred_init_tmpl.c
@@ -1,0 +1,50 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+decl_angular_ipred_fn(dav1d_ipred_dc_neon);
+decl_angular_ipred_fn(dav1d_ipred_dc_128_neon);
+decl_angular_ipred_fn(dav1d_ipred_dc_top_neon);
+decl_angular_ipred_fn(dav1d_ipred_dc_left_neon);
+decl_angular_ipred_fn(dav1d_ipred_h_neon);
+decl_angular_ipred_fn(dav1d_ipred_v_neon);
+
+COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if BITDEPTH == 8 && ARCH_AARCH64
+ c->intra_pred[DC_PRED] = dav1d_ipred_dc_neon;
+ c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_neon;
+ c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_neon;
+ c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_neon;
+ c->intra_pred[HOR_PRED] = dav1d_ipred_h_neon;
+ c->intra_pred[VERT_PRED] = dav1d_ipred_v_neon;
+#endif
+}
--- a/src/ipred.h
+++ b/src/ipred.h
@@ -89,6 +89,7 @@
} Dav1dIntraPredDSPContext;
bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c);
+bitfn_decls(void dav1d_intra_pred_dsp_init_arm, Dav1dIntraPredDSPContext *c);
bitfn_decls(void dav1d_intra_pred_dsp_init_x86, Dav1dIntraPredDSPContext *c);
#endif /* DAV1D_SRC_IPRED_H */
--- a/src/ipred_tmpl.c
+++ b/src/ipred_tmpl.c
@@ -751,7 +751,11 @@
c->pal_pred = pal_pred_c;
-#if HAVE_ASM && ARCH_X86
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ bitfn(dav1d_intra_pred_dsp_init_arm)(c);
+#elif ARCH_X86
bitfn(dav1d_intra_pred_dsp_init_x86)(c);
+#endif
#endif
}
--- a/src/meson.build
+++ b/src/meson.build
@@ -93,6 +93,7 @@
)
libdav1d_tmpl_sources += files(
'arm/cdef_init_tmpl.c',
+ 'arm/ipred_init_tmpl.c',
'arm/itx_init_tmpl.c',
'arm/loopfilter_init_tmpl.c',
'arm/looprestoration_init_tmpl.c',
@@ -101,6 +102,7 @@
if host_machine.cpu_family() == 'aarch64'
libdav1d_sources += files(
'arm/64/cdef.S',
+ 'arm/64/ipred.S',
'arm/64/itx.S',
'arm/64/loopfilter.S',
'arm/64/looprestoration.S',