ref: 91d324ebb5b293c4378176131e165bbaca8ffd58
parent: dcbbf775ea72bca6a342f25a490bb6d7a75fe548
author: B Krishnan Iyer <krishnaniyer97@gmail.com>
date: Thu Oct 10 15:48:16 EDT 2019
arm: ipred: NEON implementation of dc/h/v prediction functions A73 A53 Earlier Now Earlier Now intra_pred_dc_top_w64_8bpc_neon: 344.4 344.6 253.4 252.3
--- /dev/null
+++ b/src/arm/32/ipred.S
@@ -1,0 +1,825 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * Copyright © 2019, B Krishnan Iyer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_128_neon, export=1
+ push {r4, lr}
+ ldr r4, [sp, #8]
+ clz r3, r3
+ adr r2, L(ipred_dc_128_tbl)
+ sub r3, r3, #25
+ ldr r3, [r2, r3, lsl #2]
+ mov lr, #128
+ vdup.8 q0, lr
+ add r2, r2, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r2
+
+ .align 2
+L(ipred_dc_128_tbl):
+ .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 16f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+4:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4, pc}
+8:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4, pc}
+16:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vdup.8 q1, lr
+32:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vdup.8 q1, lr
+ vdup.8 q2, lr
+ vdup.8 q3, lr
+ sub r1, r1, #32
+64:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_v_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_neon, export=1
+ push {r4, lr}
+ ldr lr, [sp, #8]
+ clz r3, r3
+ adr r4, L(ipred_v_tbl)
+ sub r3, r3, #25
+ ldr r3, [r4, r3, lsl #2]
+ add r2, r2, #1
+ add r4, r4, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r4
+
+ .align 2
+L(ipred_v_tbl):
+ .word 640f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_v_tbl) + CONFIG_THUMB
+40:
+ vld1.32 {d0[0]}, [r2]
+4:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs lr, lr, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4, pc}
+80:
+ vld1.8 {d0}, [r2]
+8:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs lr, lr, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4, pc}
+160:
+ vld1.8 {q0}, [r2]
+16:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vld1.8 {q0, q1}, [r2]
+32:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vld1.8 {q0, q1}, [r2]!
+ sub r1, r1, #32
+ vld1.8 {q2, q3}, [r2]
+64:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_h_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_h_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ sub r2, r2, #4
+ mov lr, #-4
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_h_tbl):
+ .word 640f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_h_tbl) + CONFIG_THUMB
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
+ vst1.32 {d3[0]}, [r0, :32], r1
+ vst1.32 {d2[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5, pc}
+8:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
+ vst1.8 {d3}, [r0, :64], r1
+ vst1.8 {d2}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d1}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ add r2, r2, #3
+ mov lr, #-1
+16:
+ vld1.8 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.8 {d2[], d3[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128], r1
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vst1.8 {q1}, [r12, :128], r1
+ vld1.8 {d6[], d7[]}, [r2], lr
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ add r2, r2, #3
+ mov lr, #-1
+ sub r1, r1, #16
+32:
+ vld1.8 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.8 {d2[], d3[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128]!
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vst1.8 {q1}, [r12, :128]!
+ vld1.8 {d6[], d7[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128], r1
+ vst1.8 {q1}, [r12, :128], r1
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ add r2, r2, #3
+ mov lr, #-1
+ sub r1, r1, #48
+64:
+ vld1.8 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.8 {d2[], d3[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128]!
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vst1.8 {q1}, [r12, :128]!
+ vld1.8 {d6[], d7[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128]!
+ vst1.8 {q1}, [r12, :128]!
+ vst1.8 {q0}, [r0, :128]!
+ vst1.8 {q1}, [r12, :128]!
+ vst1.8 {q0}, [r0, :128], r1
+ vst1.8 {q1}, [r12, :128], r1
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_top_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_dc_top_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ add r2, r2, #1
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_top_tbl):
+ .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+40:
+ vld1.32 {d0[]}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #2
+ vdup.8 d0, d0[0]
+4:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ vld1.8 {d0}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #3
+ vdup.8 d0, d0[0]
+8:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ vld1.8 {d0, d1}, [r2]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #4
+ vdup.8 q0, d0[0]
+16:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ vld1.8 {d0, d1, d2, d3}, [r2]
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d4, q0, #5
+ vdup.8 q0, d4[0]
+ vdup.8 q1, d4[0]
+32:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ vld1.8 {d0, d1, d2, d3}, [r2]!
+ vaddl.u8 q0, d0, d1
+ vld1.8 {d4, d5, d6, d7}, [r2]
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q0, q1
+ vadd.u16 q1, q2, q3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d18, q0, #6
+ vdup.8 q0, d18[0]
+ vdup.8 q1, d18[0]
+ vdup.8 q2, d18[0]
+ vdup.8 q3, d18[0]
+ sub r1, r1, #32
+64:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_left_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ sub r2, r2, r4
+ clz r3, r3
+ clz lr, r4
+ sub lr, lr, #25
+ adr r5, L(ipred_dc_left_tbl)
+ sub r3, r3, #20
+ ldr r3, [r5, r3, lsl #2]
+ ldr lr, [r5, lr, lsl #2]
+ add r3, r5, r3
+ add r5, r5, lr
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_left_tbl):
+ .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+
+L(ipred_dc_left_h4):
+ vld1.32 {d0[]}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #2
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w4):
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt L(ipred_dc_left_w4)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h8):
+ vld1.8 {d0}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #3
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w8):
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt L(ipred_dc_left_w8)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h16):
+ vld1.8 {d0, d1}, [r2]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #4
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w16):
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt L(ipred_dc_left_w16)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h32):
+ vld1.8 {d0, d1, d2, d3}, [r2]
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #5
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w32):
+ vmov.8 q1, q0
+1:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+L(ipred_dc_left_h64):
+ vld1.8 {d0, d1, d2, d3}, [r2]!
+ vld1.8 {d4, d5, d6, d7}, [r2]
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q0, q1
+ vadd.u16 q1, q2, q3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #6
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w64):
+ sub r1, r1, #32
+ vmov.8 q1, q0
+ vmov.8 q2, q0
+ vmov.8 q3, q0
+1:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_neon, export=1
+ push {r4-r6, lr}
+ ldr r4, [sp, #16]
+ sub r2, r2, r4
+ add lr, r3, r4 // width + height
+ clz r3, r3
+ clz r12, r4
+ vdup.16 q15, lr // width + height
+ mov r6, #0
+ adr r5, L(ipred_dc_tbl)
+ rbit lr, lr // rbit(width + height)
+ sub r3, r3, #20 // 25 leading bits, minus table offset 5
+ sub r12, r12, #25
+ clz lr, lr // ctz(width + height)
+ ldr r3, [r5, r3, lsl #2]
+ ldr r12, [r5, r12, lsl #2]
+ neg lr, lr // -ctz(width + height)
+ add r3, r5, r3
+ add r5, r5, r12
+ vshr.u16 q15, q15, #1 // (width + height) >> 1
+ vdup.16 q14, lr // -ctz(width + height)
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_tbl):
+ .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB
+
+L(ipred_dc_h4):
+ vld1.32 {d0[0]}, [r2]!
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w4):
+ add r2, r2, #1
+ vld1.32 {d1[0]}, [r2]
+ vmov.32 d1[1], r6
+ vadd.s16 d0, d0, d30
+ vpaddl.u8 d1, d1
+ vpadd.u16 d1, d1
+ vpadd.u16 d1, d1
+ cmp r4, #4
+ vadd.s16 d0, d0, d1
+ vshl.u16 d0, d0, d28
+ beq 1f // h = 8/16
+ mov lr, #(0x3334/2)
+ mov r5, #(0x5556/2)
+ cmp r4, #16
+ it ne
+ movne lr, r5
+ vdup.16 d30, lr
+ vqdmulh.s16 d0, d0, d30
+1:
+ vdup.8 d0, d0[0]
+2:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h8):
+ vld1.8 {d0}, [r2]!
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w8):
+ add r2, r2, #1
+ vld1.8 {d2}, [r2]
+ vadd.s16 d0, d0, d30
+ vpaddl.u8 d2, d2
+ vpadd.u16 d2, d2
+ vpadd.u16 d2, d2
+ cmp r4, #8
+ vadd.s16 d0, d0, d2
+ vshl.u16 d0, d0, d28
+ beq 1f // h = 4/16/32
+ cmp r4, #32
+ mov lr, #(0x3334/2)
+ mov r5, #(0x5556/2)
+ it ne
+ movne lr, r5
+ vdup.16 q12, lr
+ vqdmulh.s16 d0, d0, d24
+1:
+ vdup.8 d0, d0[0]
+2:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h16):
+ vld1.8 {d0, d1}, [r2]!
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w16):
+ add r2, r2, #1
+ vld1.8 {d2, d3}, [r2]
+ vadd.s16 d0, d0, d30
+ vaddl.u8 q1, d2, d3
+ vadd.u16 d2, d2, d3
+ vpadd.u16 d2, d2
+ vpadd.u16 d2, d2
+ cmp r4, #16
+ vadd.s16 d0, d0, d2
+ vshl.u16 d0, d0, d28
+ beq 1f // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ mov lr, #(0x3334/2)
+ mov r5, #(0x5556/2)
+ it ne
+ movne lr, r5
+ vdup.16 q12, lr
+ vqdmulh.s16 d0, d0, d24
+1:
+ vdup.8 q0, d0[0]
+2:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h32):
+ vld1.8 {d0, d1, d2, d3}, [r2]!
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w32):
+ add r2, r2, #1
+ vld1.8 {d2, d3, d4, d5}, [r2]
+ vadd.s16 d0, d0, d30
+ vaddl.u8 q2, d4, d5
+ vadd.u16 d4, d4, d5
+ vaddl.u8 q1, d2, d3
+ vadd.u16 d2, d2, d3
+ vpadd.u16 d4, d4
+ vpadd.u16 d2, d2
+ vpadd.u16 d4, d4
+ vpadd.u16 d2, d2
+ cmp r4, #32
+ vadd.s16 d0, d0, d4
+ vadd.s16 d0, d0, d2
+ vshl.u16 d4, d0, d28
+ beq 1f // h = 8/16/64
+ cmp r4, #8
+ mov lr, #(0x3334/2)
+ mov r5, #(0x5556/2)
+ it ne
+ movne lr, r5
+ vdup.16 q12, lr
+ vqdmulh.s16 d4, d4, d24
+1:
+ vdup.8 q0, d4[0]
+ vdup.8 q1, d4[0]
+2:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h64):
+ vld1.8 {d0, d1, d2, d3}, [r2]!
+ vaddl.u8 q0, d0, d1
+ vld1.8 {d4, d5, d6, d7}, [r2]!
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q0, q1
+ vadd.u16 q1, q2, q3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w64):
+ vmov.8 q1, q0
+ vmov.8 q2, q0
+ vmov.8 q3, q0
+2:
+ add r2, r2, #1
+ vld1.8 {d2, d3, d4, d5}, [r2]!
+ vadd.s16 d0, d0, d30
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q1, d2, d3
+ vadd.u16 d4, d4, d5
+ vadd.u16 d2, d2, d3
+ vld1.8 {d16, d17, d18, d19}, [r2]
+ vpadd.u16 d4, d4
+ vpadd.u16 d2, d2
+ vpadd.u16 d4, d4
+ vpadd.u16 d2, d2
+ vaddl.u8 q8, d16, d17
+ vaddl.u8 q9, d18, d19
+ vadd.u16 d16, d16, d17
+ vadd.u16 d18, d18, d19
+ vpadd.u16 d16, d16
+ vpadd.u16 d18, d18
+ vpadd.u16 d16, d16
+ vpadd.u16 d18, d18
+ vadd.u16 d2, d2, d4
+ vadd.u16 d3, d16, d18
+ cmp r4, #64
+ vadd.s16 d0, d0, d2
+ vadd.s16 d0, d0, d3
+ vshl.u16 d18, d0, d28
+ beq 1f // h = 16/32
+ mov lr, #(0x5556/2)
+ movt lr, #(0x3334/2)
+ mov r5, r4
+ and r5, r5, #31
+ lsr lr, lr, r5
+ vdup.16 d30, lr
+ vqdmulh.s16 d18, d18, d30
+1:
+ sub r1, r1, #32
+ vdup.8 q0, d18[0]
+ vdup.8 q1, d18[0]
+ vdup.8 q2, d18[0]
+ vdup.8 q3, d18[0]
+2:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+endfunc
+
--- a/src/arm/ipred_init_tmpl.c
+++ b/src/arm/ipred_init_tmpl.c
@@ -54,7 +54,7 @@
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
-#if BITDEPTH == 8 && ARCH_AARCH64
+#if BITDEPTH == 8
c->intra_pred[DC_PRED] = dav1d_ipred_dc_neon;
c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_neon;
c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_neon;
@@ -61,6 +61,7 @@
c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_neon;
c->intra_pred[HOR_PRED] = dav1d_ipred_h_neon;
c->intra_pred[VERT_PRED] = dav1d_ipred_v_neon;
+#if ARCH_AARCH64
c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_neon;
c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_neon;
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_neon;
@@ -76,5 +77,6 @@
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_neon;
c->pal_pred = dav1d_pal_pred_neon;
+#endif
#endif
}
--- a/src/meson.build
+++ b/src/meson.build
@@ -112,6 +112,7 @@
elif host_machine.cpu_family().startswith('arm')
libdav1d_sources += files(
'arm/32/cdef.S',
+ 'arm/32/ipred.S',
'arm/32/loopfilter.S',
'arm/32/looprestoration.S',
'arm/32/mc.S',