ref: 515e26673dff988985d136cc9246724efe5866b9
parent: 128715b5fdd513f9824bef32119285efa44c1d1b
author: Martin Storsjö <martin@martin.st>
date: Sat Oct 20 20:58:37 EDT 2018
arm/mc: Add 8 bit neon asm for avg, w_avg and mask checkasm --bench numbers from a Snapdragon 835: nop: 23.0 avg_w4_8bpc_c: 385.0 avg_w4_8bpc_neon: 34.0 avg_w8_8bpc_c: 590.5 avg_w8_8bpc_neon: 65.5 avg_w16_8bpc_c: 1304.4 avg_w16_8bpc_neon: 161.3 avg_w32_8bpc_c: 4098.4 avg_w32_8bpc_neon: 589.2 avg_w64_8bpc_c: 8405.0 avg_w64_8bpc_neon: 1367.1 avg_w128_8bpc_c: 19667.9 avg_w128_8bpc_neon: 3409.0 w_avg_w4_8bpc_c: 453.8 w_avg_w4_8bpc_neon: 50.0 w_avg_w8_8bpc_c: 749.0 w_avg_w8_8bpc_neon: 105.7 w_avg_w16_8bpc_c: 1851.2 w_avg_w16_8bpc_neon: 283.7 w_avg_w32_8bpc_c: 5991.5 w_avg_w32_8bpc_neon: 1080.9 w_avg_w64_8bpc_c: 12763.5 w_avg_w64_8bpc_neon: 2544.4 w_avg_w128_8bpc_c: 30311.3 w_avg_w128_8bpc_neon: 6350.5 mask_w4_8bpc_c: 492.9 mask_w4_8bpc_neon: 57.7 mask_w8_8bpc_c: 1108.5 mask_w8_8bpc_neon: 123.0 mask_w16_8bpc_c: 2880.3 mask_w16_8bpc_neon: 349.2 mask_w32_8bpc_c: 8996.4 mask_w32_8bpc_neon: 1368.1 mask_w64_8bpc_c: 19570.3 mask_w64_8bpc_neon: 3263.5 mask_w128_8bpc_c: 46757.4 mask_w128_8bpc_neon: 8743.1
--- /dev/null
+++ b/src/arm/32/mc.S
@@ -1,0 +1,219 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#if BITDEPTH == 8
+
+.macro avg dst0, dst1, t0, t1, t2, t3
+ vld1.16 {\t0,\t1}, [r2, :128]!
+ vld1.16 {\t2,\t3}, [r3, :128]!
+ vadd.i16 \t0, \t0, \t2
+ vadd.i16 \t1, \t1, \t3
+ vqrshrun.s16 \dst0, \t0, #5
+ vqrshrun.s16 \dst1, \t1, #5
+.endm
+
+.macro w_avg dst0, dst1, t0, t1, t2, t3
+ vld1.16 {\t0,\t1}, [r2, :128]!
+ vld1.16 {\t2,\t3}, [r3, :128]!
+ vsub.i16 \t0, \t2, \t0
+ vsub.i16 \t1, \t3, \t1
+ vqdmulh.s16 \t0, \t0, q15
+ vqdmulh.s16 \t1, \t1, q15
+ vadd.i16 \t0, \t2, \t0
+ vadd.i16 \t1, \t3, \t1
+ vqrshrun.s16 \dst0, \t0, #4
+ vqrshrun.s16 \dst1, \t1, #4
+.endm
+
+.macro mask dst0, dst1, t0, t1, t2, t3
+ vld1.8 {q14}, [lr, :128]!
+ vld1.16 {\t0,\t1}, [r2, :128]!
+ vmul.i8 q14, q14, q15
+ vld1.16 {\t2,\t3}, [r3, :128]!
+ vshll.i8 q13, d28, #8
+ vshll.i8 q14, d29, #8
+ vsub.i16 \t0, \t2, \t0
+ vsub.i16 \t1, \t3, \t1
+ vqdmulh.s16 \t0, \t0, q13
+ vqdmulh.s16 \t1, \t1, q14
+ vadd.i16 \t0, \t2, \t0
+ vadd.i16 \t1, \t3, \t1
+ vqrshrun.s16 \dst0, \t0, #4
+ vqrshrun.s16 \dst1, \t1, #4
+.endm
+
+.macro bidir_fn type
+function \type\()_8bpc_neon, export=1
+ push {r4-r6,lr}
+ ldr r4, [sp, #16]
+ ldr r5, [sp, #20]
+.ifnc \type, avg
+ ldr lr, [sp, #24]
+.endif
+.ifc \type, w_avg
+ vdup.s16 q15, lr
+ vneg.s16 q15, q15
+ vshl.i16 q15, q15, #11
+.endif
+.ifc \type, mask
+ vmov.i8 q15, #256-2
+.endif
+ rbit r4, r4
+ adr r12, L(\type\()_tbl)
+ clz r4, r4
+ ldr r4, [r12, r4, lsl #2]
+ \type d16, d17, q0, q1, q2, q3
+ add r12, r12, r4
+ bx r12
+ .align 2
+L(\type\()_tbl):
+ .word 0, 0
+ .word 4f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
+4:
+ add r6, r0, r1
+ lsl r1, r1, #1
+ cmp r5, #4
+ vst1.32 {d16[0]}, [r0, :32], r1
+ vst1.32 {d16[1]}, [r6, :32], r1
+ vst1.32 {d17[0]}, [r0, :32], r1
+ vst1.32 {d17[1]}, [r6, :32], r1
+ beq 0f
+ \type d18, d19, q0, q1, q2, q3
+ cmp r5, #8
+ vst1.32 {d18[0]}, [r0, :32], r1
+ vst1.32 {d18[1]}, [r6, :32], r1
+ vst1.32 {d19[0]}, [r0, :32], r1
+ vst1.32 {d19[1]}, [r6, :32], r1
+ beq 0f
+ \type d16, d17, q0, q1, q2, q3
+ vst1.32 {d16[0]}, [r0, :32], r1
+ vst1.32 {d16[1]}, [r6, :32], r1
+ \type d18, d19, q0, q1, q2, q3
+ vst1.32 {d17[0]}, [r0, :32], r1
+ vst1.32 {d17[1]}, [r6, :32], r1
+ vst1.32 {d18[0]}, [r0, :32], r1
+ vst1.32 {d18[1]}, [r6, :32], r1
+ vst1.32 {d19[0]}, [r0, :32], r1
+ vst1.32 {d19[1]}, [r6, :32], r1
+ pop {r4-r6,pc}
+80:
+ add r6, r0, r1
+ lsl r1, r1, #1
+8:
+ vst1.8 {d16}, [r0, :64], r1
+ \type d18, d19, q0, q1, q2, q3
+ vst1.8 {d17}, [r6, :64], r1
+ vst1.8 {d18}, [r0, :64], r1
+ subs r5, r5, #4
+ vst1.8 {d19}, [r6, :64], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 8b
+160:
+ add r6, r0, r1
+ lsl r1, r1, #1
+16:
+ \type d18, d19, q0, q1, q2, q3
+ vst1.8 {q8}, [r0, :128], r1
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q9}, [r6, :128], r1
+ \type d22, d23, q0, q1, q2, q3
+ vst1.8 {q10}, [r0, :128], r1
+ subs r5, r5, #4
+ vst1.8 {q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 16b
+320:
+ add r6, r0, r1
+ lsl r1, r1, #1
+32:
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128], r1
+ \type d22, d23, q0, q1, q2, q3
+ subs r5, r5, #2
+ vst1.8 {q10, q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 32b
+640:
+ add r6, r0, #32
+64:
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ \type d22, d23, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128], r1
+ \type d16, d17, q0, q1, q2, q3
+ vst1.8 {q10, q11}, [r6, :128], r1
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128], r1
+ \type d22, d23, q0, q1, q2, q3
+ subs r5, r5, #2
+ vst1.8 {q10, q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 64b
+1280:
+ sub r1, r1, #32
+ add r6, r0, #64
+128:
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ \type d22, d23, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128]!
+ \type d16, d17, q0, q1, q2, q3
+ vst1.8 {q10, q11}, [r0, :128], r1
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r6, :128]!
+ \type d22, d23, q0, q1, q2, q3
+ subs r5, r5, #1
+ vst1.8 {q10, q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 128b
+
+0:
+ pop {r4-r6,pc}
+endfunc
+.endm
+
+bidir_fn avg
+bidir_fn w_avg
+bidir_fn mask
+
+#endif /* BITDEPTH == 8 */
--- a/src/arm/asm.S
+++ b/src/arm/asm.S
@@ -30,6 +30,30 @@
#include "config.h"
+#if ARCH_ARM
+ .syntax unified
+#ifdef __ELF__
+ .fpu neon
+ .eabi_attribute 10, 0 // suppress Tag_FP_arch
+ .eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch
+#endif
+
+#ifdef _WIN32
+#define CONFIG_THUMB 1
+#else
+#define CONFIG_THUMB 0
+#endif
+
+#if CONFIG_THUMB
+ .thumb
+#define A @
+#define T
+#else
+#define A
+#define T @
+#endif
+#endif
+
#ifndef PRIVATE_PREFIX
#define PRIVATE_PREFIX dav1d_
#endif
--- a/src/arm/mc_init.c
+++ b/src/arm/mc_init.c
@@ -39,7 +39,7 @@
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
-#if BITDEPTH == 8 && ARCH_AARCH64
+#if BITDEPTH == 8
c->avg = dav1d_avg_8bpc_neon;
c->w_avg = dav1d_w_avg_8bpc_neon;
c->mask = dav1d_mask_8bpc_neon;
--- a/src/meson.build
+++ b/src/meson.build
@@ -91,6 +91,7 @@
)
elif host_machine.cpu_family().startswith('arm')
libdav1d_tmpl_sources += files(
+ 'arm/32/mc.S',
)
endif
elif host_machine.cpu_family().startswith('x86')