ref: fa6a0924d7aef7fbbdb02c7a8df0714d00e40408
parent: 1f83575018b39d12410407dc08bdc9c445504406
author: Martin Storsjö <martin@martin.st>
date: Fri Oct 4 09:53:49 EDT 2019
arm64: cdef: Calculate two initial parameters in the same vector As there's only two individual parameters, we can insert them into the same vector, reducing the number of actual calculation instructions, but adding a few more instructions to dup the results to the final vectors instead.
--- a/src/arm/64/cdef.S
+++ b/src/arm/64/cdef.S
@@ -323,19 +323,18 @@
add x8, x8, w9, uxtw #1
movrel x9, directions\w
add x5, x9, w5, uxtw #1
- movi v30.8h, #15
- dup v28.8h, w6 // damping
+ movi v30.4h, #15
+ dup v28.4h, w6 // damping
dup v25.8h, w3 // threshold
dup v27.8h, w4 // threshold
- clz v24.8h, v25.8h // clz(threshold)
- clz v26.8h, v27.8h // clz(threshold)
- sub v24.8h, v30.8h, v24.8h // ulog2(threshold)
- sub v26.8h, v30.8h, v26.8h // ulog2(threshold)
- uqsub v24.8h, v28.8h, v24.8h // shift = imax(0, damping - ulog2(threshold))
- uqsub v26.8h, v28.8h, v26.8h // shift = imax(0, damping - ulog2(threshold))
- neg v24.8h, v24.8h // -shift
- neg v26.8h, v26.8h // -shift
+ trn1 v24.4h, v25.4h, v27.4h
+ clz v24.4h, v24.4h // clz(threshold)
+ sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
+ uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
+ neg v24.4h, v24.4h // -shift
+ dup v26.8h, v24.h[1]
+ dup v24.8h, v24.h[0]
1:
.if \w == 8