ref: 641ef4cc9fa2a12d53be2e75e9690e7b8ff4e605
parent: ba64f052353906398cf78e79bab7fbb8c468b002
author: Martin Storsjö <martin@martin.st>
date: Wed Nov 13 06:09:05 EST 2019
arm64: ipred: Avoid data dependencies with consecutive dup instructions This is around one cycle faster.
--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -608,7 +608,7 @@
cmp w4, #32
add v0.4h, v0.4h, v1.4h
add v0.4h, v0.4h, v2.4h
- ushl v0.4h, v0.4h, v17.4h
+ ushl v4.4h, v0.4h, v17.4h
b.eq 1f
// h = 8/16/64
cmp w4, #8
@@ -616,10 +616,10 @@
mov w17, #(0x5556/2)
csel w16, w16, w17, eq
dup v16.4h, w16
- sqdmulh v0.4h, v0.4h, v16.4h
+ sqdmulh v4.4h, v4.4h, v16.4h
1:
- dup v0.16b, v0.b[0]
- dup v1.16b, v0.b[0]
+ dup v0.16b, v4.b[0]
+ dup v1.16b, v4.b[0]
2:
st1 {v0.16b, v1.16b}, [x0], x1
st1 {v0.16b, v1.16b}, [x6], x1
@@ -656,7 +656,7 @@
cmp w4, #64
add v0.4h, v0.4h, v1.4h
add v0.4h, v0.4h, v3.4h
- ushl v0.4h, v0.4h, v17.4h
+ ushl v4.4h, v0.4h, v17.4h
b.eq 1f
// h = 16/32
mov w16, #(0x5556/2)
@@ -663,12 +663,12 @@
movk w16, #(0x3334/2), lsl #16
lsr w16, w16, w4
dup v16.4h, w16
- sqdmulh v0.4h, v0.4h, v16.4h
+ sqdmulh v4.4h, v4.4h, v16.4h
1:
- dup v0.16b, v0.b[0]
- dup v1.16b, v0.b[0]
- dup v2.16b, v0.b[0]
- dup v3.16b, v0.b[0]
+ dup v0.16b, v4.b[0]
+ dup v1.16b, v4.b[0]
+ dup v2.16b, v4.b[0]
+ dup v3.16b, v4.b[0]
2:
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1