ref: b6b1394b06ea2cce03c9c97c77510cb8f2a207e2
parent: 208a2abd16bb4132018810765e9982a457f62fa0
author: Martin Storsjö <martin@martin.st>
date: Mon Apr 27 20:17:04 EDT 2020
arm64: itx: Minor optimizations for the 8x32 functions This gives a couple cycles speedup.
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -2317,6 +2317,7 @@
mov w8, #2*\h
1:
+ ldrh w12, [x13], #2
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
ld1 {\i}, [x2]
st1 {v0.8h}, [x2], x8
@@ -2329,6 +2330,7 @@
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ cmp w3, w12
.if \w == 8
load_add_store_8x8 x0, x7, shiftbits=2
.else
@@ -2335,8 +2337,6 @@
load_add_store_8x8 x0, x7, shiftbits=3
.endif
- ldrh w12, [x13], #2
- cmp w3, w12
b.lt 9f
.if \w == 8
sub x2, x2, x8, lsl #3
@@ -2509,16 +2509,15 @@
mov x8, #2*32
mov w9, #32
mov x6, sp
- mov x7, x2
1:
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x7]
- st1 {v28.8h}, [x7], x8
+ ld1 {v\i\().8h}, [x2]
+ st1 {v28.8h}, [x2], x8
.endr
ldrh w12, [x13], #2
+ sub x2, x2, x8, lsl #3
sub w9, w9, #8
- sub x7, x7, x8, lsl #3
- add x7, x7, #2*8
+ add x2, x2, #2*8
bl inv_dct_8x8_neon
@@ -2528,10 +2527,9 @@
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
cmp w3, w12
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- st1 {v\i\().8h}, [x6], #16
-.endr
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
b.ge 1b
cbz w9, 3f