ref: 0bad117eb0f97594a938f17ba05d3ca89ba81a9f
parent: 2e68c1f36e560af6fa05fcb77c9ae77a76cfef6a
author: Martin Storsjö <martin@martin.st>
date: Sat Feb 1 09:33:49 EST 2020
arm64: mc: Simplify avg/w_avg/mask by always using the w16 macro This shortens the source by 40 lines, and gives a significant speedup on A53, a small speedup on A72 and a very minor slowdown for avg/w_avg on A73. Before: Cortex A53 A72 A73 avg_w4_8bpc_neon: 67.4 26.1 25.4 avg_w8_8bpc_neon: 158.7 56.3 59.1 avg_w16_8bpc_neon: 382.9 154.1 160.7 w_avg_w4_8bpc_neon: 99.9 43.6 39.4 w_avg_w8_8bpc_neon: 253.2 98.3 99.0 w_avg_w16_8bpc_neon: 543.1 285.0 301.8 mask_w4_8bpc_neon: 110.6 51.4 45.1 mask_w8_8bpc_neon: 295.0 129.9 114.0 mask_w16_8bpc_neon: 654.6 365.8 369.7 After: avg_w4_8bpc_neon: 60.8 26.3 29.0 avg_w8_8bpc_neon: 142.8 52.9 64.1 avg_w16_8bpc_neon: 378.2 153.4 160.8 w_avg_w4_8bpc_neon: 78.7 41.0 40.9 w_avg_w8_8bpc_neon: 190.6 90.1 105.1 w_avg_w16_8bpc_neon: 531.1 279.3 301.4 mask_w4_8bpc_neon: 86.6 47.2 44.9 mask_w8_8bpc_neon: 222.0 114.3 114.9 mask_w16_8bpc_neon: 639.5 356.0 369.8
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -29,14 +29,7 @@
#include "src/arm/asm.S"
#include "util.S"
-.macro avg dst, t0, t1
- ld1 {\t0\().8h}, [x2], 16
- ld1 {\t1\().8h}, [x3], 16
- add \t0\().8h, \t0\().8h, \t1\().8h
- sqrshrun \dst\().8b, \t0\().8h, #5
-.endm
-
-.macro avg16 dst, t0, t1, t2, t3
+.macro avg dst, t0, t1, t2, t3
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
add \t0\().8h, \t0\().8h, \t2\().8h
@@ -45,16 +38,7 @@
sqrshrun2 \dst\().16b, \t1\().8h, #5
.endm
-.macro w_avg dst, t0, t1
- ld1 {\t0\().8h}, [x2], 16
- ld1 {\t1\().8h}, [x3], 16
- sub \t0\().8h, \t1\().8h, \t0\().8h
- sqdmulh \t0\().8h, \t0\().8h, v30.8h
- add \t0\().8h, \t1\().8h, \t0\().8h
- sqrshrun \dst\().8b, \t0\().8h, #4
-.endm
-
-.macro w_avg16 dst, t0, t1, t2, t3
+.macro w_avg dst, t0, t1, t2, t3
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
sub \t0\().8h, \t2\().8h, \t0\().8h
@@ -67,19 +51,7 @@
sqrshrun2 \dst\().16b, \t1\().8h, #4
.endm
-.macro mask dst, t0, t1
- ld1 {v30.8b}, [x6], 8
- ld1 {\t0\().8h}, [x2], 16
- mul v30.8b, v30.8b, v31.8b
- ld1 {\t1\().8h}, [x3], 16
- shll v30.8h, v30.8b, #8
- sub \t0\().8h, \t1\().8h, \t0\().8h
- sqdmulh \t0\().8h, \t0\().8h, v30.8h
- add \t0\().8h, \t1\().8h, \t0\().8h
- sqrshrun \dst\().8b, \t0\().8h, #4
-.endm
-
-.macro mask16 dst, t0, t1, t2, t3
+.macro mask dst, t0, t1, t2, t3
ld1 {v30.16b}, [x6], 16
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
mul v30.16b, v30.16b, v31.16b
@@ -109,9 +81,8 @@
.endif
adr x7, L(\type\()_tbl)
sub w4, w4, #24
- \type v4, v0, v1
ldrh w4, [x7, x4, lsl #1]
- \type v5, v2, v3
+ \type v4, v0, v1, v2, v3
sub x7, x7, w4, uxtw
br x7
4:
@@ -118,104 +89,94 @@
cmp w5, #4
st1 {v4.s}[0], [x0], x1
st1 {v4.s}[1], [x0], x1
+ st1 {v4.s}[2], [x0], x1
+ st1 {v4.s}[3], [x0], x1
+ b.eq 0f
+ \type v5, v0, v1, v2, v3
+ cmp w5, #8
st1 {v5.s}[0], [x0], x1
st1 {v5.s}[1], [x0], x1
+ st1 {v5.s}[2], [x0], x1
+ st1 {v5.s}[3], [x0], x1
b.eq 0f
- \type v6, v0, v1
- \type v7, v2, v3
- cmp w5, #8
- st1 {v6.s}[0], [x0], x1
- st1 {v6.s}[1], [x0], x1
- st1 {v7.s}[0], [x0], x1
- st1 {v7.s}[1], [x0], x1
- b.eq 0f
- \type v4, v0, v1
- \type v5, v2, v3
+ \type v4, v0, v1, v2, v3
st1 {v4.s}[0], [x0], x1
st1 {v4.s}[1], [x0], x1
- \type v6, v0, v1
+ \type v5, v0, v1, v2, v3
+ st1 {v4.s}[2], [x0], x1
+ st1 {v4.s}[3], [x0], x1
st1 {v5.s}[0], [x0], x1
st1 {v5.s}[1], [x0], x1
- \type v7, v2, v3
- st1 {v6.s}[0], [x0], x1
- st1 {v6.s}[1], [x0], x1
- st1 {v7.s}[0], [x0], x1
- st1 {v7.s}[1], [x0], x1
+ st1 {v5.s}[2], [x0], x1
+ st1 {v5.s}[3], [x0], x1
ret
8:
- st1 {v4.8b}, [x0], x1
- \type v6, v0, v1
- st1 {v5.8b}, [x0], x1
- \type v7, v0, v1
- st1 {v6.8b}, [x0], x1
+ st1 {v4.d}[0], [x0], x1
+ \type v5, v0, v1, v2, v3
+ st1 {v4.d}[1], [x0], x1
+ st1 {v5.d}[0], [x0], x1
subs w5, w5, #4
- st1 {v7.8b}, [x0], x1
+ st1 {v5.d}[1], [x0], x1
b.le 0f
- \type v4, v0, v1
- \type v5, v2, v3
+ \type v4, v0, v1, v2, v3
b 8b
-160:
- trn1 v4.2d, v4.2d, v5.2d
16:
- \type\()16 v5, v0, v1, v2, v3
+ \type v5, v0, v1, v2, v3
st1 {v4.16b}, [x0], x1
- \type\()16 v6, v0, v1, v2, v3
+ \type v6, v0, v1, v2, v3
st1 {v5.16b}, [x0], x1
- \type\()16 v7, v0, v1, v2, v3
+ \type v7, v0, v1, v2, v3
st1 {v6.16b}, [x0], x1
subs w5, w5, #4
st1 {v7.16b}, [x0], x1
b.le 0f
- \type\()16 v4, v0, v1, v2, v3
+ \type v4, v0, v1, v2, v3
b 16b
320:
- trn1 v4.2d, v4.2d, v5.2d
add x7, x0, x1
lsl x1, x1, #1
32:
- \type\()16 v5, v0, v1, v2, v3
- \type\()16 v6, v0, v1, v2, v3
+ \type v5, v0, v1, v2, v3
+ \type v6, v0, v1, v2, v3
st1 {v4.16b,v5.16b}, [x0], x1
- \type\()16 v7, v0, v1, v2, v3
+ \type v7, v0, v1, v2, v3
subs w5, w5, #2
st1 {v6.16b,v7.16b}, [x7], x1
b.le 0f
- \type\()16 v4, v0, v1, v2, v3
+ \type v4, v0, v1, v2, v3
b 32b
640:
- trn1 v4.2d, v4.2d, v5.2d
add x7, x0, x1
lsl x1, x1, #1
64:
- \type\()16 v5, v0, v1, v2, v3
- \type\()16 v6, v0, v1, v2, v3
- \type\()16 v7, v0, v1, v2, v3
- \type\()16 v16, v0, v1, v2, v3
- \type\()16 v17, v0, v1, v2, v3
+ \type v5, v0, v1, v2, v3
+ \type v6, v0, v1, v2, v3
+ \type v7, v0, v1, v2, v3
+ \type v16, v0, v1, v2, v3
+ \type v17, v0, v1, v2, v3
st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
- \type\()16 v18, v0, v1, v2, v3
- \type\()16 v19, v0, v1, v2, v3
+ \type v18, v0, v1, v2, v3
+ \type v19, v0, v1, v2, v3
subs w5, w5, #2
st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
b.le 0f
- \type\()16 v4, v0, v1, v2, v3
+ \type v4, v0, v1, v2, v3
b 64b
1280:
- trn1 v4.2d, v4.2d, v5.2d
add x7, x0, #64
128:
- \type\()16 v5, v0, v1, v2, v3
- \type\()16 v6, v0, v1, v2, v3
- \type\()16 v7, v0, v1, v2, v3
- \type\()16 v16, v0, v1, v2, v3
- \type\()16 v17, v0, v1, v2, v3
+ \type v5, v0, v1, v2, v3
+ \type v6, v0, v1, v2, v3
+ \type v7, v0, v1, v2, v3
+ \type v16, v0, v1, v2, v3
+ \type v17, v0, v1, v2, v3
st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
- \type\()16 v18, v0, v1, v2, v3
- \type\()16 v19, v0, v1, v2, v3
+ \type v18, v0, v1, v2, v3
+ \type v19, v0, v1, v2, v3
subs w5, w5, #1
st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
b.le 0f
- \type\()16 v4, v0, v1, v2, v3
+ \type v4, v0, v1, v2, v3
b 128b
0:
ret
@@ -223,7 +184,7 @@
.hword L(\type\()_tbl) - 1280b
.hword L(\type\()_tbl) - 640b
.hword L(\type\()_tbl) - 320b
- .hword L(\type\()_tbl) - 160b
+ .hword L(\type\()_tbl) - 16b
.hword L(\type\()_tbl) - 8b
.hword L(\type\()_tbl) - 4b
endfunc