ref: 72af9329c0c003f68639301be33d4632147245b6
parent: fc5a3728144c62b634bb6fb036a6da47ee9bdf8f
author: Martin Storsjö <martin@martin.st>
date: Wed Jan 9 18:27:00 EST 2019
arm64: mc: Simplify the 8tap_2w_hv code slightly Before: Cortex A53 Snapdragon 835 mc_8tap_regular_w2_hv_8bpc_neon: 415.0 286.9 After: mc_8tap_regular_w2_hv_8bpc_neon: 399.1 269.9
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1307,21 +1307,19 @@
ext v29.16b, v28.16b, v28.16b, #2
mul v28.4h, v28.4h, v0.4h
mul v29.4h, v29.4h, v0.4h
- addv h28, v28.4h
- addv h29, v29.4h
- trn1 v16.4h, v28.4h, v29.4h
- srshr v16.4h, v16.4h, #2
+ addp v28.4h, v28.4h, v29.4h
+ addp v16.4h, v28.4h, v28.4h
+ srshr v16.4h, v16.4h, #2
bl L(\type\()_8tap_filter_2)
trn1 v16.2s, v16.2s, v28.2s
- trn1 v17.2s, v28.2s, v30.2s
- mov v18.8b, v30.8b
+ mov v17.8b, v28.8b
2:
bl L(\type\()_8tap_filter_2)
- trn1 v18.2s, v18.2s, v28.2s
- trn1 v19.2s, v28.2s, v30.2s
+ ext v18.8b, v17.8b, v28.8b, #4
+ mov v19.8b, v28.8b
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -1335,7 +1333,6 @@
b.le 0f
mov v16.8b, v18.8b
mov v17.8b, v19.8b
- mov v18.8b, v30.8b
b 2b
280: // 2x8, 2x16, 2x32 hv
@@ -1355,28 +1352,24 @@
ext v29.16b, v28.16b, v28.16b, #2
mul v28.4h, v28.4h, v0.4h
mul v29.4h, v29.4h, v0.4h
- addv h28, v28.4h
- addv h29, v29.4h
- trn1 v16.4h, v28.4h, v29.4h
- srshr v16.4h, v16.4h, #2
+ addp v28.4h, v28.4h, v29.4h
+ addp v16.4h, v28.4h, v28.4h
+ srshr v16.4h, v16.4h, #2
bl L(\type\()_8tap_filter_2)
trn1 v16.2s, v16.2s, v28.2s
- trn1 v17.2s, v28.2s, v30.2s
- mov v18.8b, v30.8b
+ mov v17.8b, v28.8b
bl L(\type\()_8tap_filter_2)
- trn1 v18.2s, v18.2s, v28.2s
- trn1 v19.2s, v28.2s, v30.2s
- mov v20.8b, v30.8b
+ ext v18.8b, v17.8b, v28.8b, #4
+ mov v19.8b, v28.8b
bl L(\type\()_8tap_filter_2)
- trn1 v20.2s, v20.2s, v28.2s
- trn1 v21.2s, v28.2s, v30.2s
- mov v22.8b, v30.8b
+ ext v20.8b, v19.8b, v28.8b, #4
+ mov v21.8b, v28.8b
28:
bl L(\type\()_8tap_filter_2)
- trn1 v22.2s, v22.2s, v28.2s
- trn1 v23.2s, v28.2s, v30.2s
+ ext v22.8b, v21.8b, v28.8b, #4
+ mov v23.8b, v28.8b
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -1398,7 +1391,6 @@
mov v19.8b, v21.8b
mov v20.8b, v22.8b
mov v21.8b, v23.8b
- mov v22.8b, v30.8b
b 28b
0:
@@ -1420,7 +1412,6 @@
mla v27.4h, v30.4h, v0.h[2]
mla v27.4h, v31.4h, v0.h[3]
srshr v28.4h, v27.4h, #2
- trn2 v30.2s, v28.2s, v28.2s
ret
.endif