ref: ff41197bc89fe06311cb07d0acf7e3cac76c6946
parent: e570088de116bbbbb0e24ae5b70c0927130e5964
author: Martin Storsjö <martin@martin.st>
date: Fri Oct 4 19:07:49 EDT 2019
arm64: mc: Schedule instructions better in the warp8x8 functions Before: Cortex A53 A72 A73 warp_8x8_8bpc_neon: 1997.3 1170.1 1199.9 warp_8x8t_8bpc_neon: 1982.4 1171.5 1192.6 After: warp_8x8_8bpc_neon: 1954.6 1159.2 1153.3 warp_8x8t_8bpc_neon: 1938.5 1146.2 1136.7
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -2975,7 +2975,9 @@
ld1 {v16.8b, v17.8b}, [x2], x3
load_filter_row d0, w12, w7
+ uxtl v16.8h, v16.8b
load_filter_row d1, w12, w7
+ uxtl v17.8h, v17.8b
load_filter_row d2, w12, w7
sxtl v0.8h, v0.8b
load_filter_row d3, w12, w7
@@ -2988,16 +2990,12 @@
sxtl v4.8h, v4.8b
load_filter_row d7, w12, w7
sxtl v5.8h, v5.8b
- sxtl v6.8h, v6.8b
- sxtl v7.8h, v7.8b
-
- uxtl v16.8h, v16.8b
- uxtl v17.8h, v17.8b
-
ext v18.16b, v16.16b, v17.16b, #2*1
mul v23.8h, v16.8h, v0.8h
+ sxtl v6.8h, v6.8b
ext v19.16b, v16.16b, v17.16b, #2*2
mul v18.8h, v18.8h, v1.8h
+ sxtl v7.8h, v7.8b
ext v20.16b, v16.16b, v17.16b, #2*3
mul v19.8h, v19.8h, v2.8h
ext v21.16b, v16.16b, v17.16b, #2*4