ref: 3069ab9453b71b248f8355be89f82439aa57756e
parent: abd07c67d3984734a61caaf3ff3cd857ab307810
author: Martin Storsjö <martin@martin.st>
date: Wed Nov 6 19:10:48 EST 2019
arm64: loopfilter: Reorder instructions and tweak register use to match the arm32 port This doesn't change performance measurably, but eases potential future maintainance of the code.
--- a/src/arm/64/loopfilter.S
+++ b/src/arm/64/loopfilter.S
@@ -419,6 +419,7 @@
sub v7.8h, v7.8h, v11.8h
uaddl v8.8h, v26.8b, v30.8b // q2 + q6
uaddl2 v9.8h, v26.16b, v30.16b
+ bif v0.16b, v18.16b, v15.16b // out p5
uaddl v10.8h, v18.8b, v23.8b // p5 + p0
uaddl2 v11.8h, v18.16b, v23.16b
rshrn v5.8b, v12.8h, #4 // out p0
@@ -429,56 +430,55 @@
sub v9.8h, v9.8h, v11.8h
uaddl v10.8h, v27.8b, v30.8b // q3 + q6
uaddl2 v11.8h, v27.16b, v30.16b
- bif v0.16b, v18.16b, v15.16b // out p5
- uaddl v14.8h, v19.8b, v24.8b // p4 + q0
- uaddl2 v18.8h, v19.16b, v24.16b
+ bif v1.16b, v19.16b, v15.16b // out p4
+ uaddl v18.8h, v19.8b, v24.8b // p4 + q0
+ uaddl2 v19.8h, v19.16b, v24.16b
rshrn v6.8b, v12.8h, #4 // out q0
rshrn2 v6.16b, v13.8h, #4
add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
add v13.8h, v13.8h, v9.8h
- sub v10.8h, v10.8h, v14.8h
- sub v11.8h, v11.8h, v18.8h
- uaddl v14.8h, v28.8b, v30.8b // q4 + q6
- uaddl2 v18.8h, v28.16b, v30.16b
- bif v1.16b, v19.16b, v15.16b // out p4
- uaddl v8.8h, v20.8b, v25.8b // p3 + q1
- uaddl2 v9.8h, v20.16b, v25.16b
+ sub v10.8h, v10.8h, v18.8h
+ sub v11.8h, v11.8h, v19.8h
+ uaddl v8.8h, v28.8b, v30.8b // q4 + q6
+ uaddl2 v9.8h, v28.16b, v30.16b
+ bif v2.16b, v20.16b, v15.16b // out p3
+ uaddl v18.8h, v20.8b, v25.8b // p3 + q1
+ uaddl2 v19.8h, v20.16b, v25.16b
rshrn v7.8b, v12.8h, #4 // out q1
rshrn2 v7.16b, v13.8h, #4
add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
add v13.8h, v13.8h, v11.8h
- sub v14.8h, v14.8h, v8.8h
- sub v18.8h, v18.8h, v9.8h
+ sub v18.8h, v8.8h, v18.8h
+ sub v19.8h, v9.8h, v19.8h
uaddl v10.8h, v29.8b, v30.8b // q5 + q6
uaddl2 v11.8h, v29.16b, v30.16b
- bif v2.16b, v20.16b, v15.16b // out p3
- uaddl v19.8h, v21.8b, v26.8b // p2 + q2
- uaddl2 v20.8h, v21.16b, v26.16b
+ bif v3.16b, v21.16b, v15.16b // out p2
+ uaddl v20.8h, v21.8b, v26.8b // p2 + q2
+ uaddl2 v21.8h, v21.16b, v26.16b
rshrn v8.8b, v12.8h, #4 // out q2
rshrn2 v8.16b, v13.8h, #4
- add v12.8h, v12.8h, v14.8h // - (p3 + q1) + (q4 + q6)
- add v13.8h, v13.8h, v18.8h
- sub v10.8h, v10.8h, v19.8h
- sub v11.8h, v11.8h, v20.8h
- uaddl v14.8h, v30.8b, v30.8b // q6 + q6
- uaddl2 v18.8h, v30.16b, v30.16b
- bif v3.16b, v21.16b, v15.16b // out p2
- uaddl v19.8h, v22.8b, v27.8b // p1 + q3
- uaddl2 v20.8h, v22.16b, v27.16b
+ add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
+ add v13.8h, v13.8h, v19.8h
+ sub v10.8h, v10.8h, v20.8h
+ sub v11.8h, v11.8h, v21.8h
+ uaddl v18.8h, v30.8b, v30.8b // q6 + q6
+ uaddl2 v19.8h, v30.16b, v30.16b
+ bif v4.16b, v22.16b, v15.16b // out p1
+ uaddl v20.8h, v22.8b, v27.8b // p1 + q3
+ uaddl2 v21.8h, v22.16b, v27.16b
rshrn v9.8b, v12.8h, #4 // out q3
rshrn2 v9.16b, v13.8h, #4
add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
add v13.8h, v13.8h, v11.8h
- sub v14.8h, v14.8h, v19.8h
sub v18.8h, v18.8h, v20.8h
- bif v4.16b, v22.16b, v15.16b // out p1
+ sub v19.8h, v19.8h, v21.8h
+ bif v5.16b, v23.16b, v15.16b // out p0
rshrn v10.8b, v12.8h, #4 // out q4
rshrn2 v10.16b, v13.8h, #4
- add v12.8h, v12.8h, v14.8h // - (p1 + q3) + (q6 + q6)
- add v13.8h, v13.8h, v18.8h
+ add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
+ add v13.8h, v13.8h, v19.8h
rshrn v11.8b, v12.8h, #4 // out q5
rshrn2 v11.16b, v13.8h, #4
- bif v5.16b, v23.16b, v15.16b // out p0
bif v6.16b, v24.16b, v15.16b // out q0
bif v7.16b, v25.16b, v15.16b // out q1
bif v8.16b, v26.16b, v15.16b // out q2