shithub: dav1d

Download patch

ref: 641ef4cc9fa2a12d53be2e75e9690e7b8ff4e605
parent: ba64f052353906398cf78e79bab7fbb8c468b002
author: Martin Storsjö <martin@martin.st>
date: Wed Nov 13 06:09:05 EST 2019

arm64: ipred: Avoid data dependencies with consecutive dup instructions

This is around one cycle faster.

--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -608,7 +608,7 @@
         cmp             w4,  #32
         add             v0.4h,   v0.4h,   v1.4h
         add             v0.4h,   v0.4h,   v2.4h
-        ushl            v0.4h,   v0.4h,   v17.4h
+        ushl            v4.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 8/16/64
         cmp             w4,  #8
@@ -616,10 +616,10 @@
         mov             w17, #(0x5556/2)
         csel            w16, w16, w17, eq
         dup             v16.4h,  w16
-        sqdmulh         v0.4h,   v0.4h,   v16.4h
+        sqdmulh         v4.4h,   v4.4h,   v16.4h
 1:
-        dup             v0.16b,  v0.b[0]
-        dup             v1.16b,  v0.b[0]
+        dup             v0.16b,  v4.b[0]
+        dup             v1.16b,  v4.b[0]
 2:
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
@@ -656,7 +656,7 @@
         cmp             w4,  #64
         add             v0.4h,   v0.4h,   v1.4h
         add             v0.4h,   v0.4h,   v3.4h
-        ushl            v0.4h,   v0.4h,   v17.4h
+        ushl            v4.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 16/32
         mov             w16, #(0x5556/2)
@@ -663,12 +663,12 @@
         movk            w16, #(0x3334/2), lsl #16
         lsr             w16, w16, w4
         dup             v16.4h,  w16
-        sqdmulh         v0.4h,   v0.4h,   v16.4h
+        sqdmulh         v4.4h,   v4.4h,   v16.4h
 1:
-        dup             v0.16b,  v0.b[0]
-        dup             v1.16b,  v0.b[0]
-        dup             v2.16b,  v0.b[0]
-        dup             v3.16b,  v0.b[0]
+        dup             v0.16b,  v4.b[0]
+        dup             v1.16b,  v4.b[0]
+        dup             v2.16b,  v4.b[0]
+        dup             v3.16b,  v4.b[0]
 2:
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1