shithub: dav1d

Download patch

ref: b83cb9643bc647f6ef633330b52cb68882338067
parent: 289ca2ceaef841babb7ec3a302e0c7c625a54899
author: Victorien Le Couviour--Tuffet <victorien@videolan.org>
date: Fri Jan 17 08:42:54 EST 2020

x86: replace "mov hb, Xb" by "movzx hd, Xb" in MC

It's a little easier for the CPU to simply overwrite a 32-bit reg rather
than writing it's low 8 bits while conserving bits 8 to 31.
In order to do that it actually fetches those bits, merge to a 32-bit
value, and write that back to the 32-bit GPR.

As those are always cleared, perform a zero extend mov to dword instead.

--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -566,7 +566,7 @@
     lea                 t2d, [hq+(3<<8)]
 .v_w128_loop:
     PUT_BILIN_V_W32
-    mov                  hb, t2b
+    movzx                hd, t2b
     add                  t0, 32
     add                  t1, 32
     mov                dstq, t0
@@ -1492,7 +1492,7 @@
     add                tmpq, 32*16
     sub                  hd, 2
     jg .v_w128_loop
-    mov                  hb, t2b
+    movzx                hd, t2b
     add                  t0, 64
     add                  t1, 32
     mov                tmpq, t0
@@ -1758,7 +1758,7 @@
     mova        [tmpq-32*4], m2
     sub                  hd, 2
     jg .hv_w64_loop
-    mov                  hb, t2b
+    movzx                hd, t2b
     add                  t0, 32
     add                  t1, 16
     mov                tmpq, t0
@@ -1841,7 +1841,7 @@
     add                tmpq, 32*16
     sub                  hd, 2
     jg .hv_w128_loop
-    mov                  hb, t2b
+    movzx                hd, t2b
     add                  t0, mmsize
     add                  t1, mmsize/2
     mov                tmpq, t0
@@ -2247,7 +2247,7 @@
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .v_w16_loop
-    mov                  hb, r6b
+    movzx                hd, r6b
     add                  r4, 16
     add                  r7, 16
     mov                dstq, r4
@@ -2516,7 +2516,7 @@
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .hv_w8_loop
-    mov                  hb, r6b
+    movzx                hd, r6b
     add                  r4, 8
     add                  r7, 8
     mov                dstq, r4