shithub: dav1d

Download patch

ref: cbec1241fe6f6d43710348a5c04aad1b81059f19
parent: 14dc2038d8f55918ddb49ca13fcdbbbb9bcc1100
author: Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
date: Fri Mar 8 15:48:58 EST 2019

x86: optimize AVX2 cdef_dir

This optimization is so tiny we can't even see it in checkasm.
The only actual difference being the removal of a memory load, it has to
be better.

--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -33,7 +33,6 @@
 div_table: dd 840, 420, 280, 210, 168, 140, 120, 105
            dd 420, 210, 140, 105
 shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
-shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
 pw_128: times 2 dw 128
 pw_2048: times 2 dw 2048
@@ -600,9 +599,8 @@
     ; and [upper half]:
     ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
     ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
-    ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd
+    ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
 
-    vbroadcasti128 m14, [shufw_210xxxxx]
     pslldq          m4, m11, 2
     psrldq         m11, 14
     pslldq          m5, m12, 4
@@ -616,7 +614,7 @@
     paddw          m11, m13                 ; partial_sum_alt[3/2] right
     vbroadcasti128 m13, [div_table+32]
     paddw           m4, m5                  ; partial_sum_alt[3/2] left
-    pshufb         m11, m14
+    pshuflw        m11, m11, q3012
     punpckhwd       m6, m4, m11
     punpcklwd       m4, m11
     pmaddwd         m6, m6
@@ -631,7 +629,7 @@
     ; and [upper half]:
     ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
     ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
-    ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd
+    ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
 
     pslldq          m5, m1, 2
     psrldq          m1, 14
@@ -644,7 +642,7 @@
     paddw           m6, m7
     paddw           m1, m3                  ; partial_sum_alt[0/1] right
     paddw           m5, m6                  ; partial_sum_alt[0/1] left
-    pshufb          m1, m14
+    pshuflw         m1, m1, q3012
     punpckhwd       m6, m5, m1
     punpcklwd       m5, m1
     pmaddwd         m6, m6