ref: cbec1241fe6f6d43710348a5c04aad1b81059f19
parent: 14dc2038d8f55918ddb49ca13fcdbbbb9bcc1100
author: Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
date: Fri Mar 8 15:48:58 EST 2019
x86: optimize AVX2 cdef_dir This optimization is so tiny we can't even see it in checkasm. The only actual difference being the removal of a memory load, it has to be better.
--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -33,7 +33,6 @@
div_table: dd 840, 420, 280, 210, 168, 140, 120, 105
dd 420, 210, 140, 105
shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
-shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pw_128: times 2 dw 128
pw_2048: times 2 dw 2048
@@ -600,9 +599,8 @@
; and [upper half]:
; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
- ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd
+ ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
- vbroadcasti128 m14, [shufw_210xxxxx]
pslldq m4, m11, 2
psrldq m11, 14
pslldq m5, m12, 4
@@ -616,7 +614,7 @@
paddw m11, m13 ; partial_sum_alt[3/2] right
vbroadcasti128 m13, [div_table+32]
paddw m4, m5 ; partial_sum_alt[3/2] left
- pshufb m11, m14
+ pshuflw m11, m11, q3012
punpckhwd m6, m4, m11
punpcklwd m4, m11
pmaddwd m6, m6
@@ -631,7 +629,7 @@
; and [upper half]:
; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
- ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd
+ ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
pslldq m5, m1, 2
psrldq m1, 14
@@ -644,7 +642,7 @@
paddw m6, m7
paddw m1, m3 ; partial_sum_alt[0/1] right
paddw m5, m6 ; partial_sum_alt[0/1] left
- pshufb m1, m14
+ pshuflw m1, m1, q3012
punpckhwd m6, m5, m1
punpcklwd m5, m1
pmaddwd m6, m6