shithub: dav1d

Download patch

ref: de561b3ba1598e269b00847406181158b1f91d1f
parent: f404c7227ec95dab2aba6d3816ccff1eb36f223f
author: Henrik Gramner <gramner@twoorioles.com>
date: Fri Sep 27 19:44:34 EDT 2019

x86: Increase precision of AVX2 IDCT intermediates

The existing code was using 16-bit intermediate precision for certain
calculations which is insufficient for some esoteric edge cases.

--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -50,7 +50,6 @@
 pw_m3344_3344:  dw -3344,  3344
 pw_m3803_3344:  dw -3803,  3344
 pw_m3803_m6688: dw -3803, -6688
-COEF_PAIR           2896,  2896
 pw_2896_m2896:  dw  2896, -2896
 
 pw_5:      times 2 dw 5
@@ -63,6 +62,7 @@
 
 pd_2048: dd 2048
 
+COEF_PAIR 2896, 2896
 COEF_PAIR 1567, 3784
 COEF_PAIR 3784, 1567
 COEF_PAIR  201, 4091
@@ -194,7 +194,7 @@
 
 ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
-%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
+%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
     punpckhwd           m%3, m%2, m%1
     punpcklwd           m%2, m%1
 %if %7 < 32
@@ -222,20 +222,20 @@
     paddd               m%2, m%5
     psrad               m%3, 12
     psrad               m%2, 12
+%if %0 == 8
+    packssdw            m%8, m%2, m%3
+%else
     packssdw            m%2, m%3
+%endif
 %endmacro
 
 %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
-    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784 ; t2, t3
-    vpbroadcastd        m%6, [o(pw_2896x8)]
-    paddw               m%5, m%1, m%3
-    psubw               m%1, m%3
-    pmulhrsw            m%1, m%6 ; t1
-    pmulhrsw            m%5, m%6 ; t0
+    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3
+    ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0
     psubsw              m%3, m%1, m%2
     paddsw              m%2, m%1
-    paddsw              m%1, m%5, m%4
-    psubsw              m%4, m%5, m%4
+    paddsw              m%1, m%4, m%5
+    psubsw              m%4, m%5
 %endmacro
 
 %macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
@@ -246,27 +246,20 @@
     psubsw              m%2, m%6       ; t5a
     paddsw             m%10, m%8, m%4  ; t7
     psubsw              m%8, m%4       ; t6a
-    vpbroadcastd        m%4, [o(pw_2896x8)]
-    psubw               m%6, m%1, m%5
-    paddw               m%1, m%5
-    psubw               m%5, m%8, m%2
-    paddw               m%8, m%2
-    pmulhrsw            m%1, m%4       ; t0
-    pmulhrsw            m%6, m%4       ; t1
-    pmulhrsw            m%8, m%4       ; t6
-    pmulhrsw            m%5, m%4       ; t5
-    psubsw              m%4, m%1, m%7  ; dct4 out3
-    paddsw              m%1, m%7       ; dct4 out0
-    paddsw              m%7, m%6, m%3  ; dct4 out1
-    psubsw              m%6, m%3       ; dct4 out2
-    paddsw              m%2, m%7, m%8  ; out1
-    psubsw              m%7, m%8       ; out6
+    ITX_MULSUB_2W        %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0
+    ITX_MULSUB_2W        %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6
+    psubsw              m%6, m%1, m%3  ; dct4 out2
+    paddsw              m%3, m%1       ; dct4 out1
+    paddsw              m%1, m%5, m%7  ; dct4 out0
+    psubsw              m%5, m%7       ; dct4 out3
+    psubsw              m%7, m%3, m%2  ; out6
+    paddsw              m%2, m%3       ; out1
+    paddsw              m%3, m%6, m%8  ; out2
+    psubsw              m%6, m%8       ; out5
     psubsw              m%8, m%1, m%10 ; out7
     paddsw              m%1, m%10      ; out0
-    paddsw              m%3, m%6, m%5  ; out2
-    psubsw              m%6, m%5       ; out5
-    psubsw              m%5, m%4, m%9  ; out4
-    paddsw              m%4, m%9       ; out3
+    paddsw              m%4, m%5, m%9  ; out3
+    psubsw              m%5, m%9       ; out4
 %endmacro
 
 ; in1 = %1, in3  = %2, in5  = %3, in7  = %4
@@ -286,20 +279,16 @@
     paddsw              m%1, m%5      ; t8
     ITX_MULSUB_2W        %2, %7, %5, %10, %11,  1567, 3784 ; t9a,  t14a
     ITX_MULSUB_2W        %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
-    vpbroadcastd       m%10, [o(pw_2896x8)]
-    psubsw              m%5, m%2, m%9 ; t10
-    paddsw              m%2, m%9      ; t9
-    psubsw              m%9, m%1, m%3 ; t11a
+    psubsw              m%5, m%1, m%3 ; t11a
     paddsw              m%1, m%3      ; t8a
     psubsw              m%3, m%7, m%4 ; t13
     paddsw              m%7, m%4      ; t14
     psubsw              m%4, m%8, m%6 ; t12a
     paddsw              m%8, m%6      ; t15a
-    paddw               m%6, m%3, m%5 ; t13a
-    psubw               m%3, m%5      ; t10a
-    paddw               m%5, m%4, m%9 ; t12
-    psubw               m%4, m%9      ; t11
-    REPX {pmulhrsw x, m%10}, m%6, m%3, m%5, m%4
+    psubsw              m%6, m%2, m%9 ; t10
+    paddsw              m%2, m%9      ; t9
+    ITX_MULSUB_2W        %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a
+    ITX_MULSUB_2W        %4, %5, %9, %10, %11, 2896, 2896 ; t11,  t12
 %endmacro
 
 %macro WRAP_XMM 1+
@@ -446,21 +435,14 @@
 %endif
 %endmacro
 
-%macro IDCT4_1D_PACKED 0-1 ; pw_2896x8
+%macro IDCT4_1D_PACKED 0
     vpbroadcastd         m4, [o(pd_2048)]
     punpckhwd            m2, m1, m0
-    psubw                m3, m0, m1
-    paddw                m0, m1
-    punpcklqdq           m0, m3
-    ITX_MUL2X_PACK        2, 1, 3, 4, 1567, 3784
-%if %0 == 1
-    pmulhrsw             m0, m%1
-%else
-    vpbroadcastd         m4, [o(pw_2896x8)]
-    pmulhrsw             m0, m4     ; t0 t1
-%endif
-    psubsw               m1, m0, m2 ; out3 out2
-    paddsw               m0, m2     ; out0 out1
+    punpcklwd            m1, m0
+    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784
+    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896
+    paddsw               m0, m1, m2 ; out0 out1
+    psubsw               m1, m2     ; out3 out2
 %endmacro
 
 %macro IADST4_1D_PACKED 0
@@ -683,30 +665,30 @@
     vpbroadcastd         m6, [o(pd_2048)]
     punpckhwd            m5, m3, m0 ; in7 in1
     punpckhwd            m4, m1, m2 ; in3 in5
-    punpcklwd            m3, m1     ; in2 in6
-    psubw                m1, m0, m2
-    paddw                m0, m2
-    punpcklqdq           m0, m1     ; in0+in4 in0-in4
-    ITX_MUL2X_PACK        5, 1, 2, 6,  799, 4017, 1 ; t4a t7a
-    ITX_MUL2X_PACK        4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a
-    ITX_MUL2X_PACK        3, 1, 2, 6, 1567, 3784    ; t3 t2
-    vpbroadcastd         m6, [o(pw_2896x8)]
-    psubsw               m2, m5, m4 ; t4 t7
-    paddsw               m5, m4     ; t5a t6a
-    pshufd               m4, m2, q1032
-    psubw                m1, m2, m4
-    paddw                m4, m2
-    vpblendd             m4, m4, m1, 0xcc
-    pmulhrsw             m0, m6     ; t0 t1
-    pmulhrsw             m4, m6     ; t6 t5
-    psubsw               m1, m0, m3 ; tmp3 tmp2
-    paddsw               m0, m3     ; tmp0 tmp1
-    shufps               m2, m5, m4, q1032 ; t7 t6
-    vpblendd             m5, m5, m4, 0xcc  ; t4 t5
-    psubsw               m3, m0, m2 ; out7 out6
-    paddsw               m0, m2     ; out0 out1
-    psubsw               m2, m1, m5 ; out4 out5
-    paddsw               m1, m5     ; out3 out2
+    punpcklwd            m3, m1     ; in6 in2
+    punpcklwd            m2, m0     ; in4 in0
+    ITX_MUL2X_PACK        5, 0, 1, 6,  799, 4017, 3 ; t4a t7a
+    ITX_MUL2X_PACK        4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
+    ITX_MUL2X_PACK        3, 0, 1, 6, 1567, 3784    ; t3 t2
+    psubsw               m0, m5, m4 ; t5a t6a (interleaved)
+    paddsw               m4, m5     ; t4  t7  (interleaved)
+    ITX_MUL2X_PACK        2, 1, 5, 6, 2896, 2896    ; t0 t1
+    vpbroadcastd         m1, [o(pw_m2896_2896)]
+    ITX_MUL2X_PACK        0, 1, _, 6, 1, 5, 4 ; t6 t5
+%if mmsize > 16
+    vbroadcasti128       m1, [o(deint_shuf)]
+    pshufb               m4, m1
+%else
+    pshufb               m4, [o(deint_shuf)]
+%endif
+    psubsw               m1, m2, m3 ; tmp3 tmp2
+    paddsw               m3, m2     ; tmp0 tmp1
+    shufps               m2, m4, m0, q1032 ; t7 t6
+    vpblendd             m4, m0, 0xcc      ; t4 t5
+    paddsw               m0, m3, m2 ; out0 out1
+    psubsw               m3, m2     ; out7 out6
+    psubsw               m2, m1, m4 ; out4 out5
+    paddsw               m1, m4     ; out3 out2
 %endmacro
 
 %macro IADST8_1D_PACKED 1 ; pass
@@ -797,10 +779,10 @@
 cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpermq               m0, [cq+32*0], q3120
     vpermq               m1, [cq+32*1], q3120
-    vpbroadcastd         m5, [o(pw_2896x8)]
-    pmulhrsw             m0, m5
-    pmulhrsw             m1, m5
-    IDCT4_1D_PACKED       5
+    vpbroadcastd         m2, [o(pw_2896x8)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    IDCT4_1D_PACKED
     vbroadcasti128       m2, [o(deint_shuf)]
     shufps               m3, m0, m1, q1331
     shufps               m0, m0, m1, q0220
@@ -1011,9 +993,7 @@
     vpbroadcastd        m10, [o(pd_2048)]
 .main2:
     punpckhwd            m8, m7, m0 ; dct16 in15 in1
-    paddw                m9, m0, m4
-    psubw                m0, m4
-    punpcklqdq           m9, m0     ; dct4  in0+in2 in0-in2
+    punpcklwd            m9, m4, m0 ; dct4  in2  in0
     punpckhwd            m0, m3, m4 ; dct16 in7  in9
     punpcklwd            m7, m1     ; dct8  in7  in1
     punpckhwd            m1, m6     ; dct16 in3  in13
@@ -1024,47 +1004,44 @@
     ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 3 ; t9a  t14a
     ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
     ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
-    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 1 ; t4a  t7a
-    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 1 ; t5a  t6a
+    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 3 ; t4a  t7a
+    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 3 ; t5a  t6a
     ITX_MUL2X_PACK        6, 2, 4, 10, 1567, 3784    ; t3   t2
     psubsw               m2, m8, m0 ; t9  t14
     paddsw               m8, m0     ; t8  t15
     psubsw               m0, m1, m5 ; t10 t13
     paddsw               m1, m5     ; t11 t12
-%if mmsize > 16
-    vbroadcasti128       m5, [o(deint_shuf)]
-%else
-    mova                 m5, [o(deint_shuf)]
-%endif
-    pshufb               m8, m5
-    pshufb               m1, m5
     vpbroadcastd         m5, [o(pw_m3784_1567)]  ; reuse pw_1567_3784
-    ITX_MUL2X_PACK        2, 4, _, 10, 4, 5, 4   ; t9a  t14a
+    ITX_MUL2X_PACK        2, 4, _, 10, 4, 5, 6   ; t9a  t14a
     vpbroadcastd         m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
-    ITX_MUL2X_PACK        0, 5, _, 10, 5, 4, 4   ; t10a t13a
-    psubsw               m5, m7, m3 ; t5a t6a
-    paddsw               m7, m3     ; t4  t7
+    ITX_MUL2X_PACK        0, 5, _, 10, 5, 4, 6   ; t10a t13a
     psubsw               m4, m8, m1 ; t11a t12a
     paddsw               m8, m1     ; t8a  t15a
-    paddsw               m1, m2, m0 ; t9   t14
+    psubsw               m1, m7, m3 ; t5a  t6a
+    paddsw               m7, m3     ; t4   t7
+    paddsw               m3, m2, m0 ; t9   t14
     psubsw               m2, m0     ; t10  t13
-    punpckhqdq           m0, m8, m1 ; t15a t14
-    punpcklqdq           m8, m1     ; t8a  t9
-    pshufd               m3, m5, q1032
-    psubw                m1, m5, m3
-    paddw                m3, m5
-    vpblendd             m3, m3, m1, 0xcc ; t6 t5
-    vpbroadcastd         m1, [o(pw_2896x8)]
-    punpckhqdq           m5, m4, m2 ; t12a t13
-    punpcklqdq           m2, m4, m2 ; t11a t10
-    psubw                m4, m5, m2
-    paddw                m5, m2
-    pmulhrsw             m9, m1     ; t0   t1
-    pmulhrsw             m3, m1     ; t6   t5
-    pmulhrsw             m4, m1     ; t11  t10a
-    pmulhrsw             m5, m1     ; t12  t13a
-    shufps               m2, m7, m3, q1032 ; t7 t6
-    vpblendd             m7, m7, m3, 0xcc  ; t4 t5
+%if mmsize > 16
+    vbroadcasti128       m0, [o(deint_shuf)]
+%else
+    mova                 m0, [o(deint_shuf)]
+%endif
+    pshufb               m8, m0
+    pshufb               m7, m0
+    pshufb               m3, m0
+    ITX_MUL2X_PACK        9, 0, 5, 10, 2896, 2896 ; t0   t1
+    vpbroadcastd         m0, [o(pw_m2896_2896)]
+    ITX_MUL2X_PACK        4, 5, _, 10, 5, 0, 4    ; t11  t12
+    vpbroadcastd         m5, [o(pw_2896_2896)]
+    ITX_MUL2X_PACK        1, 0, _, 10, 0, 5, 4    ; t6   t5
+    vpbroadcastd         m0, [o(pw_m2896_2896)]
+    ITX_MUL2X_PACK        2, 0, _, 10, 0, 5, 4,   ; t13a t10a
+    punpckhqdq           m0, m8, m3        ; t15a t14
+    punpcklqdq           m8, m3            ; t8a  t9
+    shufps               m5, m4, m2, q1032 ; t12  t13a
+    vpblendd             m4, m2, 0xcc      ; t11  t10a
+    shufps               m2, m7, m1, q1032 ; t7 t6
+    vpblendd             m7, m1, 0xcc      ; t4 t5
     psubsw               m1, m9, m6 ; dct4 out3 out2
     paddsw               m9, m6     ; dct4 out0 out1
     psubsw               m3, m9, m2 ; dct8 out7 out6
@@ -3699,12 +3676,11 @@
     paddsw               m6, m11      ; t17  t30
     psubsw              m11, m0, m14  ; t21  t26
     paddsw               m0, m14      ; t22  t25
-    ITX_MUL2X_PACK       15, 12, 14, 10,  1567, 3784, 1 ; t18a t29a
-    ITX_MUL2X_PACK       13, 12, 14, 10,  1567, 3784, 1 ; t19  t28
-    ITX_MUL2X_PACK        9, 12, 14, 10, m3784, 1567, 1 ; t20  t27
-    ITX_MUL2X_PACK       11, 12, 14, 10, m3784, 1567, 1 ; t21a t26a
+    ITX_MUL2X_PACK       15, 12, 14, 10,  1567, 3784, 3 ; t18a t29a
+    ITX_MUL2X_PACK       13, 12, 14, 10,  1567, 3784, 3 ; t19  t28
+    ITX_MUL2X_PACK        9, 12, 14, 10, m3784, 1567, 3 ; t20  t27
+    ITX_MUL2X_PACK       11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a
     vbroadcasti128      m12, [o(deint_shuf)]
-    REPX    {pshufb x, m12}, m0, m1, m6, m8
     psubsw              m14, m1, m8   ; t23  t24
     paddsw               m1, m8       ; t16  t31
     psubsw               m8, m6, m0   ; t22a t25a
@@ -3713,16 +3689,18 @@
     paddsw              m15, m11      ; t18  t29
     psubsw              m11, m13, m9  ; t20a t27a
     paddsw              m13, m9       ; t19a t28a
-    vpbroadcastd        m12, [o(pw_2896x8)]
-    punpcklqdq           m9, m11, m0  ; t20a t21
-    punpckhqdq          m11, m0       ; t27a t26
-    punpcklqdq           m0, m14, m8  ; t23  t22a
-    punpckhqdq          m14, m8       ; t24  t25a
-    psubw                m8, m11, m9  ; t20  t21a
-    paddw               m11, m9       ; t27  t26a
-    psubw                m9, m14, m0  ; t23a t22
-    paddw               m14, m0       ; t24a t25
-    REPX  {pmulhrsw x, m12}, m8, m9, m14, m11
+    REPX    {pshufb x, m12}, m1, m6, m15, m13
+    ITX_MUL2X_PACK       14,  9, 12, 10, 2896, 2896 ; t24a t23a
+    vpbroadcastd         m9, [o(pw_m2896_2896)]
+    ITX_MUL2X_PACK        8, 12,  _, 10, 12,  9, 4  ; t22  t25
+    vpbroadcastd        m12, [o(pw_2896_2896)]
+    ITX_MUL2X_PACK        0, 12,  _, 10, 12,  9, 4  ; t21a t26a
+    vpbroadcastd        m12, [o(pw_2896_2896)]
+    ITX_MUL2X_PACK       11,  9,  _, 10,  9, 12, 4  ; t27  t20
+    shufps               m9, m14, m8, q1032 ; t23a t22
+    vpblendd            m14, m8, 0xcc       ; t24a t25
+    shufps               m8, m11, m0, q1032 ; t20  t21a
+    vpblendd            m11, m0, 0xcc       ; t27  t26a
     punpcklqdq           m0, m1, m6   ; t16  t17a
     punpckhqdq           m1, m6       ; t31  t30a
     psubsw              m10, m5, m8   ; out20 out21
@@ -4327,7 +4305,6 @@
     mova                 m5, [rsp+gprsize+32*0] ; t22
     mova                 m6, [rsp+gprsize+32*1] ; t23
     mova                 m3, [rsp+gprsize+32*2] ; t24a
-    vpbroadcastd         m8, [o(pw_2896x8)]
     psubsw               m1, m14, m5  ; t22a
     paddsw              m14, m5       ; t17a
     psubsw               m5, m0, m6   ; t23
@@ -4334,26 +4311,23 @@
     paddsw               m0, m6       ; t16
     psubsw               m6, m4, m3   ; t24
     paddsw               m4, m3       ; t31
+    vpbroadcastd         m8, [o(pw_m2896_2896)]
+    vpbroadcastd         m3, [o(pw_2896_2896)]
     mova       [tmp1q-32*4], m0
     mova       [tmp1q-32*3], m14
     mova       [tmp2q+32*3], m4
-    psubw                m3, m13, m9  ; t20
-    paddw               m13, m9       ; t27
-    psubw                m9, m2, m10  ; t21a
-    paddw                m2, m10      ; t26a
-    psubw               m10, m7, m1   ; t22
-    paddw                m7, m1       ; t25
-    psubw                m1, m6, m5   ; t23a
-    paddw                m6, m5       ; t24a
-    REPX   {pmulhrsw x, m8}, m3, m13, m9, m2, m10, m7, m1, m6
-    mova       [tmp1q+32*0], m3
-    mova       [tmp1q+32*1], m9
-    mova       [tmp1q+32*2], m10
-    mova       [tmp1q+32*3], m1
-    mova       [tmp2q-32*4], m6
-    mova       [tmp2q-32*3], m7
-    mova       [tmp2q-32*2], m2
-    mova       [tmp2q-32*1], m13
+    ITX_MULSUB_2W        13,  9,  0,  4, 15,  3,  8 ; t20,  t27
+    ITX_MULSUB_2W         2, 10,  0,  4, 15,  3,  8 ; t21a, t26a
+    ITX_MULSUB_2W         7,  1,  0,  4, 15,  3,  8 ; t22,  t25
+    ITX_MULSUB_2W         6,  5,  0,  4, 15,  3,  8 ; t23a, t24a
+    mova       [tmp1q+32*0], m13
+    mova       [tmp1q+32*1], m2
+    mova       [tmp1q+32*2], m7
+    mova       [tmp1q+32*3], m6
+    mova       [tmp2q-32*4], m5
+    mova       [tmp2q-32*3], m1
+    mova       [tmp2q-32*2], m10
+    mova       [tmp2q-32*1], m9
     ret
 ALIGN function_align
 .transpose_2x8x8_round:
@@ -5237,11 +5211,10 @@
     sub                 rax, o_idct64_offset + 8
     vpbroadcastd        m11, [o(pw_1567_3784)]
     vpbroadcastd        m12, [o(pw_m3784_1567)]
-    vpbroadcastd        m13, [o(pw_m1567_m3784)]
-    vpbroadcastd        m14, [o(pw_2896x8)]
+    vpbroadcastd        m13, [o(pw_2896_2896)]
+    vpbroadcastd        m14, [o(pw_m2896_2896)]
 .main_part2_pass1_loop:
     call .main_part2_internal
-    REPX  {pmulhrsw x, m14}, m1, m2, m4, m3
     IDCT64_PART2_END      0,  7,  0,  6,  9, 10
     IDCT64_PART2_END      7,  8,  5,  0,  6,  7
     IDCT64_PART2_END      8,  2,  1,  0,  6,  7
@@ -5251,44 +5224,42 @@
     ret
 .main_part2_internal:
     mova                 m0, [tmp1q-32*12] ; t32a
-    mova                 m1, [tmp2q-32*13] ; t39a
-    mova                 m2, [tmp1q-32* 4] ; t40a
+    mova                 m6, [tmp2q-32*13] ; t39a
+    mova                 m1, [tmp1q-32* 4] ; t40a
     mova                 m5, [tmp2q+32* 3] ; t55a
     add               tmp1q, 32
     sub               tmp2q, 32
-    mova                 m4, [tmp1q+32* 3] ; t48a
-    mova                 m3, [tmp2q-32* 4] ; t47a
-    mova                 m6, [tmp1q+32*11] ; t56a
+    mova                 m2, [tmp1q+32* 3] ; t48a
+    mova                 m4, [tmp2q-32* 4] ; t47a
+    mova                 m3, [tmp1q+32*11] ; t56a
     mova                 m7, [tmp2q+32*12] ; t63a
-    psubsw               m8, m0, m1 ; t39
-    paddsw               m0, m1     ; t32
-    psubsw               m1, m3, m2 ; t40
-    paddsw               m3, m2     ; t47
-    psubsw               m2, m4, m5 ; t55
-    paddsw               m4, m5     ; t48
-    psubsw               m5, m7, m6 ; t56
-    paddsw               m7, m6     ; t63
-    ITX_MULSUB_2W         5,  8,  6,  9, 15, 11, 12 ; t39a, t56a
-    ITX_MULSUB_2W         2,  1,  6,  9, 15, 12, 13 ; t40a, t55a
-    psubsw               m6, m0, m3 ; t47a
-    paddsw               m0, m3     ; t32a
-    psubsw               m3, m7, m4 ; t48a
-    paddsw               m7, m4     ; t63a
-    psubsw               m4, m5, m2 ; t40
-    paddsw               m5, m2     ; t39
-    psubsw               m2, m8, m1 ; t55
-    paddsw               m8, m1     ; t56
-    psubw                m1, m2, m4 ; t40a
-    paddw                m2, m4     ; t55a
-    psubw                m4, m3, m6 ; t47
-    paddw                m3, m6     ; t48
+    psubsw               m8, m0, m6 ; t39
+    paddsw               m0, m6     ; t32
+    psubsw               m6, m4, m1 ; t40
+    paddsw               m4, m1     ; t47
+    psubsw               m1, m2, m5 ; t55
+    paddsw               m2, m5     ; t48
+    psubsw               m5, m7, m3 ; t56
+    paddsw               m7, m3     ; t63
+    ITX_MULSUB_2W         5,  8,  3,  9, 15, 11, 12 ; t39a, t56a
+    vpbroadcastd         m9, [o(pw_m1567_m3784)]
+    ITX_MULSUB_2W         1,  6,  3,  9, 15, 12,  9 ; t40a, t55a
+    psubsw               m3, m0, m4 ; t47a
+    paddsw               m0, m4     ; t32a
+    psubsw               m4, m7, m2 ; t48a
+    paddsw               m7, m2     ; t63a
+    psubsw               m2, m5, m1 ; t40
+    paddsw               m5, m1     ; t39
+    psubsw               m1, m8, m6 ; t55
+    paddsw               m8, m6     ; t56
+    ITX_MULSUB_2W         4,  3,  6,  9, 15, 13, 14 ; t47,  t48
+    ITX_MULSUB_2W         1,  2,  6,  9, 15, 13, 14 ; t40a, t55a
     ret
 .main_part2_pass2:
     sub                 rax, o_idct64_offset + 8
     vpbroadcastd        m11, [o(pw_1567_3784)]
     vpbroadcastd        m12, [o(pw_m3784_1567)]
-    vpbroadcastd        m13, [o(pw_m1567_m3784)]
-    vpbroadcastd        m14, [o(pw_2048)]
+    vpbroadcastd        m13, [o(pw_2896_2896)]
     lea                  r9, [strideq*5]    ; stride*5
     lea                  r3, [r9+strideq*1] ; stride*6
     lea                  r7, [r9+strideq*2] ; stride*7
@@ -5295,9 +5266,9 @@
     lea                  r8, [r3+strideq*2] ; stride*8
     lea                  r2, [dstq+r7]
 .main_part2_pass2_loop:
+    vpbroadcastd        m14, [o(pw_m2896_2896)]
     call .main_part2_internal
-    vpbroadcastd        m10, [o(pw_2896x8)]
-    REPX  {pmulhrsw x, m10}, m1, m2, m4, m3
+    vpbroadcastd        m14, [o(pw_2048)]
     IDCT64_PART2_END      0,  7,  0,  6,  9, 10, strideq*0, r3*4, r8*4, r7*8
     IDCT64_PART2_END      7,  8,  5,  0,  6,  7, strideq*0, r3*4, r8*4, r7*8
     IDCT64_PART2_END      8,  2,  1,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8