shithub: dav1d

Download patch

ref: a755b6e3d3ea865e43f4d033b8c66e42cd559d15
parent: eb01bdb9763f3c1990d748682cc5b853fd05ca69
author: Henrik Gramner <gramner@twoorioles.com>
date: Sat Dec 15 14:02:07 EST 2018

Clip coefficients in SSSE3/AVX2 inverse transform asm

--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -231,10 +231,10 @@
     psubw               m%1, m%3
     pmulhrsw            m%1, m%6 ; t1
     pmulhrsw            m%5, m%6 ; t0
-    psubw               m%3, m%1, m%2
-    paddw               m%2, m%1
-    paddw               m%1, m%5, m%4
-    psubw               m%4, m%5, m%4
+    psubsw              m%3, m%1, m%2
+    paddsw              m%2, m%1
+    paddsw              m%1, m%5, m%4
+    psubsw              m%4, m%5, m%4
 %endmacro
 
 %macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
@@ -241,10 +241,10 @@
     ITX_MULSUB_2W        %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
     ITX_MULSUB_2W        %2, %8, %9, %10, %11,  799, 4017 ; t4a, t7a
     ITX_MULSUB_2W        %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
-    paddw               m%9, m%2, m%6  ; t4
-    psubw               m%2, m%6       ; t5a
-    paddw              m%10, m%8, m%4  ; t7
-    psubw               m%8, m%4       ; t6a
+    paddsw              m%9, m%2, m%6  ; t4
+    psubsw              m%2, m%6       ; t5a
+    paddsw             m%10, m%8, m%4  ; t7
+    psubsw              m%8, m%4       ; t6a
     vpbroadcastd        m%4, [o(pw_2896x8)]
     psubw               m%6, m%1, m%5
     paddw               m%1, m%5
@@ -254,18 +254,18 @@
     pmulhrsw            m%6, m%4       ; t1
     pmulhrsw            m%8, m%4       ; t6
     pmulhrsw            m%5, m%4       ; t5
-    psubw               m%4, m%1, m%7  ; dct4 out3
-    paddw               m%1, m%7       ; dct4 out0
-    paddw               m%7, m%6, m%3  ; dct4 out1
-    psubw               m%6, m%3       ; dct4 out2
-    paddw               m%2, m%7, m%8  ; out1
-    psubw               m%7, m%8       ; out6
-    psubw               m%8, m%1, m%10 ; out7
-    paddw               m%1, m%10      ; out0
-    paddw               m%3, m%6, m%5  ; out2
-    psubw               m%6, m%5       ; out5
-    psubw               m%5, m%4, m%9  ; out4
-    paddw               m%4, m%9       ; out3
+    psubsw              m%4, m%1, m%7  ; dct4 out3
+    paddsw              m%1, m%7       ; dct4 out0
+    paddsw              m%7, m%6, m%3  ; dct4 out1
+    psubsw              m%6, m%3       ; dct4 out2
+    paddsw              m%2, m%7, m%8  ; out1
+    psubsw              m%7, m%8       ; out6
+    psubsw              m%8, m%1, m%10 ; out7
+    paddsw              m%1, m%10      ; out0
+    paddsw              m%3, m%6, m%5  ; out2
+    psubsw              m%6, m%5       ; out5
+    psubsw              m%5, m%4, m%9  ; out4
+    paddsw              m%4, m%9       ; out3
 %endmacro
 
 ; in1 = %1, in3  = %2, in5  = %3, in7  = %4
@@ -275,25 +275,25 @@
     ITX_MULSUB_2W        %5, %4, %9, %10, %11, 3166, 2598 ; t9a,  t14a
     ITX_MULSUB_2W        %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
     ITX_MULSUB_2W        %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
-    psubw               m%9, m%2, m%6 ; t13
-    paddw               m%6, m%2      ; t12
-    psubw               m%2, m%8, m%4 ; t14
-    paddw               m%8, m%4      ; t15
-    psubw               m%4, m%7, m%3 ; t10
-    paddw               m%3, m%7      ; t11
-    psubw               m%7, m%1, m%5 ; t9
-    paddw               m%1, m%5      ; t8
+    psubsw              m%9, m%2, m%6 ; t13
+    paddsw              m%6, m%2      ; t12
+    psubsw              m%2, m%8, m%4 ; t14
+    paddsw              m%8, m%4      ; t15
+    psubsw              m%4, m%7, m%3 ; t10
+    paddsw              m%3, m%7      ; t11
+    psubsw              m%7, m%1, m%5 ; t9
+    paddsw              m%1, m%5      ; t8
     ITX_MULSUB_2W        %2, %7, %5, %10, %11,  1567, 3784 ; t9a,  t14a
     ITX_MULSUB_2W        %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
     vpbroadcastd       m%10, [o(pw_2896x8)]
-    psubw               m%5, m%2, m%9 ; t10
-    paddw               m%2, m%9      ; t9
-    psubw               m%9, m%1, m%3 ; t11a
-    paddw               m%1, m%3      ; t8a
-    psubw               m%3, m%7, m%4 ; t13
-    paddw               m%7, m%4      ; t14
-    psubw               m%4, m%8, m%6 ; t12a
-    paddw               m%8, m%6      ; t15a
+    psubsw              m%5, m%2, m%9 ; t10
+    paddsw              m%2, m%9      ; t9
+    psubsw              m%9, m%1, m%3 ; t11a
+    paddsw              m%1, m%3      ; t8a
+    psubsw              m%3, m%7, m%4 ; t13
+    paddsw              m%7, m%4      ; t14
+    psubsw              m%4, m%8, m%6 ; t12a
+    paddsw              m%8, m%6      ; t15a
     paddw               m%6, m%3, m%5 ; t13a
     psubw               m%3, m%5      ; t10a
     paddw               m%5, m%4, m%9 ; t12
@@ -458,8 +458,8 @@
     vpbroadcastd         m4, [o(pw_2896x8)]
     pmulhrsw             m0, m4     ; t0 t1
 %endif
-    psubw                m1, m0, m2 ; out3 out2
-    paddw                m0, m2     ; out0 out1
+    psubsw               m1, m0, m2 ; out3 out2
+    paddsw               m0, m2     ; out0 out1
 %endmacro
 
 %macro IADST4_1D_PACKED 0
@@ -693,8 +693,8 @@
     ITX_MUL2X_PACK        4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a
     ITX_MUL2X_PACK        3, 1, 2, 6, 1567, 3784    ; t3 t2
     vpbroadcastd         m6, [o(pw_2896x8)]
-    psubw                m2, m5, m4 ; t4 t7
-    paddw                m5, m4     ; t5a t6a
+    psubsw               m2, m5, m4 ; t4 t7
+    paddsw               m5, m4     ; t5a t6a
     pshufd               m4, m2, q1032
     psubw                m1, m2, m4
     paddw                m4, m2
@@ -701,14 +701,14 @@
     vpblendd             m4, m4, m1, 0xcc
     pmulhrsw             m0, m6     ; t0 t1
     pmulhrsw             m4, m6     ; t6 t5
-    psubw                m1, m0, m3 ; tmp3 tmp2
-    paddw                m0, m3     ; tmp0 tmp1
+    psubsw               m1, m0, m3 ; tmp3 tmp2
+    paddsw               m0, m3     ; tmp0 tmp1
     shufps               m2, m5, m4, q1032 ; t7 t6
     vpblendd             m5, m5, m4, 0xcc  ; t4 t5
-    psubw                m3, m0, m2 ; out7 out6
-    paddw                m0, m2     ; out0 out1
-    psubw                m2, m1, m5 ; out4 out5
-    paddw                m1, m5     ; out3 out2
+    psubsw               m3, m0, m2 ; out7 out6
+    paddsw               m0, m2     ; out0 out1
+    psubsw               m2, m1, m5 ; out4 out5
+    paddsw               m1, m5     ; out3 out2
 %endmacro
 
 %macro IADST8_1D_PACKED 0
@@ -721,19 +721,19 @@
     ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
     ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
     ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189 ; t6a t7a
-    psubw                m4, m0, m2 ; t4 t5
-    paddw                m0, m2     ; t0 t1
-    psubw                m5, m1, m3 ; t6 t7
-    paddw                m1, m3     ; t2 t3
+    psubsw               m4, m0, m2 ; t4 t5
+    paddsw               m0, m2     ; t0 t1
+    psubsw               m5, m1, m3 ; t6 t7
+    paddsw               m1, m3     ; t2 t3
     shufps               m2, m5, m4, q1032
     punpckhwd            m4, m2
     punpcklwd            m5, m2
     ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
     ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567    ; t7a t6a
-    psubw                m2, m0, m1        ; t2 t3
-    paddw                m0, m1            ; out0 -out7
-    psubw                m1, m4, m5        ; t7 t6
-    paddw                m4, m5            ; out6 -out1
+    psubsw               m2, m0, m1        ; t2 t3
+    paddsw               m0, m1            ; out0 -out7
+    psubsw               m1, m4, m5        ; t7 t6
+    paddsw               m4, m5            ; out6 -out1
     vpbroadcastd         m5, [o(pw_2896x8)]
     vpblendd             m3, m0, m4, 0x33  ; out6 -out7
     vpblendd             m0, m0, m4, 0xcc  ; out0 -out1
@@ -981,10 +981,10 @@
     ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 1 ; t4a  t7a
     ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 1 ; t5a  t6a
     ITX_MUL2X_PACK        6, 2, 4, 10, 1567, 3784    ; t3   t2
-    psubw                m2, m8, m0 ; t9  t14
-    paddw                m8, m0     ; t8  t15
-    psubw                m0, m1, m5 ; t10 t13
-    paddw                m1, m5     ; t11 t12
+    psubsw               m2, m8, m0 ; t9  t14
+    paddsw               m8, m0     ; t8  t15
+    psubsw               m0, m1, m5 ; t10 t13
+    paddsw               m1, m5     ; t11 t12
 %if mmsize > 16
     vbroadcasti128       m5, [o(deint_shuf)]
 %else
@@ -996,12 +996,12 @@
     ITX_MUL2X_PACK        2, 4, _, 10, 4, 5, 4   ; t9a  t14a
     vpbroadcastd         m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
     ITX_MUL2X_PACK        0, 5, _, 10, 5, 4, 4   ; t10a t13a
-    psubw                m5, m7, m3 ; t5a t6a
-    paddw                m7, m3     ; t4  t7
-    psubw                m4, m8, m1 ; t11a t12a
-    paddw                m8, m1     ; t8a  t15a
-    paddw                m1, m2, m0 ; t9   t14
-    psubw                m2, m0     ; t10  t13
+    psubsw               m5, m7, m3 ; t5a t6a
+    paddsw               m7, m3     ; t4  t7
+    psubsw               m4, m8, m1 ; t11a t12a
+    paddsw               m8, m1     ; t8a  t15a
+    paddsw               m1, m2, m0 ; t9   t14
+    psubsw               m2, m0     ; t10  t13
     punpckhqdq           m0, m8, m1 ; t15a t14
     punpcklqdq           m8, m1     ; t8a  t9
     pshufd               m3, m5, q1032
@@ -1019,20 +1019,20 @@
     pmulhrsw             m5, m1     ; t12  t13a
     shufps               m2, m7, m3, q1032 ; t7 t6
     vpblendd             m7, m7, m3, 0xcc  ; t4 t5
-    psubw                m1, m9, m6 ; dct4 out3 out2
-    paddw                m9, m6     ; dct4 out0 out1
-    psubw                m3, m9, m2 ; dct8 out7 out6
-    paddw                m9, m2     ; dct8 out0 out1
-    psubw                m2, m1, m7 ; dct8 out4 out5
-    paddw                m1, m7     ; dct8 out3 out2
-    psubw                m7, m9, m0 ; out15 out14
-    paddw                m0, m9     ; out0  out1
-    psubw                m6, m1, m5 ; out12 out13
-    paddw                m1, m5     ; out3  out2
-    psubw                m5, m2, m4 ; out11 out10
-    paddw                m2, m4     ; out4  out5
-    psubw                m4, m3, m8 ; out8  out9
-    paddw                m3, m8     ; out7  out6
+    psubsw               m1, m9, m6 ; dct4 out3 out2
+    paddsw               m9, m6     ; dct4 out0 out1
+    psubsw               m3, m9, m2 ; dct8 out7 out6
+    paddsw               m9, m2     ; dct8 out0 out1
+    psubsw               m2, m1, m7 ; dct8 out4 out5
+    paddsw               m1, m7     ; dct8 out3 out2
+    psubsw               m7, m9, m0 ; out15 out14
+    paddsw               m0, m9     ; out0  out1
+    psubsw               m6, m1, m5 ; out12 out13
+    paddsw               m1, m5     ; out3  out2
+    psubsw               m5, m2, m4 ; out11 out10
+    paddsw               m2, m4     ; out4  out5
+    psubsw               m4, m3, m8 ; out8  out9
+    paddsw               m3, m8     ; out7  out6
 %endmacro
 
 INV_TXFM_4X16_FN dct, dct,      0
@@ -1153,20 +1153,20 @@
     ITX_MUL4X_PACK        1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
     ITX_MUL4X_PACK        3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
     ITX_MUL4X_PACK        4, 2, 5, 6, 8, 3857, 1380, 4052,  601, 3
-    psubw                m2, m0, m3 ; t9a  t8a  t11a t10a
-    paddw                m0, m3     ; t1a  t0a  t3a  t2a
-    psubw                m3, m1, m4 ; t13a t12a t15a t14a
-    paddw                m1, m4     ; t5a  t4a  t7a  t6a
+    psubsw               m2, m0, m3 ; t9a  t8a  t11a t10a
+    paddsw               m0, m3     ; t1a  t0a  t3a  t2a
+    psubsw               m3, m1, m4 ; t13a t12a t15a t14a
+    paddsw               m1, m4     ; t5a  t4a  t7a  t6a
     ITX_MUL4X_PACK        2, 4, 5, 6, 8,  799, 4017, 3406, 2276, 3
     psubw                m6, m7, m5
     ITX_MUL2X_PACK        3, 5, _, 8, 6, 4, 6
     vpbroadcastd         m6, [o(pw_m3784_1567)]
     vpbroadcastd         m5, [o(pw_1567_3784)]
-    psubw                m4, m0, m1 ; t5   t4   t7   t6
-    paddw                m0, m1     ; t1   t0   t3   t2
-    psubw                m1, m2, m3 ; t13a t12a t15a t14a
-    paddw                m2, m3     ; t9a  t8a  t11a t10a
-    psubw                m3, m7, m6
+    psubsw               m4, m0, m1 ; t5   t4   t7   t6
+    paddsw               m0, m1     ; t1   t0   t3   t2
+    psubsw               m1, m2, m3 ; t13a t12a t15a t14a
+    paddsw               m2, m3     ; t9a  t8a  t11a t10a
+    psubw                m3, m7, m6 ; pw_3784_m1567
     vpblendd             m6, m6, m3, 0xf0
     ITX_MUL2X_PACK        4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
     ITX_MUL2X_PACK        1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
@@ -1179,10 +1179,10 @@
     vinserti128          m4, m4, xm1, 1    ; t4a  t5a  t12  t13
     vpbroadcastd         m5, [o(pw_2896x8)]
     pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
-    psubw                m1, m0, m3        ; t3a t2a t11 t10
-    paddw                m0, m3     ; -out15  out0   out14 -out1
-    paddw                m3, m4, m2 ; -out3   out12  out2  -out13
-    psubw                m4, m2            ; t6 t7 t14a t15a
+    psubsw               m1, m0, m3        ; t3a t2a t11 t10
+    paddsw               m0, m3     ; -out15  out0   out14 -out1
+    paddsw               m3, m4, m2 ; -out3   out12  out2  -out13
+    psubsw               m4, m2            ; t6 t7 t14a t15a
     shufps               m2, m1, m4, q1032 ; t2a t6  t10 t14a
     vpblendd             m4, m4, m1, 0x33  ; t3a t7  t11 t15a
     paddw                m1, m2, m4
@@ -1902,53 +1902,53 @@
     ITX_MUL2X_PACK        6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
     ITX_MUL2X_PACK        7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
     ITX_MUL2X_PACK        8, 4, 9, 10, 4052,  601, 3 ; t14 t15
-    psubw                m4, m0, m5 ; t9a  t8a
-    paddw                m0, m5     ; t1a  t0a
-    psubw                m5, m1, m6 ; t11a t10a
-    paddw                m1, m6     ; t3a  t2a
-    psubw                m6, m2, m7 ; t13a t12a
-    paddw                m2, m7     ; t5a  t4a
-    psubw                m7, m3, m8 ; t15a t14a
-    paddw                m3, m8     ; t7a  t6a
+    psubsw               m4, m0, m5 ; t9a  t8a
+    paddsw               m0, m5     ; t1a  t0a
+    psubsw               m5, m1, m6 ; t11a t10a
+    paddsw               m1, m6     ; t3a  t2a
+    psubsw               m6, m2, m7 ; t13a t12a
+    paddsw               m2, m7     ; t5a  t4a
+    psubsw               m7, m3, m8 ; t15a t14a
+    paddsw               m3, m8     ; t7a  t6a
     vpbroadcastd        m11, [o(pw_m4017_799)]
     vpbroadcastd        m12, [o(pw_799_4017)]
     pxor                 m9, m9
     ITX_MUL2X_PACK        4, 8, _, 10, 11, 12, 6 ; t8  t9
-    psubw                m8, m9, m11
+    psubw                m8, m9, m11 ; pw_4017_m799
     ITX_MUL2X_PACK        6, 12, _, 10, 12, 8, 6 ; t12 t13
     vpbroadcastd        m11, [o(pw_m2276_3406)]
     vpbroadcastd        m12, [o(pw_3406_2276)]
     ITX_MUL2X_PACK        5, 8, _, 10, 11, 12, 6 ; t10 t11
-    psubw                m8, m9, m11
+    psubw                m8, m9, m11 ; pw_2276_m3406
     ITX_MUL2X_PACK        7, 12, _, 10, 12, 8, 6 ; t14 t15
-    psubw                m8, m1, m3 ; t7   t6
-    paddw                m1, m3     ; t3   t2
-    psubw                m3, m0, m2 ; t5   t4
-    paddw                m0, m2     ; t1   t0
-    psubw                m2, m5, m7 ; t14a t15a
-    paddw                m7, m5     ; t10a t11a
-    psubw                m5, m4, m6 ; t12a t13a
-    paddw                m4, m6     ; t8a  t9a
+    psubsw               m8, m1, m3 ; t7   t6
+    paddsw               m1, m3     ; t3   t2
+    psubsw               m3, m0, m2 ; t5   t4
+    paddsw               m0, m2     ; t1   t0
+    psubsw               m2, m5, m7 ; t14a t15a
+    paddsw               m7, m5     ; t10a t11a
+    psubsw               m5, m4, m6 ; t12a t13a
+    paddsw               m4, m6     ; t8a  t9a
     vpbroadcastd        m11, [o(pw_m3784_1567)]
     vpbroadcastd        m12, [o(pw_1567_3784)]
     ITX_MUL2X_PACK        3, 6, _, 10, 11, 12, 4 ; t4a t5a
-    psubw                m6, m9, m11
+    psubw                m6, m9, m11 ; pw_3784_m1567
     ITX_MUL2X_PACK        8, 12, _, 10, 12, 6, 4 ; t6a t7a
     vpbroadcastd        m11, [o(pw_m1567_3784)]
     vpbroadcastd        m12, [o(pw_3784_1567)]
     ITX_MUL2X_PACK        2, 6, _, 10, 11, 12, 4 ; t15 t14
-    psubw                m6, m9, m11
+    psubw                m6, m9, m11 ; pw_1567_m3784
     ITX_MUL2X_PACK        5, 12, _, 10, 12, 6, 4 ; t13 t12
     vbroadcasti128      m11, [o(deint_shuf)]
     vpbroadcastd        m12, [o(pw_2896x8)]
-    psubw                m6, m0, m1        ;  t3a    t2a
-    paddw                m0, m1            ; -out15  out0
-    paddw                m1, m2, m5        ; -out13  out2
-    psubw                m5, m2            ;  t15a   t14a
-    paddw                m2, m4, m7        ; -out1  out14
-    psubw                m4, m7            ;  t10    t11
-    psubw                m7, m3, m8        ;  t6     t7
-    paddw                m8, m3            ; -out3   out12
+    psubsw               m6, m0, m1        ;  t3a    t2a
+    paddsw               m0, m1            ; -out15  out0
+    paddsw               m1, m2, m5        ; -out13  out2
+    psubsw               m5, m2            ;  t15a   t14a
+    paddsw               m2, m4, m7        ; -out1  out14
+    psubsw               m4, m7            ;  t10    t11
+    psubsw               m7, m3, m8        ;  t6     t7
+    paddsw               m8, m3            ; -out3   out12
     REPX    {pshufb x, m11}, m6, m4, m0, m2
     vpblendd             m3, m6, m4, 0xcc  ;  t3a    t11
     shufps               m6, m6, m4, q1032 ;  t2a    t10
@@ -2580,25 +2580,25 @@
     ITX_MULSUB_2W         3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
     ITX_MULSUB_2W         1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
     ITX_MULSUB_2W         5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
-    psubw                m8, m2, m6 ; t6
-    paddw                m2, m6     ; t2
-    psubw                m6, m0, m4 ; t4
-    paddw                m0, m4     ; t0
-    psubw                m4, m5, m1 ; t7
-    paddw                m5, m1     ; t3
-    psubw                m1, m7, m3 ; t5
-    paddw                m7, m3     ; t1
+    psubsw               m8, m2, m6 ; t6
+    paddsw               m2, m6     ; t2
+    psubsw               m6, m0, m4 ; t4
+    paddsw               m0, m4     ; t0
+    psubsw               m4, m5, m1 ; t7
+    paddsw               m5, m1     ; t3
+    psubsw               m1, m7, m3 ; t5
+    paddsw               m7, m3     ; t1
     ITX_MULSUB_2W         6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
     ITX_MULSUB_2W         4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
-    psubw                m9, m6, m8 ;  t7
-    paddw                m6, m8     ;  out6
+    psubsw               m9, m6, m8 ;  t7
+    paddsw               m6, m8     ;  out6
     vpbroadcastd         m8, [o(pw_2896x8)]
-    psubw                m3, m7, m5 ;  t3
-    paddw                m7, m5     ; -out7
-    psubw                m5, m0, m2 ;  t2
-    paddw                m0, m2     ;  out0
-    psubw                m2, m1, m4 ;  t6
-    paddw                m1, m4     ; -out1
+    psubsw               m3, m7, m5 ;  t3
+    paddsw               m7, m5     ; -out7
+    psubsw               m5, m0, m2 ;  t2
+    paddsw               m0, m2     ;  out0
+    psubsw               m2, m1, m4 ;  t6
+    paddsw               m1, m4     ; -out1
     psubw                m4, m5, m3
     paddw                m3, m5
     psubw                m5, m2, m9
@@ -2959,25 +2959,25 @@
     mova [rsp+gprsize+32*0], m6  ; tmp3
     IDCT16_1D_ODDHALF     9,  3,  5,  7,  1, 11, 13, 14,  6, 10, 15
     mova                 m6, [rsp+gprsize+32*1] ; tmp5
-    psubw               m15, m0, m14  ; out15
-    paddw                m0, m14      ; out0
-    psubw               m14, m2, m13  ; out14
-    paddw                m2, m13      ; out1
+    psubsw              m15, m0, m14  ; out15
+    paddsw               m0, m14      ; out0
+    psubsw              m14, m2, m13  ; out14
+    paddsw               m2, m13      ; out1
     mova [rsp+gprsize+32*1], m2
-    psubw               m13, m4, m11  ; out13
-    paddw                m2, m4, m11  ; out2
-    psubw               m11, m8, m7   ; out11
-    paddw                m4, m8, m7   ; out4
+    psubsw              m13, m4, m11  ; out13
+    paddsw               m2, m4, m11  ; out2
+    psubsw              m11, m8, m7   ; out11
+    paddsw               m4, m8, m7   ; out4
     mova                 m7, [rsp+gprsize+32*2] ; tmp7
-    psubw               m10, m6, m5   ; out10
-    paddw                m5, m6       ; out5
-    psubw                m8, m7, m9   ; out8
-    paddw                m7, m9       ; out7
-    psubw                m9, m12, m3  ; out9
-    paddw                m6, m12, m3  ; out6
+    psubsw              m10, m6, m5   ; out10
+    paddsw               m5, m6       ; out5
+    psubsw               m8, m7, m9   ; out8
+    paddsw               m7, m9       ; out7
+    psubsw               m9, m12, m3  ; out9
+    paddsw               m6, m12, m3  ; out6
     mova                 m3, [rsp+gprsize+32*0] ; tmp3
-    psubw               m12, m3, m1   ; out12
-    paddw                m3, m1       ; out3
+    psubsw              m12, m3, m1   ; out12
+    paddsw               m3, m1       ; out3
     ret
 
 INV_TXFM_16X16_FN adst, dct
@@ -3012,24 +3012,24 @@
     ITX_MULSUB_2W         9,  6,  0,  4, 15, 2440, 3290 ; t7,  t6
     ITX_MULSUB_2W         5, 10,  0,  4, 15, 3513, 2106 ; t11, t10
     ITX_MULSUB_2W         1, 14,  0,  4, 15, 4052,  601 ; t15, t14
-    psubw                m0, m2, m10  ; t10a
-    paddw                m2, m10      ; t2a
-    psubw               m10, m13, m5  ; t11a
-    paddw               m13, m5       ; t3a
-    psubw                m5, m6, m14  ; t14a
-    paddw                m6, m14      ; t6a
-    psubw               m14, m9, m1   ; t15a
-    paddw                m9, m1       ; t7a
+    psubsw               m0, m2, m10  ; t10a
+    paddsw               m2, m10      ; t2a
+    psubsw              m10, m13, m5  ; t11a
+    paddsw              m13, m5       ; t3a
+    psubsw               m5, m6, m14  ; t14a
+    paddsw               m6, m14      ; t6a
+    psubsw              m14, m9, m1   ; t15a
+    paddsw               m9, m1       ; t7a
     ITX_MULSUB_2W         0, 10,  1,  4, 15, 3406, 2276 ; t11, t10
     ITX_MULSUB_2W        14,  5,  1,  4, 15, 2276, 3406 ; t14, t15
-    psubw                m1, m10, m14 ; t14a
-    paddw               m10, m14      ; t10a
-    psubw               m14, m0, m5   ; t15a
-    paddw                m0, m5       ; t11a
-    psubw                m5, m2, m6   ; t6
-    paddw                m2, m6       ; t2
-    psubw                m6, m13, m9  ; t7
-    paddw               m13, m9       ; t3
+    psubsw               m1, m10, m14 ; t14a
+    paddsw              m10, m14      ; t10a
+    psubsw              m14, m0, m5   ; t15a
+    paddsw               m0, m5       ; t11a
+    psubsw               m5, m2, m6   ; t6
+    paddsw               m2, m6       ; t2
+    psubsw               m6, m13, m9  ; t7
+    paddsw              m13, m9       ; t3
     ITX_MULSUB_2W         6,  5,  4,  9, 15, 3784, 1567 ; t6a, t7a
     ITX_MULSUB_2W        14,  1,  4,  9, 15, 3784, 1567 ; t14, t15
     mova                 m9, [rsp+gprsize+32*0] ; in15
@@ -3042,46 +3042,46 @@
     ITX_MULSUB_2W        11,  6,  2, 10, 15, 1751, 3703 ; t5,  t4
     ITX_MULSUB_2W         7,  8,  2, 10, 15, 3035, 2751 ; t9,  t8
     ITX_MULSUB_2W         3, 12,  2, 10, 15, 3857, 1380 ; t13, t12
-    psubw               m10, m4, m8  ; t8a
-    paddw                m8, m4      ; t0a
-    psubw                m4, m9, m7  ; t9a
-    paddw                m9, m7      ; t1a
-    psubw                m7, m6, m12 ; t12a
-    paddw                m6, m12     ; t4a
-    psubw               m12, m11, m3 ; t13a
-    paddw               m11, m3      ; t5a
+    psubsw              m10, m4, m8  ; t8a
+    paddsw               m8, m4      ; t0a
+    psubsw               m4, m9, m7  ; t9a
+    paddsw               m9, m7      ; t1a
+    psubsw               m7, m6, m12 ; t12a
+    paddsw               m6, m12     ; t4a
+    psubsw              m12, m11, m3 ; t13a
+    paddsw              m11, m3      ; t5a
     ITX_MULSUB_2W        10,  4,  2,  3, 15,  799, 4017 ; t9,  t8
     ITX_MULSUB_2W        12,  7,  2,  3, 15, 4017,  799 ; t12, t13
-    psubw                m3, m9, m11 ; t5
-    paddw                m9, m11     ; t1
-    psubw               m11, m4, m12 ; t12a
-    paddw                m4, m12     ; t8a
-    paddw               m12, m8, m6  ; t0
-    psubw                m8, m6      ; t4
-    paddw                m6, m10, m7 ; t9a
-    psubw               m10, m7      ; t13a
+    psubsw               m3, m9, m11 ; t5
+    paddsw               m9, m11     ; t1
+    psubsw              m11, m4, m12 ; t12a
+    paddsw               m4, m12     ; t8a
+    paddsw              m12, m8, m6  ; t0
+    psubsw               m8, m6      ; t4
+    paddsw               m6, m10, m7 ; t9a
+    psubsw              m10, m7      ; t13a
     ITX_MULSUB_2W         8,  3,  2,  7, 15, 1567, 3784 ; t5a, t4a
     ITX_MULSUB_2W        11, 10,  2,  7, 15, 1567, 3784 ; t13, t12
     mova                 m7, [rsp+gprsize+32*0] ; t10a
     mova                 m2, [rsp+gprsize+32*1] ; t6a
-    paddw               m15, m9, m13  ; -out15
-    psubw                m9, m13      ;  t3a
-    paddw               m13, m11, m1  ; -out13
-    psubw               m11, m1       ;  t15a
-    psubw                m1, m4, m7   ;  t10
-    paddw                m7, m4       ; -out1
-    psubw                m4, m3, m2   ;  t6
-    paddw                m3, m2       ; -out3
-    paddw                m2, m10, m14 ;  out2
-    psubw               m10, m14      ;  t14a
-    paddw               m14, m6, m0   ;  out14
-    psubw                m6, m0       ;  t11
+    paddsw              m15, m9, m13  ; -out15
+    psubsw               m9, m13      ;  t3a
+    paddsw              m13, m11, m1  ; -out13
+    psubsw              m11, m1       ;  t15a
+    psubsw               m1, m4, m7   ;  t10
+    paddsw               m7, m4       ; -out1
+    psubsw               m4, m3, m2   ;  t6
+    paddsw               m3, m2       ; -out3
+    paddsw               m2, m10, m14 ;  out2
+    psubsw              m10, m14      ;  t14a
+    paddsw              m14, m6, m0   ;  out14
+    psubsw               m6, m0       ;  t11
     mova                 m0, [rsp+gprsize+32*2] ; t2
     mova [rsp+gprsize+32*1], m7
-    psubw                m7, m12, m0  ;  t2a
-    paddw                m0, m12      ;  out0
-    paddw               m12, m8, m5   ;  out12
-    psubw                m8, m5       ;  t7
+    psubsw               m7, m12, m0  ;  t2a
+    paddsw               m0, m12      ;  out0
+    paddsw              m12, m8, m5   ;  out12
+    psubsw               m8, m5       ;  t7
     paddw                m5, m10, m11 ; -out5
     psubw               m10, m11      ;  out10
     psubw               m11, m4, m8   ; -out11
@@ -3475,26 +3475,26 @@
     ITX_MUL2X_PACK       13,  6, 12, 10, 2440, 3290, 3 ; t22a, t25a
     ITX_MUL2X_PACK       11,  6, 12, 10, 3035, 2751, 3 ; t17a, t30a
 .main2:
-    psubw                m6, m1, m11  ; t17 t30
-    paddw                m1, m11      ; t16 t31
-    psubw               m11, m9, m14  ; t18 t29
-    paddw                m9, m14      ; t19 t28
-    psubw               m14, m15, m0  ; t21 t26
-    paddw               m15, m0       ; t20 t27
-    psubw                m0, m8, m13  ; t22 t25
-    paddw                m8, m13      ; t23 t24
+    psubsw               m6, m1, m11  ; t17 t30
+    paddsw               m1, m11      ; t16 t31
+    psubsw              m11, m9, m14  ; t18 t29
+    paddsw               m9, m14      ; t19 t28
+    psubsw              m14, m15, m0  ; t21 t26
+    paddsw              m15, m0       ; t20 t27
+    psubsw               m0, m8, m13  ; t22 t25
+    paddsw               m8, m13      ; t23 t24
     ITX_MUL2X_PACK        6, 12, 13, 10,   799, 4017, 3 ; t17a t30a
     ITX_MUL2X_PACK       11, 12, 13, 10, m4017,  799, 3 ; t18a t29a
     ITX_MUL2X_PACK       14, 12, 13, 10,  3406, 2276, 3 ; t21a t26a
     ITX_MUL2X_PACK        0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
-    psubw               m13, m1, m9   ; t19a t28a
-    paddw                m1, m9       ; t16a t31a
-    psubw                m9, m8, m15  ; t20a t27a
-    paddw                m8, m15      ; t23a t24a
-    psubw               m15, m6, m11  ; t18  t29
-    paddw                m6, m11      ; t17  t30
-    psubw               m11, m0, m14  ; t21  t26
-    paddw                m0, m14      ; t22  t25
+    psubsw              m13, m1, m9   ; t19a t28a
+    paddsw               m1, m9       ; t16a t31a
+    psubsw               m9, m8, m15  ; t20a t27a
+    paddsw               m8, m15      ; t23a t24a
+    psubsw              m15, m6, m11  ; t18  t29
+    paddsw               m6, m11      ; t17  t30
+    psubsw              m11, m0, m14  ; t21  t26
+    paddsw               m0, m14      ; t22  t25
     ITX_MUL2X_PACK       15, 12, 14, 10,  1567, 3784, 1 ; t18a t29a
     ITX_MUL2X_PACK       13, 12, 14, 10,  1567, 3784, 1 ; t19  t28
     ITX_MUL2X_PACK        9, 12, 14, 10, m3784, 1567, 1 ; t20  t27
@@ -3501,48 +3501,48 @@
     ITX_MUL2X_PACK       11, 12, 14, 10, m3784, 1567, 1 ; t21a t26a
     vbroadcasti128      m12, [o(deint_shuf)]
     REPX    {pshufb x, m12}, m0, m1, m6, m8
-    psubw               m14, m1, m8   ; t23  t24
-    paddw                m1, m8       ; t16  t31
-    psubw                m8, m6, m0   ; t22a t25a
-    paddw                m6, m0       ; t17a t30a
-    psubw                m0, m15, m11 ; t21  t26
-    paddw               m15, m11      ; t18  t29
-    psubw               m11, m13, m9  ; t20a t27a
-    paddw               m13, m9       ; t19a t28a
+    psubsw              m14, m1, m8   ; t23  t24
+    paddsw               m1, m8       ; t16  t31
+    psubsw               m8, m6, m0   ; t22a t25a
+    paddsw               m6, m0       ; t17a t30a
+    psubsw               m0, m15, m11 ; t21  t26
+    paddsw              m15, m11      ; t18  t29
+    psubsw              m11, m13, m9  ; t20a t27a
+    paddsw              m13, m9       ; t19a t28a
     vpbroadcastd        m12, [o(pw_2896x8)]
-    punpcklqdq            m9, m11, m0 ; t20a t21
-    punpckhqdq           m11, m0      ; t27a t26
-    punpcklqdq            m0, m14, m8 ; t23  t22a
-    punpckhqdq           m14, m8      ; t24  t25a
-    psubw                 m8, m11, m9 ; t20  t21a
-    paddw                m11, m9      ; t27  t26a
-    psubw                 m9, m14, m0 ; t23a t22
-    paddw                m14, m0      ; t24a t25
-    REPX   {pmulhrsw x, m12}, m8, m9, m14, m11
+    punpcklqdq           m9, m11, m0  ; t20a t21
+    punpckhqdq          m11, m0       ; t27a t26
+    punpcklqdq           m0, m14, m8  ; t23  t22a
+    punpckhqdq          m14, m8       ; t24  t25a
+    psubw                m8, m11, m9  ; t20  t21a
+    paddw               m11, m9       ; t27  t26a
+    psubw                m9, m14, m0  ; t23a t22
+    paddw               m14, m0       ; t24a t25
+    REPX  {pmulhrsw x, m12}, m8, m9, m14, m11
     punpcklqdq           m0, m1, m6   ; t16  t17a
     punpckhqdq           m1, m6       ; t31  t30a
-    psubw               m10, m5, m8   ; out20 out21
-    paddw                m5, m8       ; out11 out10
-    psubw                m6, m3, m14  ; out24 out25
-    paddw                m3, m14      ; out7  out6
-    psubw                m8, m7, m0   ; out16 out17
-    paddw                m7, m0       ; out15 out14
+    psubsw              m10, m5, m8   ; out20 out21
+    paddsw               m5, m8       ; out11 out10
+    psubsw               m6, m3, m14  ; out24 out25
+    paddsw               m3, m14      ; out7  out6
+    psubsw               m8, m7, m0   ; out16 out17
+    paddsw               m7, m0       ; out15 out14
     mova                 m0, [rsp+gprsize+0*32]
     punpcklqdq          m12, m13, m15 ; t19a t18
     punpckhqdq          m13, m15      ; t28a t29
-    psubw               m15, m0, m1   ; out31 out30
-    paddw                m0, m1       ; out0  out1
+    psubsw              m15, m0, m1   ; out31 out30
+    paddsw               m0, m1       ; out0  out1
     mova                 m1, [rsp+gprsize+1*32]
     mova [rsp+gprsize+0*32], m6
     mova                 m6, [rsp+gprsize+2*32]
-    psubw               m14, m1, m13  ; out28 out29
-    paddw                m1, m13      ; out3  out2
-    psubw               m13, m2, m11  ; out27 out26
-    paddw                m2, m11      ; out4  out5
-    psubw               m11, m4, m9   ; out23 out22
-    paddw                m4, m9       ; out8  out9
-    psubw                m9, m6, m12  ; out19 out18
-    paddw                m6, m12      ; out12 out13
+    psubsw              m14, m1, m13  ; out28 out29
+    paddsw               m1, m13      ; out3  out2
+    psubsw              m13, m2, m11  ; out27 out26
+    paddsw               m2, m11      ; out4  out5
+    psubsw              m11, m4, m9   ; out23 out22
+    paddsw               m4, m9       ; out8  out9
+    psubsw               m9, m6, m12  ; out19 out18
+    paddsw               m6, m12      ; out12 out13
     ret
 
 %macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
@@ -3873,8 +3873,8 @@
 
 %macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
     mova                m%4, [%2]
-    paddw               m%3, m%1, m%4
-    psubw               m%1, m%4
+    paddsw              m%3, m%1, m%4
+    psubsw              m%1, m%4
     pmovzxbw            m%4, [dstq+%6]
     pmulhrsw            m%3, m%5
     pmulhrsw            m%1, m%5
@@ -4057,29 +4057,29 @@
     ITX_MULSUB_2W         6,  9,  7,  8, 15, 2440, 3290 ; t22a, t25a
     ITX_MULSUB_2W        14,  1,  7,  8, 15, 4052,  601 ; t23a, t24a
 .main2:
-    psubw                m7, m12, m4  ; t18
-    paddw               m12, m4       ; t19
-    psubw                m4, m2, m10  ; t21
-    paddw                m2, m10      ; t20
-    psubw               m10, m14, m6  ; t22
-    paddw               m14, m6       ; t23
-    psubw                m6, m1, m9   ; t25
-    paddw                m1, m9       ; t24
-    psubw                m9, m13, m5  ; t26
-    paddw               m13, m5       ; t27
-    psubw                m5, m3, m11  ; t29
-    paddw                m3, m11      ; t28
+    psubsw               m7, m12, m4  ; t18
+    paddsw              m12, m4       ; t19
+    psubsw               m4, m2, m10  ; t21
+    paddsw               m2, m10      ; t20
+    psubsw              m10, m14, m6  ; t22
+    paddsw              m14, m6       ; t23
+    psubsw               m6, m1, m9   ; t25
+    paddsw               m1, m9       ; t24
+    psubsw               m9, m13, m5  ; t26
+    paddsw              m13, m5       ; t27
+    psubsw               m5, m3, m11  ; t29
+    paddsw               m3, m11      ; t28
     ITX_MULSUB_2W         5,  7,  8, 11, 15, m4017,  799 ; t18a, t29a
     ITX_MULSUB_2W         9,  4,  8, 11, 15,  3406, 2276 ; t21a, t26a
     ITX_MULSUB_2W         6, 10,  8, 11, 15, m2276, 3406 ; t22a, t25a
-    psubw                m8, m14, m2  ; t20a
-    paddw               m14, m2       ; t23a
-    psubw                m2, m1, m13  ; t27a
-    paddw                m1, m13      ; t24a
-    psubw               m13, m6, m9   ; t21
-    paddw                m6, m9       ; t22
-    psubw                m9, m10, m4  ; t26
-    paddw               m10, m4       ; t25
+    psubsw               m8, m14, m2  ; t20a
+    paddsw              m14, m2       ; t23a
+    psubsw               m2, m1, m13  ; t27a
+    paddsw               m1, m13      ; t24a
+    psubsw              m13, m6, m9   ; t21
+    paddsw               m6, m9       ; t22
+    psubsw               m9, m10, m4  ; t26
+    paddsw              m10, m4       ; t25
     ITX_MULSUB_2W         2,  8,  4, 11, 15, m3784, 1567 ; t20,  t27
     ITX_MULSUB_2W         9, 13,  4, 11, 15, m3784, 1567 ; t21a, t26a
     mova                 m4, [rsp+gprsize+32*0] ; in31
@@ -4090,31 +4090,31 @@
     mova [rsp+gprsize+32*2], m1  ; t24a
     ITX_MULSUB_2W         0,  4,  1, 11, 15,  201, 4091 ; t16a, t31a
     ITX_MULSUB_2W        14,  6,  1, 11, 15, 3035, 2751 ; t17a, t30a
-    psubw                m1, m0, m14  ; t17
-    paddw                m0, m14      ; t16
-    psubw               m14, m4, m6   ; t30
-    paddw                m4, m6       ; t31
+    psubsw               m1, m0, m14  ; t17
+    paddsw               m0, m14      ; t16
+    psubsw              m14, m4, m6   ; t30
+    paddsw               m4, m6       ; t31
     ITX_MULSUB_2W        14,  1,  6, 11, 15,  799, 4017 ; t17a, t30a
-    psubw                m6, m0, m12  ; t19a
-    paddw                m0, m12      ; t16a
-    psubw               m12, m4, m3   ; t28a
-    paddw                m4, m3       ; t31a
-    psubw                m3, m14, m5  ; t18
-    paddw               m14, m5       ; t17
-    psubw                m5, m1, m7   ; t29
-    paddw                m1, m7       ; t30
+    psubsw               m6, m0, m12  ; t19a
+    paddsw               m0, m12      ; t16a
+    psubsw              m12, m4, m3   ; t28a
+    paddsw               m4, m3       ; t31a
+    psubsw               m3, m14, m5  ; t18
+    paddsw              m14, m5       ; t17
+    psubsw               m5, m1, m7   ; t29
+    paddsw               m1, m7       ; t30
     ITX_MULSUB_2W         5,  3,  7, 11, 15, 1567, 3784 ; t18a, t29a
     ITX_MULSUB_2W        12,  6,  7, 11, 15, 1567, 3784 ; t19,  t28
-    psubw                m7, m1, m10  ; t25a
-    paddw                m1, m10      ; t30a
-    psubw               m10, m5, m9   ; t21
-    paddw                m5, m9       ; t18
-    psubw                m9, m12, m2  ; t20a
-    paddw               m12, m2       ; t19a
-    psubw                m2, m3, m13  ; t26
-    paddw                m3, m13      ; t29
-    psubw               m13, m6, m8   ; t27a
-    paddw                m6, m8       ; t28a
+    psubsw               m7, m1, m10  ; t25a
+    paddsw               m1, m10      ; t30a
+    psubsw              m10, m5, m9   ; t21
+    paddsw               m5, m9       ; t18
+    psubsw               m9, m12, m2  ; t20a
+    paddsw              m12, m2       ; t19a
+    psubsw               m2, m3, m13  ; t26
+    paddsw               m3, m13      ; t29
+    psubsw              m13, m6, m8   ; t27a
+    paddsw               m6, m8       ; t28a
     mova       [tmp1q-32*2], m5
     mova       [tmp1q-32*1], m12
     mova       [tmp2q+32*0], m6
@@ -4124,12 +4124,12 @@
     mova                 m6, [rsp+gprsize+32*1] ; t23
     mova                 m3, [rsp+gprsize+32*2] ; t24a
     vpbroadcastd         m8, [o(pw_2896x8)]
-    psubw                m1, m14, m5  ; t22a
-    paddw               m14, m5       ; t17a
-    psubw                m5, m0, m6   ; t23
-    paddw                m0, m6       ; t16
-    psubw                m6, m4, m3   ; t24
-    paddw                m4, m3       ; t31
+    psubsw               m1, m14, m5  ; t22a
+    paddsw              m14, m5       ; t17a
+    psubsw               m5, m0, m6   ; t23
+    paddsw               m0, m6       ; t16
+    psubsw               m6, m4, m3   ; t24
+    paddsw               m4, m3       ; t31
     mova       [tmp1q-32*4], m0
     mova       [tmp1q-32*3], m14
     mova       [tmp2q+32*3], m4
@@ -4242,13 +4242,13 @@
 ; Perform the final sumsub step and YMM lane shuffling
 %macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
     mova                m%3, [tmp2q+32*( 3-%1)]
-    psubw               m%4, m%1, m%3
-    paddw               m%1, m%3
+    psubsw              m%4, m%1, m%3
+    paddsw              m%1, m%3
     mova                m%3, [tmp1q+32*(11-%2)]
     mova         [tmp1q+32*(11-%2)+16], xm%4
     vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
-    paddw               m%4, m%2, m%3
-    psubw               m%2, m%3
+    paddsw              m%4, m%2, m%3
+    psubsw              m%2, m%3
     mova         [tmp1q+32*(11-%2)], xm%2
     vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
     vperm2i128          m%2, m%1, m%4, 0x31
@@ -4709,12 +4709,12 @@
     mova                m%5, [tmp1q-32*(45-%1)]
     mova                m%4, [tmp2q-32*(20+%1)]
 %endif
-    psubw               m%6, m%5, m%4 ; idct32 out31-n
-    paddw               m%5, m%4      ; idct32 out 0+n
-    psubw               m%4, m%6, m%3 ; out32+n
-    paddw               m%6, m%3      ; out31-n
-    psubw               m%3, m%5, m%2 ; out63-n
-    paddw               m%5, m%2      ; out 0+n
+    psubsw              m%6, m%5, m%4 ; idct32 out31-n
+    paddsw              m%5, m%4      ; idct32 out 0+n
+    psubsw              m%4, m%6, m%3 ; out32+n
+    paddsw              m%6, m%3      ; out31-n
+    psubsw              m%3, m%5, m%2 ; out63-n
+    paddsw              m%5, m%2      ; out 0+n
 %if %0 == 6 ; pass 1
 %if %1 & 1
     mova [tmp2q-32*(19-%1)], m%4
@@ -4949,25 +4949,25 @@
     pmulhrsw             m2, m13 ; t34a
     pmulhrsw             m8, m3  ; t60a
     pmulhrsw             m3, m12 ; t35a
-    psubw               m12, m0, m1   ; t33
-    paddw                m0, m1       ; t32
-    psubw                m1, m3, m2   ; t34
-    paddw                m3, m2       ; t35
-    psubw                m2, m8, m9   ; t61
-    paddw                m8, m9       ; t60
-    psubw                m9, m11, m10 ; t62
-    paddw               m11, m10      ; t63
+    psubsw              m12, m0, m1   ; t33
+    paddsw               m0, m1       ; t32
+    psubsw               m1, m3, m2   ; t34
+    paddsw               m3, m2       ; t35
+    psubsw               m2, m8, m9   ; t61
+    paddsw               m8, m9       ; t60
+    psubsw               m9, m11, m10 ; t62
+    paddsw              m11, m10      ; t63
     ITX_MULSUB_2W         2,  1, 10, 13, 15, m4076, 401 ; t34a, t61a
     vpbroadcastd        m14, [o(pw_401_4076)]
     ITX_MULSUB_2W         9, 12, 10, 13, 15, 14, 13 ; t33a, t62a
-    psubw               m10, m0, m3  ; t35a
-    paddw                m0, m3      ; t32a
-    psubw                m3, m11, m8 ; t60a
-    paddw               m11, m8      ; t63a
-    psubw                m8, m9, m2  ; t34
-    paddw                m9, m2      ; t33
-    psubw                m2, m12, m1 ; t61
-    paddw               m12, m1      ; t62
+    psubsw              m10, m0, m3  ; t35a
+    paddsw               m0, m3      ; t32a
+    psubsw               m3, m11, m8 ; t60a
+    paddsw              m11, m8      ; t63a
+    psubsw               m8, m9, m2  ; t34
+    paddsw               m9, m2      ; t33
+    psubsw               m2, m12, m1 ; t61
+    paddsw              m12, m1      ; t62
     mova       [tmp1q-32*4], m0
     mova       [tmp1q-32*3], m9
     mova       [tmp2q+32*2], m12
@@ -4996,25 +4996,25 @@
     pmulhrsw             m6, m9  ; t38a
     pmulhrsw             m0, m7  ; t56a
     pmulhrsw             m7, m8  ; t39a
-    psubw                m8, m4, m5 ; t37
-    paddw                m4, m5     ; t36
-    psubw                m5, m7, m6 ; t38
-    paddw                m7, m6     ; t39
-    psubw                m6, m0, m1 ; t57
-    paddw                m0, m1     ; t56
-    psubw                m1, m3, m2 ; t58
-    paddw                m3, m2     ; t59
+    psubsw               m8, m4, m5 ; t37
+    paddsw               m4, m5     ; t36
+    psubsw               m5, m7, m6 ; t38
+    paddsw               m7, m6     ; t39
+    psubsw               m6, m0, m1 ; t57
+    paddsw               m0, m1     ; t56
+    psubsw               m1, m3, m2 ; t58
+    paddsw               m3, m2     ; t59
     ITX_MULSUB_2W         6,  5,  2,  9, 15, m2598, 3166 ; t38a, t57a
     vpbroadcastd        m10, [o(pw_3166_2598)]
     ITX_MULSUB_2W         1,  8,  2,  9, 15, 10,  9 ; t37a, t58a
-    psubw                m2, m7, m4 ; t36a
-    paddw                m7, m4     ; t39a
-    psubw                m4, m0, m3 ; t59a
-    paddw                m0, m3     ; t56a
-    psubw                m3, m6, m1 ; t37
-    paddw                m6, m1     ; t38
-    psubw                m1, m5, m8 ; t58
-    paddw                m5, m8     ; t57
+    psubsw               m2, m7, m4 ; t36a
+    paddsw               m7, m4     ; t39a
+    psubsw               m4, m0, m3 ; t59a
+    paddsw               m0, m3     ; t56a
+    psubsw               m3, m6, m1 ; t37
+    paddsw               m6, m1     ; t38
+    psubsw               m1, m5, m8 ; t58
+    paddsw               m5, m8     ; t57
     mova       [tmp1q+32*2], m6
     mova       [tmp1q+32*3], m7
     mova       [tmp2q-32*4], m0
@@ -5056,24 +5056,24 @@
     mova                 m3, [tmp2q-32* 4] ; t47a
     mova                 m6, [tmp1q+32*11] ; t56a
     mova                 m7, [tmp2q+32*12] ; t63a
-    psubw                m8, m0, m1 ; t39
-    paddw                m0, m1     ; t32
-    psubw                m1, m3, m2 ; t40
-    paddw                m3, m2     ; t47
-    psubw                m2, m4, m5 ; t55
-    paddw                m4, m5     ; t48
-    psubw                m5, m7, m6 ; t56
-    paddw                m7, m6     ; t63
+    psubsw               m8, m0, m1 ; t39
+    paddsw               m0, m1     ; t32
+    psubsw               m1, m3, m2 ; t40
+    paddsw               m3, m2     ; t47
+    psubsw               m2, m4, m5 ; t55
+    paddsw               m4, m5     ; t48
+    psubsw               m5, m7, m6 ; t56
+    paddsw               m7, m6     ; t63
     ITX_MULSUB_2W         5,  8,  6,  9, 15, 11, 12 ; t39a, t56a
     ITX_MULSUB_2W         2,  1,  6,  9, 15, 12, 13 ; t40a, t55a
-    psubw                m6, m0, m3 ; t47a
-    paddw                m0, m3     ; t32a
-    psubw                m3, m7, m4 ; t48a
-    paddw                m7, m4     ; t63a
-    psubw                m4, m5, m2 ; t40
-    paddw                m5, m2     ; t39
-    psubw                m2, m8, m1 ; t55
-    paddw                m8, m1     ; t56
+    psubsw               m6, m0, m3 ; t47a
+    paddsw               m0, m3     ; t32a
+    psubsw               m3, m7, m4 ; t48a
+    paddsw               m7, m4     ; t63a
+    psubsw               m4, m5, m2 ; t40
+    paddsw               m5, m2     ; t39
+    psubsw               m2, m8, m1 ; t55
+    paddsw               m8, m1     ; t56
     psubw                m1, m2, m4 ; t40a
     paddw                m2, m4     ; t55a
     psubw                m4, m3, m6 ; t47
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -135,8 +135,8 @@
     pmulhrsw             m0, [qw_2896x8]     ;high: t1 ;low: t0
 %endif
 
-    psubw                m1, m0, m2          ;high: out2 ;low: out3
-    paddw                m0, m2              ;high: out1 ;low: out0
+    psubsw               m1, m0, m2          ;high: out2 ;low: out3
+    paddsw               m0, m2              ;high: out1 ;low: out0
 %endmacro
 
 %macro IADST4_1D_PACKED 0