ref: ef64567e1d6c5444c92097200680769b67a37da4
parent: 3a3af969e7b694a02d4ba51c7bf7ab0258724292
author: Henrik Gramner <gramner@twoorioles.com>
date: Tue Jan 14 16:53:12 EST 2020
x86: Fix overflows in SSSE3 idct
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -33,9 +33,14 @@
deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
-%macro COEF_PAIR 2
-pw_%1_m%2: times 4 dw %1, -%2
-pw_%2_%1: times 4 dw %2, %1
+%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1
+pw_%1_m%2: times 4 dw %1, -%2
+%if %3 != 2
+pw_%2_%1: times 4 dw %2, %1
+%endif
+%if %3
+pw_m%1_m%2: times 4 dw -%1, -%2
+%endif
%endmacro
;adst4
@@ -55,17 +60,21 @@
COEF_PAIR 1931, 3612
COEF_PAIR 3166, 2598
COEF_PAIR 3920, 1189
-COEF_PAIR 3784, 1567
+COEF_PAIR 3784, 1567, 1
COEF_PAIR 995, 3973
COEF_PAIR 1751, 3703
COEF_PAIR 3513, 2106
COEF_PAIR 3857, 1380
-COEF_PAIR 4017, 799
+COEF_PAIR 4017, 799, 1
COEF_PAIR 201, 4091
COEF_PAIR 2440, 3290
COEF_PAIR 3035, 2751
COEF_PAIR 4052, 601
-COEF_PAIR 2276, 3406
+COEF_PAIR 2276, 3406, 1
+COEF_PAIR 4076, 401, 2
+COEF_PAIR 2598, 3166, 2
+COEF_PAIR 3612, 1931, 2
+COEF_PAIR 1189, 3920, 2
pd_2048: times 4 dd 2048
pw_2048: times 8 dw 2048
@@ -2114,15 +2123,13 @@
ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a
psubsw m%6, m%1, m%4 ;low: t9 high: t14
paddsw m%1, m%4 ;low: t8 high: t15
- psubsw m%3, m%5, m%2 ;low: t10 high: t13
+ psubsw m%4, m%5, m%2 ;low: t10 high: t13
paddsw m%5, m%2 ;low: t11 high: t12
mova m%2, [o(deint_shuf2)]
pshufb m%6, m%2
- pshufb m%3, [o(deint_shuf1)]
- pxor m%4, m%4
- psubw m%4, m%3 ;packed -t10 -t13
+ pshufb m%4, m%2
ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a
- ITX_MUL2X_PACK %4, %3, %7, 3784, 1567 ;low: t10a high: t13a
+ ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a
psubsw m%3, m%1, m%5 ;low: t11a high: t12a
paddsw m%1, m%5 ;low: t8a high: t15a
psubsw m%5, m%6, m%4 ;low: t10 high: t13
@@ -2973,20 +2980,18 @@
mova [rsp+gprsize*2+16*2], m7
ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a
ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a
- pxor m4, m4
- psubsw m7, m2, m3 ;t10
+ psubsw m4, m2, m3 ;t10
paddsw m2, m3 ;t11
psubsw m3, m1, m5 ;t13
paddsw m1, m5 ;t12
- psubw m4, m7
- ITX_MULSUB_2W 4, 3, 7, 5, 6, 1567, 3784 ;t10a, t13a
+ ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a
mova m7, [rsp+gprsize*2+32*5]
psubsw m6, m0, m2 ;t11a
paddsw m0, m2 ;t8a
- paddsw m2, m7, m4 ;t9
- psubsw m7, m4 ;t10
+ paddsw m2, m7, m3 ;t9
+ psubsw m7, m3 ;t10
mova m5, [rsp+gprsize*2+16*0]
- psubsw m4, m5, m0 ;out8
+ psubsw m3, m5, m0 ;out8
paddsw m0, m5 ;out7
mova [rsp+gprsize*2+32*5], m0
mova m5, [rsp+gprsize*2+16*9]
@@ -2996,9 +3001,9 @@
mova [rsp+gprsize*2+16*9], m2
mova m0, [rsp+gprsize*2+16*1]
mova m2, [rsp+gprsize*2+16*2]
- mova [rsp+gprsize*2+16*1], m4
- psubsw m5, m0, m3 ;t13
- paddsw m0, m3 ;t14
+ mova [rsp+gprsize*2+16*1], m3
+ psubsw m5, m0, m4 ;t13
+ paddsw m0, m4 ;t14
mova m3, [o(pd_2048)]
psubsw m4, m2, m1 ;t12a
paddsw m1, m2 ;t15a
@@ -4139,11 +4144,9 @@
pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19
mova [rsp+gprsize*2+16*22], m1 ;t19
mova [rsp+gprsize*2+16*31], m2 ;t28
- pxor m0, m0
- psubw m0, m1
- ITX_MULSUB_2W 0, 2, 1, 3, 7, 799, 4017 ;t18a, t29a
- mova [rsp+gprsize*2+16*21], m0 ;t18a
- mova [rsp+gprsize*2+16*32], m2 ;t29a
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m2 ;t18a
+ mova [rsp+gprsize*2+16*32], m1 ;t29a
mova m0, [rsp+gprsize*2+16*23] ;in5
pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27
pmulhrsw m0, [o(pw_995x8)] ;t20, t21
@@ -4187,13 +4190,11 @@
paddsw m0, m1 ;t19
psubsw m5, m2, m3 ;t29
paddsw m3, m2 ;t28
- pxor m2, m2
- psubw m2, m4
- ITX_MULSUB_2W 2, 5, 1, 4, 7, 799, 4017 ;t18a, t29a
- mova [rsp+gprsize*2+16*21], m2 ;t18a
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
mova [rsp+gprsize*2+16*22], m0 ;t19
mova [rsp+gprsize*2+16*31], m3 ;t28
- mova [rsp+gprsize*2+16*32], m5 ;t29a
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
mova m0, [rsp+gprsize*2+16*23] ;in5
mova m1, [rsp+gprsize*2+16*24] ;in11
pmulhrsw m3, m0, [o(pw_3973x8)]
@@ -4245,13 +4246,11 @@
paddsw m0, m2 ;t19
psubsw m5, m1, m3 ;t29
paddsw m3, m1 ;t28
- pxor m2, m2
- psubw m2, m4 ;-t18
- ITX_MULSUB_2W 2, 5, 1, 4, 7, 799, 4017 ;t18a, t29a
- mova [rsp+gprsize*2+16*21], m2 ;t18a
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
mova [rsp+gprsize*2+16*22], m0 ;t19
mova [rsp+gprsize*2+16*31], m3 ;t28
- mova [rsp+gprsize*2+16*32], m5 ;t29a
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
mova m0, [rsp+gprsize*2+16*23] ;in5
mova m1, [rsp+gprsize*2+16*24] ;in11
mova m2, [rsp+gprsize*2+16*29] ;in21
@@ -4279,34 +4278,29 @@
paddsw m0, m2 ;t23
psubsw m5, m1, m3 ;t25
paddsw m3, m1 ;t24
- pxor m6, m6
- psubw m2, m6, m4
- ITX_MULSUB_2W 2, 5, 1, 4, 7, 3406, 2276 ;t22a, t25a
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a
+ mova m2, [rsp+gprsize*2+16*24] ;t21a
+ psubsw m1, m5, m2 ;t21
+ paddsw m5, m2 ;t22
+ mova [rsp+gprsize*2+16*25], m5 ;t22
+ mova m2, [rsp+gprsize*2+16*29] ;t26a
+ psubsw m5, m4, m2 ;t26
+ paddsw m4, m2 ;t25
+ mova [rsp+gprsize*2+16*28], m4 ;t25
+ ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a
+ mova [rsp+gprsize*2+16*24], m5 ;t21a
+ mova [rsp+gprsize*2+16*29], m1 ;t26a
- mova m4, [rsp+gprsize*2+16*24] ;t21a
- psubsw m1, m2, m4 ;t21
- paddsw m2, m4 ;t22
- psubw m4, m6, m1 ;-t21
- mova [rsp+gprsize*2+16*25], m2 ;t22
- mova m1, [rsp+gprsize*2+16*29] ;t26a
- psubsw m2, m5, m1 ;t26
- paddsw m5, m1 ;t25
- mova [rsp+gprsize*2+16*28], m5 ;t25
- ITX_MULSUB_2W 4, 2, 1, 5, 7, 1567, 3784 ;t21a, t26a
- mova [rsp+gprsize*2+16*24], m4 ;t21a
- mova [rsp+gprsize*2+16*29], m2 ;t26a
-
mova m1, [rsp+gprsize*2+16*23] ;t20
mova m5, [rsp+gprsize*2+16*30] ;t27
psubsw m2, m0, m1 ;t20a
paddsw m0, m1 ;t23a
- psubsw m4, m3, m5 ;t27a
+ psubsw m6, m3, m5 ;t27a
paddsw m3, m5 ;t24a
- psubw m6, m2 ;-t20a
- ITX_MULSUB_2W 6, 4, 1, 5, 7, 1567, 3784 ;t20, t27
+ ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27
mova [rsp+gprsize*2+16*26], m0 ;t23a
mova [rsp+gprsize*2+16*27], m3 ;t24a
- mova [rsp+gprsize*2+16*30], m4 ;t27
+ mova [rsp+gprsize*2+16*30], m2 ;t27
mova m0, [rsp+gprsize*2+16*20] ;t17a
mova m1, [rsp+gprsize*2+16*21] ;t18a
@@ -5706,11 +5700,9 @@
pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35
mova [rsp+gprsize*2+16*38], m1 ;t35
mova [rsp+gprsize*2+16*63], m2 ;t60
- pxor m6, m6
- psubw m3, m6, m1
- ITX_MULSUB_2W 3, 2, 0, 1, 7, 401, 4076 ;t34a, t61a
- mova [rsp+gprsize*2+16*37], m3 ;t34a
- mova [rsp+gprsize*2+16*64], m2 ;t61a
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a
+ mova [rsp+gprsize*2+16*37], m2 ;t34a
+ mova [rsp+gprsize*2+16*64], m1 ;t61a
mova m0, [rsp+gprsize*2+16*39] ;in9
pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59
@@ -5726,10 +5718,9 @@
pmulhrsw m1, [o(pw_m700x8)] ;t38,t39
mova [rsp+gprsize*2+16*42], m1 ;t39
mova [rsp+gprsize*2+16*59], m2 ;t56
- psubw m3, m6, m1
- ITX_MULSUB_2W 3, 2, 0, 1, 7, 3166, 2598 ;t38a, t57a
- mova [rsp+gprsize*2+16*41], m3 ;t38a
- mova [rsp+gprsize*2+16*60], m2 ;t57a
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a
+ mova [rsp+gprsize*2+16*41], m2 ;t38a
+ mova [rsp+gprsize*2+16*60], m1 ;t57a
mova m0, [rsp+gprsize*2+16*43] ;in5
pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55
@@ -5745,10 +5736,9 @@
pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43
mova [rsp+gprsize*2+16*46], m1 ;t43
mova [rsp+gprsize*2+16*55], m2 ;t52
- psubw m3, m6, m1
- ITX_MULSUB_2W 3, 2, 0, 1, 7, 1931, 3612 ;t42a, t53a
- mova [rsp+gprsize*2+16*45], m3 ;t42a
- mova [rsp+gprsize*2+16*56], m2 ;t53a
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a
+ mova [rsp+gprsize*2+16*45], m2 ;t42a
+ mova [rsp+gprsize*2+16*56], m1 ;t53a
mova m0, [rsp+gprsize*2+16*47] ;in13
pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51
@@ -5796,13 +5786,11 @@
paddsw m0, m1 ;t35
psubsw m5, m2, m3 ;t61
paddsw m3, m2 ;t60
- pxor m6, m6
- psubw m2, m6, m4
- ITX_MULSUB_2W 2, 5, 1, 4, 7, 401, 4076 ;t34a, t61a
- mova [rsp+gprsize*2+16*37], m2 ;t34a
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a
+ mova [rsp+gprsize*2+16*37], m5 ;t34a
mova [rsp+gprsize*2+16*38], m0 ;t35
mova [rsp+gprsize*2+16*63], m3 ;t60
- mova [rsp+gprsize*2+16*64], m5 ;t61a
+ mova [rsp+gprsize*2+16*64], m4 ;t61a
mova m0, [rsp+gprsize*2+16*39] ;in9
mova m1, [rsp+gprsize*2+16*61] ;in23
@@ -5830,12 +5818,11 @@
paddsw m0, m1 ;t39
psubsw m5, m2, m3 ;t57
paddsw m3, m2 ;t56
- psubw m2, m6, m4
- ITX_MULSUB_2W 2, 5, 1, 4, 7, 3166, 2598 ;t38a, t57a
- mova [rsp+gprsize*2+16*41], m2 ;t38a
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a
+ mova [rsp+gprsize*2+16*41], m5 ;t38a
mova [rsp+gprsize*2+16*42], m0 ;t39
mova [rsp+gprsize*2+16*59], m3 ;t56
- mova [rsp+gprsize*2+16*60], m5 ;t57a
+ mova [rsp+gprsize*2+16*60], m4 ;t57a
mova m0, [rsp+gprsize*2+16*43] ;in5
mova m1, [rsp+gprsize*2+16*57] ;in27
@@ -5863,12 +5850,11 @@
paddsw m0, m1 ;t43
psubsw m5, m2, m3 ;t53
paddsw m3, m2 ;t52
- psubw m2, m6, m4
- ITX_MULSUB_2W 2, 5, 1, 4, 7, 1931, 3612 ;t42a, t53a
- mova [rsp+gprsize*2+16*45], m2 ;t42a
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
mova [rsp+gprsize*2+16*46], m0 ;t43
mova [rsp+gprsize*2+16*55], m3 ;t52
- mova [rsp+gprsize*2+16*56], m5 ;t53a
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
mova m0, [rsp+gprsize*2+16*47] ;in13
mova m1, [rsp+gprsize*2+16*53] ;in19
@@ -5899,35 +5885,29 @@
ALIGN function_align
.main2:
- pxor m2, m2
- psubw m2, m5
- ITX_MULSUB_2W 2, 4, 1, 5, 7, 3920, 1189 ;t46a, t49a
-
+ ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a
mova m1, [rsp+gprsize*2+16*54] ;t51
- psubsw m5, m0, m6 ;t44a
+ psubsw m2, m0, m6 ;t44a
paddsw m0, m6 ;t47a
psubsw m6, m3, m1 ;t51a
paddsw m3, m1 ;t48a
mova [rsp+gprsize*2+16*50], m0 ;t47a
mova [rsp+gprsize*2+16*51], m3 ;t48a
- pxor m1, m1
- psubw m3, m1, m5
- ITX_MULSUB_2W 3, 6, 0, 5, 7, 3406, 2276 ;t44, t51
- mova [rsp+gprsize*2+16*47], m3 ;t44
- mova [rsp+gprsize*2+16*54], m6 ;t51
+ ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51
+ mova [rsp+gprsize*2+16*47], m6 ;t44
+ mova [rsp+gprsize*2+16*54], m2 ;t51
mova m0, [rsp+gprsize*2+16*48] ;t45a
mova m3, [rsp+gprsize*2+16*53] ;t50a
- psubsw m5, m2, m0 ;t45
- paddsw m2, m0 ;t46
- psubsw m6, m4, m3 ;t50
- paddsw m4, m3 ;t49
- psubw m1, m5
- ITX_MULSUB_2W 1, 6, 0, 3, 7, 3406, 2276 ;t45a, t50a
- mova [rsp+gprsize*2+16*48], m1 ;t45a
- mova [rsp+gprsize*2+16*49], m2 ;t46
- mova [rsp+gprsize*2+16*52], m4 ;t49
- mova [rsp+gprsize*2+16*53], m6 ;t50a
+ psubsw m2, m4, m0 ;t45
+ paddsw m4, m0 ;t46
+ psubsw m6, m5, m3 ;t50
+ paddsw m5, m3 ;t49
+ ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a
+ mova [rsp+gprsize*2+16*48], m6 ;t45a
+ mova [rsp+gprsize*2+16*49], m4 ;t46
+ mova [rsp+gprsize*2+16*52], m5 ;t49
+ mova [rsp+gprsize*2+16*53], m2 ;t50a
mova m0, [rsp+gprsize*2+16*43] ;t40
mova m2, [rsp+gprsize*2+16*46] ;t43
@@ -5965,12 +5945,10 @@
paddsw m0, m2 ;t38
psubsw m5, m1, m3 ;t58
paddsw m1, m3 ;t57
- pxor m6, m6
- psubw m3, m6, m4
- ITX_MULSUB_2W 3, 5, 2, 4, 7, 799, 4017 ;t37a, t58a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a
mova [rsp+gprsize*2+16*41], m0 ;t38
- mova [rsp+gprsize*2+16*40], m3 ;t37a
- mova [rsp+gprsize*2+16*61], m5 ;t58a
+ mova [rsp+gprsize*2+16*40], m5 ;t37a
+ mova [rsp+gprsize*2+16*61], m4 ;t58a
mova [rsp+gprsize*2+16*60], m1 ;t57
mova m0, [rsp+gprsize*2+16*42] ;t39
@@ -5981,11 +5959,10 @@
paddsw m0, m2 ;t39a
psubsw m5, m1, m3 ;t59a
paddsw m1, m3 ;t56a
- psubw m3, m6, m4
- ITX_MULSUB_2W 3, 5, 2, 4, 7, 799, 4017 ;t36, t59
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59
mova [rsp+gprsize*2+16*42], m0 ;t39a
- mova [rsp+gprsize*2+16*39], m3 ;t36
- mova [rsp+gprsize*2+16*62], m5 ;t59
+ mova [rsp+gprsize*2+16*39], m5 ;t36
+ mova [rsp+gprsize*2+16*62], m4 ;t59
mova [rsp+gprsize*2+16*59], m1 ;t56a
mova m0, [rsp+gprsize*2+16*35] ;t32
@@ -6072,12 +6049,10 @@
paddsw m0, m2 ;t47
psubsw m5, m1, m3 ;t55
paddsw m1, m3 ;t48
- pxor m6, m6
- psubw m3, m6, m4
- ITX_MULSUB_2W 3, 5, 2, 4, 7, 1567, 3784 ;t40a, t55a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a
mova [rsp+gprsize*2+16*50], m0 ;t47
- mova [rsp+gprsize*2+16*43], m3 ;t40a
- mova [rsp+gprsize*2+16*58], m5 ;t55a
+ mova [rsp+gprsize*2+16*43], m5 ;t40a
+ mova [rsp+gprsize*2+16*58], m4 ;t55a
mova [rsp+gprsize*2+16*51], m1 ;t48
mova m0, [rsp+gprsize*2+16*49] ;t46
@@ -6088,11 +6063,10 @@
paddsw m0, m2 ;t46a
psubsw m5, m1, m3 ;t54a
paddsw m1, m3 ;t49a
- psubw m3, m6, m4
- ITX_MULSUB_2W 3, 5, 2, 4, 7, 1567, 3784 ;t41, t54
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54
mova [rsp+gprsize*2+16*49], m0 ;t46a
- mova [rsp+gprsize*2+16*44], m3 ;t41
- mova [rsp+gprsize*2+16*57], m5 ;t54
+ mova [rsp+gprsize*2+16*44], m5 ;t41
+ mova [rsp+gprsize*2+16*57], m4 ;t54
mova [rsp+gprsize*2+16*52], m1 ;t49a
mova m0, [rsp+gprsize*2+16*48] ;t45a
@@ -6103,23 +6077,21 @@
paddsw m0, m2 ;t45
psubsw m5, m1, m3 ;t53
paddsw m1, m3 ;t50
- psubw m3, m6, m4
- ITX_MULSUB_2W 3, 5, 2, 4, 7, 1567, 3784 ;t42a, t53a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a
mova [rsp+gprsize*2+16*48], m0 ;t45
- mova [rsp+gprsize*2+16*45], m3 ;t42a
- mova [rsp+gprsize*2+16*56], m5 ;t53a
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
mova [rsp+gprsize*2+16*53], m1 ;t50
mova m0, [rsp+gprsize*2+16*47] ;t44
mova m2, [rsp+gprsize*2+16*46] ;t43
- mova m5, [rsp+gprsize*2+16*55] ;t52
+ mova m3, [rsp+gprsize*2+16*55] ;t52
mova m1, [rsp+gprsize*2+16*54] ;t51
- psubsw m3, m0, m2 ;t43a
+ psubsw m4, m0, m2 ;t43a
paddsw m0, m2 ;t44a
- psubsw m4, m1, m5 ;t52a
- paddsw m1, m5 ;t51a
- psubw m5, m6, m3
- ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t43, t52
+ psubsw m5, m1, m3 ;t52a
+ paddsw m1, m3 ;t51a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52
mova m2, [rsp+gprsize*2+16*38] ;t35a
mova m3, [rsp+gprsize*2+16*31] ;tmp[28]