shithub: dav1d

Download patch

ref: 1703f21fb708f3e75ec8889c2a7592652d1ecfbc
parent: bd8ce19eefcdf93860c88a40089116d13fc1242f
author: Liwei Wang <liwei@multicorewareinc.com>
date: Tue Dec 18 05:36:30 EST 2018

Add SSSE3 implementation for the 4x8 and 8x4 blocks in itx

Cycle times:
inv_txfm_add_4x8_adst_adst_0_8bpc_c: 1167.6
inv_txfm_add_4x8_adst_adst_0_8bpc_ssse3: 114.6
inv_txfm_add_4x8_adst_adst_1_8bpc_c: 1167.2
inv_txfm_add_4x8_adst_adst_1_8bpc_ssse3: 114.1
inv_txfm_add_4x8_adst_dct_0_8bpc_c: 1174.7
inv_txfm_add_4x8_adst_dct_0_8bpc_ssse3: 34.8
inv_txfm_add_4x8_adst_dct_1_8bpc_c: 1158.0
inv_txfm_add_4x8_adst_dct_1_8bpc_ssse3: 101.0
inv_txfm_add_4x8_adst_flipadst_0_8bpc_c: 1150.9
inv_txfm_add_4x8_adst_flipadst_0_8bpc_ssse3: 115.8
inv_txfm_add_4x8_adst_flipadst_1_8bpc_c: 1157.6
inv_txfm_add_4x8_adst_flipadst_1_8bpc_ssse3: 115.8
inv_txfm_add_4x8_adst_identity_0_8bpc_c: 848.4
inv_txfm_add_4x8_adst_identity_0_8bpc_ssse3: 59.1
inv_txfm_add_4x8_adst_identity_1_8bpc_c: 850.1
inv_txfm_add_4x8_adst_identity_1_8bpc_ssse3: 59.1
inv_txfm_add_4x8_dct_adst_0_8bpc_c: 1205.6
inv_txfm_add_4x8_dct_adst_0_8bpc_ssse3: 107.0
inv_txfm_add_4x8_dct_adst_1_8bpc_c: 1183.7
inv_txfm_add_4x8_dct_adst_1_8bpc_ssse3: 107.0
inv_txfm_add_4x8_dct_dct_0_8bpc_c: 1227.0
inv_txfm_add_4x8_dct_dct_0_8bpc_ssse3: 34.6
inv_txfm_add_4x8_dct_dct_1_8bpc_c: 1229.7
inv_txfm_add_4x8_dct_dct_1_8bpc_ssse3: 96.1
inv_txfm_add_4x8_dct_flipadst_0_8bpc_c: 1188.2
inv_txfm_add_4x8_dct_flipadst_0_8bpc_ssse3: 109.3
inv_txfm_add_4x8_dct_flipadst_1_8bpc_c: 1192.7
inv_txfm_add_4x8_dct_flipadst_1_8bpc_ssse3: 109.9
inv_txfm_add_4x8_dct_identity_0_8bpc_c: 878.4
inv_txfm_add_4x8_dct_identity_0_8bpc_ssse3: 31.9
inv_txfm_add_4x8_dct_identity_1_8bpc_c: 879.0
inv_txfm_add_4x8_dct_identity_1_8bpc_ssse3: 54.8
inv_txfm_add_4x8_flipadst_adst_0_8bpc_c: 1181.8
inv_txfm_add_4x8_flipadst_adst_0_8bpc_ssse3: 114.7
inv_txfm_add_4x8_flipadst_adst_1_8bpc_c: 1203.0
inv_txfm_add_4x8_flipadst_adst_1_8bpc_ssse3: 114.5
inv_txfm_add_4x8_flipadst_dct_0_8bpc_c: 1203.6
inv_txfm_add_4x8_flipadst_dct_0_8bpc_ssse3: 34.1
inv_txfm_add_4x8_flipadst_dct_1_8bpc_c: 1204.4
inv_txfm_add_4x8_flipadst_dct_1_8bpc_ssse3: 100.2
inv_txfm_add_4x8_flipadst_flipadst_0_8bpc_c: 1180.6
inv_txfm_add_4x8_flipadst_flipadst_0_8bpc_ssse3: 117.1
inv_txfm_add_4x8_flipadst_flipadst_1_8bpc_c: 1178.7
inv_txfm_add_4x8_flipadst_flipadst_1_8bpc_ssse3: 116.8
inv_txfm_add_4x8_flipadst_identity_0_8bpc_c: 871.3
inv_txfm_add_4x8_flipadst_identity_0_8bpc_ssse3: 69.0
inv_txfm_add_4x8_flipadst_identity_1_8bpc_c: 872.3
inv_txfm_add_4x8_flipadst_identity_1_8bpc_ssse3: 70.0
inv_txfm_add_4x8_identity_adst_0_8bpc_c: 1125.2
inv_txfm_add_4x8_identity_adst_0_8bpc_ssse3: 98.7
inv_txfm_add_4x8_identity_adst_1_8bpc_c: 1092.6
inv_txfm_add_4x8_identity_adst_1_8bpc_ssse3: 99.6
inv_txfm_add_4x8_identity_dct_0_8bpc_c: 1139.4
inv_txfm_add_4x8_identity_dct_0_8bpc_ssse3: 38.8
inv_txfm_add_4x8_identity_dct_1_8bpc_c: 1111.0
inv_txfm_add_4x8_identity_dct_1_8bpc_ssse3: 84.1
inv_txfm_add_4x8_identity_flipadst_0_8bpc_c: 1112.4
inv_txfm_add_4x8_identity_flipadst_0_8bpc_ssse3: 100.7
inv_txfm_add_4x8_identity_flipadst_1_8bpc_c: 1098.7
inv_txfm_add_4x8_identity_flipadst_1_8bpc_ssse3: 100.8
inv_txfm_add_4x8_identity_identity_0_8bpc_c: 791.6
inv_txfm_add_4x8_identity_identity_0_8bpc_ssse3: 43.9
inv_txfm_add_4x8_identity_identity_1_8bpc_c: 797.0
inv_txfm_add_4x8_identity_identity_1_8bpc_ssse3: 43.8
inv_txfm_add_8x4_adst_adst_0_8bpc_c: 1102.8
inv_txfm_add_8x4_adst_adst_0_8bpc_ssse3: 108.7
inv_txfm_add_8x4_adst_adst_1_8bpc_c: 1101.8
inv_txfm_add_8x4_adst_adst_1_8bpc_ssse3: 108.9
inv_txfm_add_8x4_adst_dct_0_8bpc_c: 1146.9
inv_txfm_add_8x4_adst_dct_0_8bpc_ssse3: 98.7
inv_txfm_add_8x4_adst_dct_1_8bpc_c: 1157.9
inv_txfm_add_8x4_adst_dct_1_8bpc_ssse3: 98.9
inv_txfm_add_8x4_adst_flipadst_0_8bpc_c: 1144.6
inv_txfm_add_8x4_adst_flipadst_0_8bpc_ssse3: 111.4
inv_txfm_add_8x4_adst_flipadst_1_8bpc_c: 1128.2
inv_txfm_add_8x4_adst_flipadst_1_8bpc_ssse3: 112.4
inv_txfm_add_8x4_adst_identity_0_8bpc_c: 1051.1
inv_txfm_add_8x4_adst_identity_0_8bpc_ssse3: 87.1
inv_txfm_add_8x4_adst_identity_1_8bpc_c: 1059.2
inv_txfm_add_8x4_adst_identity_1_8bpc_ssse3: 87.7
inv_txfm_add_8x4_dct_adst_0_8bpc_c: 1130.2
inv_txfm_add_8x4_dct_adst_0_8bpc_ssse3: 29.0
inv_txfm_add_8x4_dct_adst_1_8bpc_c: 1130.1
inv_txfm_add_8x4_dct_adst_1_8bpc_ssse3: 89.2
inv_txfm_add_8x4_dct_dct_0_8bpc_c: 1186.0
inv_txfm_add_8x4_dct_dct_0_8bpc_ssse3: 26.3
inv_txfm_add_8x4_dct_dct_1_8bpc_c: 1172.2
inv_txfm_add_8x4_dct_dct_1_8bpc_ssse3: 78.8
inv_txfm_add_8x4_dct_flipadst_0_8bpc_c: 1154.7
inv_txfm_add_8x4_dct_flipadst_0_8bpc_ssse3: 29.1
inv_txfm_add_8x4_dct_flipadst_1_8bpc_c: 1150.2
inv_txfm_add_8x4_dct_flipadst_1_8bpc_ssse3: 92.2
inv_txfm_add_8x4_dct_identity_0_8bpc_c: 1078.7
inv_txfm_add_8x4_dct_identity_0_8bpc_ssse3: 29.2
inv_txfm_add_8x4_dct_identity_1_8bpc_c: 1090.1
inv_txfm_add_8x4_dct_identity_1_8bpc_ssse3: 72.2
inv_txfm_add_8x4_flipadst_adst_0_8bpc_c: 1111.6
inv_txfm_add_8x4_flipadst_adst_0_8bpc_ssse3: 108.6
inv_txfm_add_8x4_flipadst_adst_1_8bpc_c: 1112.1
inv_txfm_add_8x4_flipadst_adst_1_8bpc_ssse3: 107.6
inv_txfm_add_8x4_flipadst_dct_0_8bpc_c: 1163.0
inv_txfm_add_8x4_flipadst_dct_0_8bpc_ssse3: 98.3
inv_txfm_add_8x4_flipadst_dct_1_8bpc_c: 1160.0
inv_txfm_add_8x4_flipadst_dct_1_8bpc_ssse3: 99.6
inv_txfm_add_8x4_flipadst_flipadst_0_8bpc_c: 1137.9
inv_txfm_add_8x4_flipadst_flipadst_0_8bpc_ssse3: 112.0
inv_txfm_add_8x4_flipadst_flipadst_1_8bpc_c: 1140.0
inv_txfm_add_8x4_flipadst_flipadst_1_8bpc_ssse3: 112.0
inv_txfm_add_8x4_flipadst_identity_0_8bpc_c: 1057.2
inv_txfm_add_8x4_flipadst_identity_0_8bpc_ssse3: 88.1
inv_txfm_add_8x4_flipadst_identity_1_8bpc_c: 1058.3
inv_txfm_add_8x4_flipadst_identity_1_8bpc_ssse3: 87.1
inv_txfm_add_8x4_identity_adst_0_8bpc_c: 794.0
inv_txfm_add_8x4_identity_adst_0_8bpc_ssse3: 60.6
inv_txfm_add_8x4_identity_adst_1_8bpc_c: 793.4
inv_txfm_add_8x4_identity_adst_1_8bpc_ssse3: 60.6
inv_txfm_add_8x4_identity_dct_0_8bpc_c: 838.4
inv_txfm_add_8x4_identity_dct_0_8bpc_ssse3: 27.4
inv_txfm_add_8x4_identity_dct_1_8bpc_c: 838.5
inv_txfm_add_8x4_identity_dct_1_8bpc_ssse3: 52.0
inv_txfm_add_8x4_identity_flipadst_0_8bpc_c: 825.3
inv_txfm_add_8x4_identity_flipadst_0_8bpc_ssse3: 66.7
inv_txfm_add_8x4_identity_flipadst_1_8bpc_c: 831.7
inv_txfm_add_8x4_identity_flipadst_1_8bpc_ssse3: 66.7
inv_txfm_add_8x4_identity_identity_0_8bpc_c: 768.6
inv_txfm_add_8x4_identity_identity_0_8bpc_ssse3: 40.0
inv_txfm_add_8x4_identity_identity_1_8bpc_c: 743.3
inv_txfm_add_8x4_identity_identity_1_8bpc_ssse3: 39.9

--- a/src/x86/itx_init_tmpl.c
+++ b/src/x86/itx_init_tmpl.c
@@ -78,6 +78,8 @@
 decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2);
 
 decl_itx17_fns(4, 4, ssse3);
+decl_itx16_fns(4, 8, ssse3);
+decl_itx16_fns(8, 4, ssse3);
 
 void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
 #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
@@ -121,7 +123,9 @@
     if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
 
 #if BITDEPTH == 8
-    assign_itx17_fn(, 4, 4, ssse3);
+    assign_itx17_fn(,  4, 4, ssse3);
+    assign_itx16_fn(R, 4, 8, ssse3);
+    assign_itx16_fn(R, 8, 4, ssse3);
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -29,22 +29,38 @@
 
 SECTION_RODATA 16
 
-deint_shuf: db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
+deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
 
-pw_2896x8:      times 8 dw  2896*8
-pw_1567_m3784:  times 4 dw  1567, -3784
-pw_3784_1567:   times 4 dw  3784,  1567
+deint_shuf1: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
+deint_shuf2: db  8,  9,  0,  1, 10, 11,  2,  3, 12, 13,  4,  5, 14, 15,  6,  7
 
+%macro COEF_PAIR 2
+pw_%1_m%2:  times 4 dw   %1, -%2
+pw_%2_%1:   times 4 dw   %2,  %1
+%endmacro
+
+;adst4
 pw_1321_3803:   times 4 dw  1321,  3803
 pw_2482_m1321:  times 4 dw  2482, -1321
 pw_3344_2482:   times 4 dw  3344,  2482
 pw_3344_m3803:  times 4 dw  3344, -3803
 pw_m6688_m3803: times 4 dw -6688, -3803
-pw_3344x8:      times 8 dw  3344*8
-pw_5793x4:      times 8 dw  5793*4
 
+COEF_PAIR 1567, 3784
+COEF_PAIR  799, 4017
+COEF_PAIR 3406, 2276
+COEF_PAIR  401, 4076
+COEF_PAIR 1931, 3612
+COEF_PAIR 3166, 2598
+COEF_PAIR 3920, 1189
+COEF_PAIR 3784, 1567
+
 pd_2048:        times 4 dd  2048
 pw_2048:        times 8 dw  2048
+pw_4096:        times 8 dw  4096
+pw_2896x8:      times 8 dw  2896*8
+pw_3344x8:      times 8 dw  3344*8
+pw_5793x4:      times 8 dw  5793*4
 
 iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
 iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
@@ -61,14 +77,10 @@
 %define o(x) r5-$$+x ; PIC
 %endif
 
-%macro ITX4_END 4-5 2048 ; row[1-4], rnd
-%if %5
-    mova                 m2, [o(pw_%5)]
-    pmulhrsw             m0, m2
-    pmulhrsw             m1, m2
-%endif
+%macro WRITE_4X4 9  ;src[1-2], tmp[1-3], row[1-4]
     lea                  r2, [dstq+strideq*2]
 %assign %%i 1
+%rotate 5
 %rep 4
     %if %1 & 2
         CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
@@ -79,33 +91,43 @@
     %rotate 1
 %endrep
 
-    movd                 m2, [%%row_adr1]       ;dst0
-    movd                 m4, [%%row_adr2]       ;dst1
-    punpckldq            m2, m4                 ;high: dst1 :low: dst0
-    movd                 m3, [%%row_adr3]       ;dst2
-    movd                 m4, [%%row_adr4]       ;dst3
-    punpckldq            m3, m4                 ;high: dst3 :low: dst2
+    movd                 m%3, [%%row_adr1]        ;dst0
+    movd                 m%5, [%%row_adr2]        ;dst1
+    punpckldq            m%3, m%5                 ;high: dst1 :low: dst0
+    movd                 m%4, [%%row_adr3]        ;dst2
+    movd                 m%5, [%%row_adr4]        ;dst3
+    punpckldq            m%4, m%5                 ;high: dst3 :low: dst2
 
-    pxor                 m4, m4
-    punpcklbw            m2, m4                 ;extend byte to word
-    punpcklbw            m3, m4                 ;extend byte to word
+    pxor                 m%5, m%5
+    punpcklbw            m%3, m%5                 ;extend byte to word
+    punpcklbw            m%4, m%5                 ;extend byte to word
 
-    paddw                m0, m2                 ;high: dst1 + out1 ;low: dst0 + out0
-    paddw                m1, m3                 ;high: dst3 + out3 ;low: dst2 + out2
+    paddw                m%1, m%3                 ;high: dst1 + out1 ;low: dst0 + out0
+    paddw                m%2, m%4                 ;high: dst3 + out3 ;low: dst2 + out2
 
-    packuswb             m0, m1                 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
+    packuswb             m%1, m%2                 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
 
-    movd       [%%row_adr1], m0                 ;store dst0 + out0
-    pshuflw              m1, m0, q1032
-    movd       [%%row_adr2], m1                 ;store dst1 + out1
-    punpckhqdq           m0, m0
-    movd       [%%row_adr3], m0                 ;store dst2 + out2
-    psrlq                m0, 32
-    movd       [%%row_adr4], m0                 ;store dst3 + out3
+    movd       [%%row_adr1], m%1                  ;store dst0 + out0
+    pshuflw              m%2, m%1, q1032
+    movd       [%%row_adr2], m%2                  ;store dst1 + out1
+    punpckhqdq           m%1, m%1
+    movd       [%%row_adr3], m%1                  ;store dst2 + out2
+    psrlq                m%1, 32
+    movd       [%%row_adr4], m%1                  ;store dst3 + out3
+%endmacro
 
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+    mova                 m2, [o(pw_%5)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+%endif
+
+    WRITE_4X4            0, 1, 2, 3, 4, %1, %2, %3, %4
     ret
 %endmacro
 
+
 ; flags: 1 = swap, 2: coef_regs
 %macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
 %if %6 & 2
@@ -113,7 +135,7 @@
     pmaddwd              m%1, m%5
 %elif %6 & 1
     pmaddwd              m%2, m%1, [o(pw_%5_%4)]
-    pmaddwd              m%1, [pw_%4_m%5]
+    pmaddwd              m%1, [o(pw_%4_m%5)]
 %else
     pmaddwd              m%2, m%1, [o(pw_%4_m%5)]
     pmaddwd              m%1, [o(pw_%5_%4)]
@@ -126,24 +148,25 @@
 %endmacro
 
 %macro IDCT4_1D_PACKED 0-1   ;pw_2896x8
-    punpckhwd            m2, m0, m1           ;unpacked in1 in3
+    punpckhwd            m2, m0, m1            ;unpacked in1 in3
     psubw                m3, m0, m1
     paddw                m0, m1
-    punpcklqdq           m0, m3               ;high: in0-in2 ;low: in0+in2
+    punpcklqdq           m0, m3                ;high: in0-in2 ;low: in0+in2
 
     mova                 m3, [o(pd_2048)]
-    ITX_MUL2X_PACK 2, 1, 3, 1567, 3784
+    ITX_MUL2X_PACK        2, 1, 3, 1567, 3784
 
 %if %0 == 1
     pmulhrsw             m0, m%1
 %else
-    pmulhrsw             m0, [o(pw_2896x8)]  ;high: t1 ;low: t0
+    pmulhrsw             m0, [o(pw_2896x8)]    ;high: t1 ;low: t0
 %endif
 
-    psubsw               m1, m0, m2          ;high: out2 ;low: out3
-    paddsw               m0, m2              ;high: out1 ;low: out0
+    psubsw               m1, m0, m2            ;high: out2 ;low: out3
+    paddsw               m0, m2                ;high: out1 ;low: out0
 %endmacro
 
+
 %macro IADST4_1D_PACKED 0
     punpcklwd            m2, m0, m1                ;unpacked in0 in2
     punpckhwd            m3, m0, m1                ;unpacked in1 in3
@@ -172,8 +195,8 @@
     packssdw             m2, m2                    ;high: out3 ;low: out3
 %endmacro
 
-%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
-cglobal inv_txfm_add_%1_%2_%4, 4, 6, 0, dst, stride, coeff, eob, tx2
+%macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
+cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
     %undef cmp
 %if ARCH_X86_32
     LEA                    r5, $$
@@ -193,7 +216,7 @@
 %endmacro
 
 %macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x4
+    INV_TXFM_FN          %1, %2, %3, 4x4, 6
 %ifidn %1_%2, dct_identity
     mova                 m0, [o(pw_2896x8)]
     pmulhrsw             m0, [coeffq]
@@ -254,7 +277,7 @@
 INV_TXFM_4X4_FN dct, flipadst, 0
 INV_TXFM_4X4_FN dct, identity, 3
 
-cglobal idct_4x4_internal, 0, 0, 4, dst, stride, coeff, eob, tx2
+cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]      ;high: in1 ;low: in0
     mova                 m1, [coeffq+16*1]      ;high: in3 ;low in2
 
@@ -281,7 +304,7 @@
 INV_TXFM_4X4_FN adst, flipadst, 0
 INV_TXFM_4X4_FN adst, identity
 
-cglobal iadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
+cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]
     mova                 m1, [coeffq+16*1]
     call .main
@@ -313,7 +336,7 @@
 INV_TXFM_4X4_FN flipadst, flipadst, 0
 INV_TXFM_4X4_FN flipadst, identity
 
-cglobal iflipadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
+cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]
     mova                 m1, [coeffq+16*1]
     call m(iadst_4x4_internal).main
@@ -341,7 +364,7 @@
 INV_TXFM_4X4_FN identity, flipadst
 INV_TXFM_4X4_FN identity, identity
 
-cglobal iidentity_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
+cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]
     mova                 m1, [coeffq+16*1]
     mova                 m2, [o(pw_5793x4)]
@@ -398,3 +421,692 @@
 
     shufpd               m0, m2, 0x01
     ITX4_END              0, 3, 2, 1, 0
+
+
+%macro IDCT8_1D_PACKED 0
+    mova                 m6, [o(pd_2048)]
+    punpckhwd            m5, m0, m3                 ;unpacked in1 in7
+    punpckhwd            m4, m2, m1                 ;unpacked in5 in3
+    punpcklwd            m1, m3                     ;unpacked in2 in6
+    psubw                m3, m0, m2
+    paddw                m0, m2
+    punpcklqdq           m0, m3                     ;low: in0+in4 high: in0-in4
+    ITX_MUL2X_PACK        5, 2, 6,  799, 4017, 1    ;low: t4a high: t7a
+    ITX_MUL2X_PACK        4, 2, 6, 3406, 2276, 1    ;low: t5a high: t6a
+    ITX_MUL2X_PACK        1, 2, 6, 1567, 3784       ;low: t3  high: t2
+    mova                 m6, [o(pw_2896x8)]
+    psubsw               m2, m5, m4                 ;low: t5a high: t6a
+    paddsw               m5, m4                     ;low: t4  high: t7
+    punpckhqdq           m4, m2, m2                 ;low: t6a high: t6a
+    psubw                m3, m4, m2                 ;low: t6a - t5a
+    paddw                m4, m2                     ;low: t6a + t5a
+    punpcklqdq           m4, m3                     ;low: t6a + t5a high: t6a - t5a
+    pmulhrsw             m0, m6                     ;low: t0   high: t1
+    pmulhrsw             m4, m6                     ;low: t6   high: t5
+    shufps               m2, m5, m4, q1032          ;low: t7   high: t6
+    shufps               m5, m4, q3210              ;low: t4   high: t5
+    psubsw               m4, m0, m1                 ;low: tmp3 high: tmp2
+    paddsw               m0, m1                     ;low: tmp0 high: tmp1
+    psubsw               m3, m0, m2                 ;low: out7 high: out6
+    paddsw               m0, m2                     ;low: out0 high: out1
+    psubsw               m2, m4, m5                 ;low: out4 high: out5
+    paddsw               m1, m4, m5                 ;low: out3 high: out2
+%endmacro
+
+;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
+    punpckhwd           m%3, m%1, m%2
+    punpcklwd           m%1, m%2
+%if %7 < 8
+    pmaddwd             m%2, m%7, m%1
+    pmaddwd             m%4, m%7, m%3
+%else
+    mova                m%2, [o(pw_%7_%6)]
+    pmaddwd             m%4, m%3, m%2
+    pmaddwd             m%2, m%1
+%endif
+    paddd               m%4, m%5
+    paddd               m%2, m%5
+    psrad               m%4, 12
+    psrad               m%2, 12
+    packssdw            m%2, m%4                 ;dst2
+%if %7 < 8
+    pmaddwd             m%3, m%6
+    pmaddwd             m%1, m%6
+%else
+    mova                m%4, [o(pw_%6_m%7)]
+    pmaddwd             m%3, m%4
+    pmaddwd             m%1, m%4
+%endif
+    paddd               m%3, m%5
+    paddd               m%1, m%5
+    psrad               m%3, 12
+    psrad               m%1, 12
+    packssdw            m%1, m%3                 ;dst1
+%endmacro
+
+%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
+    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784   ;t2, t3
+    mova                m%6, [o(pw_2896x8)]
+    paddw               m%5, m%1, m%3
+    psubw               m%1, m%3
+    pmulhrsw            m%1, m%6                          ;t1
+    pmulhrsw            m%5, m%6                          ;t0
+    psubsw              m%3, m%1, m%2                     ;out2
+    paddsw              m%2, m%1                          ;out1
+    paddsw              m%1, m%5, m%4                     ;out0
+    psubsw              m%5, m%4                          ;out3
+    mova                m%4, m%5
+%endmacro
+
+%macro IADST4_1D 0
+    mova                 m4, m2
+    psubw                m2, m0, m4
+    paddw                m2, m3                        ;low: in0 - in2 + in3
+
+    punpckhwd            m6, m0, m4                    ;unpacked in0 in2
+    punpckhwd            m7, m1, m3                    ;unpacked in1 in3
+    punpcklwd            m0, m4                        ;unpacked in0 in2
+    punpcklwd            m1, m3                        ;unpacked in1 in3
+
+    pmaddwd              m4, m0, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
+    pmaddwd              m0, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
+    pmaddwd              m3, m1, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m1, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
+    paddd                m3, m4                        ;t0 + t3
+
+    pmaddwd              m1, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
+    pmulhrsw             m2, [o(pw_3344x8)]            ;out2
+    mova                 m4, [o(pd_2048)]
+    paddd                m0, m4
+    paddd                m4, m3                        ;t0 + t3 + 2048
+    paddd                m5, m0                        ;t1 + t3 + 2048
+    paddd                m3, m0
+    paddd                m3, m1                        ;t0 + t1 - t3 + 2048
+
+    psrad                m4, 12                        ;out0
+    psrad                m5, 12                        ;out1
+    psrad                m3, 12                        ;out3
+    packssdw             m0, m4, m5                    ;low: out0  high: out1
+
+    pmaddwd              m4, m6, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
+    pmaddwd              m6, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
+    pmaddwd              m1, m7, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m7, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
+    paddd                m1, m4                        ;t0 + t3
+    pmaddwd              m7, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
+
+    mova                 m4, [o(pd_2048)]
+    paddd                m6, m4
+    paddd                m4, m1                        ;t0 + t3 + 2048
+    paddd                m5, m6                        ;t1 + t3 + 2048
+    paddd                m1, m6
+    paddd                m1, m7                        ;t0 + t1 - t3 + 2048
+
+    psrad                m4, 12                        ;out0
+    psrad                m5, 12                        ;out1
+    psrad                m1, 12                        ;out3
+    packssdw             m3, m1                        ;out3
+    packssdw             m4, m5                        ;low: out0  high: out1
+
+    punpckhqdq           m1, m0, m4                    ;out1
+    punpcklqdq           m0, m4                        ;out0
+%endmacro
+
+%macro IADST8_1D_PACKED 0
+    mova                 m6, [o(pd_2048)]
+    punpckhwd            m4, m3, m0                ;unpacked in7 in0
+    punpckhwd            m5, m2, m1                ;unpacked in5 in2
+    punpcklwd            m1, m2                    ;unpacked in3 in4
+    punpcklwd            m0, m3                    ;unpacked in1 in6
+    ITX_MUL2X_PACK        4, 2, 6,  401, 4076      ;low:  t0a   high:  t1a
+    ITX_MUL2X_PACK        5, 2, 6, 1931, 3612      ;low:  t2a   high:  t3a
+    ITX_MUL2X_PACK        1, 2, 6, 3166, 2598      ;low:  t4a   high:  t5a
+    ITX_MUL2X_PACK        0, 2, 6, 3920, 1189      ;low:  t6a   high:  t7a
+
+    psubsw               m3, m4, m1                ;low:  t4    high:  t5
+    paddsw               m4, m1                    ;low:  t0    high:  t1
+    psubsw               m2, m5, m0                ;low:  t6    high:  t7
+    paddsw               m5, m0                    ;low:  t2    high:  t3
+
+    shufps               m1, m3, m2, q1032
+    punpckhwd            m2, m1
+    punpcklwd            m3, m1
+    ITX_MUL2X_PACK        3, 0, 6, 1567, 3784, 1   ;low:  t5a   high:  t4a
+    ITX_MUL2X_PACK        2, 0, 6, 3784, 1567      ;low:  t7a   high:  t6a
+
+    psubsw               m1, m4, m5                ;low:  t2    high:  t3
+    paddsw               m4, m5                    ;low:  out0  high: -out7
+    psubsw               m5, m3, m2                ;low:  t7    high:  t6
+    paddsw               m3, m2                    ;low:  out6  high: -out1
+    shufps               m0, m4, m3, q3210         ;low:  out0  high: -out1
+    shufps               m3, m4, q3210             ;low:  out6  high: -out7
+
+    shufps               m4, m1, m5, q1032         ;low:  t3    high:  t7
+    shufps               m1, m5, q3210             ;low:  t2    high:  t6
+    mova                 m5, [o(pw_2896x8)]
+    psubw                m2, m1, m4                ;low:  t2-t3 high:  t6-t7
+    paddw                m1, m4                    ;low:  t2+t3 high:  t6+t7
+    pmulhrsw             m2, m5                    ;low:  out4  high: -out5
+    shufps               m1, m1, q1032
+    pmulhrsw             m1, m5                    ;low:  out2  high: -out3
+%endmacro
+
+%macro WRITE_4X8 4 ;row[1-4]
+    WRITE_4X4             0, 1, 4, 5, 6, %1, %2, %3, %4
+    lea                dstq, [dstq+strideq*4]
+    WRITE_4X4             2, 3, 4, 5, 6, %1, %2, %3, %4
+%endmacro
+
+%macro INV_4X8 0
+    punpckhwd            m4, m2, m3
+    punpcklwd            m2, m3
+    punpckhwd            m3, m0, m1
+    punpcklwd            m0, m1
+    punpckhdq            m1, m0, m2                  ;low: in2 high: in3
+    punpckldq            m0, m2                      ;low: in0 high: in1
+    punpckldq            m2, m3, m4                  ;low: in4 high: in5
+    punpckhdq            m3, m4                      ;low: in6 high: in7
+%endmacro
+
+%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 4x8, 8
+%if %3 >= 0
+%ifidn %1_%2, dct_identity
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1, [coeffq]
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, [o(pw_4096)]
+    punpckhwd            m2, m0, m0
+    punpcklwd            m0, m0
+    punpckhdq            m1, m0, m0
+    punpckldq            m0, m0
+    punpckhdq            m3, m2, m2
+    punpckldq            m2, m2
+    call m(iadst_4x8_internal).end3
+    RET
+%elifidn %1_%2, identity_dct
+    movd                 m0, [coeffq+16*0]
+    punpcklwd            m0, [coeffq+16*1]
+    movd                 m1, [coeffq+16*2]
+    punpcklwd            m1, [coeffq+16*3]
+    mova                 m2, [o(pw_2896x8)]
+    punpckldq            m0, m1
+    pmulhrsw             m0, m2
+    paddw                m0, m0
+    pmulhrsw             m0, [o(pw_5793x4)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m0, [o(pw_2048)]
+    punpcklqdq           m0, m0
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    call m(iadst_4x8_internal).end3
+    RET
+%elifidn %1_%2, dct_dct
+    pshuflw              m0, [coeffq], q0000
+    punpcklqdq           m0, m0
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+    mov           [coeffq], eobd
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, [o(pw_2048)]
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    call m(iadst_4x8_internal).end4
+    RET
+%else ; adst_dct / flipadst_dct
+    pshuflw              m0, [coeffq], q0000
+    punpcklqdq           m0, m0
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+%ifidn %1, adst
+    pmulhrsw             m0, [o(iadst4_dconly1a)]
+%else ; flipadst
+    pmulhrsw             m0, [o(iadst4_dconly1b)]
+%endif
+    mov            [coeffq], eobd
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, [o(pw_2048)]
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    call m(iadst_4x8_internal).end4
+    RET
+%endif
+%endif
+%endmacro
+
+INV_TXFM_4X8_FN dct, dct,      0
+INV_TXFM_4X8_FN dct, identity, 7
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    call m(idct_8x4_internal).main
+    call m(iadst_4x8_internal).inversion
+    jmp                tx2q
+
+.pass2:
+    call .main
+    shufps               m1, m1, q1032
+    shufps               m3, m3, q1032
+    mova                 m4, [o(pw_2048)]
+    jmp m(iadst_4x8_internal).end2
+
+ALIGN function_align
+.main:
+    IDCT8_1D_PACKED
+    ret
+
+
+INV_TXFM_4X8_FN adst, dct,      0
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    call m(iadst_8x4_internal).main
+    call .inversion
+    jmp                tx2q
+
+.pass2:
+    shufps               m0, m0, q1032
+    shufps               m1, m1, q1032
+    call .main
+    mova                 m4, [o(pw_2048)]
+    pxor                 m5, m5
+    psubw                m5, m4
+
+.end:
+    punpcklqdq           m4, m5
+
+.end2:
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m4
+    pmulhrsw             m2, m4
+    pmulhrsw             m3, m4
+
+.end3:
+    pxor                 m5, m5
+    mova      [coeffq+16*0], m5
+    mova      [coeffq+16*1], m5
+    mova      [coeffq+16*2], m5
+    mova      [coeffq+16*3], m5
+
+.end4:
+    WRITE_4X8             0, 1, 2, 3
+    RET
+
+ALIGN function_align
+.main:
+    IADST8_1D_PACKED
+    ret
+
+ALIGN function_align
+.inversion:
+    INV_4X8
+    ret
+
+INV_TXFM_4X8_FN flipadst, dct,      0
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    call m(iadst_8x4_internal).main
+
+    punpcklwd            m4, m3, m2
+    punpckhwd            m3, m2
+    punpcklwd            m5, m1, m0
+    punpckhwd            m1, m0
+    punpckldq            m2, m3, m1                  ;low: in4 high: in5
+    punpckhdq            m3, m1                      ;low: in6 high: in7
+    punpckldq            m0, m4, m5                  ;low: in0 high: in1
+    punpckhdq            m1, m4, m5                  ;low: in2 high: in3
+    jmp                tx2q
+
+.pass2:
+    shufps               m0, m0, q1032
+    shufps               m1, m1, q1032
+    call m(iadst_4x8_internal).main
+
+    mova                 m4, m0
+    mova                 m5, m1
+    pshufd               m0, m3, q1032
+    pshufd               m1, m2, q1032
+    pshufd               m2, m5, q1032
+    pshufd               m3, m4, q1032
+    mova                 m5, [o(pw_2048)]
+    pxor                 m4, m4
+    psubw                m4, m5
+    jmp m(iadst_4x8_internal).end
+
+INV_TXFM_4X8_FN identity, dct,      3
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    mova                 m5, [o(pw_5793x4)]
+    paddw                m0, m0
+    paddw                m1, m1
+    paddw                m2, m2
+    paddw                m3, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
+    pmulhrsw             m3, m5
+
+    call m(iadst_4x8_internal).inversion
+    jmp                tx2q
+
+.pass2:
+    mova                 m4, [o(pw_4096)]
+    jmp m(iadst_4x8_internal).end2
+
+
+%macro WRITE_8X2 5       ;coefs[1-2], tmp[1-3]
+    movq                 m%3, [dstq   ]
+    movq                 m%4, [dstq+strideq]
+    pxor                 m%5, m%5
+    punpcklbw            m%3, m%5                 ;extend byte to word
+    punpcklbw            m%4, m%5                 ;extend byte to word
+    paddw                m%3, m%1
+    paddw                m%4, m%2
+    packuswb             m%3, m%4
+    movq           [dstq   ], m%3
+    punpckhqdq           m%3, m%3
+    movq      [dstq+strideq], m%3
+%endmacro
+
+%macro WRITE_8X4 7      ;coefs[1-4], tmp[1-3]
+    WRITE_8X2             0, 1, 4, 5, 6
+    lea                dstq, [dstq+strideq*2]
+    WRITE_8X2             2, 3, 4, 5, 6
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 8x4, 8
+%if %3 >= 0
+%ifidn %1_%2, dct_identity
+    mova                 m0, [o(pw_2896x8)]
+    pmulhrsw             m1, m0, [coeffq]
+    pmulhrsw             m1, m0
+    paddw                m1, m1
+    pmulhrsw             m1, [o(pw_5793x4)]
+    pmulhrsw             m1, [o(pw_2048)]
+    punpcklwd            m1, m1
+    punpckhdq            m2, m1, m1
+    punpckldq            m1, m1
+    punpckhdq            m3, m2, m2
+    punpckldq            m2, m2
+    punpckldq            m0, m1, m1
+    punpckhdq            m1, m1
+%elifidn %1_%2, identity_dct
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    mova                 m2, [coeffq+16*2]
+    mova                 m3, [coeffq+16*3]
+    punpckhwd            m4, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m5, m2, m3
+    punpcklwd            m2, m3
+    punpcklwd            m0, m4
+    punpcklwd            m2, m5
+    punpcklqdq           m0, m2
+    mova                 m4, [o(pw_2896x8)]
+    pmulhrsw             m0, m4
+    paddw                m0, m0
+    pmulhrsw             m0, m4
+    pmulhrsw             m0, [o(pw_2048)]
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+%else
+    pshuflw              m0, [coeffq], q0000
+    punpcklqdq           m0, m0
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m1
+%ifidn %2, dct
+    mova                 m2, [o(pw_2048)]
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m2
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+%else ; adst / flipadst
+    pmulhrsw             m2, m0, [o(iadst4_dconly2b)]
+    pmulhrsw             m0, [o(iadst4_dconly2a)]
+    mova                 m1, [o(pw_2048)]
+    pmulhrsw             m0, m1
+    pmulhrsw             m2, m1
+%ifidn %2, adst
+    punpckhqdq           m1, m0, m0
+    punpcklqdq           m0, m0
+    punpckhqdq           m3, m2, m2
+    punpcklqdq           m2, m2
+%else ; flipadst
+    mova                 m3, m0
+    punpckhqdq           m0, m2, m2
+    punpcklqdq           m1, m2, m2
+    punpckhqdq           m2, m3, m3
+    punpcklqdq           m3, m3
+%endif
+%endif
+%endif
+    call m(iadst_8x4_internal).end2
+    RET
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct,      0
+INV_TXFM_8X4_FN dct, adst,     0
+INV_TXFM_8X4_FN dct, flipadst, 0
+INV_TXFM_8X4_FN dct, identity, 3
+
+cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    call m(idct_4x8_internal).main
+
+    mova                 m4, [o(deint_shuf1)]
+    mova                 m5, [o(deint_shuf2)]
+    pshufb               m0, m4
+    pshufb               m1, m5
+    pshufb               m2, m4
+    pshufb               m3, m5
+    punpckhdq            m4, m0, m1
+    punpckldq            m0, m1
+    punpckhdq            m5, m2, m3
+    punpckldq            m2, m3
+    punpckhqdq           m1, m0, m2                      ;in1
+    punpcklqdq           m0, m2                          ;in0
+    punpckhqdq           m3, m4, m5                      ;in3
+    punpcklqdq           m2 ,m4, m5                      ;in2
+    jmp                tx2q
+
+.pass2:
+    call .main
+    jmp m(iadst_8x4_internal).end
+
+ALIGN function_align
+.main:
+    mova                 m6, [o(pd_2048)]
+    IDCT4_1D             0, 1, 2, 3, 4, 5, 6
+    ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    shufps               m0, m0, q1032
+    shufps               m1, m1, q1032
+    call m(iadst_4x8_internal).main
+
+    punpckhwd            m4, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m2, m3
+    punpcklwd            m2, m3
+    pxor                 m5, m5
+    psubw                m3, m5, m1
+    psubw                m5, m4
+    punpckhdq            m4, m5, m3
+    punpckldq            m5, m3
+    punpckhdq            m3, m0, m2
+    punpckldq            m0, m2
+    punpckhwd            m1, m0, m5      ;in1
+    punpcklwd            m0, m5          ;in0
+    punpcklwd            m2, m3, m4      ;in2
+    punpckhwd            m3, m4          ;in3
+    jmp              tx2q
+
+.pass2:
+    call .main
+
+.end:
+    mova                 m4, [o(pw_2048)]
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m4
+    pmulhrsw             m2, m4
+    pmulhrsw             m3, m4
+
+.end2:
+    pxor                 m6, m6
+    mova      [coeffq+16*0], m6
+    mova      [coeffq+16*1], m6
+    mova      [coeffq+16*2], m6
+    mova      [coeffq+16*3], m6
+    WRITE_8X4             0, 1, 2, 3, 4, 5, 6
+    RET
+
+ALIGN function_align
+.main:
+    IADST4_1D
+    ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    shufps               m0, m0, q1032
+    shufps               m1, m1, q1032
+    call m(iadst_4x8_internal).main
+
+    punpckhwd            m5, m3, m2
+    punpcklwd            m3, m2
+    punpckhwd            m2, m1, m0
+    punpcklwd            m1, m0
+
+    pxor                 m0, m0
+    psubw                m4, m0, m2
+    psubw                m0, m5
+    punpckhdq            m2, m0, m4
+    punpckldq            m0, m4
+    punpckhdq            m4, m3, m1
+    punpckldq            m3, m1
+    punpckhwd            m1, m0, m3      ;in1
+    punpcklwd            m0, m3          ;in0
+    punpckhwd            m3, m2, m4      ;in3
+    punpcklwd            m2, m4          ;in2
+    jmp                  tx2q
+
+.pass2:
+    call m(iadst_8x4_internal).main
+    mova                 m4, m0
+    mova                 m5, m1
+    mova                 m0, m3
+    mova                 m1, m2
+    mova                 m2, m5
+    mova                 m3, m4
+    jmp m(iadst_8x4_internal).end
+
+INV_TXFM_8X4_FN identity, dct,      7
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+    paddw                m0, m0
+    paddw                m1, m1
+    paddw                m2, m2
+    paddw                m3, m3
+
+    punpckhwd            m4, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m2, m3
+    punpcklwd            m2, m3
+    punpckhdq            m5, m4, m1
+    punpckldq            m4, m1
+    punpckhdq            m3, m0, m2
+    punpckldq            m0, m2
+    punpckhwd            m1, m0, m4      ;in1
+    punpcklwd            m0, m4          ;in0
+    punpcklwd            m2, m3, m5      ;in2
+    punpckhwd            m3, m5          ;in3
+    jmp                tx2q
+
+.pass2:
+    mova                 m4, [o(pw_5793x4)]
+    paddw                m0, m0
+    paddw                m1, m1
+    paddw                m2, m2
+    paddw                m3, m3
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m4
+    pmulhrsw             m2, m4
+    pmulhrsw             m3, m4
+    jmp m(iadst_8x4_internal).end