shithub: dav1d

Download patch

ref: e0b88bd2b2c97a2695edcc498485e1cb3003e7f1
parent: 98ed9be69b08f5438cce7e696b2c8eadfb3ce905
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Apr 20 19:35:30 EDT 2020

x86: Remove identity/adst itx fast paths

Testing shows that those code paths are essentially never executed
with real-world bitstreams so they just add redundant branches,
increase code size, and add complexity for no actual benefit.

--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -27,15 +27,10 @@
 
 %if ARCH_X86_64
 
-SECTION_RODATA 32
+SECTION_RODATA 16
 
 ; Note: The order of (at least some of) those constants matter!
 
-iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
-iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424
-iadst4_dconly1a: dw 10568, 19856, 26752, 30424
-iadst4_dconly1b: dw 30424, 26752, 19856, 10568
-
 deint_shuf: db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
 
 %macro COEF_PAIR 2
@@ -132,7 +127,7 @@
 ; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
 ; single rip-relative lea and then address things relative from that with
 ; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
-%define o_base iadst4_dconly2a + 128
+%define o_base deint_shuf + 128
 %define o(x) (rax - (o_base) + (x))
 
 %macro REPX 2-*
@@ -363,18 +358,14 @@
     vpblendd             m0, m0, m2, 0x03
     ITX4_END              3, 0, 2, 1, 0
 
-%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
-cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, c, eob, tx2
-    %undef cmp
-    %define %%p1 m(i%1_%4_internal)
+%macro INV_TXFM_FN 3 ; type1, type2, size
+cglobal inv_txfm_add_%1_%2_%3, 4, 5, 0, dst, stride, c, eob, tx2
+    %define %%p1 m(i%1_%3_internal)
     lea                 rax, [o_base]
     ; Jump to the 1st txfm function if we're not taking the fast path, which
     ; in turn performs an indirect jump to the 2nd txfm function.
-    lea tx2q, [m(i%2_%4_internal).pass2]
-%if %3 > 0
-    cmp                eobd, %3
-    jg %%p1
-%elif %3 == 0
+    lea                tx2q, [m(i%2_%3_internal).pass2]
+%ifidn %1_%2, dct_dct
     test               eobd, eobd
     jnz %%p1
 %else
@@ -385,55 +376,17 @@
 %endif
 %endmacro
 
-%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x4
-%ifidn %1_%2, dct_identity
-    vpbroadcastd         m0, [o(pw_2896x8)]
-    pmulhrsw             m0, [cq]
-    vpbroadcastd         m1, [o(pw_1697x8)]
-    pmulhrsw             m1, m0
-    paddsw               m0, m1
-    punpcklwd            m0, m0
-    punpckhdq            m1, m0, m0
-    punpckldq            m0, m0
-    jmp m(iadst_4x4_internal).end
-%elifidn %1_%2, identity_dct
-    mova                 m0, [cq+16*0]
-    packusdw             m0, [cq+16*1]
-    vpbroadcastd         m1, [o(pw_1697x8)]
-    vpbroadcastd         m2, [o(pw_2896x8)]
-    packusdw             m0, m0
-    pmulhrsw             m1, m0
-    paddsw               m0, m1
-    pmulhrsw             m0, m2
-    mova                 m1, m0
-    jmp m(iadst_4x4_internal).end
-%elif %3 >= 0
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x4
+%ifidn %1_%2, dct_dct
     vpbroadcastw         m0, [cq]
-%ifidn %1, dct
     vpbroadcastd         m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1
-%elifidn %1, adst
-    movddup              m1, [o(iadst4_dconly1a)]
-    pmulhrsw             m0, m1
-%elifidn %1, flipadst
-    movddup              m1, [o(iadst4_dconly1b)]
-    pmulhrsw             m0, m1
-%endif
     mov                [cq], eobd ; 0
-%ifidn %2, dct
-%ifnidn %1, dct
-    vpbroadcastd         m1, [o(pw_2896x8)]
-%endif
     pmulhrsw             m0, m1
     mova                 m1, m0
     jmp m(iadst_4x4_internal).end2
-%else ; adst / flipadst
-    pmulhrsw             m1, m0, [o(iadst4_dconly2b)]
-    pmulhrsw             m0, [o(iadst4_dconly2a)]
-    jmp m(i%2_4x4_internal).end2
 %endif
-%endif
 %endmacro
 
 %macro IDCT4_1D_PACKED 0
@@ -477,10 +430,10 @@
     packssdw             m1, m2 ; out2 out3
 %endmacro
 
-INV_TXFM_4X4_FN dct, dct,      0
-INV_TXFM_4X4_FN dct, adst,     0
-INV_TXFM_4X4_FN dct, flipadst, 0
-INV_TXFM_4X4_FN dct, identity, 3
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
 
 cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
     mova                 m0, [cq+16*0]
@@ -499,9 +452,9 @@
     mova          [cq+16*1], m2
     ITX4_END              0, 1, 3, 2
 
-INV_TXFM_4X4_FN adst, dct,      0
-INV_TXFM_4X4_FN adst, adst,     0
-INV_TXFM_4X4_FN adst, flipadst, 0
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
 INV_TXFM_4X4_FN adst, identity
 
 cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
@@ -526,9 +479,9 @@
     IADST4_1D_PACKED
     ret
 
-INV_TXFM_4X4_FN flipadst, dct,      0
-INV_TXFM_4X4_FN flipadst, adst,     0
-INV_TXFM_4X4_FN flipadst, flipadst, 0
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
 INV_TXFM_4X4_FN flipadst, identity
 
 cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
@@ -549,7 +502,7 @@
 .end2:
     ITX4_END              3, 2, 1, 0
 
-INV_TXFM_4X4_FN identity, dct,      3
+INV_TXFM_4X4_FN identity, dct
 INV_TXFM_4X4_FN identity, adst
 INV_TXFM_4X4_FN identity, flipadst
 INV_TXFM_4X4_FN identity, identity
@@ -600,38 +553,9 @@
     pextrd [r2  +r3       ], xm5, 3
 %endmacro
 
-%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    vpbroadcastd        xm0, [o(pw_2896x8)]
-    pmulhrsw            xm1, xm0, [cq]
-    vpbroadcastd        xm2, [o(pw_4096)]
-    pmulhrsw            xm1, xm0
-    pmulhrsw            xm1, xm2
-    vpermq               m1, m1, q1100
-    punpcklwd            m1, m1
-    punpckldq            m0, m1, m1
-    punpckhdq            m1, m1
-    jmp m(iadst_4x8_internal).end3
-%elifidn %1_%2, identity_dct
-    movd                xm0, [cq+16*0]
-    punpcklwd           xm0, [cq+16*1]
-    movd                xm1, [cq+16*2]
-    punpcklwd           xm1, [cq+16*3]
-    vpbroadcastd        xm2, [o(pw_2896x8)]
-    vpbroadcastd        xm3, [o(pw_1697x8)]
-    vpbroadcastd        xm4, [o(pw_2048)]
-    punpckldq           xm0, xm1
-    pmulhrsw            xm0, xm2
-    pmulhrsw            xm3, xm0
-    paddsw              xm0, xm3
-    pmulhrsw            xm0, xm2
-    pmulhrsw            xm0, xm4
-    vpbroadcastq         m0, xm0
-    mova                 m1, m0
-    jmp m(iadst_4x8_internal).end3
-%elifidn %1_%2, dct_dct
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x8
+%ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
     movd                xm2, [o(pw_2048)]
@@ -641,25 +565,8 @@
     pmulhrsw            xm0, xm2
     vpbroadcastw         m0, xm0
     mova                 m1, m0
-    jmp m(iadst_4x8_internal).end4
-%else ; adst_dct / flipadst_dct
-    vpbroadcastw        xm0, [cq]
-    vpbroadcastd        xm1, [o(pw_2896x8)]
-    pmulhrsw            xm0, xm1
-    pmulhrsw            xm0, [o(iadst4_dconly1a)]
-    vpbroadcastd        xm2, [o(pw_2048)]
-    mov                [cq], eobd
-    pmulhrsw            xm0, xm1
-    pmulhrsw            xm0, xm2
-%ifidn %1, adst
-    vpbroadcastq         m0, xm0
-%else ; flipadst
-    vpermq               m0, m0, q1111
+    jmp m(iadst_4x8_internal).end3
 %endif
-    mova                 m1, m0
-    jmp m(iadst_4x8_internal).end4
-%endif
-%endif
 %endmacro
 
 %macro IDCT8_1D_PACKED 0
@@ -772,10 +679,10 @@
 %endmacro
 
 INIT_YMM avx2
-INV_TXFM_4X8_FN dct, dct,      0
-INV_TXFM_4X8_FN dct, identity, 7
+INV_TXFM_4X8_FN dct, dct
 INV_TXFM_4X8_FN dct, adst
 INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
 
 cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpermq               m0, [cq+32*0], q3120
@@ -804,7 +711,7 @@
     WRAP_XMM IDCT8_1D_PACKED
     ret
 
-INV_TXFM_4X8_FN adst, dct,      0
+INV_TXFM_4X8_FN adst, dct
 INV_TXFM_4X8_FN adst, adst
 INV_TXFM_4X8_FN adst, flipadst
 INV_TXFM_4X8_FN adst, identity
@@ -838,11 +745,10 @@
     pmulhrsw             m0, m4
     pmulhrsw             m1, m4
     WIN64_RESTORE_XMM
-.end3:
     pxor                 m2, m2
     mova          [cq+32*0], m2
     mova          [cq+32*1], m2
-.end4:
+.end3:
     lea                  r2, [dstq+strideq*4]
     lea                  r3, [strideq*3]
     WRITE_4X8             0, 1
@@ -856,7 +762,7 @@
     WRAP_XMM IADST8_1D_PACKED 2
     ret
 
-INV_TXFM_4X8_FN flipadst, dct,      0
+INV_TXFM_4X8_FN flipadst, dct
 INV_TXFM_4X8_FN flipadst, adst
 INV_TXFM_4X8_FN flipadst, flipadst
 INV_TXFM_4X8_FN flipadst, identity
@@ -888,7 +794,7 @@
     pshufd               m1, m2, q1032
     jmp m(iadst_4x8_internal).end
 
-INV_TXFM_4X8_FN identity, dct,      3
+INV_TXFM_4X8_FN identity, dct
 INV_TXFM_4X8_FN identity, adst
 INV_TXFM_4X8_FN identity, flipadst
 INV_TXFM_4X8_FN identity, identity
@@ -913,49 +819,9 @@
     vpbroadcastd         m4, [o(pw_4096)]
     jmp m(iadst_4x8_internal).end2
 
-%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x16
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    vpbroadcastd         m0, [o(pw_2896x8)]
-    pmulhrsw             m0, [cq]
-    vpbroadcastd         m1, [o(pw_16384)]
-    vpbroadcastd         m2, [o(pw_1697x16)]
-    vpbroadcastd         m3, [o(pw_2048)]
-    pmulhrsw             m0, m1
-    pmulhrsw             m2, m0
-    paddsw               m0, m0
-    paddsw               m0, m2
-    pmulhrsw             m3, m0
-    punpcklwd            m1, m3, m3
-    punpckhwd            m3, m3
-    punpckldq            m0, m1, m1
-    punpckhdq            m1, m1
-    punpckldq            m2, m3, m3
-    punpckhdq            m3, m3
-    jmp m(iadst_4x16_internal).end3
-%elifidn %1_%2, identity_dct
-    movd                xm0, [cq+32*0]
-    punpcklwd           xm0, [cq+32*1]
-    movd                xm1, [cq+32*2]
-    punpcklwd           xm1, [cq+32*3]
-    vpbroadcastd        xm2, [o(pw_1697x8)]
-    vpbroadcastd        xm3, [o(pw_2896x8)]
-    vpbroadcastd        xm4, [o(pw_2048)]
-    punpckldq           xm0, xm1
-    pcmpeqw             xm1, xm1
-    pmulhrsw            xm2, xm0
-    pcmpeqw             xm1, xm0
-    pxor                xm0, xm1
-    pavgw               xm0, xm2
-    pmulhrsw            xm0, xm3
-    pmulhrsw            xm0, xm4
-    vpbroadcastq         m0, xm0
-    mova                 m1, m0
-    mova                 m2, m0
-    mova                 m3, m0
-    jmp m(iadst_4x16_internal).end3
-%elifidn %1_%2, dct_dct
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x16
+%ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
     movd                xm2, [o(pw_16384)]
@@ -968,28 +834,8 @@
     mova                 m1, m0
     mova                 m2, m0
     mova                 m3, m0
-    jmp m(iadst_4x16_internal).end4
-%else ; adst_dct / flipadst_dct
-    vpbroadcastw        xm0, [cq]
-    pmulhrsw            xm0, [o(iadst4_dconly1a)]
-    vpbroadcastd        xm1, [o(pw_16384)]
-    vpbroadcastd        xm2, [o(pw_2896x8)]
-    mov                [cq], eobd
-    pmulhrsw            xm0, xm1
-    psrlw               xm1, 3 ; pw_2048
-    pmulhrsw            xm0, xm2
-    pmulhrsw            xm0, xm1
-%ifidn %1, adst
-    vpbroadcastq         m0, xm0
-%else ; flipadst
-    vpermq               m0, m0, q1111
+    jmp m(iadst_4x16_internal).end3
 %endif
-    mova                 m1, m0
-    mova                 m2, m0
-    mova                 m3, m0
-    jmp m(iadst_4x16_internal).end4
-%endif
-%endif
 %endmacro
 
 %macro IDCT16_1D_PACKED 0
@@ -1061,10 +907,10 @@
     paddsw               m3, m8     ; out7  out6
 %endmacro
 
-INV_TXFM_4X16_FN dct, dct,      0
-INV_TXFM_4X16_FN dct, identity, 15
+INV_TXFM_4X16_FN dct, dct
 INV_TXFM_4X16_FN dct, adst
 INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
 
 cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
     mova                 m0, [cq+32*0]
@@ -1102,7 +948,7 @@
     WRAP_XMM IDCT16_1D_PACKED
     ret
 
-INV_TXFM_4X16_FN adst, dct,      0
+INV_TXFM_4X16_FN adst, dct
 INV_TXFM_4X16_FN adst, adst
 INV_TXFM_4X16_FN adst, flipadst
 INV_TXFM_4X16_FN adst, identity
@@ -1147,13 +993,12 @@
 .end2:
     REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
     WIN64_RESTORE_XMM
-.end3:
     pxor                 m4, m4
     mova          [cq+32*0], m4
     mova          [cq+32*1], m4
     mova          [cq+32*2], m4
     mova          [cq+32*3], m4
-.end4:
+.end3:
     lea                  r2, [dstq+strideq*8]
     lea                  r3, [strideq*3]
     WRITE_4X8             0, 1
@@ -1232,7 +1077,7 @@
     packssdw             m1, m4     ; -out7   out4   out6  -out5
     ret
 
-INV_TXFM_4X16_FN flipadst, dct,      0
+INV_TXFM_4X16_FN flipadst, dct
 INV_TXFM_4X16_FN flipadst, adst
 INV_TXFM_4X16_FN flipadst, flipadst
 INV_TXFM_4X16_FN flipadst, identity
@@ -1274,7 +1119,7 @@
     psubw                m5, m7, m6
     jmp m(iadst_4x16_internal).end
 
-INV_TXFM_4X16_FN identity, dct,      3
+INV_TXFM_4X16_FN identity, dct
 INV_TXFM_4X16_FN identity, adst
 INV_TXFM_4X16_FN identity, flipadst
 INV_TXFM_4X16_FN identity, identity
@@ -1350,69 +1195,25 @@
     movhps        [dstq+%7], xm%4
 %endmacro
 
-%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x4
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    vpbroadcastd        xm0, [o(pw_2896x8)]
-    pmulhrsw            xm1, xm0, [cq]
-    vpbroadcastd        xm2, [o(pw_1697x8)]
-    vpbroadcastd        xm3, [o(pw_2048)]
-    pmulhrsw            xm1, xm0
-    pmulhrsw            xm2, xm1
-    paddsw              xm1, xm2
-    pmulhrsw            xm1, xm3
-    punpcklwd           xm1, xm1
-    punpckldq           xm0, xm1, xm1
-    punpckhdq           xm1, xm1
-    vpermq               m0, m0, q1100
-    vpermq               m1, m1, q1100
-%elifidn %1_%2, identity_dct
-    mova                xm0, [cq+16*0]
-    packusdw            xm0, [cq+16*1]
-    mova                xm1, [cq+16*2]
-    packusdw            xm1, [cq+16*3]
-    vpbroadcastd        xm2, [o(pw_2896x8)]
-    vpbroadcastd        xm3, [o(pw_2048)]
-    packusdw            xm0, xm1
-    pmulhrsw            xm0, xm2
-    paddsw              xm0, xm0
-    pmulhrsw            xm0, xm2
-    pmulhrsw            xm0, xm3
-    vinserti128          m0, m0, xm0, 1
-    mova                 m1, m0
-%else
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x4
+%ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
     pmulhrsw            xm0, xm1
-%ifidn %2, dct
     movd                xm2, [o(pw_2048)]
     pmulhrsw            xm0, xm1
     pmulhrsw            xm0, xm2
     vpbroadcastw         m0, xm0
     mova                 m1, m0
-%else ; adst / flipadst
-    vpbroadcastw         m0, xm0
-    pmulhrsw             m0, [o(iadst4_dconly2a)]
-    vpbroadcastd         m1, [o(pw_2048)]
-    pmulhrsw             m1, m0
-%ifidn %2, adst
-    vpermq               m0, m1, q1100
-    vpermq               m1, m1, q3322
-%else ; flipadst
-    vpermq               m0, m1, q2233
-    vpermq               m1, m1, q0011
-%endif
-%endif
-%endif
     jmp m(iadst_8x4_internal).end3
 %endif
 %endmacro
 
-INV_TXFM_8X4_FN dct, dct,      0
-INV_TXFM_8X4_FN dct, adst,     0
-INV_TXFM_8X4_FN dct, flipadst, 0
-INV_TXFM_8X4_FN dct, identity, 3
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
 
 cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpbroadcastd        xm3, [o(pw_2896x8)]
@@ -1510,7 +1311,7 @@
     vpermq               m0, m2, q2031
     jmp m(iadst_8x4_internal).end2
 
-INV_TXFM_8X4_FN identity, dct,      7
+INV_TXFM_8X4_FN identity, dct
 INV_TXFM_8X4_FN identity, adst
 INV_TXFM_8X4_FN identity, flipadst
 INV_TXFM_8X4_FN identity, identity
@@ -1538,25 +1339,9 @@
     paddsw               m1, m3
     jmp m(iadst_8x4_internal).end
 
-%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x8
-%ifidn %1_%2, dct_identity
-    vpbroadcastd        xm0, [o(pw_2896x8)]
-    pmulhrsw            xm0, [cq]
-    vpbroadcastd        xm1, [o(pw_16384)]
-    pmulhrsw            xm0, xm1
-    psrlw               xm1, 2 ; pw_4096
-    pmulhrsw            xm0, xm1
-    pshufb              xm0, [o(deint_shuf)]
-    vpermq               m3, m0, q1100
-    punpcklwd            m3, m3
-    pshufd               m0, m3, q0000
-    pshufd               m1, m3, q1111
-    pshufd               m2, m3, q2222
-    pshufd               m3, m3, q3333
-    jmp m(iadst_8x8_internal).end4
-%elif %3 >= 0
-%ifidn %1, dct
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x8
+%ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
     movd                xm2, [o(pw_16384)]
@@ -1576,33 +1361,13 @@
     dec                 r2d
     jg .loop
     RET
-%else ; identity
-    mova                 m0, [cq+32*0]
-    punpcklwd            m0, [cq+32*1]
-    mova                 m1, [cq+32*2]
-    punpcklwd            m1, [cq+32*3]
-    vpbroadcastd         m2, [o(pw_2896x8)]
-    vpbroadcastd         m3, [o(pw_2048)]
-    pxor                 m4, m4
-    mova          [cq+32*0], m4
-    mova          [cq+32*1], m4
-    mova          [cq+32*2], m4
-    mova          [cq+32*3], m4
-    punpckldq            m0, m1
-    vpermq               m1, m0, q3232
-    vpermq               m0, m0, q1010
-    punpcklwd            m0, m1
-    pmulhrsw             m0, m2
-    pmulhrsw             m0, m3
-    jmp m(inv_txfm_add_dct_dct_8x8).end
 %endif
-%endif
 %endmacro
 
-INV_TXFM_8X8_FN dct, dct,      0
-INV_TXFM_8X8_FN dct, identity, 7
+INV_TXFM_8X8_FN dct, dct
 INV_TXFM_8X8_FN dct, adst
 INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
 
 cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpermq               m0, [cq+32*0], q3120 ; 0 1
@@ -1749,7 +1514,7 @@
     pmulhrsw             m0, m5, m4
     jmp m(iadst_8x8_internal).end3
 
-INV_TXFM_8X8_FN identity, dct,      7
+INV_TXFM_8X8_FN identity, dct
 INV_TXFM_8X8_FN identity, adst
 INV_TXFM_8X8_FN identity, flipadst
 INV_TXFM_8X8_FN identity, identity
@@ -1776,8 +1541,8 @@
     vpbroadcastd         m4, [o(pw_4096)]
     jmp m(iadst_8x8_internal).end
 
-%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x16
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x16
 %ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
@@ -1791,66 +1556,6 @@
     vpbroadcastw         m0, xm0
     mov                 r2d, 4
     jmp m(inv_txfm_add_dct_dct_8x8).end2
-%elifidn %1_%2, dct_identity
-    WIN64_SPILL_XMM      13
-    vpbroadcastd         m0, [o(pw_2896x8)]
-    pmulhrsw             m7, m0, [cq]
-    vpbroadcastd         m1, [o(pw_16384)]
-    vpbroadcastd         m2, [o(pw_1697x16)]
-    pxor                 m3, m3
-    mova               [cq], m3
-    pmulhrsw             m7, m0
-    pmulhrsw             m7, m1
-    psrlw                m1, 3 ; pw_2048
-    pmulhrsw             m2, m7
-    paddsw               m7, m7
-    paddsw               m7, m2
-    pmulhrsw             m7, m1
-    punpcklwd            m5, m7, m7
-    punpckhwd            m7, m7
-    punpcklwd            m4, m5, m5
-    punpckhwd            m5, m5
-    punpcklwd            m6, m7, m7
-    punpckhwd            m7, m7
-    vpermq               m0, m4, q1100
-    vpermq               m1, m5, q1100
-    vpermq               m2, m6, q1100
-    vpermq               m3, m7, q1100
-    vpermq               m4, m4, q3322
-    vpermq               m5, m5, q3322
-    vpermq               m6, m6, q3322
-    vpermq               m7, m7, q3322
-    jmp m(idct_8x16_internal).end4
-%elifidn %1_%2, identity_dct
-    movd                xm0, [cq+32*0]
-    punpcklwd           xm0, [cq+32*1]
-    movd                xm2, [cq+32*2]
-    punpcklwd           xm2, [cq+32*3]
-    add                  cq, 32*4
-    movd                xm1, [cq+32*0]
-    punpcklwd           xm1, [cq+32*1]
-    movd                xm3, [cq+32*2]
-    punpcklwd           xm3, [cq+32*3]
-    vpbroadcastd        xm4, [o(pw_2896x8)]
-    vpbroadcastd        xm5, [o(pw_2048)]
-    xor                 eax, eax
-    mov           [cq-32*4], eax
-    mov           [cq-32*3], eax
-    mov           [cq-32*2], eax
-    mov           [cq-32*1], eax
-    punpckldq           xm0, xm2
-    punpckldq           xm1, xm3
-    punpcklqdq          xm0, xm1
-    pmulhrsw            xm0, xm4
-    pmulhrsw            xm0, xm4
-    pmulhrsw            xm0, xm5
-    mov           [cq+32*0], eax
-    mov           [cq+32*1], eax
-    mov           [cq+32*2], eax
-    mov           [cq+32*3], eax
-    vinserti128          m0, m0, xm0, 1
-    mov                 r2d, 4
-    jmp m(inv_txfm_add_dct_dct_8x8).end2
 %endif
 %endmacro
 
@@ -1867,10 +1572,10 @@
     pmulhrsw             m4,     [cq+32*0]
 %endmacro
 
-INV_TXFM_8X16_FN dct, dct,      0
-INV_TXFM_8X16_FN dct, identity, 15
+INV_TXFM_8X16_FN dct, dct
 INV_TXFM_8X16_FN dct, adst
 INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
 
 cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_8X16_LOAD_COEFS
@@ -1915,7 +1620,6 @@
 .end3:
     pxor                 m8, m8
     REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
-.end4:
     lea                  r3, [strideq*3]
     WRITE_8X4             0, 1, 8, 9
     lea                dstq, [dstq+strideq*4]
@@ -2120,7 +1824,7 @@
     pmulhrsw             m7, m9, m8
     jmp m(idct_8x16_internal).end3
 
-INV_TXFM_8X16_FN identity, dct,      7
+INV_TXFM_8X16_FN identity, dct
 INV_TXFM_8X16_FN identity, adst
 INV_TXFM_8X16_FN identity, flipadst
 INV_TXFM_8X16_FN identity, identity
@@ -2197,64 +1901,11 @@
     vextracti128  [dstq+%6], m%3, 1
 %endmacro
 
-%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x4
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    vpbroadcastd        xm3, [o(pw_2896x8)]
-    pmulhrsw            xm3, [cq]
-    vpbroadcastd        xm0, [o(pw_16384)]
-    vpbroadcastd        xm1, [o(pw_1697x8)]
-    pmulhrsw            xm3, xm0
-    psrlw               xm0, 3 ; pw_2048
-    pmulhrsw            xm1, xm3
-    paddsw              xm3, xm1
-    pmulhrsw            xm3, xm0
-    punpcklwd           xm3, xm3
-    punpckldq           xm1, xm3, xm3
-    punpckhdq           xm3, xm3
-    vpbroadcastq         m0, xm1
-    vpermq               m1, m1, q1111
-    vpbroadcastq         m2, xm3
-    vpermq               m3, m3, q1111
-    jmp m(iadst_16x4_internal).end2
-%elifidn %1_%2, identity_dct
-    mova                xm0,     [cq+16*0]
-    mova                xm2,     [cq+16*1]
-    vinserti128          m0, m0, [cq+16*4], 1
-    vinserti128          m2, m2, [cq+16*5], 1
-    mova                xm1,     [cq+16*2]
-    mova                xm3,     [cq+16*3]
-    vinserti128          m1, m1, [cq+16*6], 1
-    vinserti128          m3, m3, [cq+16*7], 1
-    vpbroadcastd         m4, [o(pw_1697x16)]
-    vpbroadcastd         m5, [o(pw_16384)]
-    packusdw             m0, m2
-    packusdw             m1, m3
-    packusdw             m0, m1
-    vpbroadcastd         m1, [o(pw_2896x8)]
-    pmulhrsw             m4, m0
-    pmulhrsw             m4, m5
-    paddsw               m0, m4
-    psrlw                m5, 3 ; pw_2048
-    pmulhrsw             m0, m1
-    pmulhrsw             m0, m5
-    mov                 r3d, 2
-.end:
-    pxor                 m3, m3
-.end_loop:
-    mova          [cq+32*0], m3
-    mova          [cq+32*1], m3
-    add                  cq, 32*2
-    WRITE_16X2            0, 0, 1, 2, strideq*0, strideq*1
-    lea                dstq, [dstq+strideq*2]
-    dec                 r3d
-    jg .end_loop
-    RET
-%else
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x4
+%ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
-%ifidn %2, dct
     movd                xm2, [o(pw_16384)]
     mov                [cq], eobd
     mov                 r2d, 2
@@ -2279,35 +1930,13 @@
     dec                 r2d
     jg .dconly_loop
     RET
-%else ; adst / flipadst
-    movd                xm2, [o(pw_16384)]
-    pmulhrsw            xm0, xm2
-    vpbroadcastw         m0, xm0
-    pmulhrsw             m0, [o(iadst4_dconly2a)]
-    vpbroadcastd         m3, [o(pw_2048)]
-    mov                [cq], eobd
-    pmulhrsw             m3, m0
-%ifidn %2, adst
-    vpbroadcastq         m0, xm3
-    vpermq               m1, m3, q1111
-    vpermq               m2, m3, q2222
-    vpermq               m3, m3, q3333
-%else ; flipadst
-    vpermq               m0, m3, q3333
-    vpermq               m1, m3, q2222
-    vpermq               m2, m3, q1111
-    vpbroadcastq         m3, xm3
 %endif
-    jmp m(iadst_16x4_internal).end3
-%endif
-%endif
-%endif
 %endmacro
 
-INV_TXFM_16X4_FN dct, dct,      0
-INV_TXFM_16X4_FN dct, adst,     0
-INV_TXFM_16X4_FN dct, flipadst, 0
-INV_TXFM_16X4_FN dct, identity, 3
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
 
 cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
     mova                xm0, [cq+16*0]
@@ -2481,7 +2110,7 @@
     WRITE_16X2            1, 0, 4, 5, strideq*0, strideq*1
     RET
 
-INV_TXFM_16X4_FN identity, dct,      15
+INV_TXFM_16X4_FN identity, dct
 INV_TXFM_16X4_FN identity, adst
 INV_TXFM_16X4_FN identity, flipadst
 INV_TXFM_16X4_FN identity, identity
@@ -2531,8 +2160,8 @@
     paddsw               m3, m7
     jmp m(iadst_16x4_internal).end
 
-%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x8
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x8
 %ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
@@ -2541,59 +2170,6 @@
     pmulhrsw            xm0, xm1
     mov                 r2d, 4
     jmp m(inv_txfm_add_dct_dct_16x4).dconly
-%elifidn %1_%2, dct_identity
-    WIN64_SPILL_XMM      13
-    vbroadcasti128       m7, [cq]
-    vpbroadcastd         m0, [o(pw_2896x8)]
-    vpbroadcastd         m1, [o(pw_16384)]
-    pxor                xm2, xm2
-    mova               [cq], xm2
-    pmulhrsw             m7, m0
-    pmulhrsw             m7, m0
-    pmulhrsw             m7, m1
-    psrlw                m1, 2 ; pw_4096
-    pmulhrsw             m7, m1
-    punpcklwd            m3, m7, m7
-    punpckhwd            m7, m7
-    pshufd               m0, m3, q0000
-    pshufd               m1, m3, q1111
-    pshufd               m2, m3, q2222
-    pshufd               m3, m3, q3333
-    pshufd               m4, m7, q0000
-    pshufd               m5, m7, q1111
-    pshufd               m6, m7, q2222
-    pshufd               m7, m7, q3333
-    lea                  r3, [strideq*3]
-    WRITE_16X2            0, 1, 8, 0, strideq*0, strideq*1
-    WRITE_16X2            2, 3, 0, 1, strideq*2, r3
-    jmp m(idct_16x8_internal).end4
-%elifidn %1_%2, identity_dct
-    mova                 m0, [cq+32*0]
-    packusdw             m0, [cq+32*1]
-    mova                 m2, [cq+32*2]
-    packusdw             m2, [cq+32*3]
-    mova                 m1, [cq+32*4]
-    packusdw             m1, [cq+32*5]
-    mova                 m3, [cq+32*6]
-    packusdw             m3, [cq+32*7]
-    vpbroadcastd         m4, [o(pw_2896x8)]
-    vpbroadcastd         m5, [o(pw_1697x16)]
-    packusdw             m0, m2
-    packusdw             m1, m3
-    vpbroadcastd         m2, [o(pw_16384)]
-    packusdw             m0, m1
-    vpermq               m1, m0, q3322
-    vpermq               m0, m0, q1100
-    punpcklwd            m0, m1
-    pmulhrsw             m0, m4
-    pmulhrsw             m5, m0
-    pmulhrsw             m5, m2
-    paddsw               m0, m5
-    psrlw                m2, 3 ; pw_2048
-    pmulhrsw             m0, m4
-    pmulhrsw             m0, m2
-    mov                 r3d, 4
-    jmp m(inv_txfm_add_identity_dct_16x4).end
 %endif
 %endmacro
 
@@ -2611,10 +2187,10 @@
     REPX   {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
 %endmacro
 
-INV_TXFM_16X8_FN dct, dct,      0
-INV_TXFM_16X8_FN dct, identity, 7
+INV_TXFM_16X8_FN dct, dct
 INV_TXFM_16X8_FN dct, adst
 INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
 
 cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_16X8_LOAD_COEFS 3120
@@ -2837,7 +2413,7 @@
     WRITE_16X2            1, 2, 0, 1, strideq*2, r3
     jmp m(idct_16x8_internal).end3
 
-INV_TXFM_16X8_FN identity, dct,      15
+INV_TXFM_16X8_FN identity, dct
 INV_TXFM_16X8_FN identity, adst
 INV_TXFM_16X8_FN identity, flipadst
 INV_TXFM_16X8_FN identity, identity
@@ -2896,8 +2472,8 @@
 
 %define o_base pw_5 + 128
 
-%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x16
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x16
 %ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
@@ -2905,72 +2481,6 @@
     mov                [cq], eobd
     mov                 r2d, 8
     jmp m(inv_txfm_add_dct_dct_16x4).dconly
-%elifidn %1_%2, dct_identity
-    WIN64_SPILL_XMM       7
-    vpbroadcastd         m3, [o(pw_2896x8)]
-    pmulhrsw             m3, [cq]
-    vpbroadcastd         m0, [o(pw_8192)]
-    vpbroadcastd         m1, [o(pw_1697x16)]
-    vpbroadcastw         m4, [o(deint_shuf)] ; pb_0_1
-    pcmpeqb              m5, m5
-    pxor                 m6, m6
-    mova               [cq], m6
-    paddb                m5, m5 ; pb_m2
-    pmulhrsw             m3, m0
-    psrlw                m0, 2  ; pw_2048
-    IDTX16                3, 1, 1
-    pmulhrsw             m3, m0
-    mov                 r3d, 8
-.loop:
-    mova                xm1, [dstq]
-    vinserti128          m1, m1, [dstq+strideq*8], 1
-    pshufb               m0, m3, m4
-    psubb                m4, m5 ; += 2
-    punpckhbw            m2, m1, m6
-    punpcklbw            m1, m6
-    paddw                m2, m0
-    paddw                m1, m0
-    packuswb             m1, m2
-    mova             [dstq], xm1
-    vextracti128 [dstq+strideq*8], m1, 1
-    add                dstq, strideq
-    dec                 r3d
-    jg .loop
-    RET
-%elifidn %1_%2, identity_dct
-    movd                xm0,     [cq+32*0 ]
-    movd                xm2,     [cq+32*1 ]
-    movd                xm1,     [cq+32*2 ]
-    movd                xm3,     [cq+32*3 ]
-    vinserti128          m0, m0, [cq+32*8 ], 1
-    vinserti128          m2, m2, [cq+32*9 ], 1
-    vinserti128          m1, m1, [cq+32*10], 1
-    vinserti128          m3, m3, [cq+32*11], 1
-    punpcklwd            m0, m2
-    punpcklwd            m1, m3
-    punpckldq            m0, m1
-    movd                xm1,     [cq+32*4 ]
-    movd                xm3,     [cq+32*5 ]
-    movd                xm2,     [cq+32*6 ]
-    movd                xm4,     [cq+32*7 ]
-    vinserti128          m1, m1, [cq+32*12], 1
-    vinserti128          m3, m3, [cq+32*13], 1
-    vinserti128          m2, m2, [cq+32*14], 1
-    vinserti128          m4, m4, [cq+32*15], 1
-    punpcklwd            m1, m3
-    vpbroadcastd         m3, [o(pw_1697x16)]
-    punpcklwd            m2, m4
-    vpbroadcastd         m4, [o(pw_2896x8)]
-    punpckldq            m1, m2
-    vpbroadcastd         m2, [o(pw_2048)]
-    punpcklqdq           m0, m1
-    pmulhrsw             m3, m0
-    psraw                m3, 1
-    pavgw                m0, m3
-    pmulhrsw             m0, m4
-    pmulhrsw             m0, m2
-    mov                 r3d, 8
-    jmp m(inv_txfm_add_identity_dct_16x4).end
 %endif
 %endmacro
 
@@ -2995,10 +2505,10 @@
     mova              [rsp], m15
 %endmacro
 
-INV_TXFM_16X16_FN dct, dct,      0
-INV_TXFM_16X16_FN dct, identity, 15
+INV_TXFM_16X16_FN dct, dct
 INV_TXFM_16X16_FN dct, adst
 INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
 
 cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
     ITX_16X16_LOAD_COEFS
@@ -3395,7 +2905,7 @@
     pavgw               m%1, m%2 ; signs are guaranteed to be equal
 %endmacro
 
-INV_TXFM_16X16_FN identity, dct,      15
+INV_TXFM_16X16_FN identity, dct
 INV_TXFM_16X16_FN identity, identity
 
 cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
@@ -3456,7 +2966,7 @@
     paddsw              m15, m1
     jmp m(idct_16x16_internal).end
 
-%define o_base iadst4_dconly2a + 128
+%define o_base deint_shuf + 128
 
 %macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
 %if %3
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -139,11 +139,6 @@
 pw_4085x8:      times 8 dw  4085*8
 pw_m301x8:      times 8 dw  -301*8
 
-iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
-iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
-iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
-iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424
-
 SECTION .text
 
 %macro REPX 2-*
@@ -243,31 +238,24 @@
     paddsw               m0, m2                ;high: out1 ;low: out0
 %endmacro
 
-%macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
-cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
-    %undef cmp
-    %define %%p1 m(i%1_%4_internal)
+%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack
+cglobal inv_txfm_add_%1_%2_%3, 4, 6, %4, dst, stride, coeff, eob, tx2
+    %define %%p1 m(i%1_%3_internal)
 %if ARCH_X86_32
     LEA                    r5, $$
 %endif
 %if has_epilogue
-%if %3 > 0
-    cmp                  eobd, %3
-    jle %%end
-%elif %3 == 0
+%ifidn %1_%2, dct_dct
     test                 eobd, eobd
     jz %%end
 %endif
-    lea                  tx2q, [o(m(i%2_%4_internal).pass2)]
+    lea                  tx2q, [o(m(i%2_%3_internal).pass2)]
     call %%p1
     RET
 %%end:
 %else
-    lea                  tx2q, [o(m(i%2_%4_internal).pass2)]
-%if %3 > 0
-    cmp                  eobd, %3
-    jg %%p1
-%elif %3 == 0
+    lea                  tx2q, [o(m(i%2_%3_internal).pass2)]
+%ifidn %1_%2, dct_dct
     test                 eobd, eobd
     jnz %%p1
 %else
@@ -278,63 +266,26 @@
 %endif
 %endmacro
 
-%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x4, 6
-%ifidn %1_%2, dct_identity
-    mova                 m0, [o(pw_2896x8)]
-    pmulhrsw             m0, [coeffq]
-    pmulhrsw             m1, m0, [o(pw_1697x8)]
-    paddsw               m0, m1
-    punpcklwd            m0, m0
-    punpckhdq            m1, m0, m0
-    punpckldq            m0, m0
-    TAIL_CALL m(iadst_4x4_internal).end
-%elifidn %1_%2, identity_dct
-    mova                 m1, [coeffq+16*0]
-    mova                 m2, [coeffq+16*1]
-    punpcklwd            m0, m1, m2
-    punpckhwd            m1, m2
-    punpcklwd            m0, m1
-    punpcklqdq           m0, m0
-    pmulhrsw             m1, m0, [o(pw_1697x8)]
-    paddsw               m0, m1
-    pmulhrsw             m0, [o(pw_2896x8)]
-    mova                 m1, m0
-    TAIL_CALL m(iadst_4x4_internal).end
-%elif %3 >= 0
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x4, 6
+%ifidn %1_%2, dct_dct
     pshuflw              m0, [coeffq], q0000
     punpcklqdq           m0, m0
-%ifidn %1, dct
     mova                 m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1
-%elifidn %1, adst
-    pmulhrsw             m0, [o(iadst4_dconly1a)]
-%elifidn %1, flipadst
-    pmulhrsw             m0, [o(iadst4_dconly1b)]
-%endif
     mov            [coeffq], eobd                ;0
-%ifidn %2, dct
-%ifnidn %1, dct
-    pmulhrsw             m0, [o(pw_2896x8)]
-%else
     pmulhrsw             m0, m1
-%endif
     mova                 m1, m0
     TAIL_CALL m(iadst_4x4_internal).end2
-%else ; adst / flipadst
-    pmulhrsw             m1, m0, [o(iadst4_dconly2b)]
-    pmulhrsw             m0, [o(iadst4_dconly2a)]
-    TAIL_CALL m(i%2_4x4_internal).end2
 %endif
-%endif
 %endmacro
 
 INIT_XMM ssse3
 
-INV_TXFM_4X4_FN dct, dct,      0
-INV_TXFM_4X4_FN dct, adst,     0
-INV_TXFM_4X4_FN dct, flipadst, 0
-INV_TXFM_4X4_FN dct, identity, 3
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
 
 cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]      ;high: in1 ;low: in0
@@ -358,9 +309,9 @@
 
     ITX4_END     0, 1, 3, 2
 
-INV_TXFM_4X4_FN adst, dct,      0
-INV_TXFM_4X4_FN adst, adst,     0
-INV_TXFM_4X4_FN adst, flipadst, 0
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
 INV_TXFM_4X4_FN adst, identity
 
 cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
@@ -410,9 +361,9 @@
     packssdw             m1, m2                    ;high: out3 ;low: out3
     ret
 
-INV_TXFM_4X4_FN flipadst, dct,      0
-INV_TXFM_4X4_FN flipadst, adst,     0
-INV_TXFM_4X4_FN flipadst, flipadst, 0
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
 INV_TXFM_4X4_FN flipadst, identity
 
 cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
@@ -436,7 +387,7 @@
 .end2:
     ITX4_END              3, 2, 1, 0
 
-INV_TXFM_4X4_FN identity, dct,      3
+INV_TXFM_4X4_FN identity, dct
 INV_TXFM_4X4_FN identity, adst
 INV_TXFM_4X4_FN identity, flipadst
 INV_TXFM_4X4_FN identity, identity
@@ -595,39 +546,9 @@
     punpckhdq            m3, m4                      ;low: in6 high: in7
 %endmacro
 
-%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x8, 8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    mova                 m1, [o(pw_2896x8)]
-    pmulhrsw             m0, m1, [coeffq]
-    pmulhrsw             m0, m1
-    pmulhrsw             m0, [o(pw_4096)]
-    punpckhwd            m2, m0, m0
-    punpcklwd            m0, m0
-    punpckhdq            m1, m0, m0
-    punpckldq            m0, m0
-    punpckhdq            m3, m2, m2
-    punpckldq            m2, m2
-    TAIL_CALL m(iadst_4x8_internal).end3
-%elifidn %1_%2, identity_dct
-    movd                 m0, [coeffq+16*0]
-    punpcklwd            m0, [coeffq+16*1]
-    movd                 m1, [coeffq+16*2]
-    punpcklwd            m1, [coeffq+16*3]
-    mova                 m2, [o(pw_2896x8)]
-    punpckldq            m0, m1
-    pmulhrsw             m0, m2
-    pmulhrsw             m1, m0, [o(pw_1697x8)]
-    paddsw               m0, m1
-    pmulhrsw             m0, m2
-    pmulhrsw             m0, [o(pw_2048)]
-    punpcklqdq           m0, m0
-    mova                 m1, m0
-    mova                 m2, m0
-    mova                 m3, m0
-    TAIL_CALL m(iadst_4x8_internal).end3
-%elifidn %1_%2, dct_dct
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x8, 8
+%ifidn %1_%2, dct_dct
     pshuflw              m0, [coeffq], q0000
     punpcklqdq           m0, m0
     mova                 m1, [o(pw_2896x8)]
@@ -639,32 +560,14 @@
     mova                 m1, m0
     mova                 m2, m0
     mova                 m3, m0
-    TAIL_CALL m(iadst_4x8_internal).end4
-%else ; adst_dct / flipadst_dct
-    pshuflw              m0, [coeffq], q0000
-    punpcklqdq           m0, m0
-    mova                 m1, [o(pw_2896x8)]
-    pmulhrsw             m0, m1
-%ifidn %1, adst
-    pmulhrsw             m0, [o(iadst4_dconly1a)]
-%else ; flipadst
-    pmulhrsw             m0, [o(iadst4_dconly1b)]
+    TAIL_CALL m(iadst_4x8_internal).end3
 %endif
-    mov            [coeffq], eobd
-    pmulhrsw             m0, m1
-    pmulhrsw             m0, [o(pw_2048)]
-    mova                 m1, m0
-    mova                 m2, m0
-    mova                 m3, m0
-    TAIL_CALL m(iadst_4x8_internal).end4
-%endif
-%endif
 %endmacro
 
-INV_TXFM_4X8_FN dct, dct,      0
-INV_TXFM_4X8_FN dct, identity, 7
+INV_TXFM_4X8_FN dct, dct
 INV_TXFM_4X8_FN dct, adst
 INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
 
 cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m3, [o(pw_2896x8)]
@@ -690,7 +593,7 @@
     ret
 
 
-INV_TXFM_4X8_FN adst, dct,      0
+INV_TXFM_4X8_FN adst, dct
 INV_TXFM_4X8_FN adst, adst
 INV_TXFM_4X8_FN adst, flipadst
 INV_TXFM_4X8_FN adst, identity
@@ -725,8 +628,6 @@
     pmulhrsw             m1, m4
     pmulhrsw             m2, m4
     pmulhrsw             m3, m4
-
-.end3:
     pxor                 m5, m5
     mova      [coeffq+16*0], m5
     mova      [coeffq+16*1], m5
@@ -733,7 +634,7 @@
     mova      [coeffq+16*2], m5
     mova      [coeffq+16*3], m5
 
-.end4:
+.end3:
     WRITE_4X8             0, 1, 2, 3
     RET
 
@@ -783,7 +684,7 @@
     packssdw             m2, m4                    ;low:  out4  high: -out5
     ret
 
-INV_TXFM_4X8_FN flipadst, dct,      0
+INV_TXFM_4X8_FN flipadst, dct
 INV_TXFM_4X8_FN flipadst, adst
 INV_TXFM_4X8_FN flipadst, flipadst
 INV_TXFM_4X8_FN flipadst, identity
@@ -824,7 +725,7 @@
     psubw                m4, m5
     jmp m(iadst_4x8_internal).end
 
-INV_TXFM_4X8_FN identity, dct,      3
+INV_TXFM_4X8_FN identity, dct
 INV_TXFM_4X8_FN identity, adst
 INV_TXFM_4X8_FN identity, flipadst
 INV_TXFM_4X8_FN identity, identity
@@ -881,50 +782,14 @@
     WRITE_8X2             %3, %4, %5, %6, %7
 %endmacro
 
-%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x4, 8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    mova                 m0, [o(pw_2896x8)]
-    pmulhrsw             m1, m0, [coeffq]
-    pmulhrsw             m1, m0
-    pmulhrsw             m0, m1, [o(pw_1697x8)]
-    paddsw               m1, m0
-    pmulhrsw             m1, [o(pw_2048)]
-    punpcklwd            m1, m1
-    punpckhdq            m2, m1, m1
-    punpckldq            m1, m1
-    punpckhdq            m3, m2, m2
-    punpckldq            m2, m2
-    punpckldq            m0, m1, m1
-    punpckhdq            m1, m1
-%elifidn %1_%2, identity_dct
-    mova                 m0, [coeffq+16*0]
-    mova                 m1, [coeffq+16*1]
-    mova                 m2, [coeffq+16*2]
-    mova                 m3, [coeffq+16*3]
-    punpckhwd            m4, m0, m1
-    punpcklwd            m0, m1
-    punpckhwd            m5, m2, m3
-    punpcklwd            m2, m3
-    punpcklwd            m0, m4
-    punpcklwd            m2, m5
-    punpcklqdq           m0, m2
-    mova                 m4, [o(pw_2896x8)]
-    pmulhrsw             m0, m4
-    paddsw               m0, m0
-    pmulhrsw             m0, m4
-    pmulhrsw             m0, [o(pw_2048)]
-    mova                 m1, m0
-    mova                 m2, m0
-    mova                 m3, m0
-%else
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x4, 8
+%ifidn %1_%2, dct_dct
     pshuflw              m0, [coeffq], q0000
     punpcklqdq           m0, m0
     mova                 m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1
     pmulhrsw             m0, m1
-%ifidn %2, dct
     mova                 m2, [o(pw_2048)]
     pmulhrsw             m0, m1
     pmulhrsw             m0, m2
@@ -931,34 +796,14 @@
     mova                 m1, m0
     mova                 m2, m0
     mova                 m3, m0
-%else ; adst / flipadst
-    pmulhrsw             m2, m0, [o(iadst4_dconly2b)]
-    pmulhrsw             m0, [o(iadst4_dconly2a)]
-    mova                 m1, [o(pw_2048)]
-    pmulhrsw             m0, m1
-    pmulhrsw             m2, m1
-%ifidn %2, adst
-    punpckhqdq           m1, m0, m0
-    punpcklqdq           m0, m0
-    punpckhqdq           m3, m2, m2
-    punpcklqdq           m2, m2
-%else ; flipadst
-    mova                 m3, m0
-    punpckhqdq           m0, m2, m2
-    punpcklqdq           m1, m2, m2
-    punpckhqdq           m2, m3, m3
-    punpcklqdq           m3, m3
-%endif
-%endif
-%endif
     TAIL_CALL m(iadst_8x4_internal).end2
 %endif
 %endmacro
 
-INV_TXFM_8X4_FN dct, dct,      0
-INV_TXFM_8X4_FN dct, adst,     0
-INV_TXFM_8X4_FN dct, flipadst, 0
-INV_TXFM_8X4_FN dct, identity, 3
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
 
 cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m3, [o(pw_2896x8)]
@@ -1157,7 +1002,7 @@
     mova                 m3, m4
     jmp m(iadst_8x4_internal).end
 
-INV_TXFM_8X4_FN identity, dct,      7
+INV_TXFM_8X4_FN identity, dct
 INV_TXFM_8X4_FN identity, adst
 INV_TXFM_8X4_FN identity, flipadst
 INV_TXFM_8X4_FN identity, identity
@@ -1199,30 +1044,9 @@
     paddsw               m3, m7
     jmp m(iadst_8x4_internal).end
 
-%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x8, 8, 16*4
-%ifidn %1_%2, dct_identity
-    mova                 m0, [o(pw_2896x8)]
-    pmulhrsw             m0, [coeffq]
-    mova                 m1, [o(pw_16384)]
-    pmulhrsw             m0, m1
-    psrlw                m1, 2
-    pmulhrsw             m0, m1
-    punpckhwd            m7, m0, m0
-    punpcklwd            m0, m0
-    pshufd               m3, m0, q3333
-    pshufd               m2, m0, q2222
-    pshufd               m1, m0, q1111
-    pshufd               m0, m0, q0000
-    call m(iadst_8x4_internal).end2
-    pshufd               m3, m7, q3333
-    pshufd               m2, m7, q2222
-    pshufd               m1, m7, q1111
-    pshufd               m0, m7, q0000
-    lea                dstq, [dstq+strideq*2]
-    TAIL_CALL m(iadst_8x4_internal).end3
-%elif %3 >= 0
-%ifidn %1, dct
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x8, 8, 16*4
+%ifidn %1_%2, dct_dct
     pshuflw              m0, [coeffq], q0000
     punpcklwd            m0, m0
     mova                 m1, [o(pw_2896x8)]
@@ -1244,25 +1068,7 @@
     jmp                tx2q
 .end3:
     RET
-%else ; identity
-    mova                 m0, [coeffq+16*0]
-    mova                 m1, [coeffq+16*1]
-    mova                 m2, [coeffq+16*2]
-    mova                 m3, [coeffq+16*3]
-    punpcklwd            m0, [coeffq+16*4]
-    punpcklwd            m1, [coeffq+16*5]
-    punpcklwd            m2, [coeffq+16*6]
-    punpcklwd            m3, [coeffq+16*7]
-    punpcklwd            m0, m2
-    punpcklwd            m1, m3
-    punpcklwd            m0, m1
-    pmulhrsw             m0, [o(pw_2896x8)]
-    pmulhrsw             m0, [o(pw_2048)]
-    pxor                 m4, m4
-    REPX {mova [coeffq+16*x], m4}, 0,  1,  2,  3,  4,  5,  6,  7
-    jmp m(inv_txfm_add_dct_dct_8x8).end
 %endif
-%endif
 %endmacro
 
 %macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
@@ -1298,10 +1104,10 @@
     ITX_MULSUB_2W         %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
 %endmacro
 
-INV_TXFM_8X8_FN dct, dct,      0
-INV_TXFM_8X8_FN dct, identity, 7
+INV_TXFM_8X8_FN dct, dct
 INV_TXFM_8X8_FN dct, adst
 INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
 
 cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS          coeffq, 16
@@ -1610,7 +1416,7 @@
     mova    [rsp+gprsize+16*0], m7
     jmp m(idct_8x8_internal).end3
 
-INV_TXFM_8X8_FN identity, dct,      7
+INV_TXFM_8X8_FN identity, dct
 INV_TXFM_8X8_FN identity, adst
 INV_TXFM_8X8_FN identity, flipadst
 INV_TXFM_8X8_FN identity, identity
@@ -1634,58 +1440,9 @@
     jmp m(idct_8x8_internal).end3
 
 
-%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x16, 8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    mova                 m0, [o(pw_2896x8)]
-    mova                 m1, m0
-    pmulhrsw             m0, [coeffq+16*0]
-    pmulhrsw             m1, [coeffq+16*1]
-    mova                 m2, [o(pw_16384)]
-    mova                 m3, [o(pw_1697x16)]
-    mova                 m4, [o(pw_2048)]
-    pmulhrsw             m0, m2
-    pmulhrsw             m1, m2
-    pmulhrsw             m2, m3, m0
-    pmulhrsw             m3, m1
-    paddsw               m0, m0
-    paddsw               m1, m1
-    paddsw               m0, m2
-    paddsw               m1, m3
-    pmulhrsw             m0, m4
-    pmulhrsw             m4, m1
-    punpckhwd            m2, m0, m0
-    punpcklwd            m0, m0
-    punpckhwd            m6, m4, m4
-    punpcklwd            m4, m4
-    punpckhdq            m1, m0, m0
-    punpckldq            m0, m0
-    punpckhdq            m3, m2, m2
-    punpckldq            m2, m2
-    punpckhdq            m5, m4, m4
-    punpckldq            m4, m4
-    punpckhdq            m7, m6, m6
-    punpckldq            m6, m6
-    mova      [coeffq+16*4], m4
-    TAIL_CALL m(iadst_4x16_internal).end2
-%elifidn %1_%2, identity_dct
-    movd                  m0, [coeffq+32*0]
-    punpcklwd             m0, [coeffq+32*1]
-    movd                  m1, [coeffq+32*2]
-    punpcklwd             m1, [coeffq+32*3]
-    punpckldq             m0, m1
-    pmulhrsw              m1, m0, [o(pw_1697x8)]
-    pcmpeqw               m2, m2
-    pcmpeqw               m2, m0
-    pxor                  m0, m2
-    pavgw                 m0, m1
-    pmulhrsw              m0, [o(pw_2896x8)]
-    pmulhrsw              m0, [o(pw_2048)]
-    punpcklqdq            m0, m0
-    pxor                  m1, m1
-    REPX     {mova [coeffq+32*x], m1}, 0,  1,  2,  3
-%elifidn %1_%2, dct_dct
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x16, 8
+%ifidn %1_%2, dct_dct
     pshuflw               m0, [coeffq], q0000
     punpcklwd             m0, m0
     mova                  m1, [o(pw_2896x8)]
@@ -1694,21 +1451,6 @@
     pmulhrsw              m0, [o(pw_16384)]
     pmulhrsw              m0, m1
     pmulhrsw              m0, [o(pw_2048)]
-%else ; adst_dct / flipadst_dct
-    pshuflw               m0, [coeffq], q0000
-    punpcklwd             m0, m0
-%ifidn %1, adst
-    pmulhrsw              m0, [o(iadst4_dconly1a)]
-%else ; flipadst
-    pmulhrsw              m0, [o(iadst4_dconly1b)]
-%endif
-    mova                  m1, [o(pw_16384)]
-    mov             [coeffq], eobd
-    pmulhrsw              m0, m1
-    psrlw                 m1, 3                ; pw_2048
-    pmulhrsw              m0, [o(pw_2896x8)]
-    pmulhrsw              m0, m1
-%endif
 .end:
     WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
     lea                dstq, [dstq+strideq*4]
@@ -1721,10 +1463,10 @@
 %endif
 %endmacro
 
-INV_TXFM_4X16_FN dct, dct,      0
-INV_TXFM_4X16_FN dct, identity, 15
+INV_TXFM_4X16_FN dct, dct
 INV_TXFM_4X16_FN dct, adst
 INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
 
 cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     lea                  r3, [o(m(idct_4x8_internal).pass1)]
@@ -1790,7 +1532,7 @@
     REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
     ret
 
-INV_TXFM_4X16_FN adst, dct,      0
+INV_TXFM_4X16_FN adst, dct
 INV_TXFM_4X16_FN adst, adst
 INV_TXFM_4X16_FN adst, flipadst
 INV_TXFM_4X16_FN adst, identity
@@ -1858,7 +1600,7 @@
     ret
 
 
-INV_TXFM_4X16_FN flipadst, dct,      0
+INV_TXFM_4X16_FN flipadst, dct
 INV_TXFM_4X16_FN flipadst, adst
 INV_TXFM_4X16_FN flipadst, flipadst
 INV_TXFM_4X16_FN flipadst, identity
@@ -1888,7 +1630,7 @@
     jmp   m(iadst_4x16_internal).end1
 
 
-INV_TXFM_4X16_FN identity, dct,      3
+INV_TXFM_4X16_FN identity, dct
 INV_TXFM_4X16_FN identity, adst
 INV_TXFM_4X16_FN identity, flipadst
 INV_TXFM_4X16_FN identity, identity
@@ -1964,68 +1706,11 @@
     jmp m(iadst_4x16_internal).end2
 
 
-%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x4, 8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    mova                 m3, [o(pw_2896x8)]
-    pmulhrsw             m3, [coeffq]
-    mova                 m0, [o(pw_16384)]
-    pmulhrsw             m3, m0
-    psrlw                m0, 3                ; pw_2048
-    pmulhrsw             m1, m3, [o(pw_1697x8)]
-    paddsw               m3, m1
-    pmulhrsw             m3, m0
-    punpcklwd            m3, m3
-    pshufd               m0, m3, q0000
-    pshufd               m1, m3, q1111
-    pshufd               m2, m3, q2222
-    pshufd               m3, m3, q3333
-    lea                tx2q, [dstq+8]
-    call m(iadst_8x4_internal).end2
-    add              coeffq, 16*4
-    mov                dstq, tx2q
-    TAIL_CALL m(iadst_8x4_internal).end2
-%elifidn %1_%2, identity_dct
-    mova                 m4, [o(pw_1697x16)]
-    mova                 m5, [o(pw_16384)]
-    mova                 m6, [o(pw_2896x8)]
-    mov                 r3d, 2
-    psrlw                m7, m5, 3 ; pw_2048
-.main_loop:
-    mova                 m0, [coeffq+16*0]
-    mova                 m1, [coeffq+16*1]
-    punpckhwd            m2, m0, m1
-    punpcklwd            m0, m1
-    punpcklwd            m0, m2
-    mova                 m1, [coeffq+16*2]
-    mova                 m2, [coeffq+16*3]
-    punpckhwd            m3, m1, m2
-    punpcklwd            m1, m2
-    punpcklwd            m1, m3
-    punpcklqdq           m0, m1
-    pmulhrsw             m1, m4, m0
-    pmulhrsw             m1, m5
-    paddsw               m0, m1
-    pmulhrsw             m0, m6
-    pmulhrsw             m0, m7
-.end:
-    pxor                 m3, m3
-    mova      [coeffq+16*0], m3
-    mova      [coeffq+16*1], m3
-    mova      [coeffq+16*2], m3
-    mova      [coeffq+16*3], m3
-    add              coeffq, 16*4
-    lea                tx2q, [dstq+8]
-    WRITE_8X4            0, 0, 0, 0, 1, 2, 3
-    mov                dstq, tx2q
-    dec                 r3d
-    jg .main_loop
-    RET
-%else
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x4, 8
+%ifidn %1_%2, dct_dct
     movd                 m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1, [coeffq]
-%ifidn %2, dct
     movd                 m2, [o(pw_16384)]
     mov            [coeffq], eobd
     mov                 r2d, 2
@@ -2059,36 +1744,7 @@
     jmp                tx2q
 .end:
     RET
-%else ; adst / flipadst
-    movd                 m2, [o(pw_16384)]
-    pmulhrsw             m0, m2
-    pshuflw              m0, m0, q0000
-    punpcklwd            m0, m0
-    mov            [coeffq], eobd
-    pmulhrsw             m2, m0, [o(iadst4_dconly2b)]
-    pmulhrsw             m0, [o(iadst4_dconly2a)]
-    mova                 m1, [o(pw_2048)]
-    pmulhrsw             m0, m1
-    pmulhrsw             m2, m1
-%ifidn %2, adst
-    punpckhqdq           m1, m0, m0
-    punpcklqdq           m0, m0
-    punpckhqdq           m3, m2, m2
-    punpcklqdq           m2, m2
-%else ; flipadst
-    mova                 m3, m0
-    punpckhqdq           m0, m2, m2
-    punpcklqdq           m1, m2, m2
-    punpckhqdq           m2, m3, m3
-    punpcklqdq           m3, m3
 %endif
-    lea                tx2q, [dstq+8]
-    call m(iadst_8x4_internal).end3
-    mov                dstq, tx2q
-    TAIL_CALL m(iadst_8x4_internal).end3
-%endif
-%endif
-%endif
 %endmacro
 
 %macro LOAD_7ROWS 2 ;src, stride
@@ -2144,10 +1800,10 @@
     punpcklqdq           m%1, m%6                      ;low: t8a   high: t9
 %endmacro
 
-INV_TXFM_16X4_FN dct, dct,      0
-INV_TXFM_16X4_FN dct, adst,     0
-INV_TXFM_16X4_FN dct, flipadst, 0
-INV_TXFM_16X4_FN dct, identity, 3
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
 
 cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_7ROWS        coeffq, 16
@@ -2464,7 +2120,7 @@
     jmp   m(idct_16x4_internal).pass2_end
 
 
-INV_TXFM_16X4_FN identity, dct,      15
+INV_TXFM_16X4_FN identity, dct
 INV_TXFM_16X4_FN identity, adst
 INV_TXFM_16X4_FN identity, flipadst
 INV_TXFM_16X4_FN identity, identity
@@ -2537,8 +2193,8 @@
     mova                 [%1+%2*7], m7
 %endmacro
 
-%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x16, 8, 16*16
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x16, 8, 16*16
 %ifidn %1_%2, dct_dct
     pshuflw              m0, [coeffq], q0000
     punpcklwd            m0, m0
@@ -2556,78 +2212,13 @@
     jmp m(inv_txfm_add_dct_dct_8x8).loop
 .end:
     RET
-%elifidn %1_%2, dct_identity
-    mov                 r3d, 2
-.loop:
-    mova                 m0, [o(pw_2896x8)]
-    pmulhrsw             m7, m0, [coeffq]
-    mova                 m1, [o(pw_16384)]
-    pxor                 m2, m2
-    mova           [coeffq], m2
-    pmulhrsw             m7, m0
-    pmulhrsw             m7, m1
-    psrlw                m1, 3          ; pw_2048
-    pmulhrsw             m0, m7, [o(pw_1697x16)]
-    paddsw               m7, m7
-    paddsw               m7, m0
-    pmulhrsw             m7, m1
-    punpcklwd            m0, m7, m7
-    punpckhwd            m7, m7
-    pshufd               m3, m0, q3333
-    pshufd               m2, m0, q2222
-    pshufd               m1, m0, q1111
-    pshufd               m0, m0, q0000
-    call m(iadst_8x4_internal).end3
-    pshufd               m3, m7, q3333
-    pshufd               m2, m7, q2222
-    pshufd               m1, m7, q1111
-    pshufd               m0, m7, q0000
-    lea                dstq, [dstq+strideq*2]
-    call m(iadst_8x4_internal).end3
-
-    add              coeffq, 16
-    lea                dstq, [dstq+strideq*2]
-    dec                 r3d
-    jg .loop
-    RET
-%elifidn %1_%2, identity_dct
-    movd                 m0, [coeffq+32*0]
-    punpcklwd            m0, [coeffq+32*1]
-    movd                 m2, [coeffq+32*2]
-    punpcklwd            m2, [coeffq+32*3]
-    add              coeffq, 32*4
-    movd                 m1, [coeffq+32*0]
-    punpcklwd            m1, [coeffq+32*1]
-    movd                 m3, [coeffq+32*2]
-    punpcklwd            m3, [coeffq+32*3]
-    mova                 m4, [o(pw_2896x8)]
-    xor                eobd, eobd
-    mov       [coeffq-32*4], eobd
-    mov       [coeffq-32*3], eobd
-    mov       [coeffq-32*2], eobd
-    mov       [coeffq-32*1], eobd
-    punpckldq            m0, m2
-    punpckldq            m1, m3
-    punpcklqdq           m0, m1
-    pmulhrsw             m0, m4
-    pmulhrsw             m0, m4
-    pmulhrsw             m0, [o(pw_2048)]
-    mov       [coeffq+32*0], eobd
-    mov       [coeffq+32*1], eobd
-    mov       [coeffq+32*2], eobd
-    mov       [coeffq+32*3], eobd
-    mov                 r3d, 4
-    lea                tx2q, [o(m(inv_txfm_add_identity_dct_8x16).end)]
-    jmp m(inv_txfm_add_dct_dct_8x8).loop
-.end:
-    RET
 %endif
 %endmacro
 
-INV_TXFM_8X16_FN dct, dct,      0
-INV_TXFM_8X16_FN dct, identity, 15
+INV_TXFM_8X16_FN dct, dct
 INV_TXFM_8X16_FN dct, adst
 INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
 
 cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     lea                    r3, [o(m(idct_8x8_internal).pass1)]
@@ -2790,7 +2381,7 @@
     jmp  m(iflipadst_8x8_internal).end
 
 
-INV_TXFM_8X16_FN identity, dct,      7
+INV_TXFM_8X16_FN identity, dct
 INV_TXFM_8X16_FN identity, adst
 INV_TXFM_8X16_FN identity, flipadst
 INV_TXFM_8X16_FN identity, identity
@@ -2837,8 +2428,8 @@
     jmp .end
 
 
-%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x8, 8, 16*16
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x8, 8, 16*16
 %ifidn %1_%2, dct_dct
     movd                 m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1, [coeffq]
@@ -2850,83 +2441,13 @@
     jmp m(inv_txfm_add_dct_dct_16x4).dconly
 .end:
     RET
-%elifidn %1_%2, dct_identity
-    mova                 m7, [coeffq]
-    mova                 m0, [o(pw_2896x8)]
-    mova                 m1, [o(pw_16384)]
-    pxor                 m2, m2
-    mova           [coeffq], m2
-    pmulhrsw             m7, m0
-    pmulhrsw             m7, m0
-    pmulhrsw             m7, m1
-    psrlw                m1, 2               ; pw_4096
-    pmulhrsw             m7, m1
-    punpcklwd            m3, m7, m7
-    punpckhwd            m7, m7
-    pshufd               m0, m3, q0000
-    pshufd               m1, m3, q1111
-    pshufd               m2, m3, q2222
-    pshufd               m3, m3, q3333
-    lea                  r3, [dstq+strideq*4]
-    lea                tx2q, [dstq+8]
-    call m(iadst_8x4_internal).end2
-    add              coeffq, 16*4
-    mov                dstq, tx2q
-    call m(iadst_8x4_internal).end2
-    mov                dstq, r3
-    add              coeffq, 16*4
-    pshufd               m0, m7, q0000
-    pshufd               m1, m7, q1111
-    pshufd               m2, m7, q2222
-    pshufd               m3, m7, q3333
-    lea                tx2q, [dstq+8]
-    call m(iadst_8x4_internal).end2
-    add              coeffq, 16*4
-    mov                dstq, tx2q
-    TAIL_CALL m(iadst_8x4_internal).end2
-%elifidn %1_%2, identity_dct
-    mova                 m4, [o(pw_2896x8)]
-    mova                 m5, [o(pw_1697x16)]
-    mova                 m6, [o(pw_16384)]
-    psrlw                m7, m6, 3 ; pw_2048
-    mov                 r3d, 2
-.main_loop:
-    mova                 m0, [coeffq+16*0]
-    punpcklwd            m0, [coeffq+16*1]
-    mova                 m1, [coeffq+16*2]
-    punpcklwd            m1, [coeffq+16*3]
-    punpckldq            m0, m1
-    mova                 m1, [coeffq+16*4]
-    punpcklwd            m1, [coeffq+16*5]
-    mova                 m2, [coeffq+16*6]
-    punpcklwd            m2, [coeffq+16*7]
-    punpckldq            m1, m2
-    punpcklqdq           m0, m1
-    pmulhrsw             m0, m4
-    pmulhrsw             m1, m5, m0
-    pmulhrsw             m1, m6
-    paddsw               m0, m1
-    pmulhrsw             m0, m4
-    pmulhrsw             m0, m7
-.end:
-    pxor                 m1, m1
-    REPX {mova [coeffq+16*x], m1}, 0, 1, 2, 3, 4, 5, 6, 7
-    add              coeffq, 16*8
-    lea                tx2q, [dstq+8]
-    WRITE_8X4             0, 0, 0, 0, 1, 2, 3
-    lea                dstq, [dstq+strideq*2]
-    WRITE_8X4             0, 0, 0, 0, 1, 2, 3
-    mov                dstq, tx2q
-    dec                 r3d
-    jg .main_loop
-    RET
 %endif
 %endmacro
 
-INV_TXFM_16X8_FN dct, dct,      0
-INV_TXFM_16X8_FN dct, identity, 7
+INV_TXFM_16X8_FN dct, dct
 INV_TXFM_16X8_FN dct, adst
 INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
 
 cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    coeffq+16*0, 32, 1
@@ -3382,7 +2903,7 @@
     jmp m(iflipadst_8x8_internal).pass2_main
 
 
-INV_TXFM_16X8_FN identity, dct,      15
+INV_TXFM_16X8_FN identity, dct
 INV_TXFM_16X8_FN identity, adst
 INV_TXFM_16X8_FN identity, flipadst
 INV_TXFM_16X8_FN identity, identity
@@ -3463,8 +2984,8 @@
     jmp  m(iidentity_8x8_internal).end
 
 
-%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x16, 8, 16*16
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x16, 8, 16*16
 %ifidn %1_%2, dct_dct
     movd                   m1, [o(pw_2896x8)]
     pmulhrsw               m0, m1, [coeffq]
@@ -3475,104 +2996,13 @@
     jmp m(inv_txfm_add_dct_dct_16x4).dconly
 .end:
     RET
-%elifidn %1_%2, dct_identity
-    mova                   m3, [o(pw_2896x8)]
-    pmulhrsw               m2, m3, [coeffq+16*0]
-    pmulhrsw               m3, [coeffq+16*1]
-    mova                   m0, [o(pw_8192)]
-    mova                   m1, [o(pw_1697x16)]
-    pshuflw                m4, [o(deint_shuf)], q0000 ;pb_0_1
-    punpcklwd              m4, m4
-    pcmpeqb                m5, m5
-    pxor                   m6, m6
-    mova        [coeffq+16*0], m6
-    mova        [coeffq+16*1], m6
-    paddb                  m5, m5                     ;pb_m2
-    pmulhrsw               m2, m0
-    pmulhrsw               m3, m0
-    psrlw                  m0, 2                      ;pw_2048
-    pmulhrsw               m7, m1, m2
-    pmulhrsw               m1, m3
-    paddsw                 m2, m2
-    paddsw                 m3, m3
-    paddsw                 m2, m7
-    paddsw                 m3, m1
-    pmulhrsw               m2, m0
-    pmulhrsw               m3, m0
-    mov                   r3d, 8
-.loop:
-    mova                   m1, [dstq]
-    pshufb                 m0, m2, m4
-    punpckhbw              m7, m1, m6
-    punpcklbw              m1, m6
-    paddw                  m7, m0
-    paddw                  m1, m0
-    packuswb               m1, m7
-    mova               [dstq], m1
-    mova                   m1, [dstq+strideq*8]
-    pshufb                 m0, m3, m4
-    psubb                  m4, m5 ; += 2
-    punpckhbw              m7, m1, m6
-    punpcklbw              m1, m6
-    paddw                  m7, m0
-    paddw                  m1, m0
-    packuswb               m1, m7
-    mova     [dstq+strideq*8], m1
-    add                  dstq, strideq
-    dec                   r3d
-    jg .loop
-    RET
-%elifidn %1_%2, identity_dct
-    mova                   m4, [o(pw_1697x16)]
-    mova                   m5, [o(pw_2896x8)]
-    mova                   m6, [o(pw_2048)]
-    xor                  eobd, eobd
-    lea                  tx2q, [o(m(inv_txfm_add_identity_dct_16x16).end)]
-    lea                    r3, [dstq+8]
-    mov            [rsp+16*0], r3
-.main:
-    movd                   m0, [coeffq+32*0]
-    punpcklwd              m0, [coeffq+32*1]
-    movd                   m1, [coeffq+32*2]
-    punpcklwd              m1, [coeffq+32*3]
-    add                coeffq, 32*4
-    punpckldq              m0, m1
-    movd                   m1, [coeffq+32*0]
-    punpcklwd              m1, [coeffq+32*1]
-    movd                   m2, [coeffq+32*2]
-    punpcklwd              m2, [coeffq+32*3]
-    xor                  eobd, eobd
-    mov         [coeffq-32*4], eobd
-    mov         [coeffq-32*3], eobd
-    mov         [coeffq-32*2], eobd
-    mov         [coeffq-32*1], eobd
-    punpckldq              m1, m2
-    punpcklqdq             m0, m1
-    pmulhrsw               m1, m4, m0
-    psraw                  m1, 1
-    pavgw                  m0, m1
-    pmulhrsw               m0, m5
-    pmulhrsw               m0, m6
-    mov         [coeffq+32*0], eobd
-    mov         [coeffq+32*1], eobd
-    mov         [coeffq+32*2], eobd
-    mov         [coeffq+32*3], eobd
-    mov                   r3d, 4
-    jmp m(inv_txfm_add_dct_dct_8x8).loop
-.end:
-    lea                  tx2q, [o(m(inv_txfm_add_identity_dct_16x16).end1)]
-    add                coeffq, 32*4
-    mov                  dstq, [rsp+16*0]
-    jmp .main
-.end1:
-    RET
 %endif
 %endmacro
 
-INV_TXFM_16X16_FN dct, dct,      0
-INV_TXFM_16X16_FN dct, identity, 15
+INV_TXFM_16X16_FN dct, dct
 INV_TXFM_16X16_FN dct, adst
 INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
 
 cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS     coeffq+16*1, 64
@@ -3865,7 +3295,7 @@
     pavgw               m%1, m%2
 %endmacro
 
-INV_TXFM_16X16_FN identity, dct,      15
+INV_TXFM_16X16_FN identity, dct
 INV_TXFM_16X16_FN identity, identity
 
 cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2