shithub: dav1d

Download patch

ref: 4ec4605bd06d42d057ea5014c08fd09b388c2d97
parent: 58caeed2fff90b0a93fc2b8ea2708c3f50485e22
author: Henrik Gramner <gramner@twoorioles.com>
date: Tue Oct 2 14:59:21 EDT 2018

x86-64: Inverse transforms AVX2

--- a/src/itx.c
+++ b/src/itx.c
@@ -226,4 +226,8 @@
     assign_itx_all_fn64(64, 16, R);
     assign_itx_all_fn64(64, 32, R);
     assign_itx_all_fn64(64, 64, );
+
+#if HAVE_ASM && ARCH_X86
+    bitfn(dav1d_itx_dsp_init_x86)(c);
+#endif
 }
--- a/src/itx.h
+++ b/src/itx.h
@@ -43,4 +43,7 @@
 void dav1d_itx_dsp_init_8bpc(Dav1dInvTxfmDSPContext *c);
 void dav1d_itx_dsp_init_10bpc(Dav1dInvTxfmDSPContext *c);
 
+void dav1d_itx_dsp_init_x86_8bpc(Dav1dInvTxfmDSPContext *c);
+void dav1d_itx_dsp_init_x86_10bpc(Dav1dInvTxfmDSPContext *c);
+
 #endif /* __DAV1D_SRC_ITX_H__ */
--- a/src/meson.build
+++ b/src/meson.build
@@ -98,6 +98,7 @@
         )
 
         libdav1d_tmpl_sources += files(
+            'x86/itx_init.c',
             'x86/mc_init.c',
         )
 
@@ -104,6 +105,7 @@
         # NASM source files
         libdav1d_sources_asm = files(
             'x86/cpuid.asm',
+            'x86/itx.asm',
             'x86/mc.asm',
         )
 
--- /dev/null
+++ b/src/x86/itx.asm
@@ -1,0 +1,5828 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64 && UNIX64 ; Fixme: Windows
+
+SECTION_RODATA 32
+
+; Note: The order of (at least some of) those constants matter!
+
+iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
+iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424
+iadst4_dconly1a: dw 10568, 19856, 26752, 30424
+iadst4_dconly1b: dw 30424, 26752, 19856, 10568
+
+deint_shuf: db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
+
+%macro COEF_PAIR 2
+pw_%1_%2:  dw  %1, %2
+pw_m%2_%1: dw -%2, %1
+%endmacro
+
+; ADST-only
+pw_3803_1321:   dw  3803,  1321
+pw_m1321_2482:  dw -1321,  2482
+pw_2482_3344:   dw  2482,  3344
+pw_m3803_3344:  dw -3803,  3344
+pw_m3803_m6688: dw -3803, -6688
+%define pw_3344x8 iadst4_dconly2b
+
+pw_5:      times 2 dw 5
+pw_2048:   times 2 dw 2048
+pw_4096:   times 2 dw 4096
+pw_8192:   times 2 dw 8192
+pw_16384:  times 2 dw 16384
+pw_2896x8: times 2 dw 2896*8
+pw_5793x4: times 2 dw 5793*4
+
+pd_2048: dd 2048
+
+COEF_PAIR 1567, 3784
+COEF_PAIR 3784, 1567
+COEF_PAIR  201, 4091
+COEF_PAIR  995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 2440, 3290
+COEF_PAIR 3035, 2751
+COEF_PAIR 3513, 2106
+COEF_PAIR 3857, 1380
+COEF_PAIR 4052,  601
+COEF_PAIR  401, 4076
+COEF_PAIR 1931, 3612
+COEF_PAIR 3166, 2598
+COEF_PAIR 3920, 1189
+COEF_PAIR  799, 4017
+COEF_PAIR 3406, 2276
+pw_m799_m4017:  dw  -799, -4017
+pw_m1567_m3784: dw -1567, -3784
+pw_m3406_m2276: dw -3406, -2276
+pw_m401_m4076:  dw  -401, -4076
+pw_m3166_m2598: dw -3166, -2598
+pw_m1931_m3612: dw -1931, -3612
+pw_m3920_m1189: dw -3920, -1189
+COEF_PAIR 2276, 3406
+COEF_PAIR 4017,  799
+
+%macro COEF_X8 1-*
+%rep %0
+    dw %1*8, %1*8
+    %rotate 1
+%endrep
+%endmacro
+
+pw_3703x8:  COEF_X8  3703
+pw_1751x8:  COEF_X8  1751
+pw_m1380x8: COEF_X8 -1380
+pw_3857x8:  COEF_X8  3857
+pw_3973x8:  COEF_X8  3973
+pw_995x8:   COEF_X8   995
+pw_m2106x8: COEF_X8 -2106
+pw_3513x8:  COEF_X8  3513
+pw_3290x8:  COEF_X8  3290
+pw_2440x8:  COEF_X8  2440
+pw_m601x8:  COEF_X8  -601
+pw_4052x8:  COEF_X8  4052
+
+idct64_mul: COEF_X8  4095,   101,  4065,   501,  2967, -2824,  3229, -2520
+            COEF_X8  3745,  1660,  3564,  2019,  3822, -1474,  3948, -1092
+            COEF_X8  3996,   897,  3889,  1285,  3461, -2191,  3659, -1842
+            COEF_X8  3349,  2359,  3102,  2675,  4036,  -700,  4085,  -301
+
+%define o_idct64_offset idct64_mul - (o_base) - 8
+
+SECTION .text
+
+; Callee-saved registers has to be explicitly handled when jumping around
+; different functions since RET can't automatically deal with it.
+ASSERT ARCH_X86_64 && WIN64 == 0
+
+; Code size reduction trickery: Intead of using rip-relative loads with
+; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
+; single rip-relative lea and then address things relative from that with
+; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
+%define o_base iadst4_dconly2a + 128
+%define o(x) (rax - (o_base) + (x))
+
+%macro REPX 2-*
+    %xdefine %%f(x) %1
+%rep %0 - 1
+    %rotate 1
+    %%f(%1)
+%endrep
+%endmacro
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+; flags: 1 = swap, 2 = interleave, 4: coef_regs
+%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
+%if %7 & 4
+    pmaddwd             m%2, m%5, m%1
+    pmaddwd             m%1, m%6
+%else
+%if %7 & 1
+    vpbroadcastd        m%2, [o(pw_%5_%6)]
+    vpbroadcastd        m%3, [o(pw_m%6_%5)]
+%else
+    vpbroadcastd        m%2, [o(pw_m%6_%5)]
+    vpbroadcastd        m%3, [o(pw_%5_%6)]
+%endif
+    pmaddwd             m%2, m%1
+    pmaddwd             m%1, m%3
+%endif
+    paddd               m%2, m%4
+    paddd               m%1, m%4
+%if %7 & 2
+    pslld               m%2, 4
+    psrld               m%1, 12
+    pblendw             m%1, m%2, 0xaa
+%else
+    psrad               m%2, 12
+    psrad               m%1, 12
+    packssdw            m%1, m%2
+%endif
+%endmacro
+
+; flags: 1 = swap, 2 = interleave, 4 = coef_regs
+%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags
+%if %10 & 1
+    vpbroadcastd        m%3, [o(pw_%8_%9)]
+    vpbroadcastd        m%4, [o(pw_m%9_%8)]
+    vpbroadcastd       xm%2, [o(pw_%6_%7)]
+    vpblendd            m%2, m%2, m%3, 0xf0
+    vpbroadcastd       xm%3, [o(pw_m%7_%6)]
+%else
+    vpbroadcastd        m%3, [o(pw_m%9_%8)]
+    vpbroadcastd        m%4, [o(pw_%8_%9)]
+    vpbroadcastd       xm%2, [o(pw_m%7_%6)]
+    vpblendd            m%2, m%2, m%3, 0xf0
+    vpbroadcastd       xm%3, [o(pw_%6_%7)]
+%endif
+    vpblendd            m%3, m%3, m%4, 0xf0
+    ITX_MUL2X_PACK       %1, %4, _, %5, %2, %3, (4|%10)
+%endmacro
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
+    punpckhwd           m%3, m%2, m%1
+    punpcklwd           m%2, m%1
+%if %7 < 32
+    pmaddwd             m%1, m%7, m%2
+    pmaddwd             m%4, m%7, m%3
+%else
+    vpbroadcastd        m%1, [o(pw_m%7_%6)]
+    pmaddwd             m%4, m%3, m%1
+    pmaddwd             m%1, m%2
+%endif
+    paddd               m%4, m%5
+    paddd               m%1, m%5
+    psrad               m%4, 12
+    psrad               m%1, 12
+    packssdw            m%1, m%4
+%if %7 < 32
+    pmaddwd             m%3, m%6
+    pmaddwd             m%2, m%6
+%else
+    vpbroadcastd        m%4, [o(pw_%6_%7)]
+    pmaddwd             m%3, m%4
+    pmaddwd             m%2, m%4
+%endif
+    paddd               m%3, m%5
+    paddd               m%2, m%5
+    psrad               m%3, 12
+    psrad               m%2, 12
+    packssdw            m%2, m%3
+%endmacro
+
+%macro ITX_MULHRSW_SHL3 4 ; dst/src, tmp, coef[1-2]
+    vpbroadcastd        m%2, [pw_%3_%4]
+    psllw               m%2, 3
+    pmulhrsw            m%1, m%2
+%endmacro
+
+%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
+    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784 ; t2, t3
+    vpbroadcastd        m%6, [o(pw_2896x8)]
+    paddw               m%5, m%1, m%3
+    psubw               m%1, m%3
+    pmulhrsw            m%1, m%6 ; t1
+    pmulhrsw            m%5, m%6 ; t0
+    psubw               m%3, m%1, m%2
+    paddw               m%2, m%1
+    paddw               m%1, m%5, m%4
+    psubw               m%4, m%5, m%4
+%endmacro
+
+%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
+    ITX_MULSUB_2W        %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
+    ITX_MULSUB_2W        %2, %8, %9, %10, %11,  799, 4017 ; t4a, t7a
+    ITX_MULSUB_2W        %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
+    paddw               m%9, m%2, m%6  ; t4
+    psubw               m%2, m%6       ; t5a
+    paddw              m%10, m%8, m%4  ; t7
+    psubw               m%8, m%4       ; t6a
+    vpbroadcastd        m%4, [o(pw_2896x8)]
+    psubw               m%6, m%1, m%5
+    paddw               m%1, m%5
+    psubw               m%5, m%8, m%2
+    paddw               m%8, m%2
+    pmulhrsw            m%1, m%4       ; t0
+    pmulhrsw            m%6, m%4       ; t1
+    pmulhrsw            m%8, m%4       ; t6
+    pmulhrsw            m%5, m%4       ; t5
+    psubw               m%4, m%1, m%7  ; dct4 out3
+    paddw               m%1, m%7       ; dct4 out0
+    paddw               m%7, m%6, m%3  ; dct4 out1
+    psubw               m%6, m%3       ; dct4 out2
+    paddw               m%2, m%7, m%8  ; out1
+    psubw               m%7, m%8       ; out6
+    psubw               m%8, m%1, m%10 ; out7
+    paddw               m%1, m%10      ; out0
+    paddw               m%3, m%6, m%5  ; out2
+    psubw               m%6, m%5       ; out5
+    psubw               m%5, m%4, m%9  ; out4
+    paddw               m%4, m%9       ; out3
+%endmacro
+
+; in1 = %1, in3  = %2, in5  = %3, in7  = %4
+; in9 = %5, in11 = %6, in13 = %7, in15 = %8
+%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048
+    ITX_MULSUB_2W        %1, %8, %9, %10, %11,  401, 4076 ; t8a,  t15a
+    ITX_MULSUB_2W        %5, %4, %9, %10, %11, 3166, 2598 ; t9a,  t14a
+    ITX_MULSUB_2W        %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
+    ITX_MULSUB_2W        %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
+    psubw               m%9, m%2, m%6 ; t13
+    paddw               m%6, m%2      ; t12
+    psubw               m%2, m%8, m%4 ; t14
+    paddw               m%8, m%4      ; t15
+    psubw               m%4, m%7, m%3 ; t10
+    paddw               m%3, m%7      ; t11
+    psubw               m%7, m%1, m%5 ; t9
+    paddw               m%1, m%5      ; t8
+    ITX_MULSUB_2W        %2, %7, %5, %10, %11,  1567, 3784 ; t9a,  t14a
+    ITX_MULSUB_2W        %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
+    vpbroadcastd       m%10, [o(pw_2896x8)]
+    psubw               m%5, m%2, m%9 ; t10
+    paddw               m%2, m%9      ; t9
+    psubw               m%9, m%1, m%3 ; t11a
+    paddw               m%1, m%3      ; t8a
+    psubw               m%3, m%7, m%4 ; t13
+    paddw               m%7, m%4      ; t14
+    psubw               m%4, m%8, m%6 ; t12a
+    paddw               m%8, m%6      ; t15a
+    paddw               m%6, m%3, m%5 ; t13a
+    psubw               m%3, m%5      ; t10a
+    paddw               m%5, m%4, m%9 ; t12
+    psubw               m%4, m%9      ; t11
+    REPX {pmulhrsw x, m%10}, m%6, m%3, m%5, m%4
+%endmacro
+
+%macro WRAP_XMM 1+
+    INIT_XMM cpuname
+    %1
+    INIT_YMM cpuname
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+    vpbroadcastd         m2, [o(pw_%5)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+%endif
+    lea                  r2, [dstq+strideq*2]
+%assign %%i 1
+%rep 4
+    %if %1 & 2
+        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
+    %else
+        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+    %endif
+    %assign %%i %%i + 1
+    %rotate 1
+%endrep
+    movd                 m2, [%%row_adr1]
+    pinsrd               m2, [%%row_adr2], 1
+    movd                 m3, [%%row_adr3]
+    pinsrd               m3, [%%row_adr4], 1
+    pmovzxbw             m2, m2
+    pmovzxbw             m3, m3
+    paddw                m0, m2
+    paddw                m1, m3
+    packuswb             m0, m1
+    movd       [%%row_adr1], m0
+    pextrd     [%%row_adr2], m0, 1
+    pextrd     [%%row_adr3], m0, 2
+    pextrd     [%%row_adr4], m0, 3
+    ret
+%endmacro
+
+%macro IWHT4_1D_PACKED 0
+    punpckhqdq           m3, m0, m1 ; in1 in3
+    punpcklqdq           m0, m1     ; in0 in2
+    psubw                m2, m0, m3
+    paddw                m0, m3
+    punpckhqdq           m2, m2     ; t2 t2
+    punpcklqdq           m0, m0     ; t0 t0
+    psubw                m1, m0, m2
+    psraw                m1, 1
+    psubw                m1, m3     ; t1 t3
+    psubw                m0, m1     ; ____ out0
+    paddw                m2, m1     ; out3 ____
+%endmacro
+
+INIT_XMM avx2
+cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, c
+    mova                 m0, [cq+16*0]
+    mova                 m1, [cq+16*1]
+    pxor                 m2, m2
+    mova          [cq+16*0], m2
+    mova          [cq+16*1], m2
+    psraw                m0, 2
+    psraw                m1, 2
+    IWHT4_1D_PACKED
+    punpckhwd            m0, m1
+    punpcklwd            m3, m1, m2
+    punpckhdq            m1, m0, m3
+    punpckldq            m0, m3
+    IWHT4_1D_PACKED
+    vpblendd             m0, m0, m2, 0x03
+    ITX4_END              3, 0, 2, 1, 0
+
+%macro INV_TXFM_FN 5 ; type1, type2, fast_thresh, size, num_mmregs
+cglobal inv_txfm_add_%1_%2_%4, 4, 5, %5, dst, stride, c, eob, tx2
+    %undef cmp
+    %define %%p1 m(i%1_%4_internal)
+    lea                 rax, [o_base]
+    ; Jump to the 1st txfm function if we're not taking the fast path, which
+    ; in turn performs an indirect jump to the 2nd txfm function.
+    lea tx2q, [m(i%2_%4_internal).pass2]
+%if %3 > 0
+    cmp                eobd, %3
+    jg %%p1
+%elif %3 == 0
+    test               eobd, eobd
+    jnz %%p1
+%else
+    ; jump to the 1st txfm function unless it's located directly after this
+    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 4x4, 6
+%ifidn %1_%2, dct_identity
+    vpbroadcastd         m0, [o(pw_2896x8)]
+    pmulhrsw             m0, [cq]
+    vpbroadcastd         m1, [o(pw_5793x4)]
+    paddw                m0, m0
+    pmulhrsw             m0, m1
+    punpcklwd            m0, m0
+    punpckhdq            m1, m0, m0
+    punpckldq            m0, m0
+    jmp m(iadst_4x4_internal).end
+%elifidn %1_%2, identity_dct
+    mova                 m0, [cq+16*0]
+    packusdw             m0, [cq+16*1]
+    vpbroadcastd         m2, [o(pw_5793x4)]
+    vpbroadcastd         m3, [o(pw_2896x8)]
+    packusdw             m0, m0
+    paddw                m0, m0
+    pmulhrsw             m0, m2
+    pmulhrsw             m0, m3
+    mova                 m1, m0
+    jmp m(iadst_4x4_internal).end
+%elif %3 >= 0
+    vpbroadcastw         m0, [cq]
+%ifidn %1, dct
+    vpbroadcastd         m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+%elifidn %1, adst
+    movddup              m1, [o(iadst4_dconly1a)]
+    pmulhrsw             m0, m1
+%elifidn %1, flipadst
+    movddup              m1, [o(iadst4_dconly1b)]
+    pmulhrsw             m0, m1
+%endif
+    mov                [cq], eobd ; 0
+%ifidn %2, dct
+%ifnidn %1, dct
+    vpbroadcastd         m1, [o(pw_2896x8)]
+%endif
+    pmulhrsw             m0, m1
+    mova                 m1, m0
+    jmp m(iadst_4x4_internal).end2
+%else ; adst / flipadst
+    pmulhrsw             m1, m0, [o(iadst4_dconly2b)]
+    pmulhrsw             m0, [o(iadst4_dconly2a)]
+    jmp m(i%2_4x4_internal).end2
+%endif
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0-1 ; pw_2896x8
+    vpbroadcastd         m4, [o(pd_2048)]
+    punpckhwd            m2, m1, m0
+    psubw                m3, m0, m1
+    paddw                m0, m1
+    punpcklqdq           m0, m3
+    ITX_MUL2X_PACK        2, 1, 3, 4, 1567, 3784
+%if %0 == 1
+    pmulhrsw             m0, m%1
+%else
+    vpbroadcastd         m4, [o(pw_2896x8)]
+    pmulhrsw             m0, m4     ; t0 t1
+%endif
+    psubw                m1, m0, m2 ; out3 out2
+    paddw                m0, m2     ; out0 out1
+%endmacro
+
+%macro IADST4_1D_PACKED 0
+    punpcklwd            m2, m1, m0
+    punpckhwd            m3, m1, m0
+    psubw                m0, m1
+    punpckhqdq           m1, m1
+    paddw                m1, m0 ; in0 - in2 + in3
+    vpbroadcastd         m0, [o(pw_3803_1321)]
+    vpbroadcastd         m4, [o(pw_m1321_2482)]
+    pmaddwd              m0, m2
+    pmaddwd              m2, m4
+    vpbroadcastd         m4, [o(pw_2482_3344)]
+    vpbroadcastd         m5, [o(pw_m3803_3344)]
+    pmaddwd              m4, m3
+    pmaddwd              m5, m3
+    paddd                m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
+    vpbroadcastd         m0, [o(pw_m3803_m6688)]
+    pmaddwd              m3, m0
+    vpbroadcastd         m0, [o(pw_3344x8)]
+    pmulhrsw             m1, m0 ; out2 ____
+    vpbroadcastd         m0, [o(pd_2048)]
+    paddd                m2, m0
+    paddd                m0, m4
+    paddd                m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
+    paddd                m2, m4
+    paddd                m2, m3
+    psrad                m0, 12
+    psrad                m5, 12
+    psrad                m2, 12
+    packssdw             m0, m5 ; out0 out1
+    packssdw             m2, m2 ; out3 out3
+%endmacro
+
+INV_TXFM_4X4_FN dct, dct,      0
+INV_TXFM_4X4_FN dct, adst,     0
+INV_TXFM_4X4_FN dct, flipadst, 0
+INV_TXFM_4X4_FN dct, identity, 3
+
+cglobal idct_4x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                 m0, [cq+16*0]
+    mova                 m1, [cq+16*1]
+    IDCT4_1D_PACKED
+    mova                 m2, [o(deint_shuf)]
+    shufps               m3, m0, m1, q1331
+    shufps               m0, m0, m1, q0220
+    pshufb               m0, m2
+    pshufb               m1, m3, m2
+    jmp                tx2q
+.pass2:
+    IDCT4_1D_PACKED
+    pxor                 m2, m2
+    mova          [cq+16*0], m2
+    mova          [cq+16*1], m2
+    ITX4_END              0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct,      0
+INV_TXFM_4X4_FN adst, adst,     0
+INV_TXFM_4X4_FN adst, flipadst, 0
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                 m0, [cq+16*0]
+    mova                 m1, [cq+16*1]
+    call .main
+    punpckhwd            m3, m0, m2
+    punpcklwd            m0, m1
+    punpckhwd            m1, m0, m3
+    punpcklwd            m0, m3
+    jmp                tx2q
+.pass2:
+    call .main
+    vpblendd             m1, m1, m2, 0x0c ; out2 out3
+.end:
+    pxor                 m2, m2
+    mova          [cq+16*0], m2
+    mova          [cq+16*1], m2
+.end2:
+    ITX4_END              0, 1, 2, 3
+ALIGN function_align
+.main:
+    IADST4_1D_PACKED
+    ret
+
+INV_TXFM_4X4_FN flipadst, dct,      0
+INV_TXFM_4X4_FN flipadst, adst,     0
+INV_TXFM_4X4_FN flipadst, flipadst, 0
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                 m0, [cq+16*0]
+    mova                 m1, [cq+16*1]
+    call m(iadst_4x4_internal).main
+    punpcklwd            m1, m0
+    punpckhwd            m2, m0
+    punpcklwd            m0, m2, m1
+    punpckhwd            m1, m2, m1
+    jmp                tx2q
+.pass2:
+    call m(iadst_4x4_internal).main
+    vpblendd             m1, m1, m2, 0x0c ; out2 out3
+.end:
+    pxor                 m2, m2
+    mova          [cq+16*0], m2
+    mova          [cq+16*1], m2
+.end2:
+    ITX4_END              3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct,      3
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                 m0, [cq+16*0]
+    mova                 m1, [cq+16*1]
+    vpbroadcastd         m2, [o(pw_5793x4)]
+    paddw                m0, m0
+    paddw                m1, m1
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    punpckhwd            m2, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m0, m2
+    punpcklwd            m0, m2
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m2, [o(pw_5793x4)]
+    paddw                m0, m0
+    paddw                m1, m1
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    jmp m(iadst_4x4_internal).end
+
+%macro WRITE_4X8 2 ; coefs[1-2]
+    movd                xm4, [dstq+strideq*0]
+    pinsrd              xm4, [dstq+strideq*1], 1
+    movd                xm5, [dstq+strideq*2]
+    pinsrd              xm5, [dstq+r3       ], 1
+    pinsrd              xm4, [r2  +strideq*0], 2
+    pinsrd              xm4, [r2  +strideq*1], 3
+    pinsrd              xm5, [r2  +strideq*2], 2
+    pinsrd              xm5, [r2  +r3       ], 3
+    pmovzxbw             m4, xm4
+    pmovzxbw             m5, xm5
+    paddw                m4, m%1
+    paddw                m5, m%2
+    packuswb             m4, m5
+    vextracti128        xm5, m4, 1
+    movd   [dstq+strideq*0], xm4
+    pextrd [dstq+strideq*1], xm4, 1
+    pextrd [dstq+strideq*2], xm4, 2
+    pextrd [dstq+r3       ], xm4, 3
+    movd   [r2  +strideq*0], xm5
+    pextrd [r2  +strideq*1], xm5, 1
+    pextrd [r2  +strideq*2], xm5, 2
+    pextrd [r2  +r3       ], xm5, 3
+%endmacro
+
+%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 4x8, 7
+%if %3 >= 0
+%ifidn %1_%2, dct_identity
+    vpbroadcastd        xm0, [o(pw_2896x8)]
+    pmulhrsw            xm1, xm0, [cq]
+    vpbroadcastd        xm2, [o(pw_4096)]
+    pmulhrsw            xm1, xm0
+    pmulhrsw            xm1, xm2
+    vpermq               m1, m1, q1100
+    punpcklwd            m1, m1
+    punpckldq            m0, m1, m1
+    punpckhdq            m1, m1
+    jmp m(iadst_4x8_internal).end3
+%elifidn %1_%2, identity_dct
+    movd                xm0, [cq+16*0]
+    punpcklwd           xm0, [cq+16*1]
+    movd                xm1, [cq+16*2]
+    punpcklwd           xm1, [cq+16*3]
+    vpbroadcastd        xm2, [o(pw_2896x8)]
+    vpbroadcastd        xm3, [o(pw_5793x4)]
+    vpbroadcastd        xm4, [o(pw_2048)]
+    punpckldq           xm0, xm1
+    pmulhrsw            xm0, xm2
+    paddw               xm0, xm0
+    pmulhrsw            xm0, xm3
+    pmulhrsw            xm0, xm2
+    pmulhrsw            xm0, xm4
+    vpbroadcastq         m0, xm0
+    mova                 m1, m0
+    jmp m(iadst_4x8_internal).end3
+%elifidn %1_%2, dct_dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_2048)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    mova                 m1, m0
+    jmp m(iadst_4x8_internal).end4
+%else ; adst_dct / flipadst_dct
+    vpbroadcastw        xm0, [cq]
+    vpbroadcastd        xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, [o(iadst4_dconly1a)]
+    vpbroadcastd        xm2, [o(pw_2048)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+%ifidn %1, adst
+    vpbroadcastq         m0, xm0
+%else ; flipadst
+    vpermq               m0, m0, q1111
+%endif
+    mova                 m1, m0
+    jmp m(iadst_4x8_internal).end4
+%endif
+%endif
+%endmacro
+
+%macro IDCT8_1D_PACKED 0
+    vpbroadcastd         m6, [o(pd_2048)]
+    punpckhwd            m5, m3, m0 ; in7 in1
+    punpckhwd            m4, m1, m2 ; in3 in5
+    punpcklwd            m3, m1     ; in2 in6
+    psubw                m1, m0, m2
+    paddw                m0, m2
+    punpcklqdq           m0, m1     ; in0+in4 in0-in4
+    ITX_MUL2X_PACK        5, 1, 2, 6,  799, 4017, 1 ; t4a t7a
+    ITX_MUL2X_PACK        4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a
+    ITX_MUL2X_PACK        3, 1, 2, 6, 1567, 3784    ; t3 t2
+    vpbroadcastd         m6, [o(pw_2896x8)]
+    psubw                m2, m5, m4 ; t4 t7
+    paddw                m5, m4     ; t5a t6a
+    pshufd               m4, m2, q1032
+    psubw                m1, m2, m4
+    paddw                m4, m2
+    vpblendd             m4, m4, m1, 0xcc
+    pmulhrsw             m0, m6     ; t0 t1
+    pmulhrsw             m4, m6     ; t6 t5
+    psubw                m1, m0, m3 ; tmp3 tmp2
+    paddw                m0, m3     ; tmp0 tmp1
+    shufps               m2, m5, m4, q1032 ; t7 t6
+    vpblendd             m5, m5, m4, 0xcc  ; t4 t5
+    psubw                m3, m0, m2 ; out7 out6
+    paddw                m0, m2     ; out0 out1
+    psubw                m2, m1, m5 ; out4 out5
+    paddw                m1, m5     ; out3 out2
+%endmacro
+
+%macro IADST8_1D_PACKED 0
+    vpbroadcastd         m6, [o(pd_2048)]
+    punpckhwd            m0, m4, m3 ; 0 7
+    punpckhwd            m1, m5, m2 ; 2 5
+    punpcklwd            m2, m5     ; 4 3
+    punpcklwd            m3, m4     ; 6 1
+    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
+    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
+    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
+    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189 ; t6a t7a
+    psubw                m4, m0, m2 ; t4 t5
+    paddw                m0, m2     ; t0 t1
+    psubw                m5, m1, m3 ; t6 t7
+    paddw                m1, m3     ; t2 t3
+    shufps               m2, m5, m4, q1032
+    punpckhwd            m4, m2
+    punpcklwd            m5, m2
+    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
+    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567    ; t7a t6a
+    psubw                m2, m0, m1        ; t2 t3
+    paddw                m0, m1            ; out0 -out7
+    psubw                m1, m4, m5        ; t7 t6
+    paddw                m4, m5            ; out6 -out1
+    vpbroadcastd         m5, [o(pw_2896x8)]
+    vpblendd             m3, m0, m4, 0x33  ; out6 -out7
+    vpblendd             m0, m0, m4, 0xcc  ; out0 -out1
+    shufps               m4, m2, m1, q1032 ; t3 t7
+    vpblendd             m1, m2, m1, 0xcc  ; t2 t6
+    psubw                m2, m1, m4        ; t2-t3 t6-t7
+    paddw                m1, m4            ; t2+t3 t6+t7
+    pmulhrsw             m2, m5            ; out4 -out5
+    pshufd               m1, m1, q1032
+    pmulhrsw             m1, m5            ; out2 -out3
+%endmacro
+
+INIT_YMM avx2
+INV_TXFM_4X8_FN dct, dct,      0
+INV_TXFM_4X8_FN dct, identity, 7
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    vpermq               m0, [cq+32*0], q3120
+    vpermq               m1, [cq+32*1], q3120
+    vpbroadcastd         m5, [o(pw_2896x8)]
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    IDCT4_1D_PACKED       5
+    vbroadcasti128       m2, [o(deint_shuf)]
+    shufps               m3, m0, m1, q1331
+    shufps               m0, m0, m1, q0220
+    pshufb               m0, m2
+    pshufb               m1, m3, m2
+    jmp                tx2q
+.pass2:
+    vextracti128        xm2, m0, 1
+    vextracti128        xm3, m1, 1
+    call .main
+    vpbroadcastd         m4, [o(pw_2048)]
+    vinserti128          m0, m0, xm2, 1
+    vinserti128          m1, m1, xm3, 1
+    pshufd               m1, m1, q1032
+    jmp m(iadst_4x8_internal).end2
+ALIGN function_align
+.main:
+    WRAP_XMM IDCT8_1D_PACKED
+    ret
+
+INV_TXFM_4X8_FN adst, dct,      0
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    vpermq               m0, [cq+32*0], q3120
+    vpermq               m1, [cq+32*1], q3120
+    vpbroadcastd         m2, [o(pw_2896x8)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    call m(iadst_8x4_internal).main
+    punpckhwd            m3, m0, m2
+    punpcklwd            m0, m1
+    punpckhwd            m1, m0, m3
+    punpcklwd            m0, m3
+    jmp                tx2q
+.pass2:
+    vextracti128        xm2, m0, 1
+    vextracti128        xm3, m1, 1
+    pshufd              xm4, xm0, q1032
+    pshufd              xm5, xm1, q1032
+    call .main
+    vpbroadcastd         m4, [o(pw_2048)]
+    vinserti128          m0, m0, xm2, 1
+    vinserti128          m1, m1, xm3, 1
+    pxor                 m5, m5
+    psubw                m5, m4
+.end:
+    vpblendd             m4, m4, m5, 0xcc
+.end2:
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m4
+.end3:
+    pxor                 m2, m2
+    mova          [cq+32*0], m2
+    mova          [cq+32*1], m2
+.end4:
+    lea                  r2, [dstq+strideq*4]
+    lea                  r3, [strideq*3]
+    WRITE_4X8             0, 1
+    RET
+ALIGN function_align
+.main:
+    WRAP_XMM IADST8_1D_PACKED
+    ret
+
+INV_TXFM_4X8_FN flipadst, dct,      0
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    vpermq               m0, [cq+32*0], q3120
+    vpermq               m1, [cq+32*1], q3120
+    vpbroadcastd         m2, [o(pw_2896x8)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    call m(iadst_8x4_internal).main
+    punpcklwd            m3, m1, m0
+    punpckhwd            m1, m2, m0
+    punpcklwd            m0, m1, m3
+    punpckhwd            m1, m3
+    jmp                tx2q
+.pass2:
+    vextracti128        xm2, m0, 1
+    vextracti128        xm3, m1, 1
+    pshufd              xm4, xm0, q1032
+    pshufd              xm5, xm1, q1032
+    call m(iadst_4x8_internal).main
+    vpbroadcastd         m5, [o(pw_2048)]
+    vinserti128          m3, m3, xm1, 1
+    vinserti128          m2, m2, xm0, 1
+    pxor                 m4, m4
+    psubw                m4, m5
+    pshufd               m0, m3, q1032
+    pshufd               m1, m2, q1032
+    jmp m(iadst_4x8_internal).end
+
+INV_TXFM_4X8_FN identity, dct,      3
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    vpermq               m2, [cq+32*0], q3120
+    vpermq               m0, [cq+32*1], q3120
+    vpbroadcastd         m3, [o(pw_2896x8)]
+    vpbroadcastd         m4, [o(pw_5793x4)]
+    punpcklwd            m1, m2, m0
+    punpckhwd            m2, m0
+    pmulhrsw             m1, m3
+    pmulhrsw             m2, m3
+    punpcklwd            m0, m1, m2
+    punpckhwd            m1, m2
+    paddw                m0, m0
+    paddw                m1, m1
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m4
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m4, [o(pw_4096)]
+    jmp m(iadst_4x8_internal).end2
+
+%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 4x16, 11
+%if %3 >= 0
+%ifidn %1_%2, dct_identity
+    vpbroadcastd         m0, [o(pw_2896x8)]
+    pmulhrsw             m0, [cq]
+    vpbroadcastd         m1, [o(pw_16384)]
+    vpbroadcastd         m2, [o(pw_5793x4)]
+    vpbroadcastd         m3, [o(pw_2048)]
+    pmulhrsw             m0, m1
+    psllw                m0, 2
+    pmulhrsw             m0, m2
+    pmulhrsw             m3, m0
+    punpcklwd            m1, m3, m3
+    punpckhwd            m3, m3
+    punpckldq            m0, m1, m1
+    punpckhdq            m1, m1
+    punpckldq            m2, m3, m3
+    punpckhdq            m3, m3
+    jmp m(iadst_4x16_internal).end3
+%elifidn %1_%2, identity_dct
+    movd                xm0, [cq+32*0]
+    punpcklwd           xm0, [cq+32*1]
+    movd                xm1, [cq+32*2]
+    punpcklwd           xm1, [cq+32*3]
+    vpbroadcastd        xm2, [o(pw_5793x4)]
+    vpbroadcastd        xm3, [o(pw_16384)]
+    vpbroadcastd        xm4, [o(pw_2896x8)]
+    punpckldq           xm0, xm1
+    paddw               xm0, xm0
+    pmulhrsw            xm0, xm2
+    pmulhrsw            xm0, xm3
+    psrlw               xm3, 3 ; pw_2048
+    pmulhrsw            xm0, xm4
+    pmulhrsw            xm0, xm3
+    vpbroadcastq         m0, xm0
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    jmp m(iadst_4x16_internal).end3
+%elifidn %1_%2, dct_dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    movd                xm3, [o(pw_2048)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm2
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm3
+    vpbroadcastw         m0, xm0
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    jmp m(iadst_4x16_internal).end4
+%else ; adst_dct / flipadst_dct
+    vpbroadcastw        xm0, [cq]
+    pmulhrsw            xm0, [o(iadst4_dconly1a)]
+    vpbroadcastd        xm1, [o(pw_16384)]
+    vpbroadcastd        xm2, [o(pw_2896x8)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    psrlw               xm1, 3 ; pw_2048
+    pmulhrsw            xm0, xm2
+    pmulhrsw            xm0, xm1
+%ifidn %1, adst
+    vpbroadcastq         m0, xm0
+%else ; flipadst
+    vpermq               m0, m0, q1111
+%endif
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    jmp m(iadst_4x16_internal).end4
+%endif
+%endif
+%endmacro
+
+%macro IDCT16_1D_PACKED 0
+    vpbroadcastd        m10, [o(pd_2048)]
+.main2:
+    punpckhwd            m8, m7, m0 ; dct16 in15 in1
+    paddw                m9, m0, m4
+    psubw                m0, m4
+    punpcklqdq           m9, m0     ; dct4  in0+in2 in0-in2
+    punpckhwd            m0, m3, m4 ; dct16 in7  in9
+    punpcklwd            m7, m1     ; dct8  in7  in1
+    punpckhwd            m1, m6     ; dct16 in3  in13
+    punpcklwd            m3, m5     ; dct8  in3  in5
+    punpckhwd            m5, m2     ; dct16 in11 in5
+    punpcklwd            m6, m2     ; dct4  in3  in1
+    ITX_MUL2X_PACK        8, 2, 4, 10,  401, 4076, 3 ; t8a  t15a
+    ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 3 ; t9a  t14a
+    ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
+    ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
+    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 1 ; t4a  t7a
+    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 1 ; t5a  t6a
+    ITX_MUL2X_PACK        6, 2, 4, 10, 1567, 3784    ; t3   t2
+    psubw                m2, m8, m0 ; t9  t14
+    paddw                m8, m0     ; t8  t15
+    psubw                m0, m1, m5 ; t10 t13
+    paddw                m1, m5     ; t11 t12
+%if mmsize > 16
+    vbroadcasti128       m5, [o(deint_shuf)]
+%else
+    mova                 m5, [o(deint_shuf)]
+%endif
+    pshufb               m8, m5
+    pshufb               m1, m5
+    vpbroadcastd         m5, [o(pw_m3784_1567)]  ; reuse pw_1567_3784
+    ITX_MUL2X_PACK        2, 4, _, 10, 4, 5, 4   ; t9a  t14a
+    vpbroadcastd         m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
+    ITX_MUL2X_PACK        0, 5, _, 10, 5, 4, 4   ; t10a t13a
+    psubw                m5, m7, m3 ; t5a t6a
+    paddw                m7, m3     ; t4  t7
+    psubw                m4, m8, m1 ; t11a t12a
+    paddw                m8, m1     ; t8a  t15a
+    paddw                m1, m2, m0 ; t9   t14
+    psubw                m2, m0     ; t10  t13
+    punpckhqdq           m0, m8, m1 ; t15a t14
+    punpcklqdq           m8, m1     ; t8a  t9
+    pshufd               m3, m5, q1032
+    psubw                m1, m5, m3
+    paddw                m3, m5
+    vpblendd             m3, m3, m1, 0xcc ; t6 t5
+    vpbroadcastd         m1, [o(pw_2896x8)]
+    punpckhqdq           m5, m4, m2 ; t12a t13
+    punpcklqdq           m2, m4, m2 ; t11a t10
+    psubw                m4, m5, m2
+    paddw                m5, m2
+    pmulhrsw             m9, m1     ; t0   t1
+    pmulhrsw             m3, m1     ; t6   t5
+    pmulhrsw             m4, m1     ; t11  t10a
+    pmulhrsw             m5, m1     ; t12  t13a
+    shufps               m2, m7, m3, q1032 ; t7 t6
+    vpblendd             m7, m7, m3, 0xcc  ; t4 t5
+    psubw                m1, m9, m6 ; dct4 out3 out2
+    paddw                m9, m6     ; dct4 out0 out1
+    psubw                m3, m9, m2 ; dct8 out7 out6
+    paddw                m9, m2     ; dct8 out0 out1
+    psubw                m2, m1, m7 ; dct8 out4 out5
+    paddw                m1, m7     ; dct8 out3 out2
+    psubw                m7, m9, m0 ; out15 out14
+    paddw                m0, m9     ; out0  out1
+    psubw                m6, m1, m5 ; out12 out13
+    paddw                m1, m5     ; out3  out2
+    psubw                m5, m2, m4 ; out11 out10
+    paddw                m2, m4     ; out4  out5
+    psubw                m4, m3, m8 ; out8  out9
+    paddw                m3, m8     ; out7  out6
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct,      0
+INV_TXFM_4X16_FN dct, identity, 15
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+
+cglobal idct_4x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                 m0, [cq+32*0]
+    mova                 m1, [cq+32*1]
+    mova                 m2, [cq+32*2]
+    mova                 m3, [cq+32*3]
+    call m(idct_16x4_internal).main
+    vpbroadcastd         m5, [o(pw_16384)]
+    punpckhwd            m4, m2, m3
+    punpcklwd            m2, m3
+    punpckhwd            m3, m0, m1
+    punpcklwd            m0, m1
+    REPX   {pmulhrsw x, m5}, m0, m4, m2, m3
+    punpckhdq            m1, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    jmp                tx2q
+.pass2:
+    vextracti128        xm4, m0, 1
+    vextracti128        xm5, m1, 1
+    vextracti128        xm6, m2, 1
+    vextracti128        xm7, m3, 1
+    call .main
+    vinserti128          m0, m0, xm4, 1
+    vinserti128          m1, m1, xm5, 1
+    vpbroadcastd         m5, [o(pw_2048)]
+    vinserti128          m2, m2, xm6, 1
+    vinserti128          m3, m3, xm7, 1
+    pshufd               m1, m1, q1032
+    pshufd               m3, m3, q1032
+    jmp m(iadst_4x16_internal).end2
+ALIGN function_align
+.main:
+    WRAP_XMM IDCT16_1D_PACKED
+    ret
+
+INV_TXFM_4X16_FN adst, dct,      0
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                 m0, [cq+32*0]
+    mova                 m1, [cq+32*1]
+    mova                 m2, [cq+32*2]
+    mova                 m3, [cq+32*3]
+    call m(iadst_16x4_internal).main
+    vpbroadcastd         m5, [o(pw_16384)]
+    punpckhwd            m4, m2, m3
+    punpcklwd            m2, m3
+    punpckhwd            m3, m0, m1
+    punpcklwd            m0, m1
+    REPX   {pmulhrsw x, m5}, m4, m2, m3, m0
+    punpckhdq            m1, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    jmp                tx2q
+.pass2:
+    call .main
+    pshufd               m1, m1, q1032
+    vpbroadcastd         m5, [o(pw_2048)]
+    vpblendd             m4, m1, m0, 0x33
+    vpblendd             m0, m0, m2, 0x33
+    vpblendd             m2, m2, m3, 0x33
+    vpblendd             m3, m3, m1, 0x33
+    vpermq               m0, m0, q2031
+    vpermq               m1, m2, q1302
+    vpermq               m2, m3, q3120
+    vpermq               m3, m4, q0213
+    psubw                m6, m7, m5
+.end:
+    vpblendd             m5, m5, m6, 0xcc
+.end2:
+    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
+.end3:
+    pxor                 m4, m4
+    mova          [cq+32*0], m4
+    mova          [cq+32*1], m4
+    mova          [cq+32*2], m4
+    mova          [cq+32*3], m4
+.end4:
+    lea                  r2, [dstq+strideq*8]
+    lea                  r3, [strideq*3]
+    WRITE_4X8             0, 1
+    lea                dstq, [dstq+strideq*4]
+    lea                  r2, [r2  +strideq*4]
+    WRITE_4X8             2, 3
+    RET
+ALIGN function_align
+.main:
+    vpblendd             m4, m1, m0, 0xcc
+    vpblendd             m1, m1, m0, 0x33
+    vpblendd             m5, m2, m3, 0xcc
+    vpblendd             m2, m2, m3, 0x33
+    vperm2i128           m3, m5, m2, 0x31
+    vinserti128          m0, m1, xm4, 1 ; in0  in3  in2  in1
+    vperm2i128           m4, m1, m4, 0x31
+    vinserti128          m1, m5, xm2, 1 ; in4  in7  in6  in5
+    pshufd               m3, m3, q1032  ; in12 in15 in13 in14
+    pshufd               m2, m4, q1032  ; in11 in8  in9  in10
+.main2:
+    vpbroadcastd         m8, [o(pd_2048)]
+    pxor                 m7, m7
+    punpckhwd            m4, m3, m0 ; in12 in3  in14 in1
+    punpcklwd            m0, m3     ; in0  in15 in2  in13
+    punpckhwd            m3, m2, m1 ; in8  in7  in10 in5
+    punpcklwd            m1, m2     ; in4  in11 in6  in9
+    ITX_MUL4X_PACK        0, 2, 5, 6, 8,  201, 4091,  995, 3973, 3
+    ITX_MUL4X_PACK        1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
+    ITX_MUL4X_PACK        3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
+    ITX_MUL4X_PACK        4, 2, 5, 6, 8, 3857, 1380, 4052,  601, 3
+    psubw                m2, m0, m3 ; t9a  t8a  t11a t10a
+    paddw                m0, m3     ; t1a  t0a  t3a  t2a
+    psubw                m3, m1, m4 ; t13a t12a t15a t14a
+    paddw                m1, m4     ; t5a  t4a  t7a  t6a
+    ITX_MUL4X_PACK        2, 4, 5, 6, 8,  799, 4017, 3406, 2276, 3
+    psubw                m6, m7, m5
+    ITX_MUL2X_PACK        3, 5, _, 8, 6, 4, 6
+    vpbroadcastd         m6, [o(pw_m3784_1567)]
+    vpbroadcastd         m5, [o(pw_1567_3784)]
+    psubw                m4, m0, m1 ; t5   t4   t7   t6
+    paddw                m0, m1     ; t1   t0   t3   t2
+    psubw                m1, m2, m3 ; t13a t12a t15a t14a
+    paddw                m2, m3     ; t9a  t8a  t11a t10a
+    psubw                m3, m7, m6
+    vpblendd             m6, m6, m3, 0xf0
+    ITX_MUL2X_PACK        4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
+    ITX_MUL2X_PACK        1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
+    vbroadcasti128       m5, [o(deint_shuf)]
+    pshufb               m0, m5
+    pshufb               m2, m5
+    vperm2i128           m3, m0, m2, 0x31  ; t3   t2   t11a t10a
+    vinserti128          m0, m0, xm2, 1    ; t1   t0   t9a  t8a
+    vperm2i128           m2, m4, m1, 0x31  ; t7a  t6a  t15  t14
+    vinserti128          m4, m4, xm1, 1    ; t4a  t5a  t12  t13
+    vpbroadcastd         m5, [o(pw_2896x8)]
+    pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
+    psubw                m1, m0, m3        ; t3a t2a t11 t10
+    paddw                m0, m3     ; -out15  out0   out14 -out1
+    paddw                m3, m4, m2 ; -out3   out12  out2  -out13
+    psubw                m4, m2            ; t6 t7 t14a t15a
+    shufps               m2, m1, m4, q1032 ; t2a t6  t10 t14a
+    vpblendd             m4, m4, m1, 0x33  ; t3a t7  t11 t15a
+    paddw                m1, m2, m4
+    psubw                m2, m4
+    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
+    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
+    ret
+
+INV_TXFM_4X16_FN flipadst, dct,      0
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                 m0, [cq+32*0]
+    mova                 m1, [cq+32*1]
+    mova                 m2, [cq+32*2]
+    mova                 m3, [cq+32*3]
+    call m(iadst_16x4_internal).main
+    vpbroadcastd         m5, [o(pw_16384)]
+    punpcklwd            m4, m1, m0
+    punpckhwd            m1, m0
+    punpcklwd            m0, m3, m2
+    punpckhwd            m3, m2
+    REPX   {pmulhrsw x, m5}, m4, m1, m0, m3
+    punpckldq            m2, m3, m1
+    punpckhdq            m3, m1
+    punpckhdq            m1, m0, m4
+    punpckldq            m0, m4
+    jmp                tx2q
+.pass2:
+    call m(iadst_4x16_internal).main
+    pshufd               m1, m1, q1032
+    vpbroadcastd         m6, [o(pw_2048)]
+    vpblendd             m4, m0, m2, 0x33
+    vpblendd             m0, m0, m1, 0xcc
+    vpblendd             m1, m1, m3, 0xcc
+    vpblendd             m2, m2, m3, 0x33
+    vpermq               m0, m0, q3120
+    vpermq               m1, m1, q0213
+    vpermq               m2, m2, q2031
+    vpermq               m3, m4, q1302
+    psubw                m5, m7, m6
+    jmp m(iadst_4x16_internal).end
+
+INV_TXFM_4X16_FN identity, dct,      3
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                 m3, [cq+32*0]
+    mova                 m2, [cq+32*1]
+    mova                 m4, [cq+32*2]
+    mova                 m0, [cq+32*3]
+    vpbroadcastd         m5, [o(pw_5793x4)]
+    punpcklwd            m1, m3, m2
+    punpckhwd            m3, m2
+    punpcklwd            m2, m4, m0
+    punpckhwd            m4, m0
+    REPX   {paddw    x, x }, m1, m2, m3, m4
+    REPX   {pmulhrsw x, m5}, m1, m2, m3, m4
+    vpbroadcastd         m5, [o(pw_16384)]
+    punpckldq            m0, m1, m2
+    punpckhdq            m1, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m4, [o(pw_5793x4)]
+    vpbroadcastd         m5, [o(pw_2048)]
+    REPX   {psllw    x, 2 }, m0, m1, m2, m3
+    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
+    jmp m(iadst_4x16_internal).end2
+
+%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3, ; coefs[1-2], tmp[1-2], off[1-3]
+    movq               xm%3, [dstq   ]
+    movhps             xm%3, [dstq+%5]
+    movq               xm%4, [dstq+%6]
+    movhps             xm%4, [dstq+%7]
+    pmovzxbw            m%3, xm%3
+    pmovzxbw            m%4, xm%4
+%ifnum %1
+    paddw               m%3, m%1
+%else
+    paddw               m%3, %1
+%endif
+%ifnum %2
+    paddw               m%4, m%2
+%else
+    paddw               m%4, %2
+%endif
+    packuswb            m%3, m%4
+    vextracti128       xm%4, m%3, 1
+    movq          [dstq   ], xm%3
+    movhps        [dstq+%6], xm%3
+    movq          [dstq+%5], xm%4
+    movhps        [dstq+%7], xm%4
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 8x4, 7
+%if %3 >= 0
+%ifidn %1_%2, dct_identity
+    vpbroadcastd        xm0, [o(pw_2896x8)]
+    pmulhrsw            xm1, xm0, [cq]
+    vpbroadcastd        xm2, [o(pw_5793x4)]
+    vpbroadcastd        xm3, [o(pw_2048)]
+    pmulhrsw            xm1, xm0
+    paddw               xm1, xm1
+    pmulhrsw            xm1, xm2
+    pmulhrsw            xm1, xm3
+    punpcklwd           xm1, xm1
+    punpckldq           xm0, xm1, xm1
+    punpckhdq           xm1, xm1
+    vpermq               m0, m0, q1100
+    vpermq               m1, m1, q1100
+%elifidn %1_%2, identity_dct
+    mova                xm0, [cq+16*0]
+    packusdw            xm0, [cq+16*1]
+    mova                xm1, [cq+16*2]
+    packusdw            xm1, [cq+16*3]
+    vpbroadcastd        xm2, [o(pw_2896x8)]
+    vpbroadcastd        xm3, [o(pw_2048)]
+    packusdw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    paddw               xm0, xm0
+    pmulhrsw            xm0, xm2
+    pmulhrsw            xm0, xm3
+    vinserti128          m0, m0, xm0, 1
+    mova                 m1, m0
+%else
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    pmulhrsw            xm0, xm1
+%ifidn %2, dct
+    movd                xm2, [o(pw_2048)]
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    mova                 m1, m0
+%else ; adst / flipadst
+    vpbroadcastw         m0, xm0
+    pmulhrsw             m0, [o(iadst4_dconly2a)]
+    vpbroadcastd         m1, [o(pw_2048)]
+    pmulhrsw             m1, m0
+%ifidn %2, adst
+    vpermq               m0, m1, q1100
+    vpermq               m1, m1, q3322
+%else ; flipadst
+    vpermq               m0, m1, q2233
+    vpermq               m1, m1, q0011
+%endif
+%endif
+%endif
+    jmp m(iadst_8x4_internal).end3
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct,      0
+INV_TXFM_8X4_FN dct, adst,     0
+INV_TXFM_8X4_FN dct, flipadst, 0
+INV_TXFM_8X4_FN dct, identity, 3
+
+cglobal idct_8x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    vpbroadcastd        xm3, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm3, [cq+16*0]
+    pmulhrsw            xm1, xm3, [cq+16*1]
+    pmulhrsw            xm2, xm3, [cq+16*2]
+    pmulhrsw            xm3,      [cq+16*3]
+    call m(idct_4x8_internal).main
+    vbroadcasti128       m4, [o(deint_shuf)]
+    vinserti128          m3, m1, xm3, 1
+    vinserti128          m1, m0, xm2, 1
+    shufps               m0, m1, m3, q0220
+    shufps               m1, m1, m3, q1331
+    pshufb               m0, m4
+    pshufb               m1, m4
+    jmp                tx2q
+.pass2:
+    IDCT4_1D_PACKED
+    vpermq               m0, m0, q3120
+    vpermq               m1, m1, q2031
+    jmp m(iadst_8x4_internal).end2
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    vpbroadcastd        xm0, [o(pw_2896x8)]
+    pshufd              xm4,      [cq+16*0], q1032
+    pmulhrsw            xm3, xm0, [cq+16*3]
+    pshufd              xm5,      [cq+16*1], q1032
+    pmulhrsw            xm2, xm0, [cq+16*2]
+    pmulhrsw            xm4, xm0
+    pmulhrsw            xm5, xm0
+    call m(iadst_4x8_internal).main
+    vinserti128        m0, m0, xm2, 1
+    vinserti128        m1, m1, xm3, 1
+    punpckhwd          m2, m0, m1
+    punpcklwd          m0, m1
+    pxor               m3, m3
+    psubw              m3, m2
+    punpckhwd          m1, m0, m3
+    punpcklwd          m0, m3
+    jmp              tx2q
+.pass2:
+    call .main
+    vpblendd             m1, m1, m2, 0xcc
+.end:
+    vpermq               m0, m0, q3120
+    vpermq               m1, m1, q3120
+.end2:
+    vpbroadcastd         m2, [o(pw_2048)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+.end3:
+    pxor                 m2, m2
+    mova          [cq+32*0], m2
+    mova          [cq+32*1], m2
+    lea                  r3, [strideq*3]
+    WRITE_8X4             0, 1, 4, 5
+    RET
+ALIGN function_align
+.main:
+    IADST4_1D_PACKED
+    ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    vpbroadcastd        xm0, [o(pw_2896x8)]
+    pshufd              xm4,      [cq+16*0], q1032
+    pmulhrsw            xm3, xm0, [cq+16*3]
+    pshufd              xm5,      [cq+16*1], q1032
+    pmulhrsw            xm2, xm0, [cq+16*2]
+    pmulhrsw            xm4, xm0
+    pmulhrsw            xm5, xm0
+    call m(iadst_4x8_internal).main
+    vinserti128          m3, m3, xm1, 1
+    vinserti128          m2, m2, xm0, 1
+    punpckhwd            m1, m3, m2
+    punpcklwd            m3, m2
+    pxor                 m0, m0
+    psubw                m0, m1
+    punpckhwd            m1, m0, m3
+    punpcklwd            m0, m3
+    jmp                tx2q
+.pass2:
+    call m(iadst_8x4_internal).main
+    vpblendd             m2, m2, m1, 0x33
+    vpermq               m1, m0, q2031
+    vpermq               m0, m2, q2031
+    jmp m(iadst_8x4_internal).end2
+
+INV_TXFM_8X4_FN identity, dct,      7
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                xm2,     [cq+16*0]
+    mova                xm0,     [cq+16*1]
+    vinserti128          m2, m2, [cq+16*2], 1
+    vinserti128          m0, m0, [cq+16*3], 1
+    vpbroadcastd         m3, [o(pw_2896x8)]
+    punpcklwd            m1, m2, m0
+    punpckhwd            m2, m0
+    pmulhrsw             m1, m3
+    pmulhrsw             m2, m3
+    punpcklwd            m0, m1, m2
+    punpckhwd            m1, m2
+    paddw                m0, m0
+    paddw                m1, m1
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m2, [o(pw_5793x4)]
+    paddw                m0, m0
+    paddw                m1, m1
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    jmp m(iadst_8x4_internal).end
+
+%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 8x8, 7
+%ifidn %1_%2, dct_identity
+    vpbroadcastd        xm0, [o(pw_2896x8)]
+    pmulhrsw            xm0, [cq]
+    vpbroadcastd        xm1, [o(pw_16384)]
+    pmulhrsw            xm0, xm1
+    psrlw               xm1, 2 ; pw_4096
+    pmulhrsw            xm0, xm1
+    pshufb              xm0, [o(deint_shuf)]
+    vpermq               m3, m0, q1100
+    punpcklwd            m3, m3
+    pshufd               m0, m3, q0000
+    pshufd               m1, m3, q1111
+    pshufd               m2, m3, q2222
+    pshufd               m3, m3, q3333
+    jmp m(iadst_8x8_internal).end4
+%elif %3 >= 0
+%ifidn %1, dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm2
+    psrlw               xm2, 3 ; pw_2048
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+.end:
+    mov                 r2d, 2
+.end2:
+    lea                  r3, [strideq*3]
+.loop:
+    WRITE_8X4             0, 0, 1, 2
+    lea                dstq, [dstq+strideq*4]
+    dec                 r2d
+    jg .loop
+    RET
+%else ; identity
+    mova                 m0, [cq+32*0]
+    punpcklwd            m0, [cq+32*1]
+    mova                 m1, [cq+32*2]
+    punpcklwd            m1, [cq+32*3]
+    vpbroadcastd         m2, [o(pw_2896x8)]
+    vpbroadcastd         m3, [o(pw_2048)]
+    pxor                 m4, m4
+    mova          [cq+32*0], m4
+    mova          [cq+32*1], m4
+    mova          [cq+32*2], m4
+    mova          [cq+32*3], m4
+    punpckldq            m0, m1
+    vpermq               m1, m0, q3232
+    vpermq               m0, m0, q1010
+    punpcklwd            m0, m1
+    pmulhrsw             m0, m2
+    pmulhrsw             m0, m3
+    jmp m(inv_txfm_add_dct_dct_8x8).end
+%endif
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct,      0
+INV_TXFM_8X8_FN dct, identity, 7
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    vpermq               m0, [cq+32*0], q3120 ; 0 1
+    vpermq               m3, [cq+32*3], q3120 ; 6 7
+    vpermq               m2, [cq+32*2], q3120 ; 4 5
+    vpermq               m1, [cq+32*1], q3120 ; 2 3
+    call .main
+    shufps               m4, m0, m1, q0220
+    shufps               m5, m0, m1, q1331
+    shufps               m1, m2, m3, q0220
+    shufps               m3, m2, m3, q1331
+    vbroadcasti128       m0, [o(deint_shuf)]
+    vpbroadcastd         m2, [o(pw_16384)]
+    REPX   {pshufb   x, m0}, m4, m5, m1, m3
+    REPX   {pmulhrsw x, m2}, m4, m5, m1, m3
+    vinserti128          m0, m4, xm1, 1
+    vperm2i128           m2, m4, m1, 0x31
+    vinserti128          m1, m5, xm3, 1
+    vperm2i128           m3, m5, m3, 0x31
+    jmp                tx2q
+.pass2:
+    call .main
+    vpbroadcastd         m4, [o(pw_2048)]
+    vpermq               m0, m0, q3120
+    vpermq               m1, m1, q2031
+    vpermq               m2, m2, q3120
+    vpermq               m3, m3, q2031
+    jmp m(iadst_8x8_internal).end2
+ALIGN function_align
+.main:
+    IDCT8_1D_PACKED
+    ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    vpermq               m4, [cq+32*0], q1302 ; 1 0
+    vpermq               m3, [cq+32*3], q3120 ; 6 7
+    vpermq               m5, [cq+32*1], q1302 ; 3 2
+    vpermq               m2, [cq+32*2], q3120 ; 4 5
+    call .main
+    vpbroadcastd         m5, [o(pw_16384)]
+    punpcklwd            m4, m0, m1
+    punpckhwd            m0, m1
+    punpcklwd            m1, m2, m3
+    punpckhwd            m2, m3
+    pxor                 m3, m3
+    psubw                m3, m5 ; negate odd elements during rounding
+    pmulhrsw             m4, m5
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m3
+    punpcklwd            m3, m4, m0
+    punpckhwd            m4, m0
+    punpcklwd            m0, m1, m2
+    punpckhwd            m1, m2
+    vperm2i128           m2, m3, m0, 0x31
+    vinserti128          m0, m3, xm0, 1
+    vperm2i128           m3, m4, m1, 0x31
+    vinserti128          m1, m4, xm1, 1
+    jmp                tx2q
+.pass2:
+    pshufd               m4, m0, q1032
+    pshufd               m5, m1, q1032
+    call .main
+    vpbroadcastd         m5, [o(pw_2048)]
+    vpbroadcastd        xm4, [o(pw_4096)]
+    psubw                m4, m5 ; lower half = 2048, upper half = -2048
+.end:
+    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+.end2:
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m4
+.end3:
+    pmulhrsw             m2, m4
+    pmulhrsw             m3, m4
+.end4:
+    pxor                 m4, m4
+    mova          [cq+32*0], m4
+    mova          [cq+32*1], m4
+    mova          [cq+32*2], m4
+    mova          [cq+32*3], m4
+    lea                  r3, [strideq*3]
+    WRITE_8X4             0, 1, 4, 5
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4             2, 3, 4, 5
+    RET
+ALIGN function_align
+.main:
+    IADST8_1D_PACKED
+    ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    vpermq               m4, [cq+32*0], q1302 ; 1 0
+    vpermq               m3, [cq+32*3], q3120 ; 6 7
+    vpermq               m5, [cq+32*1], q1302 ; 3 2
+    vpermq               m2, [cq+32*2], q3120 ; 4 5
+    call m(iadst_8x8_internal).main
+    vpbroadcastd         m5, [o(pw_16384)]
+    punpckhwd            m4, m3, m2
+    punpcklwd            m3, m2
+    punpckhwd            m2, m1, m0
+    punpcklwd            m1, m0
+    pxor                 m0, m0
+    psubw                m0, m5
+    pmulhrsw             m4, m0
+    pmulhrsw             m3, m5
+    pmulhrsw             m2, m0
+    pmulhrsw             m1, m5
+    punpckhwd            m0, m4, m3
+    punpcklwd            m4, m3
+    punpckhwd            m3, m2, m1
+    punpcklwd            m2, m1
+    vinserti128          m1, m0, xm3, 1
+    vperm2i128           m3, m0, m3, 0x31
+    vinserti128          m0, m4, xm2, 1
+    vperm2i128           m2, m4, m2, 0x31
+    jmp                tx2q
+.pass2:
+    pshufd               m4, m0, q1032
+    pshufd               m5, m1, q1032
+    call m(iadst_8x8_internal).main
+    vpbroadcastd         m4, [o(pw_2048)]
+    vpbroadcastd        xm5, [o(pw_4096)]
+    psubw                m4, m5 ; lower half = -2048, upper half = 2048
+    vpermq               m5, m3, q2031
+    vpermq               m3, m0, q2031
+    vpermq               m0, m2, q2031
+    vpermq               m2, m1, q2031
+    pmulhrsw             m1, m0, m4
+    pmulhrsw             m0, m5, m4
+    jmp m(iadst_8x8_internal).end3
+
+INV_TXFM_8X8_FN identity, dct,      7
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                xm3,     [cq+16*0]
+    mova                xm2,     [cq+16*1]
+    vinserti128          m3, m3, [cq+16*4], 1
+    vinserti128          m2, m2, [cq+16*5], 1
+    mova                xm4,     [cq+16*2]
+    mova                xm0,     [cq+16*3]
+    vinserti128          m4, m4, [cq+16*6], 1
+    vinserti128          m0, m0, [cq+16*7], 1
+    punpcklwd            m1, m3, m2
+    punpckhwd            m3, m2
+    punpcklwd            m2, m4, m0
+    punpckhwd            m4, m0
+    punpckldq            m0, m1, m2
+    punpckhdq            m1, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m4, [o(pw_4096)]
+    jmp m(iadst_8x8_internal).end
+
+%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 8x16, 13
+%ifidn %1_%2, dct_dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    psrlw               xm2, 3 ; pw_2048
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    mov                 r2d, 4
+    jmp m(inv_txfm_add_dct_dct_8x8).end2
+%elifidn %1_%2, dct_identity
+    vpbroadcastd         m0, [o(pw_2896x8)]
+    pmulhrsw             m7, m0, [cq]
+    vpbroadcastd         m1, [o(pw_16384)]
+    vpbroadcastd         m2, [o(pw_5793x4)]
+    pmulhrsw             m7, m0
+    pmulhrsw             m7, m1
+    psrlw                m1, 3 ; pw_2048
+    psllw                m7, 2
+    pmulhrsw             m7, m2
+    pmulhrsw             m7, m1
+    punpcklwd            m5, m7, m7
+    punpckhwd            m7, m7
+    punpcklwd            m4, m5, m5
+    punpckhwd            m5, m5
+    punpcklwd            m6, m7, m7
+    punpckhwd            m7, m7
+    vpermq               m0, m4, q1100
+    vpermq               m1, m5, q1100
+    vpermq               m2, m6, q1100
+    vpermq               m3, m7, q1100
+    vpermq               m4, m4, q3322
+    vpermq               m5, m5, q3322
+    vpermq               m6, m6, q3322
+    vpermq               m7, m7, q3322
+    jmp m(idct_8x16_internal).end3
+%elifidn %1_%2, identity_dct
+    movd                xm0, [cq+32*0]
+    punpcklwd           xm0, [cq+32*1]
+    movd                xm2, [cq+32*2]
+    punpcklwd           xm2, [cq+32*3]
+    add                  cq, 32*4
+    movd                xm1, [cq+32*0]
+    punpcklwd           xm1, [cq+32*1]
+    movd                xm3, [cq+32*2]
+    punpcklwd           xm3, [cq+32*3]
+    vpbroadcastd        xm4, [o(pw_2896x8)]
+    vpbroadcastd        xm5, [o(pw_2048)]
+    xor                 eax, eax
+    mov           [cq-32*4], eax
+    mov           [cq-32*3], eax
+    mov           [cq-32*2], eax
+    mov           [cq-32*1], eax
+    punpckldq           xm0, xm2
+    punpckldq           xm1, xm3
+    punpcklqdq          xm0, xm1
+    pmulhrsw            xm0, xm4
+    pmulhrsw            xm0, xm4
+    pmulhrsw            xm0, xm5
+    mov           [cq+32*0], eax
+    mov           [cq+32*1], eax
+    mov           [cq+32*2], eax
+    mov           [cq+32*3], eax
+    vinserti128          m0, m0, xm0, 1
+    mov                 r2d, 4
+    jmp m(inv_txfm_add_dct_dct_8x8).end2
+%endif
+%endmacro
+
+%macro ITX_8X16_LOAD_COEFS 0
+    vpbroadcastd         m4, [o(pw_2896x8)]
+    pmulhrsw             m0, m4, [cq+32*0]
+    add                  cq, 32*4
+    pmulhrsw             m7, m4, [cq+32*3]
+    pmulhrsw             m1, m4, [cq-32*3]
+    pmulhrsw             m6, m4, [cq+32*2]
+    pmulhrsw             m2, m4, [cq-32*2]
+    pmulhrsw             m5, m4, [cq+32*1]
+    pmulhrsw             m3, m4, [cq-32*1]
+    pmulhrsw             m4,     [cq+32*0]
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct,      0
+INV_TXFM_8X16_FN dct, identity, 15
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+
+cglobal idct_8x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    ITX_8X16_LOAD_COEFS
+    call m(idct_16x8_internal).main
+    vpbroadcastd        m10, [o(pw_16384)]
+.pass1_end:
+    vperm2i128           m9, m3, m7, 0x31
+    vinserti128          m3, m3, xm7, 1
+    vperm2i128           m8, m2, m6, 0x31
+    vinserti128          m2, m2, xm6, 1
+    vperm2i128           m6, m1, m5, 0x31
+    vinserti128          m1, m1, xm5, 1
+    vperm2i128           m5, m0, m4, 0x31
+    vinserti128          m0, m0, xm4, 1
+    punpckhwd            m4, m2, m3
+    punpcklwd            m2, m3
+    punpckhwd            m3, m0, m1
+    punpcklwd            m0, m1
+.pass1_end2:
+    punpckhwd            m7, m5, m6
+    punpcklwd            m5, m6
+    punpcklwd            m6, m8, m9
+    punpckhwd            m8, m9
+    REPX  {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8
+    punpckhdq            m1, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    punpckldq            m4, m5, m6
+    punpckhdq            m5, m6
+    punpckldq            m6, m7, m8
+    punpckhdq            m7, m8
+    jmp                tx2q
+.pass2:
+    call .main
+    REPX {vpermq x, x, q3120}, m0, m2, m4, m6
+    REPX {vpermq x, x, q2031}, m1, m3, m5, m7
+.end:
+    vpbroadcastd         m8, [o(pw_2048)]
+.end2:
+    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+.end3:
+    lea                  r3, [strideq*3]
+    WRITE_8X4             0, 1, 8, 9
+    lea                dstq, [dstq+strideq*4]
+    pxor                 m0, m0
+    REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+    WRITE_8X4             2, 3, 0, 1
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4             4, 5, 0, 1
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4             6, 7, 0, 1
+    RET
+ALIGN function_align
+.main:
+    IDCT16_1D_PACKED
+    ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    ITX_8X16_LOAD_COEFS
+    call m(iadst_16x8_internal).main
+    vpbroadcastd        m10, [o(pw_16384)]
+    pslld                m9, m10, 17
+    psubw               m10, m9 ; 16384, -16384
+    jmp m(idct_8x16_internal).pass1_end
+ALIGN function_align
+.pass2:
+    call .main
+    vpbroadcastd         m9, [o(pw_2048)]
+    vpbroadcastd        xm8, [o(pw_4096)]
+    psubw                m8, m9
+    REPX {vpermq x, x, q2031}, m0, m1, m2, m3
+    REPX {vpermq x, x, q3120}, m4, m5, m6, m7
+    jmp m(idct_8x16_internal).end2
+ALIGN function_align
+.main:
+    REPX {pshufd x, x, q1032}, m7, m1, m5, m3
+.main2:
+    vpbroadcastd        m10, [o(pd_2048)]
+    punpckhwd            m8, m7, m0 ; in14 in1
+    punpcklwd            m0, m7     ; in0  in15
+    punpcklwd            m7, m6, m1 ; in12 in3
+    punpckhwd            m1, m6     ; in2  in13
+    punpckhwd            m6, m5, m2 ; in10 in5
+    punpcklwd            m2, m5     ; in4  in11
+    punpcklwd            m5, m4, m3 ; in8  in7
+    punpckhwd            m3, m4     ; in6  in9
+    ITX_MUL2X_PACK        0, 4, 9, 10,  201, 4091, 3 ; t0  t1
+    ITX_MUL2X_PACK        1, 4, 9, 10,  995, 3973, 3 ; t2  t3
+    ITX_MUL2X_PACK        2, 4, 9, 10, 1751, 3703, 3 ; t4  t5
+    ITX_MUL2X_PACK        3, 4, 9, 10, 2440, 3290, 3 ; t6  t7
+    ITX_MUL2X_PACK        5, 4, 9, 10, 3035, 2751, 3 ; t8  t9
+    ITX_MUL2X_PACK        6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
+    ITX_MUL2X_PACK        7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
+    ITX_MUL2X_PACK        8, 4, 9, 10, 4052,  601, 3 ; t14 t15
+    psubw                m4, m0, m5 ; t9a  t8a
+    paddw                m0, m5     ; t1a  t0a
+    psubw                m5, m1, m6 ; t11a t10a
+    paddw                m1, m6     ; t3a  t2a
+    psubw                m6, m2, m7 ; t13a t12a
+    paddw                m2, m7     ; t5a  t4a
+    psubw                m7, m3, m8 ; t15a t14a
+    paddw                m3, m8     ; t7a  t6a
+    vpbroadcastd        m11, [o(pw_m4017_799)]
+    vpbroadcastd        m12, [o(pw_799_4017)]
+    pxor                 m9, m9
+    ITX_MUL2X_PACK        4, 8, _, 10, 11, 12, 6 ; t8  t9
+    psubw                m8, m9, m11
+    ITX_MUL2X_PACK        6, 12, _, 10, 12, 8, 6 ; t12 t13
+    vpbroadcastd        m11, [o(pw_m2276_3406)]
+    vpbroadcastd        m12, [o(pw_3406_2276)]
+    ITX_MUL2X_PACK        5, 8, _, 10, 11, 12, 6 ; t10 t11
+    psubw                m8, m9, m11
+    ITX_MUL2X_PACK        7, 12, _, 10, 12, 8, 6 ; t14 t15
+    psubw                m8, m1, m3 ; t7   t6
+    paddw                m1, m3     ; t3   t2
+    psubw                m3, m0, m2 ; t5   t4
+    paddw                m0, m2     ; t1   t0
+    psubw                m2, m5, m7 ; t14a t15a
+    paddw                m7, m5     ; t10a t11a
+    psubw                m5, m4, m6 ; t12a t13a
+    paddw                m4, m6     ; t8a  t9a
+    vpbroadcastd        m11, [o(pw_m3784_1567)]
+    vpbroadcastd        m12, [o(pw_1567_3784)]
+    ITX_MUL2X_PACK        3, 6, _, 10, 11, 12, 4 ; t4a t5a
+    psubw                m6, m9, m11
+    ITX_MUL2X_PACK        8, 12, _, 10, 12, 6, 4 ; t6a t7a
+    vpbroadcastd        m11, [o(pw_m1567_3784)]
+    vpbroadcastd        m12, [o(pw_3784_1567)]
+    ITX_MUL2X_PACK        2, 6, _, 10, 11, 12, 4 ; t15 t14
+    psubw                m6, m9, m11
+    ITX_MUL2X_PACK        5, 12, _, 10, 12, 6, 4 ; t13 t12
+    vbroadcasti128      m11, [o(deint_shuf)]
+    vpbroadcastd        m12, [o(pw_2896x8)]
+    psubw                m6, m0, m1        ;  t3a    t2a
+    paddw                m0, m1            ; -out15  out0
+    paddw                m1, m2, m5        ; -out13  out2
+    psubw                m5, m2            ;  t15a   t14a
+    paddw                m2, m4, m7        ; -out1  out14
+    psubw                m4, m7            ;  t10    t11
+    psubw                m7, m3, m8        ;  t6     t7
+    paddw                m8, m3            ; -out3   out12
+    REPX    {pshufb x, m11}, m6, m4, m0, m2
+    vpblendd             m3, m6, m4, 0xcc  ;  t3a    t11
+    shufps               m6, m6, m4, q1032 ;  t2a    t10
+    vpblendd             m4, m5, m7, 0xcc  ;  t15a   t7
+    shufps               m5, m5, m7, q1032 ;  t14a   t6
+    shufps               m7, m2, m0, q1032 ;  out14 -out15
+    vpblendd             m0, m0, m2, 0x33  ; -out1   out0
+    paddw                m2, m5, m4        ; -out5   out4
+    psubw                m5, m4            ;  out10 -out11
+    psubw                m4, m6, m3        ;  out8  -out9
+    paddw                m3, m6            ; -out7   out6
+    shufps               m6, m8, m1, q1032 ;  out12 -out13
+    vpblendd             m1, m1, m8, 0x33  ; -out3   out2
+    REPX  {pmulhrsw x, m12}, m2, m3, m4, m5
+    ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    ITX_8X16_LOAD_COEFS
+    call m(iadst_16x8_internal).main
+    vpbroadcastd         m9, [o(pw_16384)]
+    pslld               m10, m9, 17
+    psubw               m10, m9 ; -16384, 16384
+    vperm2i128           m9, m4, m0, 0x31
+    vinserti128          m0, m4, xm0, 1
+    vperm2i128           m8, m5, m1, 0x31
+    vinserti128          m4, m5, xm1, 1
+    vperm2i128           m5, m7, m3, 0x31
+    vinserti128          m3, m7, xm3, 1
+    vinserti128          m1, m6, xm2, 1
+    vperm2i128           m6, m6, m2, 0x31
+    punpcklwd            m2, m4, m0
+    punpckhwd            m4, m0
+    punpcklwd            m0, m3, m1
+    punpckhwd            m3, m1
+    jmp m(idct_8x16_internal).pass1_end2
+.pass2:
+    call m(iadst_8x16_internal).main
+    vpbroadcastd         m8, [o(pw_2048)]
+    vpbroadcastd        xm9, [o(pw_4096)]
+    psubw                m8, m9
+    vpermq               m9, m0, q3120
+    vpermq               m0, m7, q2031
+    vpermq               m7, m1, q3120
+    vpermq               m1, m6, q2031
+    vpermq               m6, m2, q3120
+    vpermq               m2, m5, q2031
+    vpermq               m5, m3, q3120
+    vpermq               m3, m4, q2031
+    pmulhrsw             m0, m8
+    pmulhrsw             m1, m8
+    pmulhrsw             m2, m8
+    pmulhrsw             m3, m8
+    pmulhrsw             m4, m5, m8
+    pmulhrsw             m5, m6, m8
+    pmulhrsw             m6, m7, m8
+    pmulhrsw             m7, m9, m8
+    jmp m(idct_8x16_internal).end3
+
+INV_TXFM_8X16_FN identity, dct,      7
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                xm3,     [cq+16*0]
+    mova                xm2,     [cq+16*2]
+    add                  cq, 16*8
+    vinserti128          m3, m3, [cq+16*0], 1
+    vinserti128          m2, m2, [cq+16*2], 1
+    vpbroadcastd         m9, [o(pw_2896x8)]
+    mova                xm4,     [cq-16*4]
+    mova                xm5,     [cq-16*2]
+    vinserti128          m4, m4, [cq+16*4], 1
+    vinserti128          m5, m5, [cq+16*6], 1
+    mova                xm7,     [cq-16*7]
+    mova                xm6,     [cq-16*5]
+    vinserti128          m7, m7, [cq+16*1], 1
+    vinserti128          m6, m6, [cq+16*3], 1
+    mova                xm8,     [cq-16*3]
+    mova                xm0,     [cq-16*1]
+    vinserti128          m8, m8, [cq+16*5], 1
+    vinserti128          m0, m0, [cq+16*7], 1
+    punpcklwd            m1, m3, m2
+    punpckhwd            m3, m2
+    punpcklwd            m2, m4, m5
+    punpckhwd            m4, m5
+    punpcklwd            m5, m7, m6
+    punpckhwd            m7, m6
+    punpcklwd            m6, m8, m0
+    punpckhwd            m8, m0
+    REPX   {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8
+    punpckldq            m0, m1, m2
+    punpckhdq            m1, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    punpckldq            m4, m5, m6
+    punpckhdq            m5, m6
+    punpckldq            m6, m7, m8
+    punpckhdq            m7, m8
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m8, [o(pw_5793x4)]
+    REPX {psllw    x, 2       }, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX {vpermq   x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX {pmulhrsw x, m8      }, m0, m1, m2, m3, m4, m5, m6, m7
+    jmp m(idct_8x16_internal).end
+
+%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
+    pmovzxbw            m%3, [dstq+%5]
+%ifnum %1
+    paddw               m%3, m%1
+%else
+    paddw               m%3, %1
+%endif
+    pmovzxbw            m%4, [dstq+%6]
+%ifnum %2
+    paddw               m%4, m%2
+%else
+    paddw               m%4, %2
+%endif
+    packuswb            m%3, m%4
+    vpermq              m%3, m%3, q3120
+    mova          [dstq+%5], xm%3
+    vextracti128  [dstq+%6], m%3, 1
+%endmacro
+
+%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 16x4, 11
+%if %3 >= 0
+%ifidn %1_%2, dct_identity
+    vpbroadcastd        xm3, [o(pw_2896x8)]
+    pmulhrsw            xm3, [cq]
+    vpbroadcastd        xm0, [o(pw_16384)]
+    vpbroadcastd        xm1, [o(pw_5793x4)]
+    pmulhrsw            xm3, xm0
+    psrlw               xm0, 3 ; pw_2048
+    paddw               xm3, xm3
+    pmulhrsw            xm3, xm1
+    pmulhrsw            xm3, xm0
+    punpcklwd           xm3, xm3
+    punpckldq           xm1, xm3, xm3
+    punpckhdq           xm3, xm3
+    vpbroadcastq         m0, xm1
+    vpermq               m1, m1, q1111
+    vpbroadcastq         m2, xm3
+    vpermq               m3, m3, q1111
+    jmp m(iadst_16x4_internal).end2
+%elifidn %1_%2, identity_dct
+    mova                xm0,     [cq+16*0]
+    mova                xm2,     [cq+16*1]
+    vinserti128          m0, m0, [cq+16*4], 1
+    vinserti128          m2, m2, [cq+16*5], 1
+    mova                xm1,     [cq+16*2]
+    mova                xm3,     [cq+16*3]
+    vinserti128          m1, m1, [cq+16*6], 1
+    vinserti128          m3, m3, [cq+16*7], 1
+    vpbroadcastd         m4, [o(pw_5793x4)]
+    vpbroadcastd         m5, [o(pw_16384)]
+    packusdw             m0, m2
+    packusdw             m1, m3
+    packusdw             m0, m1
+    vpbroadcastd         m1, [o(pw_2896x8)]
+    psllw                m0, 2
+    pmulhrsw             m0, m4
+    pmulhrsw             m0, m5
+    psrlw                m5, 3 ; pw_2048
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m5
+    mov                 r3d, 2
+.end:
+    pxor                 m3, m3
+.end_loop:
+    mova          [cq+32*0], m3
+    mova          [cq+32*1], m3
+    add                  cq, 32*2
+    WRITE_16X2            0, 0, 1, 2, strideq*0, strideq*1
+    lea                dstq, [dstq+strideq*2]
+    dec                 r3d
+    jg .end_loop
+    RET
+%else
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+%ifidn %2, dct
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    mov                 r2d, 2
+.dconly:
+    pmulhrsw            xm0, xm2
+    movd                xm2, [pw_2048] ; intentionally rip-relative
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    pxor                 m3, m3
+.dconly_loop:
+    mova                xm1, [dstq]
+    vinserti128          m1, m1, [dstq+strideq], 1
+    punpckhbw            m2, m1, m3
+    punpcklbw            m1, m3
+    paddw                m2, m0
+    paddw                m1, m0
+    packuswb             m1, m2
+    mova             [dstq], xm1
+    vextracti128 [dstq+strideq], m1, 1
+    lea                dstq, [dstq+strideq*2]
+    dec                 r2d
+    jg .dconly_loop
+    RET
+%else ; adst / flipadst
+    movd                xm2, [o(pw_16384)]
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    pmulhrsw             m0, [o(iadst4_dconly2a)]
+    vpbroadcastd         m3, [o(pw_2048)]
+    mov                [cq], eobd
+    pmulhrsw             m3, m0
+%ifidn %2, adst
+    vpbroadcastq         m0, xm3
+    vpermq               m1, m3, q1111
+    vpermq               m2, m3, q2222
+    vpermq               m3, m3, q3333
+%else ; flipadst
+    vpermq               m0, m3, q3333
+    vpermq               m1, m3, q2222
+    vpermq               m2, m3, q1111
+    vpbroadcastq         m3, xm3
+%endif
+    jmp m(iadst_16x4_internal).end3
+%endif
+%endif
+%endif
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct,      0
+INV_TXFM_16X4_FN dct, adst,     0
+INV_TXFM_16X4_FN dct, flipadst, 0
+INV_TXFM_16X4_FN dct, identity, 3
+
+cglobal idct_16x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                xm0, [cq+16*0]
+    mova                xm1, [cq+16*1]
+    mova                xm2, [cq+16*2]
+    mova                xm3, [cq+16*3]
+    mova                xm4, [cq+16*4]
+    mova                xm5, [cq+16*5]
+    mova                xm6, [cq+16*6]
+    mova                xm7, [cq+16*7]
+    call m(idct_4x16_internal).main
+    vinserti128          m6, m2, xm6, 1
+    vinserti128          m2, m0, xm4, 1
+    vinserti128          m0, m1, xm5, 1
+    vinserti128          m1, m3, xm7, 1
+    punpcklwd            m3, m2, m6
+    punpckhwd            m2, m6
+    vpbroadcastd         m6, [o(pw_16384)]
+    punpckhwd            m4, m0, m1
+    punpcklwd            m0, m1
+    mova                 m1, m6
+    jmp m(iadst_16x4_internal).pass1_end
+.pass2:
+    call .main
+    jmp m(iadst_16x4_internal).end
+ALIGN function_align
+.main:
+    vpbroadcastd         m6, [o(pd_2048)]
+    IDCT4_1D              0, 1, 2, 3, 4, 5, 6
+    ret
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    vpermq               m0, [cq+32*0], q1230
+    vpermq               m3, [cq+32*3], q2103
+    vpermq               m1, [cq+32*1], q1230
+    vpermq               m2, [cq+32*2], q2103
+    call m(iadst_4x16_internal).main2
+    pshufd               m2, m2, q1032
+    punpcklwd            m4, m3, m1
+    punpcklwd            m5, m2, m0
+    punpckhwd            m0, m1
+    punpckhwd            m2, m3
+    vpbroadcastd         m1, [o(pw_16384)]
+    vinserti128          m3, m0, xm2, 1
+    vperm2i128           m2, m0, m2, 0x31
+    vinserti128          m0, m4, xm5, 1
+    vperm2i128           m4, m4, m5, 0x31
+    psubw                m6, m7, m1
+.pass1_end:
+    pmulhrsw             m3, m1
+    pmulhrsw             m2, m6
+    pmulhrsw             m4, m1
+    pmulhrsw             m0, m6
+    punpcklwd            m1, m3, m2
+    punpckhwd            m3, m2
+    punpcklwd            m2, m4, m0
+    punpckhwd            m4, m0
+    punpckldq            m0, m1, m2
+    punpckhdq            m1, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    jmp                tx2q
+.pass2:
+    call .main
+.end:
+    vpbroadcastd         m4, [o(pw_2048)]
+    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
+.end2:
+    pxor                 m4, m4
+    mova          [cq+32*0], m4
+    mova          [cq+32*1], m4
+    mova          [cq+32*2], m4
+    mova          [cq+32*3], m4
+.end3:
+    WRITE_16X2            0, 1, 4, 5, strideq*0, strideq*1
+    lea                dstq, [dstq+strideq*2]
+    WRITE_16X2            2, 3, 4, 5, strideq*0, strideq*1
+    RET
+ALIGN function_align
+.main:
+    vpbroadcastd         m7, [o(pw_3803_1321)]
+    vpbroadcastd         m8, [o(pw_m1321_2482)]
+    vpbroadcastd         m9, [o(pw_2482_3344)]
+    punpcklwd            m4, m2, m0 ; in2 in0 l
+    psubw                m6, m0, m2
+    punpckhwd            m2, m0     ; in2 in0 h
+    paddw                m6, m3     ; t2
+    pmaddwd              m0, m7, m4 ; t0:02 l
+    pmaddwd              m7, m2     ; t0:02 h
+    pmaddwd              m4, m8     ; t1:02 l
+    pmaddwd              m8, m2     ; t1:02 h
+    punpckhwd            m2, m3, m1 ; in3 in1 h
+    punpcklwd            m3, m1     ; in3 in1 l
+    vpbroadcastd         m1, [o(pd_2048)]
+    pmaddwd              m5, m9, m3
+    pmaddwd              m9, m2
+    paddd                m0, m1
+    paddd                m7, m1
+    paddd                m0, m5     ; t0 + t3 + 2048 l
+    paddd                m7, m9     ; t0 + t3 + 2048 h
+    vpbroadcastd         m9, [o(pw_m3803_3344)]
+    pmaddwd              m5, m9, m2
+    pmaddwd              m9, m3
+    paddd                m5, m1     ; t1:13 + 2048 h
+    paddd                m1, m9     ; t1:13 + 2048 l
+    vpbroadcastd         m9, [o(pw_m3803_m6688)]
+    pmaddwd              m2, m9
+    pmaddwd              m3, m9
+    paddd                m5, m8     ; t1 + t3 + 2048 h
+    paddd                m1, m4     ; t1 + t3 + 2048 l
+    paddd                m8, m7
+    paddd                m4, m0
+    paddd                m2, m8     ; t0 + t1 - t3 + 2048 h
+    paddd                m3, m4     ; t0 + t1 - t3 + 2048 l
+    REPX      {psrad x, 12}, m0, m7, m5, m1, m2, m3
+    packssdw             m0, m7
+    packssdw             m1, m5
+    packssdw             m3, m2
+    vpbroadcastd         m2, [o(pw_3344x8)]
+    pmulhrsw             m2, m6
+    ret
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    vpermq               m0, [cq+32*0], q1230
+    vpermq               m3, [cq+32*3], q2103
+    vpermq               m1, [cq+32*1], q1230
+    vpermq               m2, [cq+32*2], q2103
+    call m(iadst_4x16_internal).main2
+    pshufd               m2, m2, q1032
+    punpckhwd            m4, m3, m2
+    punpckhwd            m5, m1, m0
+    punpcklwd            m0, m2
+    punpcklwd            m1, m3
+    vpbroadcastd         m6, [o(pw_16384)]
+    vinserti128          m3, m0, xm1, 1
+    vperm2i128           m2, m0, m1, 0x31
+    vinserti128          m0, m4, xm5, 1
+    vperm2i128           m4, m4, m5, 0x31
+    psubw                m1, m7, m6
+    jmp m(iadst_16x4_internal).pass1_end
+ALIGN function_align
+.pass2:
+    call m(iadst_16x4_internal).main
+    vpbroadcastd         m4, [o(pw_2048)]
+    REPX   {pmulhrsw x, m4}, m3, m2, m1, m0
+    pxor                 m4, m4
+    mova          [cq+32*0], m4
+    mova          [cq+32*1], m4
+    mova          [cq+32*2], m4
+    mova          [cq+32*3], m4
+    WRITE_16X2            3, 2, 4, 5, strideq*0, strideq*1
+    lea                dstq, [dstq+strideq*2]
+    WRITE_16X2            1, 0, 4, 5, strideq*0, strideq*1
+    RET
+
+INV_TXFM_16X4_FN identity, dct,      15
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                xm2,     [cq+16*0]
+    mova                xm4,     [cq+16*1]
+    vinserti128          m2, m2, [cq+16*4], 1
+    vinserti128          m4, m4, [cq+16*5], 1
+    mova                xm0,     [cq+16*2]
+    mova                xm1,     [cq+16*3]
+    vinserti128          m0, m0, [cq+16*6], 1
+    vinserti128          m1, m1, [cq+16*7], 1
+    vpbroadcastd         m5, [o(pw_5793x4)]
+    punpcklwd            m3, m2, m4
+    punpckhwd            m2, m4
+    punpcklwd            m4, m0, m1
+    punpckhwd            m0, m1
+    REPX       {psllw x, 2}, m3, m2, m4, m0
+    punpcklwd            m1, m3, m2
+    punpckhwd            m3, m2
+    punpcklwd            m2, m4, m0
+    punpckhwd            m4, m0
+    REPX   {pmulhrsw x, m5}, m1, m3, m2, m4
+    vpbroadcastd         m5, [o(pw_16384)]
+    punpcklqdq           m0, m1, m2
+    punpckhqdq           m1, m2
+    punpcklqdq           m2, m3, m4
+    punpckhqdq           m3, m4
+    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m4, [o(pw_5793x4)]
+    REPX   {paddw    x, x }, m0, m1, m2, m3
+    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
+    jmp m(iadst_16x4_internal).end
+
+%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 16x8, 13
+%ifidn %1_%2, dct_dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    mov                 r2d, 4
+    jmp m(inv_txfm_add_dct_dct_16x4).dconly
+%elifidn %1_%2, dct_identity
+    vbroadcasti128       m7, [cq]
+    vpbroadcastd         m0, [o(pw_2896x8)]
+    vpbroadcastd         m1, [o(pw_16384)]
+    pxor                xm2, xm2
+    mova               [cq], xm2
+    pmulhrsw             m7, m0
+    pmulhrsw             m7, m0
+    pmulhrsw             m7, m1
+    psrlw                m1, 2 ; pw_4096
+    pmulhrsw             m7, m1
+    punpcklwd            m3, m7, m7
+    punpckhwd            m7, m7
+    pshufd               m0, m3, q0000
+    pshufd               m1, m3, q1111
+    pshufd               m2, m3, q2222
+    pshufd               m3, m3, q3333
+    pshufd               m4, m7, q0000
+    pshufd               m5, m7, q1111
+    pshufd               m6, m7, q2222
+    pshufd               m7, m7, q3333
+    lea                  r3, [strideq*3]
+    WRITE_16X2            0, 1, 8, 0, strideq*0, strideq*1
+    WRITE_16X2            2, 3, 0, 1, strideq*2, r3
+    jmp m(idct_16x8_internal).end4
+%elifidn %1_%2, identity_dct
+    mova                 m0, [cq+32*0]
+    packusdw             m0, [cq+32*1]
+    mova                 m2, [cq+32*2]
+    packusdw             m2, [cq+32*3]
+    mova                 m1, [cq+32*4]
+    packusdw             m1, [cq+32*5]
+    mova                 m3, [cq+32*6]
+    packusdw             m3, [cq+32*7]
+    vpbroadcastd         m4, [o(pw_2896x8)]
+    vpbroadcastd         m5, [o(pw_5793x4)]
+    packusdw             m0, m2
+    packusdw             m1, m3
+    vpbroadcastd         m2, [o(pw_16384)]
+    packusdw             m0, m1
+    vpermq               m1, m0, q3322
+    vpermq               m0, m0, q1100
+    punpcklwd            m0, m1
+    pmulhrsw             m0, m4
+    psllw                m0, 2
+    pmulhrsw             m0, m5
+    pmulhrsw             m0, m2
+    psrlw                m2, 3 ; pw_2048
+    pmulhrsw             m0, m4
+    pmulhrsw             m0, m2
+    mov                 r3d, 4
+    jmp m(inv_txfm_add_identity_dct_16x4).end
+%endif
+%endmacro
+
+%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
+    vpbroadcastd         m8, [o(pw_2896x8)]
+    vpermq               m0, [cq+32*0], q3120
+    add                  cq, 32*4
+    vpermq               m7, [cq+32*3], q%1
+    vpermq               m1, [cq-32*3], q%1
+    vpermq               m6, [cq+32*2], q3120
+    vpermq               m2, [cq-32*2], q3120
+    vpermq               m5, [cq+32*1], q%1
+    vpermq               m3, [cq-32*1], q%1
+    vpermq               m4, [cq+32*0], q3120
+    REPX   {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct,      0
+INV_TXFM_16X8_FN dct, identity, 7
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+
+cglobal idct_16x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    ITX_16X8_LOAD_COEFS 3120
+    call m(idct_8x16_internal).main
+    vpbroadcastd        m10, [o(pw_16384)]
+    punpckhwd            m8, m0, m2
+    punpcklwd            m0, m2
+    punpckhwd            m2, m1, m3
+    punpcklwd            m1, m3
+    punpcklwd            m9, m4, m6
+    punpckhwd            m4, m6
+    punpcklwd            m6, m5, m7
+    punpckhwd            m5, m7
+    REPX  {pmulhrsw x, m10}, m8, m1, m4, m6
+.pass1_end:
+    REPX  {pmulhrsw x, m10}, m0, m2, m9, m5
+    punpckhwd            m3, m0, m8
+    punpcklwd            m0, m8
+    punpckhwd            m8, m2, m1
+    punpcklwd            m2, m1
+    punpcklwd            m7, m9, m4
+    punpckhwd            m9, m4
+    punpcklwd            m4, m5, m6
+    punpckhwd            m5, m6
+    punpckhdq            m1, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m8
+    punpckhdq            m3, m8
+    punpckldq            m6, m7, m4
+    punpckhdq            m7, m4
+    punpckldq            m8, m9, m5
+    punpckhdq            m9, m5
+    vperm2i128           m4, m0, m6, 0x31
+    vinserti128          m0, m0, xm6, 1
+    vperm2i128           m5, m1, m7, 0x31
+    vinserti128          m1, m1, xm7, 1
+    vperm2i128           m6, m2, m8, 0x31
+    vinserti128          m2, m2, xm8, 1
+    vperm2i128           m7, m3, m9, 0x31
+    vinserti128          m3, m3, xm9, 1
+    jmp                tx2q
+.pass2:
+    call .main
+    vpbroadcastd         m8, [o(pw_2048)]
+.end:
+    REPX   {pmulhrsw x, m8}, m0, m2, m4, m6
+.end2:
+    REPX   {pmulhrsw x, m8}, m1, m3, m5, m7
+    lea                  r3, [strideq*3]
+    WRITE_16X2            0, 1, 8, 0, strideq*0, strideq*1
+    WRITE_16X2            2, 3, 0, 1, strideq*2, r3
+.end3:
+    pxor                 m0, m0
+    REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+.end4:
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            4, 5, 0, 1, strideq*0, strideq*1
+    WRITE_16X2            6, 7, 0, 1, strideq*2, r3
+    RET
+ALIGN function_align
+.main:
+    vpbroadcastd        m10, [o(pd_2048)]
+.main2:
+    IDCT8_1D              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
+    ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    ITX_16X8_LOAD_COEFS 1302
+    call m(iadst_8x16_internal).main2
+    vpbroadcastd        m10, [o(pw_16384)]
+    psubw               m11, m9, m10
+    punpcklwd            m8, m0, m2
+    punpckhwd            m0, m2
+    punpckhwd            m2, m1, m3
+    punpcklwd            m1, m3
+    punpcklwd            m9, m4, m6
+    punpckhwd            m4, m6
+    punpckhwd            m6, m5, m7
+    punpcklwd            m5, m7
+    REPX  {pmulhrsw x, m11}, m8, m1, m4, m6
+    jmp m(idct_16x8_internal).pass1_end
+ALIGN function_align
+.pass2:
+    call .main
+    vpbroadcastd         m9, [o(pw_2048)]
+    pxor                 m8, m8
+    psubw                m8, m9
+    REPX   {pmulhrsw x, m9}, m0, m2, m4, m6
+    jmp m(idct_16x8_internal).end2
+ALIGN function_align
+.main:
+    vpbroadcastd        m10, [o(pd_2048)]
+    ITX_MULSUB_2W         7, 0, 8, 9, 10,  401, 4076 ; t1a, t0a
+    ITX_MULSUB_2W         3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
+    ITX_MULSUB_2W         1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
+    ITX_MULSUB_2W         5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
+    psubw                m8, m2, m6 ; t6
+    paddw                m2, m6     ; t2
+    psubw                m6, m0, m4 ; t4
+    paddw                m0, m4     ; t0
+    psubw                m4, m5, m1 ; t7
+    paddw                m5, m1     ; t3
+    psubw                m1, m7, m3 ; t5
+    paddw                m7, m3     ; t1
+    ITX_MULSUB_2W         6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
+    ITX_MULSUB_2W         4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
+    psubw                m9, m6, m8 ;  t7
+    paddw                m6, m8     ;  out6
+    vpbroadcastd         m8, [o(pw_2896x8)]
+    psubw                m3, m7, m5 ;  t3
+    paddw                m7, m5     ; -out7
+    psubw                m5, m0, m2 ;  t2
+    paddw                m0, m2     ;  out0
+    psubw                m2, m1, m4 ;  t6
+    paddw                m1, m4     ; -out1
+    psubw                m4, m5, m3
+    paddw                m3, m5
+    psubw                m5, m2, m9
+    paddw                m2, m9
+    pmulhrsw             m2, m8     ;  out2
+    pmulhrsw             m3, m8     ; -out3
+    pmulhrsw             m4, m8     ;  out4
+    pmulhrsw             m5, m8     ; -out5
+    ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    ITX_16X8_LOAD_COEFS 1302
+    call m(iadst_8x16_internal).main2
+    vpbroadcastd        m10, [o(pw_16384)]
+    psubw                m9, m10
+    punpcklwd            m8, m6, m4
+    punpckhwd            m6, m4
+    punpcklwd            m4, m7, m5
+    punpckhwd            m7, m5
+    punpckhwd            m5, m3, m1
+    punpcklwd            m3, m1
+    punpckhwd            m1, m2, m0
+    punpcklwd            m2, m0
+    REPX  {pmulhrsw x, m10}, m8, m4, m5, m1
+    REPX  {pmulhrsw x, m9 }, m6, m7, m3, m2
+    punpcklwd            m0, m7, m4
+    punpckhwd            m7, m4
+    punpckhwd            m4, m6, m8
+    punpcklwd            m6, m8
+    punpckhwd            m8, m3, m5
+    punpcklwd            m3, m5
+    punpcklwd            m5, m2, m1
+    punpckhwd            m2, m1
+    punpckhdq            m1, m0, m6
+    punpckldq            m0, m6
+    punpckldq            m6, m7, m4
+    punpckhdq            m7, m4
+    punpckhdq            m4, m3, m5
+    punpckldq            m3, m5
+    punpckldq            m5, m8, m2
+    punpckhdq            m8, m2
+    vinserti128          m2, m6, xm5, 1
+    vperm2i128           m6, m6, m5, 0x31
+    vperm2i128           m5, m1, m4, 0x31
+    vinserti128          m1, m1, xm4, 1
+    vperm2i128           m4, m0, m3, 0x31
+    vinserti128          m0, m0, xm3, 1
+    vinserti128          m3, m7, xm8, 1
+    vperm2i128           m7, m7, m8, 0x31
+    jmp                tx2q
+.pass2:
+    call m(iadst_16x8_internal).main
+    vpbroadcastd         m9, [o(pw_2048)]
+    pxor                 m8, m8
+    psubw                m8, m9
+    pmulhrsw            m10, m7, m8
+    pmulhrsw             m7, m0, m9
+    pmulhrsw             m0, m6, m9
+    pmulhrsw             m6, m1, m8
+    pmulhrsw             m1, m5, m8
+    pmulhrsw             m5, m2, m9
+    pmulhrsw             m2, m4, m9
+    pmulhrsw             m4, m3, m8
+    lea                  r3, [strideq*3]
+    WRITE_16X2           10, 0, 8, 9, strideq*0, strideq*1
+    WRITE_16X2            1, 2, 0, 1, strideq*2, r3
+    jmp m(idct_16x8_internal).end3
+
+INV_TXFM_16X8_FN identity, dct,      15
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+    mova                xm7,     [cq+16*0]
+    mova                xm2,     [cq+16*1]
+    add                  cq, 16*8
+    vpbroadcastd         m3, [o(pw_2896x8)]
+    vinserti128          m7, m7, [cq+16*0], 1
+    vinserti128          m2, m2, [cq+16*1], 1
+    mova                xm6,     [cq-16*6]
+    mova                xm4,     [cq-16*5]
+    vinserti128          m6, m6, [cq+16*2], 1
+    vinserti128          m4, m4, [cq+16*3], 1
+    mova                xm8,     [cq-16*4]
+    mova                xm5,     [cq-16*3]
+    vinserti128          m8, m8, [cq+16*4], 1
+    vinserti128          m5, m5, [cq+16*5], 1
+    mova                xm0,     [cq-16*2]
+    mova                xm1,     [cq-16*1]
+    vinserti128          m0, m0, [cq+16*6], 1
+    vinserti128          m1, m1, [cq+16*7], 1
+    vpbroadcastd         m9, [o(pw_5793x4)]
+    vpbroadcastd        m10, [o(pw_16384)]
+    REPX   {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
+    punpcklwd            m3, m7, m2
+    punpckhwd            m7, m2
+    punpcklwd            m2, m6, m4
+    punpckhwd            m6, m4
+    punpcklwd            m4, m8, m5
+    punpckhwd            m8, m5
+    punpcklwd            m5, m0, m1
+    punpckhwd            m0, m1
+    REPX       {psllw x, 2}, m3, m7, m2, m6, m4, m8, m5, m0
+    punpckldq            m1, m3, m2
+    punpckhdq            m3, m2
+    punpckldq            m2, m4, m5
+    punpckhdq            m4, m5
+    punpckldq            m5, m7, m6
+    punpckhdq            m7, m6
+    punpckldq            m6, m8, m0
+    punpckhdq            m8, m0
+    REPX   {pmulhrsw x, m9}, m1, m3, m2, m4, m5, m7, m6, m8
+    punpcklqdq           m0, m1, m2
+    punpckhqdq           m1, m2
+    punpcklqdq           m2, m3, m4
+    punpckhqdq           m3, m4
+    punpcklqdq           m4, m5, m6
+    punpckhqdq           m5, m6
+    punpcklqdq           m6, m7, m8
+    punpckhqdq           m7, m8
+    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m8, [o(pw_4096)]
+    jmp m(idct_16x8_internal).end
+
+%define o_base pw_5 + 128
+
+%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 16x16, 16
+%ifidn %1_%2, dct_dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    mov                 r2d, 8
+    jmp m(inv_txfm_add_dct_dct_16x4).dconly
+%elifidn %1_%2, dct_identity
+    vpbroadcastd         m3, [o(pw_2896x8)]
+    pmulhrsw             m3, [cq]
+    vpbroadcastd         m0, [o(pw_8192)]
+    vpbroadcastd         m1, [o(pw_5793x4)]
+    vpbroadcastw         m4, [o(deint_shuf)] ; pb_0_1
+    pcmpeqb              m5, m5
+    pxor                 m6, m6
+    mova               [cq], m6
+    paddb                m5, m5 ; pb_m2
+    pmulhrsw             m3, m0
+    psrlw                m0, 2  ; pw_2048
+    psllw                m3, 2
+    pmulhrsw             m3, m1
+    pmulhrsw             m3, m0
+    mov                 r3d, 8
+.loop:
+    mova                xm1, [dstq]
+    vinserti128          m1, m1, [dstq+strideq*8], 1
+    pshufb               m0, m3, m4
+    psubb                m4, m5 ; += 2
+    punpckhbw            m2, m1, m6
+    punpcklbw            m1, m6
+    paddw                m2, m0
+    paddw                m1, m0
+    packuswb             m1, m2
+    mova             [dstq], xm1
+    vextracti128 [dstq+strideq*8], m1, 1
+    add                dstq, strideq
+    dec                 r3d
+    jg .loop
+    RET
+%elifidn %1_%2, identity_dct
+    movd                xm0,     [cq+32*0 ]
+    movd                xm2,     [cq+32*1 ]
+    movd                xm1,     [cq+32*2 ]
+    movd                xm3,     [cq+32*3 ]
+    vinserti128          m0, m0, [cq+32*8 ], 1
+    vinserti128          m2, m2, [cq+32*9 ], 1
+    vinserti128          m1, m1, [cq+32*10], 1
+    vinserti128          m3, m3, [cq+32*11], 1
+    punpcklwd            m0, m2
+    punpcklwd            m1, m3
+    punpckldq            m0, m1
+    movd                xm1,     [cq+32*4 ]
+    movd                xm3,     [cq+32*5 ]
+    movd                xm2,     [cq+32*6 ]
+    movd                xm4,     [cq+32*7 ]
+    vinserti128          m1, m1, [cq+32*12], 1
+    vinserti128          m3, m3, [cq+32*13], 1
+    vinserti128          m2, m2, [cq+32*14], 1
+    vinserti128          m4, m4, [cq+32*15], 1
+    punpcklwd            m1, m3
+    vpbroadcastd         m3, [o(pw_5793x4)]
+    punpcklwd            m2, m4
+    vpbroadcastd         m4, [o(pw_8192)]
+    punpckldq            m1, m2
+    vpbroadcastd         m2, [o(pw_2896x8)]
+    punpcklqdq           m0, m1
+    psllw                m0, 2
+    pmulhrsw             m0, m3
+    pmulhrsw             m0, m4
+    psrlw                m4, 2 ; pw_2048
+    pmulhrsw             m0, m2
+    pmulhrsw             m0, m4
+    mov                 r3d, 8
+    jmp m(inv_txfm_add_identity_dct_16x4).end
+%endif
+%endmacro
+
+%macro ITX_16X16_LOAD_COEFS 0
+    mova                 m0, [cq+32*0]
+    mova                 m1, [cq+32*1]
+    mova                 m2, [cq+32*2]
+    mova                 m3, [cq+32*3]
+    add                  cq, 32*8
+    mova                 m4, [cq-32*4]
+    mova                 m5, [cq-32*3]
+    mova                 m6, [cq-32*2]
+    mova                 m7, [cq-32*1]
+    mova                 m8, [cq+32*0]
+    mova                 m9, [cq+32*1]
+    mova                m10, [cq+32*2]
+    mova                m11, [cq+32*3]
+    mova                m12, [cq+32*4]
+    mova                m13, [cq+32*5]
+    mova                m14, [cq+32*6]
+    mova                m15, [cq+32*7]
+    mova              [rsp], m15
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct,      0
+INV_TXFM_16X16_FN dct, identity, 15
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+
+cglobal idct_16x16_internal, 0, 0, 0, 32*3, dst, stride, c, eob, tx2
+    ITX_16X16_LOAD_COEFS
+    call .main
+.pass1_end:
+    vpbroadcastd         m1, [o(pw_8192)]
+    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+    vextracti128 [rsp+16*5], m8, 1
+    mova         [rsp+16*1], xm8
+.pass1_end2:
+    vextracti128 [rsp+16*4], m0, 1
+    mova         [rsp+16*0], xm0
+    REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
+    pmulhrsw             m1, [rsp+32*1]
+    vperm2i128           m8, m1, m9, 0x31
+    vinserti128          m1, m1, xm9, 1
+    vperm2i128           m9, m2, m10, 0x31
+    vinserti128          m2, m2, xm10, 1
+    vperm2i128          m10, m3, m11, 0x31
+    vinserti128          m3, m3, xm11, 1
+    vperm2i128          m11, m4, m12, 0x31
+    vinserti128          m4, m4, xm12, 1
+    vperm2i128          m12, m5, m13, 0x31
+    vinserti128          m5, m5, xm13, 1
+    vperm2i128          m13, m6, m14, 0x31
+    vinserti128          m6, m6, xm14, 1
+    vperm2i128          m14, m7, m15, 0x31
+    vinserti128          m7, m7, xm15, 1
+    mova                m15, [rsp+32*2]
+.pass1_end3:
+    punpcklwd            m0, m9, m10
+    punpckhwd            m9, m10
+    punpcklwd           m10, m15, m8
+    punpckhwd           m15, m8
+    punpckhwd            m8, m11, m12
+    punpcklwd           m11, m12
+    punpckhwd           m12, m13, m14
+    punpcklwd           m13, m14
+    punpckhdq           m14, m11, m13
+    punpckldq           m11, m13
+    punpckldq           m13, m15, m9
+    punpckhdq           m15, m9
+    punpckldq            m9, m10, m0
+    punpckhdq           m10, m0
+    punpckhdq            m0, m8, m12
+    punpckldq            m8, m12
+    punpcklqdq          m12, m13, m8
+    punpckhqdq          m13, m8
+    punpcklqdq           m8, m9, m11
+    punpckhqdq           m9, m11
+    punpckhqdq          m11, m10, m14
+    punpcklqdq          m10, m14
+    punpcklqdq          m14, m15, m0
+    punpckhqdq          m15, m0
+    mova                 m0, [rsp]
+    mova              [rsp], m15
+    punpckhwd           m15, m4, m5
+    punpcklwd            m4, m5
+    punpckhwd            m5, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m6, m7
+    punpcklwd            m6, m7
+    punpckhwd            m7, m2, m3
+    punpcklwd            m2, m3
+    punpckhdq            m3, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m4, m6
+    punpckhdq            m4, m6
+    punpckhdq            m6, m5, m7
+    punpckldq            m5, m7
+    punpckldq            m7, m15, m1
+    punpckhdq           m15, m1
+    punpckhqdq           m1, m0, m2
+    punpcklqdq           m0, m2
+    punpcklqdq           m2, m3, m4
+    punpckhqdq           m3, m4
+    punpcklqdq           m4, m5, m7
+    punpckhqdq           m5, m7
+    punpckhqdq           m7, m6, m15
+    punpcklqdq           m6, m15
+    jmp                tx2q
+.pass2:
+    call .main
+.end:
+    vpbroadcastd         m1, [o(pw_2048)]
+    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+    mova              [rsp], m6
+.end2:
+    REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
+    pmulhrsw             m1, [rsp+32*1]
+    lea                  r3, [strideq*3]
+    WRITE_16X2            0,  1,  6,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2        [rsp],  7,  0,  1, strideq*2, r3
+.end3:
+    pxor                 m2, m2
+    REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            8,  9,  0,  1, strideq*0, strideq*1
+    WRITE_16X2           10, 11,  0,  1, strideq*2, r3
+    REPX {mova [cq+32*x], m2},  0,  1,  2,  3,  4,  5,  6,  7
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2           12, 13,  0,  1, strideq*0, strideq*1
+    WRITE_16X2           14, 15,  0,  1, strideq*2, r3
+    RET
+ALIGN function_align
+.main:
+    vpbroadcastd        m15, [o(pd_2048)]
+    mova [rsp+gprsize+32*1], m1
+    mova [rsp+gprsize+32*2], m9
+    IDCT8_1D              0,  2,  4,  6,  8, 10, 12, 14,  1,  9, 15
+    mova                 m1, [rsp+gprsize+32*2] ; in9
+    mova [rsp+gprsize+32*2], m14 ; tmp7
+    mova                 m9, [rsp+gprsize+32*1] ; in1
+    mova [rsp+gprsize+32*1], m10 ; tmp5
+    mova                m14, [rsp+gprsize+32*0] ; in15
+    mova [rsp+gprsize+32*0], m6  ; tmp3
+    IDCT16_1D_ODDHALF     9,  3,  5,  7,  1, 11, 13, 14,  6, 10, 15
+    mova                 m6, [rsp+gprsize+32*1] ; tmp5
+    psubw               m15, m0, m14  ; out15
+    paddw                m0, m14      ; out0
+    psubw               m14, m2, m13  ; out14
+    paddw                m2, m13      ; out1
+    mova [rsp+gprsize+32*1], m2
+    psubw               m13, m4, m11  ; out13
+    paddw                m2, m4, m11  ; out2
+    psubw               m11, m8, m7   ; out11
+    paddw                m4, m8, m7   ; out4
+    mova                 m7, [rsp+gprsize+32*2] ; tmp7
+    psubw               m10, m6, m5   ; out10
+    paddw                m5, m6       ; out5
+    psubw                m8, m7, m9   ; out8
+    paddw                m7, m9       ; out7
+    psubw                m9, m12, m3  ; out9
+    paddw                m6, m12, m3  ; out6
+    mova                 m3, [rsp+gprsize+32*0] ; tmp3
+    psubw               m12, m3, m1   ; out12
+    paddw                m3, m1       ; out3
+    ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal, 0, 0, 0, 32*3, dst, stride, c, eob, tx2
+    ITX_16X16_LOAD_COEFS
+    call .main
+    vpbroadcastd         m1, [o(pw_8192)]
+    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+    vextracti128 [rsp+16*5], m8, 1
+    mova         [rsp+16*1], xm8
+    pxor                 m8, m8
+    psubw                m1, m8, m1
+    jmp m(idct_16x16_internal).pass1_end2
+ALIGN function_align
+.pass2:
+    call .main
+    vpbroadcastd         m1, [o(pw_2048)]
+    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+    mova         [rsp+32*0], m6
+    pxor                 m6, m6
+    psubw                m1, m6, m1
+    jmp m(idct_16x16_internal).end2
+ALIGN function_align
+.main:
+    vpbroadcastd        m15, [o(pd_2048)]
+    mova [rsp+gprsize+32*1], m0
+    mova [rsp+gprsize+32*2], m4
+    ITX_MULSUB_2W        13,  2,  0,  4, 15,  995, 3973 ; t3,  t2
+    ITX_MULSUB_2W         9,  6,  0,  4, 15, 2440, 3290 ; t7,  t6
+    ITX_MULSUB_2W         5, 10,  0,  4, 15, 3513, 2106 ; t11, t10
+    ITX_MULSUB_2W         1, 14,  0,  4, 15, 4052,  601 ; t15, t14
+    psubw                m0, m2, m10  ; t10a
+    paddw                m2, m10      ; t2a
+    psubw               m10, m13, m5  ; t11a
+    paddw               m13, m5       ; t3a
+    psubw                m5, m6, m14  ; t14a
+    paddw                m6, m14      ; t6a
+    psubw               m14, m9, m1   ; t15a
+    paddw                m9, m1       ; t7a
+    ITX_MULSUB_2W         0, 10,  1,  4, 15, 3406, 2276 ; t11, t10
+    ITX_MULSUB_2W        14,  5,  1,  4, 15, 2276, 3406 ; t14, t15
+    psubw                m1, m10, m14 ; t14a
+    paddw               m10, m14      ; t10a
+    psubw               m14, m0, m5   ; t15a
+    paddw                m0, m5       ; t11a
+    psubw                m5, m2, m6   ; t6
+    paddw                m2, m6       ; t2
+    psubw                m6, m13, m9  ; t7
+    paddw               m13, m9       ; t3
+    ITX_MULSUB_2W         6,  5,  4,  9, 15, 3784, 1567 ; t6a, t7a
+    ITX_MULSUB_2W        14,  1,  4,  9, 15, 3784, 1567 ; t14, t15
+    mova                 m9, [rsp+gprsize+32*0] ; in15
+    mova [rsp+gprsize+32*0], m10 ; t10a
+    mova                 m4, [rsp+gprsize+32*1] ; in0
+    mova [rsp+gprsize+32*1], m6  ; t6a
+    mova                 m6, [rsp+gprsize+32*2] ; in4
+    mova [rsp+gprsize+32*2], m2  ; t2
+    ITX_MULSUB_2W         9,  4,  2, 10, 15,  201, 4091 ; t1,  t0
+    ITX_MULSUB_2W        11,  6,  2, 10, 15, 1751, 3703 ; t5,  t4
+    ITX_MULSUB_2W         7,  8,  2, 10, 15, 3035, 2751 ; t9,  t8
+    ITX_MULSUB_2W         3, 12,  2, 10, 15, 3857, 1380 ; t13, t12
+    psubw               m10, m4, m8  ; t8a
+    paddw                m8, m4      ; t0a
+    psubw                m4, m9, m7  ; t9a
+    paddw                m9, m7      ; t1a
+    psubw                m7, m6, m12 ; t12a
+    paddw                m6, m12     ; t4a
+    psubw               m12, m11, m3 ; t13a
+    paddw               m11, m3      ; t5a
+    ITX_MULSUB_2W        10,  4,  2,  3, 15,  799, 4017 ; t9,  t8
+    ITX_MULSUB_2W        12,  7,  2,  3, 15, 4017,  799 ; t12, t13
+    psubw                m3, m9, m11 ; t5
+    paddw                m9, m11     ; t1
+    psubw               m11, m4, m12 ; t12a
+    paddw                m4, m12     ; t8a
+    paddw               m12, m8, m6  ; t0
+    psubw                m8, m6      ; t4
+    paddw                m6, m10, m7 ; t9a
+    psubw               m10, m7      ; t13a
+    ITX_MULSUB_2W         8,  3,  2,  7, 15, 1567, 3784 ; t5a, t4a
+    ITX_MULSUB_2W        11, 10,  2,  7, 15, 1567, 3784 ; t13, t12
+    mova                 m7, [rsp+gprsize+32*0] ; t10a
+    mova                 m2, [rsp+gprsize+32*1] ; t6a
+    paddw               m15, m9, m13  ; -out15
+    psubw                m9, m13      ;  t3a
+    paddw               m13, m11, m1  ; -out13
+    psubw               m11, m1       ;  t15a
+    psubw                m1, m4, m7   ;  t10
+    paddw                m7, m4       ; -out1
+    psubw                m4, m3, m2   ;  t6
+    paddw                m3, m2       ; -out3
+    paddw                m2, m10, m14 ;  out2
+    psubw               m10, m14      ;  t14a
+    paddw               m14, m6, m0   ;  out14
+    psubw                m6, m0       ;  t11
+    mova                 m0, [rsp+gprsize+32*2] ; t2
+    mova [rsp+gprsize+32*1], m7
+    psubw                m7, m12, m0  ;  t2a
+    paddw                m0, m12      ;  out0
+    paddw               m12, m8, m5   ;  out12
+    psubw                m8, m5       ;  t7
+    paddw                m5, m10, m11 ; -out5
+    psubw               m10, m11      ;  out10
+    psubw               m11, m4, m8   ; -out11
+    paddw                m4, m8       ;  out4
+    psubw                m8, m7, m9   ;  out8
+    paddw                m7, m9       ; -out7
+    psubw                m9, m1, m6   ; -out9
+    paddw                m6, m1       ;  out6
+    vpbroadcastd         m1, [o(pw_2896x8)]
+    REPX   {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
+    ret
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal, 0, 0, 0, 32*3, dst, stride, c, eob, tx2
+    ITX_16X16_LOAD_COEFS
+    call m(iadst_16x16_internal).main
+    vpbroadcastd         m1, [o(pw_8192)]
+    pmulhrsw             m6, m1
+    mova         [rsp+32*2], m6
+    pmulhrsw             m6, m1, m4
+    pmulhrsw             m4, m1, m10
+    pmulhrsw            m10, m1, m12
+    pmulhrsw            m12, m1, m2
+    pmulhrsw             m2, m1, m8
+    pmulhrsw             m8, m1, m14
+    pmulhrsw            m14, m1, m0
+    pxor                 m0, m0
+    psubw                m0, m1
+    REPX   {pmulhrsw x, m0}, m3, m5, m7, m11, m15
+    pmulhrsw             m1, m0, m9
+    pmulhrsw             m9, m0, m13
+    pmulhrsw             m0, [rsp+32*1]
+    mova         [rsp+16*0], xm15
+    mova         [rsp+16*1], xm7
+    vperm2i128          m15, m15, m7, 0x31
+    vinserti128          m7, m2, xm14, 1
+    vperm2i128          m14, m2, m14, 0x31
+    vinserti128          m2, m9, xm5, 1
+    vperm2i128           m9, m9, m5, 0x31
+    vinserti128          m5, m4, xm12, 1
+    vperm2i128          m12, m4, m12, 0x31
+    vinserti128          m4, m11, xm3, 1
+    vperm2i128          m11, m11, m3, 0x31
+    vinserti128          m3, m10, xm6, 1
+    vperm2i128          m10, m10, m6, 0x31
+    vinserti128          m6, m1, xm0, 1
+    vperm2i128          m13, m1, m0, 0x31
+    vinserti128          m1, m8, [rsp+32*2], 1
+    vperm2i128           m8, m8, [rsp+32*2], 0x31
+    jmp m(idct_16x16_internal).pass1_end3
+.pass2:
+    call m(iadst_16x16_internal).main
+    vpbroadcastd         m1, [o(pw_2048)]
+    pmulhrsw             m0, m1
+    pmulhrsw             m8, m1
+    mova         [rsp+32*0], m0
+    mova         [rsp+32*2], m8
+    pxor                 m0, m0
+    psubw                m0, m1
+    pmulhrsw             m8, m0, m7
+    pmulhrsw             m7, m0, m9
+    pmulhrsw             m9, m1, m6
+    pmulhrsw             m6, m1, m10
+    pmulhrsw            m10, m0, m5
+    pmulhrsw             m5, m0, m11
+    pmulhrsw            m11, m1, m4
+    pmulhrsw             m4, m1, m12
+    pmulhrsw            m12, m0, m3
+    pmulhrsw             m3, m0, m13
+    pmulhrsw            m13, m1, m2
+    pmulhrsw             m1, m14
+    pmulhrsw            m14, m0, [rsp+32*1]
+    pmulhrsw             m0, m15
+    lea                  r3, [strideq*3]
+    WRITE_16X2            0,  1,  2,  0, strideq*0, strideq*1
+    mova                m15, [rsp+32*0]
+    WRITE_16X2            3,  4,  0,  1, strideq*2, r3
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            5,  6,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            7, [rsp+32*2],  0,  1, strideq*2, r3
+    jmp m(idct_16x16_internal).end3
+
+INV_TXFM_16X16_FN identity, dct,      15
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal, 0, 0, 0, 32*3, dst, stride, c, eob, tx2
+    mova                xm0,      [cq+16*0]
+    mova               xm15,      [cq+16*1]
+    mova                xm1,      [cq+16*2]
+    mova                xm8,      [cq+16*3]
+    mova                xm2,      [cq+16*4]
+    mova                xm9,      [cq+16*5]
+    mova                xm3,      [cq+16*6]
+    mova               xm10,      [cq+16*7]
+    add                  cq, 16*16
+    vinserti128          m0, m0,  [cq+16*0], 1
+    vinserti128         m15, m15, [cq+16*1], 1
+    mova                xm4,      [cq-16*8]
+    mova               xm11,      [cq-16*7]
+    vinserti128          m1, m1,  [cq+16*2], 1
+    vinserti128          m8, m8,  [cq+16*3], 1
+    mova                xm5,      [cq-16*6]
+    mova               xm12,      [cq-16*5]
+    vinserti128          m2, m2,  [cq+16*4], 1
+    vinserti128          m9, m9,  [cq+16*5], 1
+    mova                xm6,      [cq-16*4]
+    mova               xm13,      [cq-16*3]
+    vinserti128          m3, m3,  [cq+16*6], 1
+    vinserti128         m10, m10, [cq+16*7], 1
+    mova                xm7,      [cq-16*2]
+    mova               xm14,      [cq-16*1]
+    vinserti128          m4, m4,  [cq+16*8], 1
+    vinserti128         m11, m11, [cq+16*9], 1
+    vinserti128          m5, m5,  [cq+16*10], 1
+    vinserti128         m12, m12, [cq+16*11], 1
+    vinserti128          m6, m6,  [cq+16*12], 1
+    vinserti128         m13, m13, [cq+16*13], 1
+    vinserti128          m7, m7,  [cq+16*14], 1
+    vinserti128         m14, m14, [cq+16*15], 1
+    REPX   {psllw    x, 2 }, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
+                             m8,  m9,  m10, m11, m12, m13, m14, m15
+    mova              [rsp], m0
+    vpbroadcastd         m0, [o(pw_5793x4)]
+    REPX   {pmulhrsw x, m0},      m1,  m2,  m3,  m4,  m5,  m6,  m7, \
+                             m8,  m9,  m10, m11, m12, m13, m14, m15
+    pmulhrsw             m0, [rsp]
+    mova              [rsp], m1
+    vpbroadcastd         m1, [o(pw_8192)]
+    REPX   {pmulhrsw x, m1}, m0,       m2,  m3,  m4,  m5,  m6,  m7, \
+                             m8,  m9,  m10, m11, m12, m13, m14, m15
+    pmulhrsw             m1, [rsp]
+    mova              [rsp], m0
+    jmp m(idct_16x16_internal).pass1_end3
+ALIGN function_align
+.pass2:
+    vpbroadcastd        m15, [o(pw_5793x4)]
+    REPX  {psllw    x, 2  }, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX  {pmulhrsw x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
+    mova         [rsp+32*1], m1
+    mova                 m1, [rsp+32*0]
+    REPX  {psllw    x, 2  }, m8, m9, m10, m11, m12, m13, m14, m1
+    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
+    pmulhrsw            m15, m1
+    jmp m(idct_16x16_internal).end
+
+%define o_base iadst4_dconly2a + 128
+
+%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
+%if %3
+    vpbroadcastd        m15, [o(pw_2896x8)]
+    pmulhrsw             m0, m15, [%1+%2*0]
+    pmulhrsw             m1, m15, [%1+%2*1]
+    pmulhrsw             m2, m15, [%1+%2*2]
+    pmulhrsw             m3, m15, [%1+%2*3]
+    pmulhrsw             m4, m15, [%1+%2*4]
+    pmulhrsw             m5, m15, [%1+%2*5]
+    pmulhrsw             m6, m15, [%1+%2*6]
+    pmulhrsw             m7, m15, [%1+%2*7]
+%else
+    mova                 m0, [%1+%2*0]
+    mova                 m1, [%1+%2*1]
+    mova                 m2, [%1+%2*2]
+    mova                 m3, [%1+%2*3]
+    mova                 m4, [%1+%2*4]
+    mova                 m5, [%1+%2*5]
+    mova                 m6, [%1+%2*6]
+    mova                 m7, [%1+%2*7]
+%endif
+%endmacro
+
+%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2
+%if %3
+%if %3 == 1
+    vpbroadcastd        m15, [o(pw_2896x8)]
+%endif
+    pmulhrsw             m8, m15, [%1+%2*0]
+    pmulhrsw             m9, m15, [%1+%2*1]
+    pmulhrsw            m10, m15, [%1+%2*2]
+    pmulhrsw            m11, m15, [%1+%2*3]
+    pmulhrsw            m12, m15, [%1+%2*4]
+    pmulhrsw            m13, m15, [%1+%2*5]
+    pmulhrsw            m14, m15, [%1+%2*6]
+    pmulhrsw            m15,      [%1+%2*7]
+%else
+    mova                 m8, [%1+%2*0]
+    mova                 m9, [%1+%2*1]
+    mova                m10, [%1+%2*2]
+    mova                m11, [%1+%2*3]
+    mova                m12, [%1+%2*4]
+    mova                m13, [%1+%2*5]
+    mova                m14, [%1+%2*6]
+    mova                m15, [%1+%2*7]
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jz .dconly
+    PROLOGUE              0, 0, 16, 32*3, dst, stride, c, eob
+    %undef cmp
+    cmp                eobd, 106
+    jle .fast
+    LOAD_8ROWS      cq+32*1, 32*2
+    call m(idct_16x8_internal).main
+    vperm2i128          m11, m0, m4, 0x31
+    vinserti128          m0, m0, xm4, 1
+    vperm2i128           m4, m1, m5, 0x31
+    vinserti128          m1, m1, xm5, 1
+    vperm2i128           m5, m2, m6, 0x31
+    vinserti128          m2, m2, xm6, 1
+    vperm2i128           m6, m3, m7, 0x31
+    vinserti128          m3, m3, xm7, 1
+    pxor                 m7, m7
+    REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15
+    punpckhwd            m7, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m2, m3
+    punpcklwd            m2, m3
+    punpcklwd            m3, m11, m4
+    punpckhwd           m11, m4
+    punpckhwd            m4, m5, m6
+    punpcklwd            m5, m6
+    punpckhdq            m6, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m5
+    punpckhdq            m3, m5
+    punpckhdq            m5, m11, m4
+    punpckldq           m11, m4
+    punpckldq            m4, m7, m1
+    punpckhdq            m7, m1
+    punpckhqdq          m12, m6, m0
+    punpcklqdq           m0, m6     ; out4
+    punpckhqdq          m13, m7, m4
+    punpcklqdq           m4, m7     ; out5
+    punpckhqdq          m14, m3, m2
+    punpcklqdq           m2, m3     ; out6
+    punpckhqdq          m15, m5, m11
+    punpcklqdq          m11, m5     ; out7
+    mova         [rsp+32*0], m0
+    mova         [rsp+32*1], m4
+    mova         [rsp+32*2], m2
+.fast:
+    LOAD_8ROWS      cq+32*0, 32*2
+    call m(idct_16x8_internal).main
+    vperm2i128           m8, m0, m4, 0x31
+    vinserti128          m0, m0, xm4, 1
+    vperm2i128           m4, m1, m5, 0x31
+    vinserti128          m1, m1, xm5, 1
+    vperm2i128           m5, m2, m6, 0x31
+    vinserti128          m2, m2, xm6, 1
+    vperm2i128           m6, m3, m7, 0x31
+    vinserti128          m3, m3, xm7, 1
+    vpbroadcastd         m9, [o(pw_8192)]
+    pxor                 m7, m7
+    REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14
+    punpckhwd            m7, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m2, m3
+    punpcklwd            m2, m3
+    punpckhwd            m3, m8, m4
+    punpcklwd            m8, m4
+    punpckhwd            m4, m5, m6
+    punpcklwd            m5, m6
+    punpckhdq            m6, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m8, m5
+    punpckhdq            m8, m5
+    punpckhdq            m5, m3, m4
+    punpckldq            m3, m4
+    punpckhdq            m4, m7, m1
+    punpckldq            m7, m1
+    punpcklqdq           m1, m7, m4
+    punpckhqdq           m7, m4     ; out9
+    punpckhqdq           m4, m2, m8 ; out10
+    punpcklqdq           m2, m8
+    punpckhqdq           m8, m3, m5
+    punpcklqdq           m3, m5
+    punpckhqdq           m5, m0, m6 ; out8
+    punpcklqdq           m0, m6
+    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7
+    cmp                eobd, 106
+    jg .full
+    mova         [rsp+32*0], m5
+    mova         [rsp+32*1], m7
+    mova         [rsp+32*2], m4
+    pmulhrsw            m11, m9, m8
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7
+    call .main_fast
+    jmp .pass2
+.dconly:
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm2
+    psrlw               xm2, 2 ; pw_2048
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    mov                 r2d, 8
+    jmp m(inv_txfm_add_dct_dct_8x8).end2
+.full:
+    REPX   {pmulhrsw x, m9}, m12, m13, m14, m15
+    pmulhrsw             m6, m9, [rsp+32*2]
+    mova         [rsp+32*2], m4
+    pmulhrsw             m4, m9, [rsp+32*0]
+    mova         [rsp+32*0], m5
+    pmulhrsw             m5, m9, [rsp+32*1]
+    mova         [rsp+32*1], m7
+    pmulhrsw             m7, m9, m11
+    pmulhrsw            m11, m9, m8
+    call .main
+.pass2:
+    vpbroadcastd        m12, [o(pw_2048)]
+    REPX  {pmulhrsw x, m12}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
+                             m8,  m9,  m10, m11,      m13, m14, m15
+    pmulhrsw            m12, [rsp]
+    REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14
+    REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15
+    mova         [rsp+32*0], m4
+    mova         [rsp+32*1], m6
+    lea                  r3, [strideq*3]
+    WRITE_8X4             0,  1,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4             2,  3,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4    [rsp+32*0],  5,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4    [rsp+32*1],  7,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4             8,  9,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4            10, 11,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4            12, 13,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4            14, 15,  4,  6
+    RET
+ALIGN function_align
+.main_fast: ; bottom half is zero
+    call m(idct_8x16_internal).main
+    mova                 m8, [rsp+gprsize+0*32]
+    mova [rsp+gprsize+0*32], m0
+    mova                 m9, [rsp+gprsize+1*32]
+    mova [rsp+gprsize+1*32], m1
+    mova                 m0, [rsp+gprsize+2*32]
+    mova [rsp+gprsize+2*32], m6
+    punpcklwd            m1, m8, m8
+    punpckhwd            m8, m8
+    punpcklwd           m15, m9, m9
+    punpckhwd            m9, m9
+    punpcklwd           m14, m0, m0
+    punpckhwd            m0, m0
+    punpcklwd           m13, m11, m11
+    punpckhwd           m11, m11
+    ITX_MULHRSW_SHL3      1,  6,   201, 4091 ; t16a, t31a
+    ITX_MULHRSW_SHL3      8,  6,  m601, 4052 ; t23a, t24a
+    ITX_MULHRSW_SHL3     15,  6,   995, 3973 ; t20a, t27a
+    ITX_MULHRSW_SHL3      9,  6, m1380, 3857 ; t19a, t28a
+    ITX_MULHRSW_SHL3     14,  6,  1751, 3703 ; t18a, t29a
+    ITX_MULHRSW_SHL3      0,  6, m2106, 3513 ; t21a, t26a
+    ITX_MULHRSW_SHL3     13,  6,  2440, 3290 ; t22a, t25a
+    ITX_MULHRSW_SHL3     11,  6, m2751, 3035 ; t17a, t30a
+    jmp .main2
+ALIGN function_align
+.main:
+    call m(idct_8x16_internal).main
+    mova                 m8, [rsp+gprsize+0*32]
+    mova [rsp+gprsize+0*32], m0
+    mova                 m9, [rsp+gprsize+1*32]
+    mova [rsp+gprsize+1*32], m1
+    mova                 m0, [rsp+gprsize+2*32]
+    mova [rsp+gprsize+2*32], m6
+    punpcklwd            m1, m15, m8  ; in31 in1
+    punpckhwd            m8, m15      ; in3  in29
+    punpcklwd           m15, m14, m9  ; in27 in5
+    punpckhwd            m9, m14      ; in7  in25
+    punpcklwd           m14, m13, m0  ; in23 in9
+    punpckhwd            m0, m13      ; in11 in21
+    punpcklwd           m13, m12, m11 ; in19 in13
+    punpckhwd           m11, m12      ; in15 in17
+    ITX_MUL2X_PACK        1,  6, 12, 10,  201, 4091, 3 ; t16a, t31a
+    ITX_MUL2X_PACK        8,  6, 12, 10, 4052,  601, 3 ; t23a, t24a
+    ITX_MUL2X_PACK       15,  6, 12, 10,  995, 3973, 3 ; t20a, t27a
+    ITX_MUL2X_PACK        9,  6, 12, 10, 3857, 1380, 3 ; t19a, t28a
+    ITX_MUL2X_PACK       14,  6, 12, 10, 1751, 3703, 3 ; t18a, t29a
+    ITX_MUL2X_PACK        0,  6, 12, 10, 3513, 2106, 3 ; t21a, t26a
+    ITX_MUL2X_PACK       13,  6, 12, 10, 2440, 3290, 3 ; t22a, t25a
+    ITX_MUL2X_PACK       11,  6, 12, 10, 3035, 2751, 3 ; t17a, t30a
+.main2:
+    psubw                m6, m1, m11  ; t17 t30
+    paddw                m1, m11      ; t16 t31
+    psubw               m11, m9, m14  ; t18 t29
+    paddw                m9, m14      ; t19 t28
+    psubw               m14, m15, m0  ; t21 t26
+    paddw               m15, m0       ; t20 t27
+    psubw                m0, m8, m13  ; t22 t25
+    paddw                m8, m13      ; t23 t24
+    ITX_MUL2X_PACK        6, 12, 13, 10,   799, 4017, 3 ; t17a t30a
+    ITX_MUL2X_PACK       11, 12, 13, 10, m4017,  799, 3 ; t18a t29a
+    ITX_MUL2X_PACK       14, 12, 13, 10,  3406, 2276, 3 ; t21a t26a
+    ITX_MUL2X_PACK        0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
+    psubw               m13, m1, m9   ; t19a t28a
+    paddw                m1, m9       ; t16a t31a
+    psubw                m9, m8, m15  ; t20a t27a
+    paddw                m8, m15      ; t23a t24a
+    psubw               m15, m6, m11  ; t18  t29
+    paddw                m6, m11      ; t17  t30
+    psubw               m11, m0, m14  ; t21  t26
+    paddw                m0, m14      ; t22  t25
+    ITX_MUL2X_PACK       15, 12, 14, 10,  1567, 3784, 1 ; t18a t29a
+    ITX_MUL2X_PACK       13, 12, 14, 10,  1567, 3784, 1 ; t19  t28
+    ITX_MUL2X_PACK        9, 12, 14, 10, m3784, 1567, 1 ; t20  t27
+    ITX_MUL2X_PACK       11, 12, 14, 10, m3784, 1567, 1 ; t21a t26a
+    vbroadcasti128      m12, [o(deint_shuf)]
+    REPX    {pshufb x, m12}, m0, m1, m6, m8
+    psubw               m14, m1, m8   ; t23  t24
+    paddw                m1, m8       ; t16  t31
+    psubw                m8, m6, m0   ; t22a t25a
+    paddw                m6, m0       ; t17a t30a
+    psubw                m0, m15, m11 ; t21  t26
+    paddw               m15, m11      ; t18  t29
+    psubw               m11, m13, m9  ; t20a t27a
+    paddw               m13, m9       ; t19a t28a
+    vpbroadcastd        m12, [o(pw_2896x8)]
+    punpcklqdq            m9, m11, m0 ; t20a t21
+    punpckhqdq           m11, m0      ; t27a t26
+    punpcklqdq            m0, m14, m8 ; t23  t22a
+    punpckhqdq           m14, m8      ; t24  t25a
+    psubw                 m8, m11, m9 ; t20  t21a
+    paddw                m11, m9      ; t27  t26a
+    psubw                 m9, m14, m0 ; t23a t22
+    paddw                m14, m0      ; t24a t25
+    REPX   {pmulhrsw x, m12}, m8, m9, m14, m11
+    punpcklqdq           m0, m1, m6   ; t16  t17a
+    punpckhqdq           m1, m6       ; t31  t30a
+    psubw               m10, m5, m8   ; out20 out21
+    paddw                m5, m8       ; out11 out10
+    psubw                m6, m3, m14  ; out24 out25
+    paddw                m3, m14      ; out7  out6
+    psubw                m8, m7, m0   ; out16 out17
+    paddw                m7, m0       ; out15 out14
+    mova                 m0, [rsp+gprsize+0*32]
+    punpcklqdq          m12, m13, m15 ; t19a t18
+    punpckhqdq          m13, m15      ; t28a t29
+    psubw               m15, m0, m1   ; out31 out30
+    paddw                m0, m1       ; out0  out1
+    mova                 m1, [rsp+gprsize+1*32]
+    mova [rsp+gprsize+0*32], m6
+    mova                 m6, [rsp+gprsize+2*32]
+    psubw               m14, m1, m13  ; out28 out29
+    paddw                m1, m13      ; out3  out2
+    psubw               m13, m2, m11  ; out27 out26
+    paddw                m2, m11      ; out4  out5
+    psubw               m11, m4, m9   ; out23 out22
+    paddw                m4, m9       ; out8  out9
+    psubw                m9, m6, m12  ; out19 out18
+    paddw                m6, m12      ; out12 out13
+    ret
+
+%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
+    vbroadcasti128      m%1, [cq+16*%3]
+    vbroadcasti128      m%2, [cq+16*%4]
+    shufpd              m%1, m%1, m%2, 0x0c
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    mov                 r2d, 8
+.dconly:
+    pmulhrsw            xm0, xm2
+    movd                xm2, [pw_2048] ; intentionally rip-relative
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    pxor                 m3, m3
+.dconly_loop:
+    mova                 m1, [dstq]
+    punpckhbw            m2, m1, m3
+    punpcklbw            m1, m3
+    paddw                m2, m0
+    paddw                m1, m0
+    packuswb             m1, m2
+    mova             [dstq], m1
+    add                dstq, strideq
+    dec                 r2d
+    jg .dconly_loop
+    RET
+.normal:
+    PROLOGUE              0, 0, 16, 32*3, dst, stride, c, eob
+    %undef cmp
+    LOAD_PACKED_16X2      0,  7,  0,  2 ; in0  in2
+    LOAD_PACKED_16X2      4,  7,  1,  3 ; in1  in3
+    LOAD_PACKED_16X2      1,  7,  4,  6 ; in4  in6
+    LOAD_PACKED_16X2      5,  7,  5,  7 ; in5  in7
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
+    add                  cq, 16*16
+    LOAD_PACKED_16X2      2,  7, -8, -6 ; in8  in10
+    LOAD_PACKED_16X2      6,  7, -7, -5 ; in9  in11
+    LOAD_PACKED_16X2      3,  7, -4, -2 ; in12 in14
+    LOAD_PACKED_16X2     11,  7, -3, -1 ; in13 in15
+    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1
+    mova         [rsp+32*0], m4
+    mova         [rsp+32*1], m5
+    mova         [rsp+32*2], m6
+    cmp                eobd, 106
+    jg .full
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7
+    call m(inv_txfm_add_dct_dct_8x32).main_fast
+    jmp .pass2
+.full:
+    LOAD_PACKED_16X2      4,  7,  0,  2 ; in16 in18
+    LOAD_PACKED_16X2     12,  7,  3,  1 ; in19 in17
+    LOAD_PACKED_16X2      5,  7,  4,  6 ; in20 in22
+    LOAD_PACKED_16X2     13,  7,  7,  5 ; in23 in21
+    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
+    add                  cq, 16*8
+    LOAD_PACKED_16X2      6,  7,  0,  2 ; in24 in26
+    LOAD_PACKED_16X2     14,  7,  3,  1 ; in27 in25
+    LOAD_PACKED_16X2      7,  8,  4,  6 ; in28 in30
+    LOAD_PACKED_16X2     15,  8,  7,  5 ; in31 in29
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
+    call m(inv_txfm_add_dct_dct_8x32).main
+.pass2:
+    vpbroadcastd        m12, [o(pw_8192)]
+    REPX  {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15
+    mova         [rsp+32*1], m9
+    mova         [rsp+32*2], m10
+    punpckhwd            m9, m0, m2
+    punpcklwd            m0, m2
+    punpckhwd            m2, m1, m3
+    punpcklwd            m1, m3
+    punpcklwd           m10, m4, m6
+    punpckhwd            m4, m6
+    punpcklwd            m6, m5, m7
+    punpckhwd            m5, m7
+    punpckhwd            m3, m0, m9
+    punpcklwd            m0, m9
+    punpckhwd            m9, m2, m1
+    punpcklwd            m2, m1
+    punpcklwd            m7, m10, m4
+    punpckhwd           m10, m4
+    punpcklwd            m4, m5, m6
+    punpckhwd            m5, m6
+    punpckhdq            m1, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m9
+    punpckhdq            m3, m9
+    punpckldq            m6, m7, m4
+    punpckhdq            m7, m4
+    punpckldq            m9, m10, m5
+    punpckhdq           m10, m5
+    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10
+    pmulhrsw            m12, [rsp+32*0]
+    mova         [rsp+32*0], m8
+    vperm2i128           m4, m0, m6, 0x31
+    vinserti128          m0, m0, xm6, 1
+    vperm2i128           m5, m1, m7, 0x31
+    vinserti128          m1, m1, xm7, 1
+    vperm2i128           m6, m2, m9, 0x31
+    vinserti128          m2, m2, xm9, 1
+    vperm2i128           m7, m3, m10, 0x31
+    vinserti128          m3, m3, xm10, 1
+    call m(idct_16x8_internal).main
+    vpbroadcastd         m8, [o(pw_2048)]
+    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+    lea                  r2, [strideq*3]
+    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r2
+    lea                  r3, [dstq+strideq*4]
+    %define dstq r3
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  0,  1, strideq*2, r2
+    mova                 m0, [rsp+32*0]
+    mova                 m1, [rsp+32*1]
+    mova                 m2, [rsp+32*2]
+    punpckhwd            m7, m0, m2
+    punpcklwd            m0, m2
+    punpckhwd            m2, m1, m11
+    punpcklwd            m1, m11
+    punpckhwd            m4, m12, m14
+    punpcklwd           m12, m14
+    punpckhwd            m5, m13, m15
+    punpcklwd           m13, m15
+    punpckhwd            m3, m0, m7
+    punpcklwd            m0, m7
+    punpckhwd            m9, m2, m1
+    punpcklwd            m2, m1
+    punpcklwd            m7, m12, m4
+    punpckhwd           m12, m4
+    punpcklwd            m4, m5, m13
+    punpckhwd            m5, m13
+    punpckhdq            m1, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m9
+    punpckhdq            m3, m9
+    punpckldq            m6, m7, m4
+    punpckhdq            m7, m4
+    punpckldq            m9, m12, m5
+    punpckhdq           m12, m5
+    vperm2i128           m4, m0, m6, 0x31
+    vinserti128          m0, m0, xm6, 1
+    vperm2i128           m5, m1, m7, 0x31
+    vinserti128          m1, m1, xm7, 1
+    vperm2i128           m6, m2, m9, 0x31
+    vinserti128          m2, m2, xm9, 1
+    vperm2i128           m7, m3, m12, 0x31
+    vinserti128          m3, m3, xm12, 1
+    call m(idct_16x8_internal).main2
+    vpbroadcastd         m8, [o(pw_2048)]
+    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+    add                  r0, 16
+    add                  r3, 16
+    %define dstq r0
+    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r2
+    %define dstq r3
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  0,  1, strideq*2, r2
+    RET
+
+cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob
+    vpbroadcastd         m9, [pw_5]
+    lea                  r4, [strideq*3]
+    sub                eobd, 107 ; loop_iterations = 1 + (eobd >= 107)
+.loop:
+    mova                xm0,     [cq+16* 0]
+    mova                xm1,     [cq+16* 4]
+    vinserti128          m0, m0, [cq+16* 1], 1
+    vinserti128          m1, m1, [cq+16* 5], 1
+    pxor                 m8, m8
+    mova          [cq+32*0], m8
+    mova          [cq+32*2], m8
+    add                  cq, 16*16
+    mova                xm2,     [cq-16* 8]
+    mova                xm3,     [cq-16* 4]
+    vinserti128          m2, m2, [cq-16* 7], 1
+    vinserti128          m3, m3, [cq-16* 3], 1
+    mova                xm4,     [cq+16* 0]
+    mova                xm5,     [cq+16* 4]
+    vinserti128          m4, m4, [cq+16* 1], 1
+    vinserti128          m5, m5, [cq+16* 5], 1
+    mova                xm6,     [cq+16* 8]
+    mova                xm7,     [cq+16*12]
+    vinserti128          m6, m6, [cq+16* 9], 1
+    vinserti128          m7, m7, [cq+16*13], 1
+    REPX {mova [cq+32*x], m8}, -4, -2,  0,  2,  4,  6
+    REPX  {paddw     x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+    call .transpose8x8
+    REPX  {psraw     x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
+    WRITE_8X4             0,  4,  8, 10, strideq*8, strideq*4, r4*4
+    add                dstq, strideq
+    WRITE_8X4             1,  5,  0,  4, strideq*8, strideq*4, r4*4
+    add                dstq, strideq
+    WRITE_8X4             2,  6,  0,  4, strideq*8, strideq*4, r4*4
+    add                dstq, strideq
+    WRITE_8X4             3,  7,  0,  4, strideq*8, strideq*4, r4*4
+    add                dstq, strideq
+    sub                  cq, 16*16-32
+    lea                dstq, [dstq+r4*4]
+    add                eobd, 0x80000000
+    jnc .loop
+    RET
+ALIGN function_align
+.transpose8x8:
+    punpckhwd            m8, m4, m5
+    punpcklwd            m4, m5
+    punpckhwd            m5, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m6, m7
+    punpcklwd            m6, m7
+    punpckhwd            m7, m2, m3
+    punpcklwd            m2, m3
+    punpckhdq            m3, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m4, m6
+    punpckhdq            m4, m6
+    punpckhdq            m6, m5, m7
+    punpckldq            m5, m7
+    punpckldq            m7, m8, m1
+    punpckhdq            m8, m1
+    punpckhqdq           m1, m0, m2
+    punpcklqdq           m0, m2
+    punpcklqdq           m2, m3, m4
+    punpckhqdq           m3, m4
+    punpcklqdq           m4, m5, m7
+    punpckhqdq           m5, m7
+    punpckhqdq           m7, m6, m8
+    punpcklqdq           m6, m8
+    ret
+
+cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob
+    add                  cq, 16*8
+    vpbroadcastd         m9, [pw_4096]
+    lea                  r4, [strideq*3]
+    lea                  r5, [dstq+strideq*4]
+    sub                eobd, 107
+.loop:
+    mova                xm0,     [cq-16*8]
+    mova                xm1,     [cq-16*7]
+    vinserti128          m0, m0, [cq+16*0], 1
+    vinserti128          m1, m1, [cq+16*1], 1
+    mova                xm2,     [cq-16*6]
+    mova                xm3,     [cq-16*5]
+    vinserti128          m2, m2, [cq+16*2], 1
+    vinserti128          m3, m3, [cq+16*3], 1
+    mova                xm4,     [cq-16*4]
+    mova                xm5,     [cq-16*3]
+    vinserti128          m4, m4, [cq+16*4], 1
+    vinserti128          m5, m5, [cq+16*5], 1
+    mova                xm6,     [cq-16*2]
+    mova                xm7,     [cq-16*1]
+    vinserti128          m6, m6, [cq+16*6], 1
+    vinserti128          m7, m7, [cq+16*7], 1
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1,  0,  1,  2,  3
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r4
+    %define dstq r5
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  0,  1, strideq*2, r4
+    add                  cq, 16*16
+    add                  r0, 16
+    add                  r5, 16
+    add                eobd, 0x80000000
+    jnc .loop
+    RET
+
+%define o_base pw_5 + 128
+
+%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs
+%if %3
+    vpbroadcastd        m15, [o(pw_2896x8)]
+    pmulhrsw             m0, m15, [%1+%2* 0]
+    pmulhrsw             m1, m15, [%1+%2* 1]
+    pmulhrsw             m2, m15, [%1+%2* 2]
+    pmulhrsw             m3, m15, [%1+%2* 3]
+    pmulhrsw             m4, m15, [%1+%2* 4]
+    pmulhrsw             m5, m15, [%1+%2* 5]
+    pmulhrsw             m6, m15, [%1+%2* 6]
+    pmulhrsw             m7, m15, [%1+%2* 7]
+    pmulhrsw             m8, m15, [%1+%2* 8]
+    pmulhrsw             m9, m15, [%1+%2* 9]
+    pmulhrsw            m10, m15, [%1+%2*10]
+    pmulhrsw            m11, m15, [%1+%2*11]
+    pmulhrsw            m12, m15, [%1+%2*12]
+    pmulhrsw            m13, m15, [%1+%2*13]
+    pmulhrsw            m14, m15, [%1+%2*14]
+    pmulhrsw            m15,      [%1+%2*15]
+%else
+    mova                 m0, [%1+%2* 0]
+    mova                 m1, [%1+%2* 1]
+    mova                 m2, [%1+%2* 2]
+    mova                 m3, [%1+%2* 3]
+    mova                 m4, [%1+%2* 4]
+    mova                 m5, [%1+%2* 5]
+    mova                 m6, [%1+%2* 6]
+    mova                 m7, [%1+%2* 7]
+    mova                 m8, [%1+%2* 8]
+    mova                 m9, [%1+%2* 9]
+    mova                m10, [%1+%2*10]
+    mova                m11, [%1+%2*11]
+    mova                m12, [%1+%2*12]
+    mova                m13, [%1+%2*13]
+    mova                m14, [%1+%2*14]
+    mova                m15, [%1+%2*15]
+%endif
+    mova              [rsp], m15
+%if %4
+    pxor                m15, m15
+    REPX {mova [%1+%2*x], m15}, 0,  1,  2,  3,  4,  5,  6,  7, \
+                                8,  9, 10, 11, 12, 13, 14, 15
+%endif
+%endmacro
+
+%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
+    mova                m%4, [%2]
+    paddw               m%3, m%1, m%4
+    psubw               m%1, m%4
+    pmovzxbw            m%4, [dstq+%6]
+    pmulhrsw            m%3, m%5
+    pmulhrsw            m%1, m%5
+    paddw               m%3, m%4
+    pmovzxbw            m%4, [r2+%7]
+    paddw               m%1, m%4
+    packuswb            m%3, m%1
+    vpermq              m%3, m%3, q3120
+    mova          [dstq+%6], xm%3
+    vextracti128    [r2+%7], m%3, 1
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x32, 4, 8, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jz .dconly
+    PROLOGUE              0, 0, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
+                                           base, tmp3
+    %undef cmp
+    LOAD_16ROWS          cq, 64, 1
+    call m(idct_16x16_internal).main
+    lea               tmp1q, [rsp+32*7]
+    lea               tmp2q, [tmp1q+32*8]
+    lea               tmp3q, [tmp1q+32*16]
+    mova                 m1, [rsp+32*1]
+    mova         [rsp+32*0], m6
+    mova         [rsp+32*1], m7
+    vpbroadcastd         m7, [o(pw_16384)]
+    call .transpose_2x8x8_round
+    mova                m15, [rsp+32*0]
+    mova         [tmp3q-32*4+ 0], xm0
+    vextracti128 [tmp3q+32*0+ 0], m0, 1
+    mova         [tmp3q-32*3+ 0], xm2
+    vextracti128 [tmp3q+32*1+ 0], m2, 1
+    mova         [tmp3q-32*2+ 0], xm4
+    vextracti128 [tmp3q+32*2+ 0], m4, 1
+    mova         [tmp3q-32*1+ 0], xm6
+    vextracti128 [tmp3q+32*3+ 0], m6, 1
+    mova         [tmp3q-32*4+16], xm8
+    vextracti128 [tmp3q+32*0+16], m8, 1
+    mova         [tmp3q-32*3+16], xm10
+    vextracti128 [tmp3q+32*1+16], m10, 1
+    mova         [tmp3q-32*2+16], xm12
+    vextracti128 [tmp3q+32*2+16], m12, 1
+    mova         [tmp3q-32*1+16], xm14
+    vextracti128 [tmp3q+32*3+16], m14, 1
+    cmp                eobd, 150
+    jg .full
+    vinserti128          m0, m1, xm9, 1
+    vperm2i128           m4, m1, m9, 0x31
+    vinserti128          m2, m5, xm13, 1
+    vperm2i128           m6, m5, m13, 0x31
+    vinserti128          m1, m3, xm11, 1
+    vperm2i128           m5, m3, m11, 0x31
+    vinserti128          m3, m7, xm15, 1
+    vperm2i128           m7, m7, m15, 0x31
+    call .main_oddhalf_fast
+    pxor                 m8, m8
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+    jmp .idct16
+.dconly:
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    mov                 r2d, 16
+    jmp m(inv_txfm_add_dct_dct_16x4).dconly
+.full:
+    mova       [tmp1q-32*4], m1
+    mova       [tmp1q-32*3], m3
+    mova       [tmp1q-32*2], m5
+    mova       [tmp1q-32*1], m7
+    mova       [tmp1q+32*0], m9
+    mova       [tmp1q+32*1], m11
+    mova       [tmp1q+32*2], m13
+    mova       [tmp1q+32*3], m15
+    LOAD_16ROWS       cq+32, 64, 1
+    call m(idct_16x16_internal).main
+    lea                  r2, [tmp3q+32*8]
+    mova                 m1, [rsp+32*1]
+    mova         [rsp+32*0], m6
+    mova         [rsp+32*1], m7
+    vpbroadcastd         m7, [o(pw_16384)]
+    call .transpose_2x8x8_round
+    mova                m15, [rsp+32*0]
+    mova         [r2-32*4+ 0], xm0
+    vextracti128 [r2+32*0+ 0], m0, 1
+    mova         [r2-32*3+ 0], xm2
+    vextracti128 [r2+32*1+ 0], m2, 1
+    mova         [r2-32*2+ 0], xm4
+    vextracti128 [r2+32*2+ 0], m4, 1
+    mova         [r2-32*1+ 0], xm6
+    vextracti128 [r2+32*3+ 0], m6, 1
+    mova         [r2-32*4+16], xm8
+    vextracti128 [r2+32*0+16], m8, 1
+    mova         [r2-32*3+16], xm10
+    vextracti128 [r2+32*1+16], m10, 1
+    mova         [r2-32*2+16], xm12
+    vextracti128 [r2+32*2+16], m12, 1
+    mova         [r2-32*1+16], xm14
+    vextracti128 [r2+32*3+16], m14, 1
+    vinserti128          m8, m1, xm9, 1
+    vperm2i128          m12, m1, m9, 0x31
+    mova                xm0,     [tmp1q-32*4]
+    mova                xm1,     [tmp1q-32*3]
+    vinserti128          m0, m0, [tmp1q+32*0], 1
+    vinserti128          m1, m1, [tmp1q+32*1], 1
+    vinserti128         m10, m5, xm13, 1
+    vperm2i128          m14, m5, m13, 0x31
+    mova                xm4,     [tmp1q-32*4+16]
+    mova                xm5,     [tmp1q-32*3+16]
+    vinserti128          m4, m4, [tmp1q+32*0+16], 1
+    vinserti128          m5, m5, [tmp1q+32*1+16], 1
+    vinserti128          m9, m3, xm11, 1
+    vperm2i128          m13, m3, m11, 0x31
+    mova                xm2,     [tmp1q-32*2]
+    mova                xm3,     [tmp1q-32*1]
+    vinserti128          m2, m2, [tmp1q+32*2], 1
+    vinserti128          m3, m3, [tmp1q+32*3], 1
+    vinserti128         m11, m7, xm15, 1
+    vperm2i128          m15, m7, m15, 0x31
+    mova                xm6,     [tmp1q-32*2+16]
+    mova                xm7,     [tmp1q-32*1+16]
+    vinserti128          m6, m6, [tmp1q+32*2+16], 1
+    vinserti128          m7, m7, [tmp1q+32*3+16], 1
+    call .main_oddhalf
+    LOAD_8ROWS_H    r2-32*4, 32
+.idct16:
+    LOAD_8ROWS   tmp3q-32*4, 32
+    mova              [rsp], m15
+    call m(idct_16x16_internal).main
+    imul                 r2, strideq, 19
+    lea                  r3, [strideq*3]
+    add                  r2, dstq
+    call .pass2_end
+    RET
+ALIGN function_align
+.main_oddhalf_fast: ; lower half is zero
+    mova [rsp+gprsize+32*1], m7
+    pxor                 m7, m7
+    mova [rsp+gprsize+32*0], m7
+    mova [rsp+gprsize+32*2], m7
+    vpbroadcastd        m11, [o(pw_3703x8)]
+    vpbroadcastd         m7, [o(pw_1751x8)]
+    vpbroadcastd        m12, [o(pw_m1380x8)]
+    vpbroadcastd         m8, [o(pw_3857x8)]
+    vpbroadcastd        m13, [o(pw_3973x8)]
+    vpbroadcastd        m15, [o(pw_995x8)]
+    pmulhrsw            m11, m4  ; t29a
+    pmulhrsw             m4, m7  ; t18a
+    pmulhrsw            m12, m3  ; t19a
+    pmulhrsw             m3, m8  ; t28a
+    pmulhrsw            m13, m2  ; t27a
+    pmulhrsw             m2, m15 ; t20a
+    vpbroadcastd        m10, [o(pw_m2106x8)]
+    vpbroadcastd         m7, [o(pw_3513x8)]
+    vpbroadcastd         m9, [o(pw_3290x8)]
+    vpbroadcastd         m8, [o(pw_2440x8)]
+    vpbroadcastd        m14, [o(pw_m601x8)]
+    vpbroadcastd        m15, [o(pw_4052x8)]
+    pmulhrsw            m10, m5  ; t21a
+    pmulhrsw             m5, m7  ; t26a
+    pmulhrsw             m9, m6  ; t25a
+    pmulhrsw             m6, m8  ; t22a
+    pmulhrsw            m14, m1  ; t23a
+    pmulhrsw             m1, m15 ; t24a
+    vpbroadcastd        m15, [o(pd_2048)]
+    jmp .main2
+ALIGN function_align
+.main_oddhalf:
+    mova [rsp+gprsize+32*0], m15
+    mova [rsp+gprsize+32*1], m7
+    mova [rsp+gprsize+32*2], m8
+    vpbroadcastd        m15, [o(pd_2048)]
+    ITX_MULSUB_2W         4, 11,  7,  8, 15, 1751, 3703 ; t18a, t29a
+    ITX_MULSUB_2W        12,  3,  7,  8, 15, 3857, 1380 ; t19a, t28a
+    ITX_MULSUB_2W         2, 13,  7,  8, 15,  995, 3973 ; t20a, t27a
+    ITX_MULSUB_2W        10,  5,  7,  8, 15, 3513, 2106 ; t21a, t26a
+    ITX_MULSUB_2W         6,  9,  7,  8, 15, 2440, 3290 ; t22a, t25a
+    ITX_MULSUB_2W        14,  1,  7,  8, 15, 4052,  601 ; t23a, t24a
+.main2:
+    psubw                m7, m12, m4  ; t18
+    paddw               m12, m4       ; t19
+    psubw                m4, m2, m10  ; t21
+    paddw                m2, m10      ; t20
+    psubw               m10, m14, m6  ; t22
+    paddw               m14, m6       ; t23
+    psubw                m6, m1, m9   ; t25
+    paddw                m1, m9       ; t24
+    psubw                m9, m13, m5  ; t26
+    paddw               m13, m5       ; t27
+    psubw                m5, m3, m11  ; t29
+    paddw                m3, m11      ; t28
+    ITX_MULSUB_2W         5,  7,  8, 11, 15, m4017,  799 ; t18a, t29a
+    ITX_MULSUB_2W         9,  4,  8, 11, 15,  3406, 2276 ; t21a, t26a
+    ITX_MULSUB_2W         6, 10,  8, 11, 15, m2276, 3406 ; t22a, t25a
+    psubw                m8, m14, m2  ; t20a
+    paddw               m14, m2       ; t23a
+    psubw                m2, m1, m13  ; t27a
+    paddw                m1, m13      ; t24a
+    psubw               m13, m6, m9   ; t21
+    paddw                m6, m9       ; t22
+    psubw                m9, m10, m4  ; t26
+    paddw               m10, m4       ; t25
+    ITX_MULSUB_2W         2,  8,  4, 11, 15, m3784, 1567 ; t20,  t27
+    ITX_MULSUB_2W         9, 13,  4, 11, 15, m3784, 1567 ; t21a, t26a
+    mova                 m4, [rsp+gprsize+32*0] ; in31
+    mova [rsp+gprsize+32*0], m6  ; t22
+    mova                 m6, [rsp+gprsize+32*1] ; in15
+    mova [rsp+gprsize+32*1], m14 ; t23a
+    mova                m14, [rsp+gprsize+32*2] ; in17
+    mova [rsp+gprsize+32*2], m1  ; t24a
+    ITX_MULSUB_2W         0,  4,  1, 11, 15,  201, 4091 ; t16a, t31a
+    ITX_MULSUB_2W        14,  6,  1, 11, 15, 3035, 2751 ; t17a, t30a
+    psubw                m1, m0, m14  ; t17
+    paddw                m0, m14      ; t16
+    psubw               m14, m4, m6   ; t30
+    paddw                m4, m6       ; t31
+    ITX_MULSUB_2W        14,  1,  6, 11, 15,  799, 4017 ; t17a, t30a
+    psubw                m6, m0, m12  ; t19a
+    paddw                m0, m12      ; t16a
+    psubw               m12, m4, m3   ; t28a
+    paddw                m4, m3       ; t31a
+    psubw                m3, m14, m5  ; t18
+    paddw               m14, m5       ; t17
+    psubw                m5, m1, m7   ; t29
+    paddw                m1, m7       ; t30
+    ITX_MULSUB_2W         5,  3,  7, 11, 15, 1567, 3784 ; t18a, t29a
+    ITX_MULSUB_2W        12,  6,  7, 11, 15, 1567, 3784 ; t19,  t28
+    psubw                m7, m1, m10  ; t25a
+    paddw                m1, m10      ; t30a
+    psubw               m10, m5, m9   ; t21
+    paddw                m5, m9       ; t18
+    psubw                m9, m12, m2  ; t20a
+    paddw               m12, m2       ; t19a
+    psubw                m2, m3, m13  ; t26
+    paddw                m3, m13      ; t29
+    psubw               m13, m6, m8   ; t27a
+    paddw                m6, m8       ; t28a
+    mova       [tmp1q-32*2], m5
+    mova       [tmp1q-32*1], m12
+    mova       [tmp2q+32*0], m6
+    mova       [tmp2q+32*1], m3
+    mova       [tmp2q+32*2], m1
+    mova                 m5, [rsp+gprsize+32*0] ; t22
+    mova                 m6, [rsp+gprsize+32*1] ; t23
+    mova                 m3, [rsp+gprsize+32*2] ; t24a
+    vpbroadcastd         m8, [o(pw_2896x8)]
+    psubw                m1, m14, m5  ; t22a
+    paddw               m14, m5       ; t17a
+    psubw                m5, m0, m6   ; t23
+    paddw                m0, m6       ; t16
+    psubw                m6, m4, m3   ; t24
+    paddw                m4, m3       ; t31
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m14
+    mova       [tmp2q+32*3], m4
+    psubw                m3, m13, m9  ; t20
+    paddw               m13, m9       ; t27
+    psubw                m9, m2, m10  ; t21a
+    paddw                m2, m10      ; t26a
+    psubw               m10, m7, m1   ; t22
+    paddw                m7, m1       ; t25
+    psubw                m1, m6, m5   ; t23a
+    paddw                m6, m5       ; t24a
+    REPX   {pmulhrsw x, m8}, m3, m13, m9, m2, m10, m7, m1, m6
+    mova       [tmp1q+32*0], m3
+    mova       [tmp1q+32*1], m9
+    mova       [tmp1q+32*2], m10
+    mova       [tmp1q+32*3], m1
+    mova       [tmp2q-32*4], m6
+    mova       [tmp2q-32*3], m7
+    mova       [tmp2q-32*2], m2
+    mova       [tmp2q-32*1], m13
+    ret
+ALIGN function_align
+.transpose_2x8x8_round:
+    punpckhwd            m6, m12, m13
+    punpcklwd           m12, m13
+    punpckhwd           m13, m8, m9
+    punpcklwd            m8, m9
+    punpckhwd            m9, m14, m15
+    punpcklwd           m14, m15
+    punpckhwd           m15, m10, m11
+    punpcklwd           m10, m11
+    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5
+    punpckhdq           m11, m8, m10
+    punpckldq            m8, m10
+    punpckldq           m10, m12, m14
+    punpckhdq           m12, m14
+    punpckhdq           m14, m13, m15
+    punpckldq           m13, m15
+    punpckldq           m15, m6, m9
+    punpckhdq            m6, m9
+    punpckhqdq           m9, m8, m10
+    punpcklqdq           m8, m10
+    punpcklqdq          m10, m11, m12
+    punpckhqdq          m11, m12
+    punpcklqdq          m12, m13, m15
+    punpckhqdq          m13, m15
+    punpckhqdq          m15, m14, m6
+    punpcklqdq          m14, m6
+    pmulhrsw             m6, m7, [rsp+gprsize+32*0]
+    REPX   {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15
+    pmulhrsw             m7, [rsp+gprsize+32*1]
+    mova [rsp+gprsize+32*0], m15
+    punpckhwd           m15, m4, m5
+    punpcklwd            m4, m5
+    punpckhwd            m5, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m6, m7
+    punpcklwd            m6, m7
+    punpckhwd            m7, m2, m3
+    punpcklwd            m2, m3
+    punpckhdq            m3, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m4, m6
+    punpckhdq            m4, m6
+    punpckhdq            m6, m5, m7
+    punpckldq            m5, m7
+    punpckldq            m7, m15, m1
+    punpckhdq           m15, m1
+    punpckhqdq           m1, m0, m2
+    punpcklqdq           m0, m2
+    punpcklqdq           m2, m3, m4
+    punpckhqdq           m3, m4
+    punpcklqdq           m4, m5, m7
+    punpckhqdq           m5, m7
+    punpckhqdq           m7, m6, m15
+    punpcklqdq           m6, m15
+    ret
+ALIGN function_align
+.pass2_end:
+    mova [rsp+gprsize+32*0], m7
+    mova [rsp+gprsize+32*2], m15
+    vpbroadcastd        m15, [o(pw_2048)]
+    IDCT32_PASS2_END      0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4
+    IDCT32_PASS2_END      4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8
+    IDCT32_PASS2_END      8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4
+    IDCT32_PASS2_END     12, tmp1q-32*1, 0, 4, 15, r3*4,      strideq*0
+    add                dstq, strideq
+    sub                  r2, strideq
+    mova                 m1, [rsp+gprsize+32*1]
+    IDCT32_PASS2_END      1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4
+    IDCT32_PASS2_END      5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8
+    IDCT32_PASS2_END      9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4
+    IDCT32_PASS2_END     13, tmp1q-32*2, 0, 4, 15, r3*4,      strideq*0
+    add                dstq, strideq
+    sub                  r2, strideq
+    IDCT32_PASS2_END      2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4
+    IDCT32_PASS2_END      6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8
+    IDCT32_PASS2_END     10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4
+    IDCT32_PASS2_END     14, tmp1q-32*3, 0, 4, 15, r3*4,      strideq*0
+    add                dstq, strideq
+    sub                  r2, strideq
+    mova                 m7, [rsp+gprsize+32*0]
+    mova                 m1, [rsp+gprsize+32*2]
+    IDCT32_PASS2_END      3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4
+    IDCT32_PASS2_END      7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8
+    IDCT32_PASS2_END     11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4
+    IDCT32_PASS2_END      1, tmp1q-32*4, 0, 4, 15, r3*4,      strideq*0
+    ret
+
+; Perform the final sumsub step and YMM lane shuffling
+%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
+    mova                m%3, [tmp2q+32*( 3-%1)]
+    psubw               m%4, m%1, m%3
+    paddw               m%1, m%3
+    mova                m%3, [tmp1q+32*(11-%2)]
+    mova         [tmp1q+32*(11-%2)+16], xm%4
+    vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
+    paddw               m%4, m%2, m%3
+    psubw               m%2, m%3
+    mova         [tmp1q+32*(11-%2)], xm%2
+    vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
+    vperm2i128          m%2, m%1, m%4, 0x31
+    vinserti128         m%1, m%1, xm%4, 1
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x16, 4, 6, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    mov                 r2d, 16
+    jmp m(inv_txfm_add_dct_dct_32x8).dconly
+.normal:
+    PROLOGUE              0, 0, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
+    vpbroadcastd        m15, [o(pw_2896x8)]
+    pmulhrsw             m0, m15, [cq+32* 1]
+    pmulhrsw             m1, m15, [cq+32* 3]
+    pmulhrsw             m2, m15, [cq+32* 5]
+    pmulhrsw             m3, m15, [cq+32* 7]
+    pmulhrsw             m4, m15, [cq+32* 9]
+    pmulhrsw             m5, m15, [cq+32*11]
+    pmulhrsw             m6, m15, [cq+32*13]
+    pmulhrsw             m7, m15, [cq+32*15]
+    pmulhrsw             m8, m15, [cq+32*17]
+    pmulhrsw             m9, m15, [cq+32*19]
+    pmulhrsw            m10, m15, [cq+32*21]
+    pmulhrsw            m11, m15, [cq+32*23]
+    pmulhrsw            m12, m15, [cq+32*25]
+    pmulhrsw            m13, m15, [cq+32*27]
+    pmulhrsw            m14, m15, [cq+32*29]
+    pmulhrsw            m15,      [cq+32*31]
+    lea               tmp1q, [rsp+32*7]
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
+    LOAD_16ROWS     cq+32*0, 32*2, 1, 0
+    pxor                m15, m15
+    mov                 r3d, 8
+.zero_loop:
+    mova          [cq+32*0], m15
+    mova          [cq+32*1], m15
+    mova          [cq+32*2], m15
+    mova          [cq+32*3], m15
+    add                  cq, 32*4
+    dec                 r3d
+    jg .zero_loop
+    call m(idct_16x16_internal).main
+    call .pass1_end
+    lea                  r2, [strideq*3]
+    mov                  r3, dstq
+.pass2:
+    vpbroadcastd         m7, [o(pw_16384)]
+    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
+    call m(idct_16x16_internal).main
+    mova         [rsp+32*2], m15
+    vpbroadcastd        m15, [o(pw_2048)]
+    REPX  {pmulhrsw x, m15}, m2, m3, m0
+    WRITE_16X2            2,  3,  1,  2, strideq*2, r2
+    pmulhrsw             m1, m15, [rsp+32*1]
+    WRITE_16X2            0,  1,  2,  3, strideq*0, strideq*1
+    lea                dstq, [dstq+strideq*4]
+    REPX  {pmulhrsw x, m15}, m4, m5, m6, m7
+    WRITE_16X2            4,  5,  2,  3, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  2,  3, strideq*2, r2
+    lea                dstq, [dstq+strideq*4]
+    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11
+    WRITE_16X2            8,  9,  2,  3, strideq*0, strideq*1
+    WRITE_16X2           10, 11,  2,  3, strideq*2, r2
+    lea                dstq, [dstq+strideq*4]
+    REPX  {pmulhrsw x, m15}, m11, m12, m13, m14
+    pmulhrsw            m15, [rsp+32*2]
+    WRITE_16X2           12, 13,  2,  3, strideq*0, strideq*1
+    WRITE_16X2           14, 15,  2,  3, strideq*2, r2
+    test                 r3, r3
+    jnz .right_half
+    RET
+.right_half:
+    LOAD_8ROWS   tmp1q-32*4, 32
+    LOAD_8ROWS_H tmp2q-32*4, 32
+    lea                dstq, [r3+16]
+    xor                 r3d, r3d
+    mova         [rsp+32*0], m6
+    mova         [rsp+32*1], m7
+    jmp .pass2
+ALIGN function_align
+.pass1_end:
+    mova [rsp+gprsize+32*0], m9
+    IDCT32_PASS1_END      0,  8,  1,  9
+    IDCT32_PASS1_END      2, 10,  1,  9
+    IDCT32_PASS1_END      3, 11,  1,  9
+    IDCT32_PASS1_END      4, 12,  1,  9
+    IDCT32_PASS1_END      5, 13,  1,  9
+    IDCT32_PASS1_END      6, 14,  1,  9
+    IDCT32_PASS1_END      7, 15,  1,  9
+    mova                 m1, [rsp+gprsize+32*1]
+    mova                 m9, [rsp+gprsize+32*0]
+    mova [rsp+gprsize+32*0], m6
+    mova [rsp+gprsize+32*1], m7
+    IDCT32_PASS1_END      1,  9,  6,  7
+    ret
+
+cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
+%undef cmp
+    lea                 rax, [o_base]
+    vpbroadcastd         m9, [o(pw_2896x8)]
+    vpbroadcastd        m10, [o(pw_5793x4)]
+    vpbroadcastd        m11, [o(pw_5)]
+    cmp                eobd, 43   ; if (eob > 43)
+    setg                r4b       ;   iteration_count++
+    cmp                eobd, 150  ; if (eob > 150)
+    setg                 al       ;   iteration_count++
+    add                eobd, -279 ; if (eob > 278)
+    adc                 r4b, al   ;   iteration_count++
+    lea                  r3, [strideq*3]
+    mov                 rax, cq
+.loop:
+    mova                xm0,     [cq+64* 0]
+    mova                xm1,     [cq+64* 1]
+    vinserti128          m0, m0, [cq+64* 8], 1
+    vinserti128          m1, m1, [cq+64* 9], 1
+    mova                xm2,     [cq+64* 2]
+    mova                xm3,     [cq+64* 3]
+    vinserti128          m2, m2, [cq+64*10], 1
+    vinserti128          m3, m3, [cq+64*11], 1
+    mova                xm4,     [cq+64* 4]
+    mova                xm5,     [cq+64* 5]
+    vinserti128          m4, m4, [cq+64*12], 1
+    vinserti128          m5, m5, [cq+64*13], 1
+    mova                xm6,     [cq+64* 6]
+    mova                xm7,     [cq+64* 7]
+    vinserti128          m6, m6, [cq+64*14], 1
+    vinserti128          m7, m7, [cq+64*15], 1
+    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX  {psllw    x, 2  }, m0, m1, m2, m3, m4, m5, m6, m7
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX  {paddw    x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX  {psraw    x, 3  }, m0, m1, m2, m3, m4, m5, m6, m7
+    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  0,  1, strideq*2, r3
+    lea                dstq, [dstq+strideq*4]
+    add                  cq, 16
+    dec                 r4b
+    jge .loop
+    sub                  cq, 32
+    pxor                 m0, m0
+    mov                 r0d, 8
+    cmp                  cq, rax
+    jg .zero_loop
+.zero_loop_half:
+    mova         [rax+64*0], m0
+    mova         [rax+64*1], m0
+    mova         [rax+64*2], m0
+    mova         [rax+64*3], m0
+    add                 rax, 64*4
+    sub                 r0d, 2
+    jg .zero_loop_half
+    RET
+.zero_loop:
+    mova         [rax+32*0], m0
+    mova         [rax+32*1], m0
+    mova         [rax+32*2], m0
+    mova         [rax+32*3], m0
+    add                 rax, 32*4
+    dec                 r0d
+    jg .zero_loop
+    RET
+
+cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
+%undef cmp
+    lea                 rax, [o_base]
+    vpbroadcastd         m9, [o(pw_2896x8)]
+    vpbroadcastd        m10, [o(pw_5793x4)]
+    vpbroadcastd        m11, [o(pw_2048)]
+    cmp                eobd, 35  ; if (eob > 35)
+    setg                r4b      ;   iteration_count++
+    cmp                eobd, 150 ; if (eob > 150)
+    setg                r3b      ;   iteration_count += 2
+    lea                 r4d, [r4+r3*2]
+    lea                  r3, [strideq*3]
+    mov                  r5, dstq
+    mov                 rax, cq
+.loop:
+    mova                xm0,     [cq+32* 0]
+    mova                xm1,     [cq+32* 1]
+    vinserti128          m0, m0, [cq+32* 8], 1
+    vinserti128          m1, m1, [cq+32* 9], 1
+    mova                xm2,     [cq+32* 2]
+    mova                xm3,     [cq+32* 3]
+    vinserti128          m2, m2, [cq+32*10], 1
+    vinserti128          m3, m3, [cq+32*11], 1
+    mova                xm4,     [cq+32* 4]
+    mova                xm5,     [cq+32* 5]
+    vinserti128          m4, m4, [cq+32*12], 1
+    vinserti128          m5, m5, [cq+32*13], 1
+    mova                xm6,     [cq+32* 6]
+    mova                xm7,     [cq+32* 7]
+    vinserti128          m6, m6, [cq+32*14], 1
+    vinserti128          m7, m7, [cq+32*15], 1
+    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX  {psllw    x, 3  }, m0, m1, m2, m3, m4, m5, m6, m7
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  0,  1, strideq*2, r3
+    lea                dstq, [dstq+strideq*4]
+    add                  cq, 16
+    dec                 r4b
+    jl .ret
+    test                r4b, 1
+    jz .loop
+    add                  cq, 32*15
+    lea                dstq, [r5+16]
+    jmp .loop
+.ret:
+    sub                  cq, 32
+    pxor                 m0, m0
+    mov                 r0d, 4
+    mov                 r1d, 8
+    cmp                  cq, rax
+    cmovg               r0d, r1d
+.zero_loop:
+    mova         [rax+32*0], m0
+    mova         [rax+32*1], m0
+    mova         [rax+32*2], m0
+    mova         [rax+32*3], m0
+    add                 rax, 32*4
+    dec                 r0d
+    jg .zero_loop
+    RET
+
+cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    mov                 r2d, 32
+    jmp m(inv_txfm_add_dct_dct_32x8).dconly
+.normal:
+    PROLOGUE              0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \
+                                           base, tmp3, tmp4
+    %undef cmp
+    lea               tmp1q, [rsp+32*7]
+    lea               tmp2q, [tmp1q+32*8]
+    sub                eobd, 136
+    mov               tmp4d, eobd
+.pass1_loop:
+    LOAD_8ROWS      cq+64*1, 64*2
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
+    test              tmp4d, tmp4d
+    jl .fast
+    LOAD_8ROWS_H   cq+64*17, 64*2
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
+    LOAD_8ROWS_H   cq+64*16, 64*2
+    pxor                 m0, m0
+    REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
+                               24, 25, 26, 27, 28, 29, 30, 31
+    mova              [rsp], m15
+    jmp .idct16
+.fast:
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    pxor                 m8, m8
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m8
+.idct16:
+    LOAD_8ROWS      cq+64*0, 64*2
+    pxor                m15, m15
+    REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
+    call m(idct_16x16_internal).main
+    call m(inv_txfm_add_dct_dct_32x16).pass1_end
+    vpbroadcastd         m7, [o(pw_8192)]
+    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
+    lea               tmp3q, [tmp1q+32*32]
+    mova                m15, [rsp]
+    mova       [tmp3q-32*4], m0
+    mova       [tmp3q-32*3], m2
+    mova       [tmp3q-32*2], m4
+    mova       [tmp3q-32*1], m6
+    mova       [tmp3q+32*0], m8
+    mova       [tmp3q+32*1], m10
+    mova       [tmp3q+32*2], m12
+    mova       [tmp3q+32*3], m14
+    add               tmp3q, 32*8
+    mova       [tmp3q-32*4], m1
+    mova       [tmp3q-32*3], m3
+    mova       [tmp3q-32*2], m5
+    mova       [tmp3q-32*1], m7
+    mova       [tmp3q+32*0], m9
+    mova       [tmp3q+32*1], m11
+    mova       [tmp3q+32*2], m13
+    mova       [tmp3q+32*3], m15
+    vpbroadcastd         m9, [o(pw_8192)]
+    pmulhrsw             m0, m9, [tmp1q-32*4]
+    pmulhrsw             m1, m9, [tmp1q-32*3]
+    pmulhrsw             m2, m9, [tmp1q-32*2]
+    pmulhrsw             m3, m9, [tmp1q-32*1]
+    pmulhrsw             m4, m9, [tmp1q+32*0]
+    pmulhrsw             m5, m9, [tmp1q+32*1]
+    pmulhrsw             m6, m9, [tmp1q+32*2]
+    pmulhrsw             m7, m9, [tmp1q+32*3]
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    mova       [tmp1q-32*4], m0
+    pmulhrsw             m0, m9, [tmp2q-32*4]
+    mova       [tmp2q-32*4], m1
+    pmulhrsw             m1, m9, [tmp2q-32*3]
+    mova       [tmp1q-32*3], m2
+    pmulhrsw             m2, m9, [tmp2q-32*2]
+    mova       [tmp2q-32*3], m3
+    pmulhrsw             m3, m9, [tmp2q-32*1]
+    mova       [tmp1q-32*2], m4
+    pmulhrsw             m4, m9, [tmp2q+32*0]
+    mova       [tmp2q-32*2], m5
+    pmulhrsw             m5, m9, [tmp2q+32*1]
+    mova       [tmp1q-32*1], m6
+    pmulhrsw             m6, m9, [tmp2q+32*2]
+    mova       [tmp2q-32*1], m7
+    pmulhrsw             m7, m9, [tmp2q+32*3]
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    mova       [tmp1q+32*0], m0
+    mova       [tmp2q+32*0], m1
+    mova       [tmp1q+32*1], m2
+    mova       [tmp2q+32*1], m3
+    mova       [tmp1q+32*2], m4
+    mova       [tmp2q+32*2], m5
+    mova       [tmp1q+32*3], m6
+    mova       [tmp2q+32*3], m7
+    add                  cq, 32
+    add               tmp1q, 32*16
+    add               tmp2q, 32*16
+    add                eobd, 0x80000000
+    jnc .pass1_loop
+    add               tmp1q, 32*24
+    imul                 r2, strideq, 19
+    lea                  r3, [strideq*3]
+    add                  r2, dstq
+    test              tmp4d, tmp4d
+    jge .pass2_loop
+    add               tmp1q, 32*16
+    add               tmp2q, 32*16
+    add               tmp3q, 32*16
+.pass2_loop:
+    LOAD_8ROWS   tmp2q-32*4, 32
+    test              tmp4d, tmp4d
+    jl .fast2
+    LOAD_8ROWS_H tmp3q-32*4, 32
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
+    sub               tmp3q, 32*8
+    LOAD_8ROWS_H tmp3q-32*4, 32
+    sub               tmp3q, 32*16
+    jmp .pass2_loop_end
+.fast2:
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    sub               tmp3q, 32*24
+    pxor                 m8, m8
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+.pass2_loop_end:
+    LOAD_8ROWS   tmp3q-32*4, 32
+    mova              [rsp], m15
+    call m(idct_16x16_internal).main
+    call m(inv_txfm_add_dct_dct_16x32).pass2_end
+    lea               tmp3q, [tmp1q-32*32]
+    cmp               tmp2q, tmp3q
+    jl .ret
+    sub               tmp2q, 32*32
+    sub                dstq, r3
+    lea                  r2, [r2+r3+16]
+    add                dstq, 16
+    jmp .pass2_loop
+.ret:
+    RET
+
+cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob
+    %undef cmp
+    vpbroadcastd         m9, [pw_8192]
+    sub                eobd, 136 ; if (eob < 136)
+    shr                eobd, 30  ;     topleft 16x16 only
+    lea                eobd, [eobq*2-8]
+    lea                  r4, [strideq*3]
+    mov                  r5, dstq
+    lea                 rax, [cq+32]
+.loop:
+    mova                xm0,     [cq+64* 0]
+    mova                xm1,     [cq+64* 1]
+    vinserti128          m0, m0, [cq+64* 8], 1
+    vinserti128          m1, m1, [cq+64* 9], 1
+    mova                xm2,     [cq+64* 2]
+    mova                xm3,     [cq+64* 3]
+    vinserti128          m2, m2, [cq+64*10], 1
+    vinserti128          m3, m3, [cq+64*11], 1
+    mova                xm4,     [cq+64* 4]
+    mova                xm5,     [cq+64* 5]
+    vinserti128          m4, m4, [cq+64*12], 1
+    vinserti128          m5, m5, [cq+64*13], 1
+    mova                xm6,     [cq+64* 6]
+    mova                xm7,     [cq+64* 7]
+    vinserti128          m6, m6, [cq+64*14], 1
+    vinserti128          m7, m7, [cq+64*15], 1
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r4
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  0,  1, strideq*2, r4
+    lea                dstq, [dstq+strideq*4]
+    add                  cq, 16
+    inc                eobd
+    jz .ret
+    test               eobd, 3
+    jnz .loop
+    add                  cq, 64*15
+    lea                dstq, [r5+16]
+    jmp .loop
+.ret:
+    pxor                 m0, m0
+    mov                 r0d, 16
+    cmp                  cq, rax
+    jne .zero_loop
+.zero_loop_topleft:
+    mova         [rax-32*1], m0
+    mova         [rax+32*1], m0
+    mova         [rax+32*3], m0
+    mova         [rax+32*5], m0
+    add                 rax, 64*4
+    sub                 r0d, 4
+    jg .zero_loop_topleft
+    RET
+.zero_loop:
+    mova         [rax-32*1], m0
+    mova         [rax+32*0], m0
+    mova         [rax+32*1], m0
+    mova         [rax+32*2], m0
+    add                 rax, 32*4
+    dec                 r0d
+    jg .zero_loop
+    RET
+
+%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
+%if %1 & 1
+    mova                m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n
+    mova                m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n
+%else
+    mova                m%5, [tmp1q-32*(45-%1)]
+    mova                m%4, [tmp2q-32*(20+%1)]
+%endif
+    psubw               m%6, m%5, m%4 ; idct32 out31-n
+    paddw               m%5, m%4      ; idct32 out 0+n
+    psubw               m%4, m%6, m%3 ; out32+n
+    paddw               m%6, m%3      ; out31-n
+    psubw               m%3, m%5, m%2 ; out63-n
+    paddw               m%5, m%2      ; out 0+n
+%if %0 == 6 ; pass 1
+%if %1 & 1
+    mova [tmp2q-32*(19-%1)], m%4
+    mova [tmp1q-32*(14+%1)], m%6
+    mova [tmp1q+32*(18-%1)], m%3
+    mova [tmp2q-32*(51-%1)], m%5
+%else
+    mova [tmp1q-32*(13-%1)], m%4
+    mova [tmp2q-32*(20+%1)], m%6
+    mova [tmp2q+32*(12-%1)], m%3
+    mova [tmp1q-32*(45-%1)], m%5
+%endif
+%else ; pass 2
+    REPX  {pmulhrsw x, m14}, m%4, m%6, m%3, m%5
+%if %1 & 1
+    %define %%d0 r2
+    %define %%d1 dstq
+%else
+    %define %%d0 dstq
+    %define %%d1 r2
+%endif
+    pmovzxbw            m%2, [%%d0+%9 ]
+    paddw               m%2, m%4
+    pmovzxbw            m%4, [%%d1+%8 ]
+    paddw               m%4, m%6
+    pmovzxbw            m%6, [%%d1+%10]
+    paddw               m%3, m%6
+    pmovzxbw            m%6, [%%d0+%7 ]
+    paddw               m%5, m%6
+    packuswb            m%2, m%4
+    packuswb            m%3, m%5
+    vpermq              m%2, m%2, q3120
+    vpermq              m%3, m%3, q3120
+    mova         [%%d0+%9 ], xm%2
+    vextracti128 [%%d1+%8 ], m%2, 1
+    mova         [%%d1+%10], xm%3
+    vextracti128 [%%d0+%7 ], m%3, 1
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    mov                 r2d, 32
+    jmp m(inv_txfm_add_dct_dct_16x4).dconly
+.normal:
+    PROLOGUE              0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
+    %undef cmp
+    lea               tmp1q, [rsp+32*23]
+    lea               tmp2q, [tmp1q+32*24]
+    sub                eobd, 151
+    mov                 r7d, eobd
+.pass1_loop:
+    LOAD_16ROWS          cq, 64
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova         [rsp+32*0], m6
+    mova         [rsp+32*1], m7
+    vpbroadcastd         m7, [o(pw_8192)]
+    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
+    mova                m15, [rsp+32*0]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m2
+    mova       [tmp1q-32*2], m4
+    mova       [tmp1q-32*1], m6
+    mova       [tmp1q+32*0], m8
+    mova       [tmp1q+32*1], m10
+    mova       [tmp1q+32*2], m12
+    mova       [tmp1q+32*3], m14
+    mova       [tmp2q-32*4], m1
+    mova       [tmp2q-32*3], m3
+    mova       [tmp2q-32*2], m5
+    mova       [tmp2q-32*1], m7
+    mova       [tmp2q+32*0], m9
+    mova       [tmp2q+32*1], m11
+    mova       [tmp2q+32*2], m13
+    mova       [tmp2q+32*3], m15
+    add                  cq, 32
+    add               tmp1q, 32*8
+    add               tmp2q, 32*8
+    add                eobd, 0x80000000
+    jnc .pass1_loop
+    lea                  r2, [rsp+32*23]
+    mova                xm0,     [r2-32*4+ 0]
+    mova                xm1,     [r2-32*2+ 0]
+    vinserti128          m0, m0, [r2+32*0+ 0], 1
+    vinserti128          m1, m1, [r2+32*2+ 0], 1
+    mova                xm2,     [r2-32*4+16]
+    mova                xm3,     [r2-32*2+16]
+    vinserti128          m2, m2, [r2+32*0+16], 1
+    vinserti128          m3, m3, [r2+32*2+16], 1
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+    test                r7d, r7d
+    jl .fast
+    lea                  r3, [r2+32*8]
+    mova                xm4,     [r3-32*4+ 0]
+    mova                xm5,     [r3-32*2+ 0]
+    vinserti128          m4, m4, [r3+32*0+ 0], 1
+    vinserti128          m5, m5, [r3+32*2+ 0], 1
+    mova                xm6,     [r3-32*4+16]
+    mova                xm7,     [r3-32*2+16]
+    vinserti128          m6, m6, [r3+32*0+16], 1
+    vinserti128          m7, m7, [r3+32*2+16], 1
+.fast:
+    mova              [rsp], m8
+    lea               tmp1q, [rsp+32*7]
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m1
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m5
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    add               tmp1q, 32*8
+    mova       [tmp1q-32*4], m8
+    mova       [tmp1q-32*3], m9
+    mova       [tmp1q-32*2], m10
+    mova       [tmp1q-32*1], m11
+    mova       [tmp1q+32*0], m12
+    mova       [tmp1q+32*1], m13
+    mova       [tmp1q+32*2], m14
+    mova       [tmp1q+32*3], m15
+    mova                xm0,     [r2-32*3+ 0]
+    mova                xm1,     [r2-32*1+ 0]
+    vinserti128          m0, m0, [r2+32*1+ 0], 1
+    vinserti128          m1, m1, [r2+32*3+ 0], 1
+    mova                xm2,     [r2-32*3+16]
+    mova                xm3,     [r2-32*1+16]
+    vinserti128          m2, m2, [r2+32*1+16], 1
+    vinserti128          m3, m3, [r2+32*3+16], 1
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7
+    test                r7d, r7d
+    jl .fast2
+    mova                xm4,     [r3-32*3+ 0]
+    mova                xm5,     [r3-32*1+ 0]
+    vinserti128          m4, m4, [r3+32*1+ 0], 1
+    vinserti128          m5, m5, [r3+32*3+ 0], 1
+    mova                xm6,     [r3-32*3+16]
+    mova                xm7,     [r3-32*1+16]
+    vinserti128          m6, m6, [r3+32*1+16], 1
+    vinserti128          m7, m7, [r3+32*3+16], 1
+.fast2:
+    add               tmp1q, 32*8
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    add                  r2, 32*24
+    vpbroadcastd        m15, [o(pd_2048)]
+    add               tmp1q, 32*16
+    add               tmp2q, 32*32
+    mova                xm0,     [r2-32*4+ 0]
+    mova                xm3,     [r2-32*1+16]
+    vinserti128          m0, m0, [r2+32*0+ 0], 1
+    vinserti128          m3, m3, [r2+32*3+16], 1
+    mova                xm4,     [r2-32*4+16]
+    mova                xm7,     [r2-32*1+ 0]
+    vinserti128          m4, m4, [r2+32*0+16], 1
+    vinserti128          m7, m7, [r2+32*3+ 0], 1
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m2, m5, m6
+    test                r7d, r7d
+    jl .fast3
+    add                  r3, 32*24
+    mova                xm1,     [r3-32*1+16]
+    mova                xm2,     [r3-32*4+ 0]
+    vinserti128          m1, m1, [r3+32*3+16], 1
+    vinserti128          m2, m2, [r3+32*0+ 0], 1
+    mova                xm5,     [r3-32*1+ 0]
+    mova                xm6,     [r3-32*4+16]
+    vinserti128          m5, m5, [r3+32*3+ 0], 1
+    vinserti128          m6, m6, [r3+32*0+16], 1
+.fast3:
+    add                 rax, o_idct64_offset
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    add                 rax, 8
+    add               tmp1q, 32*8
+    sub               tmp2q, 32*8
+    mova                xm0,     [r2-32*2+ 0]
+    mova                xm3,     [r2-32*3+16]
+    vinserti128          m0, m0, [r2+32*2+ 0], 1
+    vinserti128          m3, m3, [r2+32*1+16], 1
+    mova                xm4,     [r2-32*2+16]
+    mova                xm7,     [r2-32*3+ 0]
+    vinserti128          m4, m4, [r2+32*2+16], 1
+    vinserti128          m7, m7, [r2+32*1+ 0], 1
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m2, m5, m6
+    test                r7d, r7d
+    jl .fast4
+    mova                xm1,     [r3-32*3+16]
+    mova                xm2,     [r3-32*2+ 0]
+    vinserti128          m1, m1, [r3+32*1+16], 1
+    vinserti128          m2, m2, [r3+32*2+ 0], 1
+    mova                xm5,     [r3-32*3+ 0]
+    mova                xm6,     [r3-32*2+16]
+    vinserti128          m5, m5, [r3+32*1+ 0], 1
+    vinserti128          m6, m6, [r3+32*2+16], 1
+.fast4:
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
+    RET
+ALIGN function_align
+%define o_base idct64_mul - 8
+.main_part1:
+    ; idct64 steps 1-5:
+    ; in1/31/17/15/ 9/23/25/ 7 ->
+    ;     t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a
+    ; in5/27/21/11/13/19/29/ 3 ->
+    ;     t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a
+    vpbroadcastd        m11, [o(idct64_mul+4* 0)]
+    vpbroadcastd        m13, [o(idct64_mul+4* 1)]
+    vpbroadcastd        m10, [o(idct64_mul+4* 4)]
+    vpbroadcastd        m12, [o(idct64_mul+4* 5)]
+    pmulhrsw            m11, m0  ; t63a
+    pmulhrsw             m0, m13 ; t32a
+    pmulhrsw            m10, m1  ; t62a
+    pmulhrsw             m1, m12 ; t33a
+    vpbroadcastd         m9, [o(idct64_mul+4* 8)]
+    vpbroadcastd        m13, [o(idct64_mul+4* 9)]
+    vpbroadcastd         m8, [o(idct64_mul+4*12)]
+    vpbroadcastd        m12, [o(idct64_mul+4*13)]
+    pmulhrsw             m9, m2  ; t61a
+    pmulhrsw             m2, m13 ; t34a
+    pmulhrsw             m8, m3  ; t60a
+    pmulhrsw             m3, m12 ; t35a
+    psubw               m12, m0, m1   ; t33
+    paddw                m0, m1       ; t32
+    psubw                m1, m3, m2   ; t34
+    paddw                m3, m2       ; t35
+    psubw                m2, m8, m9   ; t61
+    paddw                m8, m9       ; t60
+    psubw                m9, m11, m10 ; t62
+    paddw               m11, m10      ; t63
+    ITX_MULSUB_2W         2,  1, 10, 13, 15, m4076, 401 ; t34a, t61a
+    vpbroadcastd        m14, [o(pw_401_4076)]
+    ITX_MULSUB_2W         9, 12, 10, 13, 15, 14, 13 ; t33a, t62a
+    psubw               m10, m0, m3  ; t35a
+    paddw                m0, m3      ; t32a
+    psubw                m3, m11, m8 ; t60a
+    paddw               m11, m8      ; t63a
+    psubw                m8, m9, m2  ; t34
+    paddw                m9, m2      ; t33
+    psubw                m2, m12, m1 ; t61
+    paddw               m12, m1      ; t62
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m9
+    mova       [tmp2q+32*2], m12
+    mova       [tmp2q+32*3], m11
+    vpbroadcastd        m13, [o(pw_m4017_799)]
+    vpbroadcastd        m14, [o(pw_799_4017)]
+    ITX_MULSUB_2W         2,  8,  0,  1, 15, 14, 13 ; t34a, t61a
+    ITX_MULSUB_2W         3, 10,  0,  1, 15, 14, 13 ; t35,  t60
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp2q+32*0], m10
+    mova       [tmp2q+32*1], m8
+    vpbroadcastd         m3, [o(idct64_mul+4*16)]
+    vpbroadcastd        m11, [o(idct64_mul+4*17)]
+    vpbroadcastd         m2, [o(idct64_mul+4*20)]
+    vpbroadcastd        m10, [o(idct64_mul+4*21)]
+    vpbroadcastd         m1, [o(idct64_mul+4*24)]
+    vpbroadcastd         m9, [o(idct64_mul+4*25)]
+    vpbroadcastd         m0, [o(idct64_mul+4*28)]
+    vpbroadcastd         m8, [o(idct64_mul+4*29)]
+    pmulhrsw             m3, m4  ; t59a
+    pmulhrsw             m4, m11 ; t36a
+    pmulhrsw             m2, m5  ; t58a
+    pmulhrsw             m5, m10 ; t37a
+    pmulhrsw             m1, m6  ; t57a
+    pmulhrsw             m6, m9  ; t38a
+    pmulhrsw             m0, m7  ; t56a
+    pmulhrsw             m7, m8  ; t39a
+    psubw                m8, m4, m5 ; t37
+    paddw                m4, m5     ; t36
+    psubw                m5, m7, m6 ; t38
+    paddw                m7, m6     ; t39
+    psubw                m6, m0, m1 ; t57
+    paddw                m0, m1     ; t56
+    psubw                m1, m3, m2 ; t58
+    paddw                m3, m2     ; t59
+    ITX_MULSUB_2W         6,  5,  2,  9, 15, m2598, 3166 ; t38a, t57a
+    vpbroadcastd        m10, [o(pw_3166_2598)]
+    ITX_MULSUB_2W         1,  8,  2,  9, 15, 10,  9 ; t37a, t58a
+    psubw                m2, m7, m4 ; t36a
+    paddw                m7, m4     ; t39a
+    psubw                m4, m0, m3 ; t59a
+    paddw                m0, m3     ; t56a
+    psubw                m3, m6, m1 ; t37
+    paddw                m6, m1     ; t38
+    psubw                m1, m5, m8 ; t58
+    paddw                m5, m8     ; t57
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    mova       [tmp2q-32*4], m0
+    mova       [tmp2q-32*3], m5
+    vpbroadcastd         m6, [o(pw_m799_m4017)]
+    vpbroadcastd         m7, [o(pw_m4017_799)]
+    ITX_MULSUB_2W         4,  2,  0,  5, 15,  7,  6 ; t36,  t59
+    ITX_MULSUB_2W         1,  3,  0,  5, 15,  7,  6 ; t37a, t58a
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m1
+    mova       [tmp2q-32*2], m3
+    mova       [tmp2q-32*1], m2
+    ret
+%define o_base pw_5 + 128
+.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub
+    sub                 rax, o_idct64_offset + 8
+    vpbroadcastd        m11, [o(pw_1567_3784)]
+    vpbroadcastd        m12, [o(pw_m3784_1567)]
+    vpbroadcastd        m13, [o(pw_m1567_m3784)]
+    vpbroadcastd        m14, [o(pw_2896x8)]
+.main_part2_pass1_loop:
+    call .main_part2_internal
+    REPX  {pmulhrsw x, m14}, m1, m2, m4, m3
+    IDCT64_PART2_END      0,  7,  0,  6,  9, 10
+    IDCT64_PART2_END      7,  8,  5,  0,  6,  7
+    IDCT64_PART2_END      8,  2,  1,  0,  6,  7
+    IDCT64_PART2_END     15,  3,  4,  0,  6,  7
+    cmp               tmp1q, tmp2q
+    jne .main_part2_pass1_loop
+    ret
+.main_part2_internal:
+    mova                 m0, [tmp1q-32*12] ; t32a
+    mova                 m1, [tmp2q-32*13] ; t39a
+    mova                 m2, [tmp1q-32* 4] ; t40a
+    mova                 m5, [tmp2q+32* 3] ; t55a
+    add               tmp1q, 32
+    sub               tmp2q, 32
+    mova                 m4, [tmp1q+32* 3] ; t48a
+    mova                 m3, [tmp2q-32* 4] ; t47a
+    mova                 m6, [tmp1q+32*11] ; t56a
+    mova                 m7, [tmp2q+32*12] ; t63a
+    psubw                m8, m0, m1 ; t39
+    paddw                m0, m1     ; t32
+    psubw                m1, m3, m2 ; t40
+    paddw                m3, m2     ; t47
+    psubw                m2, m4, m5 ; t55
+    paddw                m4, m5     ; t48
+    psubw                m5, m7, m6 ; t56
+    paddw                m7, m6     ; t63
+    ITX_MULSUB_2W         5,  8,  6,  9, 15, 11, 12 ; t39a, t56a
+    ITX_MULSUB_2W         2,  1,  6,  9, 15, 12, 13 ; t40a, t55a
+    psubw                m6, m0, m3 ; t47a
+    paddw                m0, m3     ; t32a
+    psubw                m3, m7, m4 ; t48a
+    paddw                m7, m4     ; t63a
+    psubw                m4, m5, m2 ; t40
+    paddw                m5, m2     ; t39
+    psubw                m2, m8, m1 ; t55
+    paddw                m8, m1     ; t56
+    psubw                m1, m2, m4 ; t40a
+    paddw                m2, m4     ; t55a
+    psubw                m4, m3, m6 ; t47
+    paddw                m3, m6     ; t48
+    ret
+.main_part2_pass2:
+    sub                 rax, o_idct64_offset + 8
+    vpbroadcastd        m11, [o(pw_1567_3784)]
+    vpbroadcastd        m12, [o(pw_m3784_1567)]
+    vpbroadcastd        m13, [o(pw_m1567_m3784)]
+    vpbroadcastd        m14, [o(pw_2048)]
+    lea                  r9, [strideq*5]    ; stride*5
+    lea                  r3, [r9+strideq*1] ; stride*6
+    lea                  r7, [r9+strideq*2] ; stride*7
+    lea                  r8, [r3+strideq*2] ; stride*8
+    lea                  r2, [dstq+r7]
+.main_part2_pass2_loop:
+    call .main_part2_internal
+    vpbroadcastd        m10, [o(pw_2896x8)]
+    REPX  {pmulhrsw x, m10}, m1, m2, m4, m3
+    IDCT64_PART2_END      0,  7,  0,  6,  9, 10, strideq*0, r3*4, r8*4, r7*8
+    IDCT64_PART2_END      7,  8,  5,  0,  6,  7, strideq*0, r3*4, r8*4, r7*8
+    IDCT64_PART2_END      8,  2,  1,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8
+    IDCT64_PART2_END     15,  3,  4,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8
+    add                dstq, strideq
+    sub                  r2, strideq
+    cmp               tmp1q, tmp2q
+    jne .main_part2_pass2_loop
+    ret
+
+cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    mov                 r2d, 16
+.dconly:
+    pmulhrsw            xm0, xm2
+    movd                xm2, [o(pw_2048)]
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    pxor                 m1, m1
+.dconly_loop:
+    mova                 m2, [dstq+32*0]
+    mova                 m3, [dstq+32*1]
+    punpckhbw            m4, m2, m1
+    punpcklbw            m2, m1
+    punpckhbw            m5, m3, m1
+    punpcklbw            m3, m1
+    paddw                m4, m0
+    paddw                m2, m0
+    paddw                m5, m0
+    paddw                m3, m0
+    packuswb             m2, m4
+    packuswb             m3, m5
+    mova        [dstq+32*0], m2
+    mova        [dstq+32*1], m3
+    add                dstq, strideq
+    dec                 r2d
+    jg .dconly_loop
+    RET
+.normal:
+    PROLOGUE              0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
+    LOAD_8ROWS      cq+32*0, 32*4
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m8
+    lea               tmp1q, [rsp+32*7]
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m1
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m5
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    add               tmp1q, 32*8
+    mova       [tmp1q-32*4], m8
+    mova       [tmp1q-32*3], m9
+    mova       [tmp1q-32*2], m10
+    mova       [tmp1q-32*1], m11
+    mova       [tmp1q+32*0], m12
+    mova       [tmp1q+32*1], m13
+    mova       [tmp1q+32*2], m14
+    mova       [tmp1q+32*3], m15
+    LOAD_8ROWS      cq+32*2, 32*4
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+    add               tmp1q, 32*8
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    vpbroadcastd        m15, [o(pd_2048)]
+    add               tmp1q, 32*16
+    add               tmp2q, 32*32
+    mova                 m0, [cq+32* 1]
+    mova                 m1, [cq+32*31]
+    mova                 m2, [cq+32*17]
+    mova                 m3, [cq+32*15]
+    mova                 m4, [cq+32* 9]
+    mova                 m5, [cq+32*23]
+    mova                 m6, [cq+32*25]
+    mova                 m7, [cq+32* 7]
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+    add                 rax, o_idct64_offset
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    add                 rax, 8
+    add               tmp1q, 32*8
+    sub               tmp2q, 32*8
+    mova                 m0, [cq+32* 5]
+    mova                 m1, [cq+32*27]
+    mova                 m2, [cq+32*21]
+    mova                 m3, [cq+32*11]
+    mova                 m4, [cq+32*13]
+    mova                 m5, [cq+32*19]
+    mova                 m6, [cq+32*29]
+    mova                 m7, [cq+32* 3]
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1
+    sub               tmp1q, 32*36
+    lea                  r2, [strideq*3]
+    mov               tmp2d, 4
+.pass2_loop:
+    lea                  r3, [tmp1q-32*8]
+    mova                xm0,      [r3   -32*4]
+    mova                xm1,      [r3   -32*3]
+    vinserti128          m0, m0,  [tmp1q-32*4], 1
+    vinserti128          m1, m1,  [tmp1q-32*3], 1
+    mova                xm2,      [r3   -32*2]
+    mova                xm3,      [r3   -32*1]
+    vinserti128          m2, m2,  [tmp1q-32*2], 1
+    vinserti128          m3, m3,  [tmp1q-32*1], 1
+    mova                xm4,      [r3   +32*0]
+    mova                xm5,      [r3   +32*1]
+    vinserti128          m4, m4,  [tmp1q+32*0], 1
+    vinserti128          m5, m5,  [tmp1q+32*1], 1
+    mova                xm6,      [r3   +32*2]
+    mova                xm7,      [r3   +32*3]
+    vinserti128          m6, m6,  [tmp1q+32*2], 1
+    vinserti128          m7, m7,  [tmp1q+32*3], 1
+    mova                xm8,      [r3   -32*4+16]
+    mova                xm9,      [r3   -32*3+16]
+    vinserti128          m8, m8,  [tmp1q-32*4+16], 1
+    vinserti128          m9, m9,  [tmp1q-32*3+16], 1
+    mova               xm10,      [r3   -32*2+16]
+    mova               xm11,      [r3   -32*1+16]
+    vinserti128         m10, m10, [tmp1q-32*2+16], 1
+    vinserti128         m11, m11, [tmp1q-32*1+16], 1
+    mova               xm12,      [r3   +32*0+16]
+    mova               xm13,      [r3   +32*1+16]
+    vinserti128         m12, m12, [tmp1q+32*0+16], 1
+    vinserti128         m13, m13, [tmp1q+32*1+16], 1
+    mova               xm14,      [r3   +32*2+16]
+    mova               xm15,      [r3   +32*3+16]
+    vinserti128         m14, m14, [tmp1q+32*2+16], 1
+    vinserti128         m15, m15, [tmp1q+32*3+16], 1
+    mova         [rsp+32*0], m6
+    mova         [rsp+32*1], m7
+    vpbroadcastd         m7, [o(pw_8192)]
+    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
+    call m(idct_16x16_internal).main
+    mova         [rsp+32*0], m15
+    vpbroadcastd        m15, [o(pw_2048)]
+    REPX  {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7
+    WRITE_16X2            2,  3,  1,  2, strideq*2, r2
+    pmulhrsw             m1, m15, [rsp+32*1]
+    WRITE_16X2            0,  1,  2,  3, strideq*0, strideq*1
+    lea                  r3, [dstq+strideq*4]
+    %define dstq r3
+    WRITE_16X2            4,  5,  2,  3, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  2,  3, strideq*2, r2
+    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
+    lea                  r3, [r3+strideq*4]
+    WRITE_16X2            8,  9,  2,  3, strideq*0, strideq*1
+    WRITE_16X2           10, 11,  2,  3, strideq*2, r2
+    pmulhrsw            m15, [rsp+32*0]
+    lea                  r3, [r3+strideq*4]
+    WRITE_16X2           12, 13,  2,  3, strideq*0, strideq*1
+    WRITE_16X2           14, 15,  2,  3, strideq*2, r2
+    add               tmp1q, 32*16
+    add                  r0, 16
+    dec               tmp2d
+    jg .pass2_loop
+    RET
+
+cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    mov                 r2d, 64
+    jmp m(inv_txfm_add_dct_dct_32x8).dconly
+.normal:
+    PROLOGUE              0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2
+    lea               tmp1q, [rsp+32*7]
+    lea                r10d, [eobq-136]
+    sar                r10d, 31
+.pass1_loop:
+    lea               tmp2q, [tmp1q+32*16]
+    LOAD_8ROWS      cq+64*1, 64*2, 1
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
+    test               r10b, r10b
+    jnz .fast
+    LOAD_8ROWS_H   cq+64*17, 64*2, 2
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
+    LOAD_8ROWS_H   cq+64*16, 64*2, 1
+    mova              [rsp], m15
+    pxor                m15, m15
+    REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \
+                                24, 25, 26, 27, 28, 29, 30, 31
+    jmp .idct16
+.fast:
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    pxor                 m8, m8
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m8
+.idct16:
+    LOAD_8ROWS      cq+64*0, 64*2, 1
+    pxor                m15, m15
+    REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
+    call m(idct_16x16_internal).main
+    call m(inv_txfm_add_dct_dct_32x16).pass1_end
+    vpbroadcastd         m7, [o(pw_16384)]
+    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
+    lea                  r3, [tmp1q+32*48]
+    mova                m15, [rsp]
+    mova          [r3-32*4], m0
+    mova          [r3-32*3], m2
+    mova          [r3-32*2], m4
+    mova          [r3-32*1], m6
+    mova          [r3+32*0], m8
+    mova          [r3+32*1], m10
+    mova          [r3+32*2], m12
+    mova          [r3+32*3], m14
+    add                  r3, 32*24
+    mova          [r3-32*4], m1
+    mova          [r3-32*3], m3
+    mova          [r3-32*2], m5
+    mova          [r3-32*1], m7
+    mova          [r3+32*0], m9
+    mova          [r3+32*1], m11
+    mova          [r3+32*2], m13
+    mova          [r3+32*3], m15
+    vpbroadcastd         m9, [o(pw_16384)]
+    pmulhrsw             m0, m9, [tmp1q-32*4]
+    pmulhrsw             m1, m9, [tmp1q-32*3]
+    pmulhrsw             m2, m9, [tmp1q-32*2]
+    pmulhrsw             m3, m9, [tmp1q-32*1]
+    pmulhrsw             m4, m9, [tmp1q+32*0]
+    pmulhrsw             m5, m9, [tmp1q+32*1]
+    pmulhrsw             m6, m9, [tmp1q+32*2]
+    pmulhrsw             m7, m9, [tmp1q+32*3]
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    mova       [tmp1q-32*4], m0
+    pmulhrsw             m0, m9, [tmp2q-32*4]
+    mova       [tmp2q-32*4], m1
+    pmulhrsw             m1, m9, [tmp2q-32*3]
+    mova       [tmp1q-32*3], m2
+    pmulhrsw             m2, m9, [tmp2q-32*2]
+    mova       [tmp2q-32*3], m3
+    pmulhrsw             m3, m9, [tmp2q-32*1]
+    mova       [tmp1q-32*2], m4
+    pmulhrsw             m4, m9, [tmp2q+32*0]
+    mova       [tmp2q-32*2], m5
+    pmulhrsw             m5, m9, [tmp2q+32*1]
+    mova       [tmp1q-32*1], m6
+    pmulhrsw             m6, m9, [tmp2q+32*2]
+    mova       [tmp2q-32*1], m7
+    pmulhrsw             m7, m9, [tmp2q+32*3]
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    mova       [tmp1q+32*0], m0
+    mova       [tmp2q+32*0], m1
+    mova       [tmp1q+32*1], m2
+    mova       [tmp2q+32*1], m3
+    mova       [tmp1q+32*2], m4
+    mova       [tmp2q+32*2], m5
+    mova       [tmp1q+32*3], m6
+    mova       [tmp2q+32*3], m7
+    add                  cq, 32
+    add               tmp1q, 32*8
+    add                r10d, 0x80000000
+    jnc .pass1_loop
+    lea                  r2, [rsp+32*55]
+    lea                  r7, [r2+32*24]
+.pass2_loop:
+    lea                  r3, [r2+32*8]
+    lea                  r8, [r7+32*8]
+    mova                 m0, [r2-32*4]
+    mova                 m1, [r2-32*2]
+    mova                 m2, [r2+32*0]
+    mova                 m3, [r2+32*2]
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+    test               r10b, r10b
+    jnz .fast2
+    mova                 m4, [r3-32*4]
+    mova                 m5, [r3-32*2]
+    mova                 m6, [r3+32*0]
+    mova                 m7, [r3+32*2]
+.fast2:
+    mova              [rsp], m8
+    lea               tmp1q, [rsp+32*39]
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m1
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m5
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    add               tmp1q, 32*8
+    mova       [tmp1q-32*4], m8
+    mova       [tmp1q-32*3], m9
+    mova       [tmp1q-32*2], m10
+    mova       [tmp1q-32*1], m11
+    mova       [tmp1q+32*0], m12
+    mova       [tmp1q+32*1], m13
+    mova       [tmp1q+32*2], m14
+    mova       [tmp1q+32*3], m15
+    mova                 m0, [r2-32*3]
+    mova                 m1, [r2-32*1]
+    mova                 m2, [r2+32*1]
+    mova                 m3, [r2+32*3]
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7
+    test               r10b, r10b
+    jnz .fast3
+    mova                 m4, [r3-32*3]
+    mova                 m5, [r3-32*1]
+    mova                 m6, [r3+32*1]
+    mova                 m7, [r3+32*3]
+.fast3:
+    add               tmp1q, 32*8
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    vpbroadcastd        m15, [o(pd_2048)]
+    add               tmp1q, 32*16
+    add               tmp2q, 32*32
+    mova                 m0, [r7-32*4]
+    mova                 m3, [r7+32*3]
+    mova                 m4, [r7+32*0]
+    mova                 m7, [r7-32*1]
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m2, m5, m6
+    test               r10b, r10b
+    jnz .fast4
+    mova                 m1, [r8+32*3]
+    mova                 m2, [r8-32*4]
+    mova                 m5, [r8-32*1]
+    mova                 m6, [r8+32*0]
+.fast4:
+    add                 rax, o_idct64_offset
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    add                 rax, 8
+    add               tmp1q, 32*8
+    sub               tmp2q, 32*8
+    mova                 m0, [r7-32*2]
+    mova                 m3, [r7+32*1]
+    mova                 m4, [r7+32*2]
+    mova                 m7, [r7-32*3]
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m2, m5, m6
+    test               r10b, r10b
+    jnz .fast5
+    mova                 m1, [r8+32*1]
+    mova                 m2, [r8-32*2]
+    mova                 m5, [r8-32*3]
+    mova                 m6, [r8+32*2]
+.fast5:
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
+    add                r10d, 0x80000000
+    jc .ret
+    lea                  r2, [rsp+32*7]
+    lea                  r7, [r2+32*16]
+    sub                dstq, r8
+    lea                dstq, [dstq+strideq*4+16]
+    jmp .pass2_loop
+.ret:
+    RET
+
+cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    mov                 r2d, 32
+    jmp m(inv_txfm_add_dct_dct_64x16).dconly
+.normal:
+    PROLOGUE              0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \
+                                            base, tmp3, tmp4
+    lea               tmp1q, [rsp+32*7]
+    lea               tmp4d, [eobq-136]
+.pass1_loop:
+    LOAD_8ROWS      cq+64*0, 64*4, 1
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m8
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m1
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m5
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    add               tmp1q, 32*8
+    mova       [tmp1q-32*4], m8
+    mova       [tmp1q-32*3], m9
+    mova       [tmp1q-32*2], m10
+    mova       [tmp1q-32*1], m11
+    mova       [tmp1q+32*0], m12
+    mova       [tmp1q+32*1], m13
+    mova       [tmp1q+32*2], m14
+    mova       [tmp1q+32*3], m15
+    LOAD_8ROWS      cq+64*2, 64*4, 1
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+    add               tmp1q, 32*8
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    vpbroadcastd        m15, [o(pd_2048)]
+    add               tmp1q, 32*16
+    add               tmp2q, 32*32
+    vpbroadcastd         m7, [o(pw_2896x8)]
+    pmulhrsw             m0, m7, [cq+64* 1]
+    pmulhrsw             m1, m7, [cq+64*31]
+    pmulhrsw             m2, m7, [cq+64*17]
+    pmulhrsw             m3, m7, [cq+64*15]
+    pmulhrsw             m4, m7, [cq+64* 9]
+    pmulhrsw             m5, m7, [cq+64*23]
+    pmulhrsw             m6, m7, [cq+64*25]
+    pmulhrsw             m7,     [cq+64* 7]
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+    add                 rax, o_idct64_offset
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    vpbroadcastd         m7, [o(pw_2896x8-(o_idct64_offset))]
+    add                 rax, 8
+    add               tmp1q, 32*8
+    sub               tmp2q, 32*8
+    pmulhrsw             m0, m7, [cq+64* 5]
+    pmulhrsw             m1, m7, [cq+64*27]
+    pmulhrsw             m2, m7, [cq+64*21]
+    pmulhrsw             m3, m7, [cq+64*11]
+    pmulhrsw             m4, m7, [cq+64*13]
+    pmulhrsw             m5, m7, [cq+64*19]
+    pmulhrsw             m6, m7, [cq+64*29]
+    pmulhrsw             m7,     [cq+64* 3]
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1
+    sub               tmp1q, 32*44
+    vpbroadcastd        m10, [o(pw_16384)]
+    call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave
+    add                  cq, 32
+    add               tmp4d, 0x80000000
+    jnc .pass1_loop
+    lea               tmp1q, [rsp+32*15]
+    imul                 r2, strideq, 19
+    lea                  r3, [strideq*3]
+    add                  r2, dstq
+    mov               tmp4b, 4
+.pass2_loop:
+    lea               tmp2q, [tmp1q+32*64]
+    LOAD_8ROWS   tmp1q-32*4, 32
+    test              tmp4d, 0x40000000
+    jnz .fast
+    LOAD_8ROWS_H tmp2q-32*4, 32
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
+    lea               tmp3q, [tmp2q-32*8]
+    LOAD_8ROWS_H tmp3q-32*4, 32
+    mova              [rsp], m15
+    jmp .idct16
+.fast:
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    pxor                 m8, m8
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m8
+.idct16:
+    lea               tmp3q, [tmp1q-32*8]
+    LOAD_8ROWS   tmp3q-32*4, 32
+    call m(idct_16x16_internal).main
+    call m(inv_txfm_add_dct_dct_16x32).pass2_end
+    add               tmp1q, 32*16
+    sub                dstq, r3
+    lea                  r2, [r2+r3+16]
+    add                dstq, 16
+    dec               tmp4b
+    jg .pass2_loop
+    RET
+ALIGN function_align
+.transpose_round_interleave:
+    mov               tmp3d, 4
+.loop:
+    lea               tmp2q, [tmp1q+32*8]
+    mova                xm0,      [tmp1q-32*4]
+    mova                xm1,      [tmp1q-32*3]
+    vinserti128          m0, m0,  [tmp2q-32*4], 1
+    vinserti128          m1, m1,  [tmp2q-32*3], 1
+    mova                xm2,      [tmp1q-32*2]
+    mova                xm3,      [tmp1q-32*1]
+    vinserti128          m2, m2,  [tmp2q-32*2], 1
+    vinserti128          m3, m3,  [tmp2q-32*1], 1
+    mova                xm4,      [tmp1q+32*0]
+    mova                xm5,      [tmp1q+32*1]
+    vinserti128          m4, m4,  [tmp2q+32*0], 1
+    vinserti128          m5, m5,  [tmp2q+32*1], 1
+    mova                xm6,      [tmp1q+32*2]
+    mova                xm7,      [tmp1q+32*3]
+    vinserti128          m6, m6,  [tmp2q+32*2], 1
+    vinserti128          m7, m7,  [tmp2q+32*3], 1
+    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    mova                xm8,      [tmp1q-32*4+16]
+    mova                xm9,      [tmp1q-32*3+16]
+    vinserti128          m8, m8,  [tmp2q-32*4+16], 1
+    vinserti128          m9, m9,  [tmp2q-32*3+16], 1
+    mova       [tmp1q-32*4], m0
+    mova       [tmp2q-32*4], m1
+    mova       [tmp1q-32*3], m2
+    mova       [tmp2q-32*3], m3
+    mova                xm2,     [tmp1q-32*2+16]
+    mova                xm3,     [tmp1q-32*1+16]
+    vinserti128          m2, m2, [tmp2q-32*2+16], 1
+    vinserti128          m3, m3, [tmp2q-32*1+16], 1
+    mova       [tmp1q-32*2], m4
+    mova       [tmp2q-32*2], m5
+    mova       [tmp1q-32*1], m6
+    mova       [tmp2q-32*1], m7
+    mova                xm4,     [tmp1q+32*0+16]
+    mova                xm5,     [tmp1q+32*1+16]
+    vinserti128          m4, m4, [tmp2q+32*0+16], 1
+    vinserti128          m5, m5, [tmp2q+32*1+16], 1
+    mova                xm6,     [tmp1q+32*2+16]
+    mova                xm7,     [tmp1q+32*3+16]
+    vinserti128          m6, m6, [tmp2q+32*2+16], 1
+    vinserti128          m7, m7, [tmp2q+32*3+16], 1
+    pmulhrsw             m0, m8, m10
+    pmulhrsw             m1, m9, m10
+    REPX  {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    mova       [tmp1q+32*0], m0
+    mova       [tmp2q+32*0], m1
+    mova       [tmp1q+32*1], m2
+    mova       [tmp2q+32*1], m3
+    mova       [tmp1q+32*2], m4
+    mova       [tmp2q+32*2], m5
+    mova       [tmp1q+32*3], m6
+    mova       [tmp2q+32*3], m7
+    add               tmp1q, 32*16
+    dec               tmp3d
+    jg .loop
+    ret
+
+cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    mov                 r2d, 64
+    jmp m(inv_txfm_add_dct_dct_64x16).dconly
+.normal:
+    PROLOGUE              0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2
+    lea               tmp1q, [rsp+32*71]
+    lea                r10d, [eobq-136]
+.pass1_loop:
+    LOAD_8ROWS      cq+64*0, 64*4
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m8
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m1
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m5
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    add               tmp1q, 32*8
+    mova       [tmp1q-32*4], m8
+    mova       [tmp1q-32*3], m9
+    mova       [tmp1q-32*2], m10
+    mova       [tmp1q-32*1], m11
+    mova       [tmp1q+32*0], m12
+    mova       [tmp1q+32*1], m13
+    mova       [tmp1q+32*2], m14
+    mova       [tmp1q+32*3], m15
+    LOAD_8ROWS      cq+64*2, 64*4
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+    add               tmp1q, 32*8
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    vpbroadcastd        m15, [o(pd_2048)]
+    add               tmp1q, 32*16
+    add               tmp2q, 32*32
+    mova                 m0, [cq+64* 1]
+    mova                 m1, [cq+64*31]
+    mova                 m2, [cq+64*17]
+    mova                 m3, [cq+64*15]
+    mova                 m4, [cq+64* 9]
+    mova                 m5, [cq+64*23]
+    mova                 m6, [cq+64*25]
+    mova                 m7, [cq+64* 7]
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+    add                 rax, o_idct64_offset
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    add                 rax, 8
+    add               tmp1q, 32*8
+    sub               tmp2q, 32*8
+    mova                 m0, [cq+64* 5]
+    mova                 m1, [cq+64*27]
+    mova                 m2, [cq+64*21]
+    mova                 m3, [cq+64*11]
+    mova                 m4, [cq+64*13]
+    mova                 m5, [cq+64*19]
+    mova                 m6, [cq+64*29]
+    mova                 m7, [cq+64* 3]
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1
+    sub               tmp1q, 32*44
+    vpbroadcastd        m10, [o(pw_8192)]
+    call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave
+    add                  cq, 32
+    add                r10d, 0x80000000
+    jnc .pass1_loop
+    lea               tmp1q, [rsp+32*7]
+    mov                r10b, 4
+.pass2_loop:
+    lea                  r2, [tmp1q+32*64]
+    mova                 m0, [r2-32*4]
+    mova                 m1, [r2-32*2]
+    mova                 m2, [r2+32*0]
+    mova                 m3, [r2+32*2]
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m4
+    test               r10d, 0x40000000
+    jnz .fast
+    lea                  r3, [r2+32*64]
+    mova                 m4, [r3-32*4]
+    mova                 m5, [r3-32*2]
+    mova                 m6, [r3+32*0]
+    mova                 m7, [r3+32*2]
+.fast:
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m1
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m5
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    add               tmp1q, 32*8
+    mova       [tmp1q-32*4], m8
+    mova       [tmp1q-32*3], m9
+    mova       [tmp1q-32*2], m10
+    mova       [tmp1q-32*1], m11
+    mova       [tmp1q+32*0], m12
+    mova       [tmp1q+32*1], m13
+    mova       [tmp1q+32*2], m14
+    mova       [tmp1q+32*3], m15
+    mova                 m0, [r2-32*3]
+    mova                 m1, [r2-32*1]
+    mova                 m2, [r2+32*1]
+    mova                 m3, [r2+32*3]
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7
+    test               r10d, 0x40000000
+    jnz .fast2
+    mova                 m4, [r3-32*3]
+    mova                 m5, [r3-32*1]
+    mova                 m6, [r3+32*1]
+    mova                 m7, [r3+32*3]
+.fast2:
+    add               tmp1q, 32*8
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    vpbroadcastd        m15, [o(pd_2048)]
+    add                  r2, 32*8
+    add                  r3, 32*8
+    add               tmp1q, 32*16
+    add               tmp2q, 32*32
+    mova                 m0, [r2-32*4] ;  1
+    mova                 m3, [r2+32*3] ; 15
+    mova                 m4, [r2+32*0] ;  9
+    mova                 m7, [r2-32*1] ;  7
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m2, m5, m6
+    test               r10d, 0x40000000
+    jnz .fast3
+    mova                 m1, [r3+32*3] ; 31
+    mova                 m2, [r3-32*4] ; 17
+    mova                 m5, [r3-32*1] ; 23
+    mova                 m6, [r3+32*0] ; 25
+.fast3:
+    add                 rax, o_idct64_offset
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    add                 rax, 8
+    add               tmp1q, 32*8
+    sub               tmp2q, 32*8
+    mova                 m0, [r2-32*2] ;  5
+    mova                 m3, [r2+32*1] ; 11
+    mova                 m4, [r2+32*2] ; 13
+    mova                 m7, [r2-32*3] ;  3
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m2, m5, m6
+    test               r10d, 0x40000000
+    jnz .fast4
+    mova                 m1, [r3+32*1] ; 27
+    mova                 m2, [r3-32*2] ; 21
+    mova                 m5, [r3-32*3] ; 19
+    mova                 m6, [r3+32*2] ; 29
+.fast4:
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
+    sub               tmp1q, 32*28
+    sub                dstq, r8
+    lea                dstq, [dstq+strideq*4+16]
+    dec                r10b
+    jg .pass2_loop
+    RET
+
+%endif ; ARCH_X86_64
--- /dev/null
+++ b/src/x86/itx_init.c
@@ -1,0 +1,141 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_identity_identity_##w##x##h##_##opt)
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(dav1d_inv_txfm_add_dct_adst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_dct_flipadst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_dct_identity_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_adst_dct_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_adst_adst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_adst_flipadst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_flipadst_dct_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_flipadst_adst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_identity_dct_##w##x##h##_##opt)
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(dav1d_inv_txfm_add_adst_identity_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_flipadst_identity_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_identity_adst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_identity_flipadst_##w##x##h##_##opt)
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(dav1d_inv_txfm_add_wht_wht_##w##x##h##_##opt)
+
+decl_itx17_fns( 4,  4, avx2);
+decl_itx16_fns( 4,  8, avx2);
+decl_itx16_fns( 4, 16, avx2);
+decl_itx16_fns( 8,  4, avx2);
+decl_itx16_fns( 8,  8, avx2);
+decl_itx16_fns( 8, 16, avx2);
+decl_itx2_fns ( 8, 32, avx2);
+decl_itx16_fns(16,  4, avx2);
+decl_itx16_fns(16,  8, avx2);
+decl_itx12_fns(16, 16, avx2);
+decl_itx2_fns (16, 32, avx2);
+decl_itx2_fns (32,  8, avx2);
+decl_itx2_fns (32, 16, avx2);
+decl_itx2_fns (32, 32, avx2);
+
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_avx2);
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_avx2);
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_avx2);
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2);
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2);
+
+void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+        dav1d_inv_txfm_add_##type##_##w##x##h##_##ext
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+    assign_itx1_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+    assign_itx2_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
+    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
+    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
+    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
+    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
+    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
+    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
+    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
+    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+    assign_itx12_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
+    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
+    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
+    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+    assign_itx16_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, wht_wht,           WHT_WHT,           ext)
+
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8 && ARCH_X86_64 && !defined(_WIN32) // FIXME: Windows
+    assign_itx17_fn( ,  4,  4, avx2);
+    assign_itx16_fn(R,  4,  8, avx2);
+    assign_itx16_fn(R,  4, 16, avx2);
+    assign_itx16_fn(R,  8,  4, avx2);
+    assign_itx16_fn( ,  8,  8, avx2);
+    assign_itx16_fn(R,  8, 16, avx2);
+    assign_itx2_fn (R,  8, 32, avx2);
+    assign_itx16_fn(R, 16,  4, avx2);
+    assign_itx16_fn(R, 16,  8, avx2);
+    assign_itx12_fn( , 16, 16, avx2);
+    assign_itx2_fn (R, 16, 32, avx2);
+    assign_itx1_fn (R, 16, 64, avx2);
+    assign_itx2_fn (R, 32,  8, avx2);
+    assign_itx2_fn (R, 32, 16, avx2);
+    assign_itx2_fn ( , 32, 32, avx2);
+    assign_itx1_fn (R, 32, 64, avx2);
+    assign_itx1_fn (R, 64, 16, avx2);
+    assign_itx1_fn (R, 64, 32, avx2);
+    assign_itx1_fn ( , 64, 64, avx2);
+#endif
+}