ref: a3856f049d188b971c0956b1b5cef350212f03a3
dir: /src/x86/itx.asm/
; Copyright © 2018, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 16 ; Note: The order of (at least some of) those constants matter! deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 %macro COEF_PAIR 2 pw_%1_%2: dw %1, %2 pw_m%2_%1: dw -%2, %1 %endmacro ; ADST-only pw_3803_1321: dw 3803, 1321 pw_m1321_2482: dw -1321, 2482 pw_2482_3344: dw 2482, 3344 pw_m3344_3344: dw -3344, 3344 pw_m3803_3344: dw -3803, 3344 pw_m3803_m6688: dw -3803, -6688 pw_2896_m2896: dw 2896, -2896 pw_5: times 2 dw 5 pw_2048: times 2 dw 2048 pw_4096: times 2 dw 4096 pw_8192: times 2 dw 8192 pw_16384: times 2 dw 16384 pw_1697x16: times 2 dw 1697*16 pw_1697x8: times 2 dw 1697*8 pw_2896x8: times 2 dw 2896*8 pd_2048: dd 2048 COEF_PAIR 2896, 2896 COEF_PAIR 1567, 3784 COEF_PAIR 3784, 1567 COEF_PAIR 201, 4091 COEF_PAIR 995, 3973 COEF_PAIR 1751, 3703 COEF_PAIR 2440, 3290 COEF_PAIR 3035, 2751 COEF_PAIR 3513, 2106 COEF_PAIR 3857, 1380 COEF_PAIR 4052, 601 COEF_PAIR 401, 4076 COEF_PAIR 1931, 3612 COEF_PAIR 3166, 2598 COEF_PAIR 3920, 1189 COEF_PAIR 799, 4017 COEF_PAIR 3406, 2276 pw_m799_m4017: dw -799, -4017 pw_m1567_m3784: dw -1567, -3784 pw_m3406_m2276: dw -3406, -2276 pw_m401_m4076: dw -401, -4076 pw_m3166_m2598: dw -3166, -2598 pw_m1931_m3612: dw -1931, -3612 pw_m3920_m1189: dw -3920, -1189 COEF_PAIR 2276, 3406 COEF_PAIR 4017, 799 %macro COEF_X8 1-* %rep %0 dw %1*8, %1*8 %rotate 1 %endrep %endmacro pw_3703x8: COEF_X8 3703 pw_1751x8: COEF_X8 1751 pw_m1380x8: COEF_X8 -1380 pw_3857x8: COEF_X8 3857 pw_3973x8: COEF_X8 3973 pw_995x8: COEF_X8 995 pw_m2106x8: COEF_X8 -2106 pw_3513x8: COEF_X8 3513 pw_3290x8: COEF_X8 3290 pw_2440x8: COEF_X8 2440 pw_m601x8: COEF_X8 -601 pw_4052x8: COEF_X8 4052 idct64_mul: COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520 COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092 COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842 COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301 pw_201_4091x8: dw 201*8, 4091*8 pw_m601_4052x8: dw -601*8, 4052*8 pw_995_3973x8: dw 995*8, 3973*8 pw_m1380_3857x8: dw -1380*8, 3857*8 pw_1751_3703x8: dw 1751*8, 3703*8 pw_m2106_3513x8: dw -2106*8, 3513*8 pw_2440_3290x8: dw 2440*8, 3290*8 pw_m2751_3035x8: dw -2751*8, 3035*8 %define o_idct64_offset idct64_mul - (o_base) - 8 SECTION .text ; Code size reduction trickery: Intead of using rip-relative loads with ; mandatory 4-byte offsets everywhere, we can set up a base pointer with a ; single rip-relative lea and then address things relative from that with ; 1-byte offsets as long as data is within +-128 bytes of the base pointer. %define o_base deint_shuf + 128 %define o(x) (rax - (o_base) + (x)) %macro REPX 2-* %xdefine %%f(x) %1 %rep %0 - 1 %rotate 1 %%f(%1) %endrep %endmacro %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) ; flags: 1 = swap, 2 = interleave, 4: coef_regs %macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags %if %7 & 4 pmaddwd m%2, m%5, m%1 pmaddwd m%1, m%6 %else %if %7 & 1 vpbroadcastd m%2, [o(pw_%5_%6)] vpbroadcastd m%3, [o(pw_m%6_%5)] %else vpbroadcastd m%2, [o(pw_m%6_%5)] vpbroadcastd m%3, [o(pw_%5_%6)] %endif pmaddwd m%2, m%1 pmaddwd m%1, m%3 %endif paddd m%2, m%4 paddd m%1, m%4 %if %7 & 2 pslld m%2, 4 psrld m%1, 12 pblendw m%1, m%2, 0xaa %else psrad m%2, 12 psrad m%1, 12 packssdw m%1, m%2 %endif %endmacro ; flags: 1 = swap, 2 = interleave, 4 = coef_regs %macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags %if %10 & 1 vpbroadcastd m%3, [o(pw_%8_%9)] vpbroadcastd m%4, [o(pw_m%9_%8)] vpbroadcastd xm%2, [o(pw_%6_%7)] vpblendd m%2, m%3, 0xf0 vpbroadcastd xm%3, [o(pw_m%7_%6)] %else vpbroadcastd m%3, [o(pw_m%9_%8)] vpbroadcastd m%4, [o(pw_%8_%9)] vpbroadcastd xm%2, [o(pw_m%7_%6)] vpblendd m%2, m%3, 0xf0 vpbroadcastd xm%3, [o(pw_%6_%7)] %endif vpblendd m%3, m%4, 0xf0 ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10) %endmacro ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 %macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 punpckhwd m%3, m%2, m%1 punpcklwd m%2, m%1 %if %7 < 32 pmaddwd m%1, m%7, m%2 pmaddwd m%4, m%7, m%3 %else vpbroadcastd m%1, [o(pw_m%7_%6)] pmaddwd m%4, m%3, m%1 pmaddwd m%1, m%2 %endif paddd m%4, m%5 paddd m%1, m%5 psrad m%4, 12 psrad m%1, 12 packssdw m%1, m%4 %if %7 < 32 pmaddwd m%3, m%6 pmaddwd m%2, m%6 %else vpbroadcastd m%4, [o(pw_%6_%7)] pmaddwd m%3, m%4 pmaddwd m%2, m%4 %endif paddd m%3, m%5 paddd m%2, m%5 psrad m%3, 12 psrad m%2, 12 %if %0 == 8 packssdw m%8, m%2, m%3 %else packssdw m%2, m%3 %endif %endmacro %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3 ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0 psubsw m%3, m%1, m%2 paddsw m%2, m%1 paddsw m%1, m%4, m%5 psubsw m%4, m%5 %endmacro %macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048 ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3 paddsw m%9, m%2, m%6 ; t4 psubsw m%2, m%6 ; t5a paddsw m%10, m%8, m%4 ; t7 psubsw m%8, m%4 ; t6a ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0 ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6 psubsw m%6, m%1, m%3 ; dct4 out2 paddsw m%3, m%1 ; dct4 out1 paddsw m%1, m%5, m%7 ; dct4 out0 psubsw m%5, m%7 ; dct4 out3 psubsw m%7, m%3, m%2 ; out6 paddsw m%2, m%3 ; out1 paddsw m%3, m%6, m%8 ; out2 psubsw m%6, m%8 ; out5 psubsw m%8, m%1, m%10 ; out7 paddsw m%1, m%10 ; out0 paddsw m%4, m%5, m%9 ; out3 psubsw m%5, m%9 ; out4 %endmacro ; in1 = %1, in3 = %2, in5 = %3, in7 = %4 ; in9 = %5, in11 = %6, in13 = %7, in15 = %8 %macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048 ITX_MULSUB_2W %1, %8, %9, %10, %11, 401, 4076 ; t8a, t15a ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a psubsw m%9, m%2, m%6 ; t13 paddsw m%6, m%2 ; t12 psubsw m%2, m%8, m%4 ; t14 paddsw m%8, m%4 ; t15 psubsw m%4, m%7, m%3 ; t10 paddsw m%3, m%7 ; t11 psubsw m%7, m%1, m%5 ; t9 paddsw m%1, m%5 ; t8 ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a psubsw m%5, m%1, m%3 ; t11a paddsw m%1, m%3 ; t8a psubsw m%3, m%7, m%4 ; t13 paddsw m%7, m%4 ; t14 psubsw m%4, m%8, m%6 ; t12a paddsw m%8, m%6 ; t15a psubsw m%6, m%2, m%9 ; t10 paddsw m%2, m%9 ; t9 ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12 %endmacro %macro WRAP_XMM 1+ INIT_XMM cpuname %1 INIT_YMM cpuname %endmacro %macro ITX4_END 4-5 2048 ; row[1-4], rnd %if %5 vpbroadcastd m2, [o(pw_%5)] pmulhrsw m0, m2 pmulhrsw m1, m2 %endif lea r2, [dstq+strideq*2] %assign %%i 1 %rep 4 %if %1 & 2 CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) %else CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) %endif %assign %%i %%i + 1 %rotate 1 %endrep movd m2, [%%row_adr1] pinsrd m2, [%%row_adr2], 1 movd m3, [%%row_adr3] pinsrd m3, [%%row_adr4], 1 pmovzxbw m2, m2 pmovzxbw m3, m3 paddw m0, m2 paddw m1, m3 packuswb m0, m1 movd [%%row_adr1], m0 pextrd [%%row_adr2], m0, 1 pextrd [%%row_adr3], m0, 2 pextrd [%%row_adr4], m0, 3 ret %endmacro %macro IWHT4_1D_PACKED 0 punpckhqdq m3, m0, m1 ; in1 in3 punpcklqdq m0, m1 ; in0 in2 psubw m2, m0, m3 paddw m0, m3 punpckhqdq m2, m2 ; t2 t2 punpcklqdq m0, m0 ; t0 t0 psubw m1, m0, m2 psraw m1, 1 psubw m1, m3 ; t1 t3 psubw m0, m1 ; ____ out0 paddw m2, m1 ; out3 ____ %endmacro INIT_XMM avx2 cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, c mova m0, [cq+16*0] mova m1, [cq+16*1] pxor m2, m2 mova [cq+16*0], m2 mova [cq+16*1], m2 psraw m0, 2 psraw m1, 2 IWHT4_1D_PACKED punpckhwd m0, m1 punpcklwd m3, m1, m2 punpckhdq m1, m0, m3 punpckldq m0, m3 IWHT4_1D_PACKED vpblendd m0, m2, 0x03 ITX4_END 3, 0, 2, 1, 0 %macro INV_TXFM_FN 3 ; type1, type2, size cglobal inv_txfm_add_%1_%2_%3, 4, 5, 0, dst, stride, c, eob, tx2 %define %%p1 m(i%1_%3_internal) lea rax, [o_base] ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. lea tx2q, [m(i%2_%3_internal).pass2] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else ; jump to the 1st txfm function unless it's located directly after this times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align %%end: %endif %endmacro %macro INV_TXFM_4X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x4 %ifidn %1_%2, dct_dct vpbroadcastw m0, [cq] vpbroadcastd m1, [o(pw_2896x8)] pmulhrsw m0, m1 mov [cq], eobd ; 0 pmulhrsw m0, m1 mova m1, m0 jmp m(iadst_4x4_internal).end2 %endif %endmacro %macro IDCT4_1D_PACKED 0 vpbroadcastd m4, [o(pd_2048)] punpckhwd m2, m1, m0 punpcklwd m1, m0 ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 paddsw m0, m1, m2 ; out0 out1 psubsw m1, m2 ; out3 out2 %endmacro %macro IADST4_1D_PACKED 0 punpcklwd m2, m1, m0 punpckhwd m3, m1, m0 vpbroadcastd m5, [o(pw_m3344_3344)] vpbroadcastd m0, [o(pw_3803_1321)] vpbroadcastd m4, [o(pw_m1321_2482)] pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2 psrld m5, 16 pmaddwd m0, m2 pmaddwd m2, m4 pmaddwd m5, m3 ; 3344*in0 paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3 vpbroadcastd m4, [o(pw_2482_3344)] vpbroadcastd m5, [o(pw_m3803_3344)] pmaddwd m4, m3 pmaddwd m5, m3 paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3 vpbroadcastd m0, [o(pw_m3803_m6688)] pmaddwd m3, m0 vpbroadcastd m0, [o(pd_2048)] paddd m2, m0 paddd m1, m0 paddd m0, m4 paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3 paddd m2, m4 paddd m2, m3 REPX {psrad x, 12}, m1, m2, m0, m5 packssdw m0, m5 ; out0 out1 packssdw m1, m2 ; out2 out3 %endmacro INV_TXFM_4X4_FN dct, dct INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst INV_TXFM_4X4_FN dct, identity cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] IDCT4_1D_PACKED mova m2, [o(deint_shuf)] shufps m3, m0, m1, q1331 shufps m0, m1, q0220 pshufb m0, m2 pshufb m1, m3, m2 jmp tx2q .pass2: IDCT4_1D_PACKED pxor m2, m2 mova [cq+16*0], m2 mova [cq+16*1], m2 ITX4_END 0, 1, 3, 2 INV_TXFM_4X4_FN adst, dct INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] call .main punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: call .main .end: pxor m2, m2 mova [cq+16*0], m2 mova [cq+16*1], m2 .end2: ITX4_END 0, 1, 2, 3 ALIGN function_align .main: IADST4_1D_PACKED ret INV_TXFM_4X4_FN flipadst, dct INV_TXFM_4X4_FN flipadst, adst INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] call m(iadst_4x4_internal).main punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 jmp tx2q .pass2: call m(iadst_4x4_internal).main .end: pxor m2, m2 mova [cq+16*0], m2 mova [cq+16*1], m2 .end2: ITX4_END 3, 2, 1, 0 INV_TXFM_4X4_FN identity, dct INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 jmp tx2q .pass2: vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 jmp m(iadst_4x4_internal).end %macro WRITE_4X8 2 ; coefs[1-2] movd xm4, [dstq+strideq*0] pinsrd xm4, [dstq+strideq*1], 1 movd xm5, [dstq+strideq*2] pinsrd xm5, [dstq+r3 ], 1 pinsrd xm4, [r2 +strideq*0], 2 pinsrd xm4, [r2 +strideq*1], 3 pinsrd xm5, [r2 +strideq*2], 2 pinsrd xm5, [r2 +r3 ], 3 pmovzxbw m4, xm4 pmovzxbw m5, xm5 paddw m4, m%1 paddw m5, m%2 packuswb m4, m5 vextracti128 xm5, m4, 1 movd [dstq+strideq*0], xm4 pextrd [dstq+strideq*1], xm4, 1 pextrd [dstq+strideq*2], xm4, 2 pextrd [dstq+r3 ], xm4, 3 movd [r2 +strideq*0], xm5 pextrd [r2 +strideq*1], xm5, 1 pextrd [r2 +strideq*2], xm5, 2 pextrd [r2 +r3 ], xm5, 3 %endmacro %macro INV_TXFM_4X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x8 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_2048)] mov [cq], eobd pmulhrsw xm0, xm1 pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mova m1, m0 jmp m(iadst_4x8_internal).end3 %endif %endmacro %macro IDCT8_1D_PACKED 0 vpbroadcastd m6, [o(pd_2048)] punpckhwd m5, m3, m0 ; in7 in1 punpckhwd m4, m1, m2 ; in3 in5 punpcklwd m3, m1 ; in6 in2 punpcklwd m2, m0 ; in4 in0 ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 psubsw m0, m5, m4 ; t5a t6a (interleaved) paddsw m4, m5 ; t4 t7 (interleaved) ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 vpbroadcastd m1, [o(pw_m2896_2896)] ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5 %if mmsize > 16 vbroadcasti128 m1, [o(deint_shuf)] pshufb m4, m1 %else pshufb m4, [o(deint_shuf)] %endif psubsw m1, m2, m3 ; tmp3 tmp2 paddsw m3, m2 ; tmp0 tmp1 shufps m2, m4, m0, q1032 ; t7 t6 vpblendd m4, m0, 0xcc ; t4 t5 paddsw m0, m3, m2 ; out0 out1 psubsw m3, m2 ; out7 out6 psubsw m2, m1, m4 ; out4 out5 paddsw m1, m4 ; out3 out2 %endmacro %macro IADST8_1D_PACKED 1 ; pass vpbroadcastd m6, [o(pd_2048)] punpckhwd m0, m4, m3 ; 0 7 punpckhwd m1, m5, m2 ; 2 5 punpcklwd m2, m5 ; 4 3 punpcklwd m3, m4 ; 6 1 %if %1 == 1 ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a psubsw m4, m0, m2 ; t5 t4 paddsw m0, m2 ; t1 t0 psubsw m5, m1, m3 ; t6 t7 paddsw m1, m3 ; t2 t3 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a %if mmsize > 16 vbroadcasti128 m2, [o(deint_shuf)] %else mova m2, [o(deint_shuf)] %endif pshuflw m1, m1, q2301 pshufhw m1, m1, q2301 psubsw m3, m0, m1 ; t3 t2 paddsw m0, m1 ; -out7 out0 psubsw m1, m4, m5 ; t7 t6 paddsw m4, m5 ; out6 -out1 pshufb m0, m2 pshufb m4, m2 vpbroadcastd m5, [o(pw_m2896_2896)] pmaddwd m2, m5, m3 pmaddwd m5, m1 paddd m2, m6 paddd m5, m6 psrad m2, 12 psrad m5, 12 packssdw m2, m5 ; out4 -out5 vpbroadcastd m5, [o(pw_2896_2896)] pmaddwd m3, m5 pmaddwd m1, m5 paddd m3, m6 paddd m1, m6 psrad m3, 12 psrad m1, 12 packssdw m1, m3 ; out2 -out3 punpcklqdq m3, m4, m0 ; out6 -out7 punpckhqdq m0, m4 ; out0 -out1 %else ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a psubsw m4, m0, m2 ; t4 t5 paddsw m0, m2 ; t0 t1 psubsw m5, m1, m3 ; t6 t7 paddsw m1, m3 ; t2 t3 shufps m2, m5, m4, q1032 punpckhwd m4, m2 punpcklwd m5, m2 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a psubsw m2, m0, m1 ; t2 t3 paddsw m0, m1 ; out0 -out7 psubsw m1, m4, m5 ; t7 t6 paddsw m4, m5 ; out6 -out1 vpbroadcastd m5, [o(pw_2896x8)] vpblendd m3, m0, m4, 0x33 ; out6 -out7 vpblendd m0, m4, 0xcc ; out0 -out1 shufps m4, m2, m1, q1032 ; t3 t7 vpblendd m1, m2, 0x33 ; t2 t6 psubsw m2, m1, m4 ; t2-t3 t6-t7 paddsw m1, m4 ; t2+t3 t6+t7 pmulhrsw m2, m5 ; out4 -out5 pshufd m1, m1, q1032 pmulhrsw m1, m5 ; out2 -out3 %endif %endmacro INIT_YMM avx2 INV_TXFM_4X8_FN dct, dct INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst INV_TXFM_4X8_FN dct, identity cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 IDCT4_1D_PACKED vbroadcasti128 m2, [o(deint_shuf)] shufps m3, m0, m1, q1331 shufps m0, m1, q0220 pshufb m0, m2 pshufb m1, m3, m2 jmp tx2q .pass2: vextracti128 xm2, m0, 1 vextracti128 xm3, m1, 1 call .main vpbroadcastd m4, [o(pw_2048)] vinserti128 m0, xm2, 1 vinserti128 m1, xm3, 1 pshufd m1, m1, q1032 jmp m(iadst_4x8_internal).end2 ALIGN function_align .main: WRAP_XMM IDCT8_1D_PACKED ret INV_TXFM_4X8_FN adst, dct INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 call m(iadst_8x4_internal).main punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: vextracti128 xm2, m0, 1 vextracti128 xm3, m1, 1 pshufd xm4, xm0, q1032 pshufd xm5, xm1, q1032 call .main_pass2 vpbroadcastd m4, [o(pw_2048)] vinserti128 m0, xm2, 1 vinserti128 m1, xm3, 1 pxor m5, m5 psubw m5, m4 .end: vpblendd m4, m5, 0xcc .end2: pmulhrsw m0, m4 pmulhrsw m1, m4 WIN64_RESTORE_XMM pxor m2, m2 mova [cq+32*0], m2 mova [cq+32*1], m2 .end3: lea r2, [dstq+strideq*4] lea r3, [strideq*3] WRITE_4X8 0, 1 RET ALIGN function_align .main_pass1: WRAP_XMM IADST8_1D_PACKED 1 ret ALIGN function_align .main_pass2: WRAP_XMM IADST8_1D_PACKED 2 ret INV_TXFM_4X8_FN flipadst, dct INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 call m(iadst_8x4_internal).main punpcklwd m3, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m3 punpckhwd m1, m3 jmp tx2q .pass2: vextracti128 xm2, m0, 1 vextracti128 xm3, m1, 1 pshufd xm4, xm0, q1032 pshufd xm5, xm1, q1032 call m(iadst_4x8_internal).main_pass2 vpbroadcastd m5, [o(pw_2048)] vinserti128 m3, xm1, 1 vinserti128 m2, xm0, 1 pxor m4, m4 psubw m4, m5 pshufd m0, m3, q1032 pshufd m1, m2, q1032 jmp m(iadst_4x8_internal).end INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m2, [cq+32*0], q3120 vpermq m0, [cq+32*1], q3120 vpbroadcastd m3, [o(pw_2896x8)] vpbroadcastd m4, [o(pw_1697x8)] punpcklwd m1, m2, m0 punpckhwd m2, m0 pmulhrsw m1, m3 pmulhrsw m2, m3 punpcklwd m0, m1, m2 punpckhwd m1, m2 pmulhrsw m2, m4, m0 pmulhrsw m4, m1 paddsw m0, m2 paddsw m1, m4 jmp tx2q .pass2: vpbroadcastd m4, [o(pw_4096)] jmp m(iadst_4x8_internal).end2 %macro INV_TXFM_4X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x16 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] movd xm3, [o(pw_2048)] mov [cq], eobd pmulhrsw xm0, xm2 pmulhrsw xm0, xm1 pmulhrsw xm0, xm3 vpbroadcastw m0, xm0 mova m1, m0 mova m2, m0 mova m3, m0 jmp m(iadst_4x16_internal).end3 %endif %endmacro %macro IDCT16_1D_PACKED 0 vpbroadcastd m10, [o(pd_2048)] .main2: punpckhwd m8, m7, m0 ; dct16 in15 in1 punpcklwd m9, m4, m0 ; dct4 in2 in0 punpckhwd m0, m3, m4 ; dct16 in7 in9 punpcklwd m7, m1 ; dct8 in7 in1 punpckhwd m1, m6 ; dct16 in3 in13 punpcklwd m3, m5 ; dct8 in3 in5 punpckhwd m5, m2 ; dct16 in11 in5 punpcklwd m6, m2 ; dct4 in3 in1 ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 3 ; t8a t15a ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2 psubsw m2, m8, m0 ; t9 t14 paddsw m8, m0 ; t8 t15 psubsw m0, m1, m5 ; t10 t13 paddsw m1, m5 ; t11 t12 vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784 ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567 ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a psubsw m4, m8, m1 ; t11a t12a paddsw m8, m1 ; t8a t15a psubsw m1, m7, m3 ; t5a t6a paddsw m7, m3 ; t4 t7 paddsw m3, m2, m0 ; t9 t14 psubsw m2, m0 ; t10 t13 %if mmsize > 16 vbroadcasti128 m0, [o(deint_shuf)] %else mova m0, [o(deint_shuf)] %endif pshufb m8, m0 pshufb m7, m0 pshufb m3, m0 ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1 vpbroadcastd m0, [o(pw_m2896_2896)] ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12 vpbroadcastd m5, [o(pw_2896_2896)] ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5 vpbroadcastd m0, [o(pw_m2896_2896)] ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a punpckhqdq m0, m8, m3 ; t15a t14 punpcklqdq m8, m3 ; t8a t9 shufps m5, m4, m2, q1032 ; t12 t13a vpblendd m4, m2, 0xcc ; t11 t10a shufps m2, m7, m1, q1032 ; t7 t6 vpblendd m7, m1, 0xcc ; t4 t5 psubsw m1, m9, m6 ; dct4 out3 out2 paddsw m9, m6 ; dct4 out0 out1 psubsw m3, m9, m2 ; dct8 out7 out6 paddsw m9, m2 ; dct8 out0 out1 psubsw m2, m1, m7 ; dct8 out4 out5 paddsw m1, m7 ; dct8 out3 out2 psubsw m7, m9, m0 ; out15 out14 paddsw m0, m9 ; out0 out1 psubsw m6, m1, m5 ; out12 out13 paddsw m1, m5 ; out3 out2 psubsw m5, m2, m4 ; out11 out10 paddsw m2, m4 ; out4 out5 psubsw m4, m3, m8 ; out8 out9 paddsw m3, m8 ; out7 out6 %endmacro INV_TXFM_4X16_FN dct, dct INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst INV_TXFM_4X16_FN dct, identity cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] call m(idct_16x4_internal).main vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m1 punpcklwd m0, m1 REPX {pmulhrsw x, m5}, m0, m4, m2, m3 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 jmp tx2q .pass2: vextracti128 xm4, m0, 1 vextracti128 xm5, m1, 1 vextracti128 xm6, m2, 1 vextracti128 xm7, m3, 1 call .main vinserti128 m0, xm4, 1 vinserti128 m1, xm5, 1 vpbroadcastd m5, [o(pw_2048)] vinserti128 m2, xm6, 1 vinserti128 m3, xm7, 1 pshufd m1, m1, q1032 pshufd m3, m3, q1032 jmp m(iadst_4x16_internal).end2 ALIGN function_align .main: WRAP_XMM IDCT16_1D_PACKED ret INV_TXFM_4X16_FN adst, dct INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] call m(iadst_16x4_internal).main vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m1 punpcklwd m0, m1 REPX {pmulhrsw x, m5}, m4, m2, m3, m0 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 jmp tx2q .pass2: call .main vpbroadcastd m5, [o(pw_2896x8)] paddsw m1, m2, m4 psubsw m2, m4 pmulhrsw m1, m5 ; -out7 out4 out6 -out5 pmulhrsw m2, m5 ; out8 -out11 -out9 out10 vpbroadcastd m5, [o(pw_2048)] pshufd m1, m1, q1032 vpblendd m4, m1, m0, 0x33 vpblendd m0, m2, 0x33 vpblendd m2, m3, 0x33 vpblendd m3, m1, 0x33 vpermq m0, m0, q2031 vpermq m1, m2, q1302 vpermq m2, m3, q3120 vpermq m3, m4, q0213 psubw m6, m7, m5 .end: vpblendd m5, m6, 0xcc .end2: REPX {pmulhrsw x, m5}, m0, m1, m2, m3 WIN64_RESTORE_XMM pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 mova [cq+32*2], m4 mova [cq+32*3], m4 .end3: lea r2, [dstq+strideq*8] lea r3, [strideq*3] WRITE_4X8 0, 1 lea dstq, [dstq+strideq*4] lea r2, [r2 +strideq*4] WRITE_4X8 2, 3 RET ALIGN function_align .main: vpblendd m4, m1, m0, 0xcc vpblendd m1, m0, 0x33 vpblendd m5, m2, m3, 0xcc vpblendd m2, m3, 0x33 vperm2i128 m3, m5, m2, 0x31 vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1 vperm2i128 m4, m1, m4, 0x31 vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5 pshufd m3, m3, q1032 ; in12 in15 in13 in14 pshufd m2, m4, q1032 ; in11 in8 in9 in10 .main2: vpbroadcastd m8, [o(pd_2048)] pxor m7, m7 punpckhwd m4, m3, m0 ; in12 in3 in14 in1 punpcklwd m0, m3 ; in0 in15 in2 in13 punpckhwd m3, m2, m1 ; in8 in7 in10 in5 punpcklwd m1, m2 ; in4 in11 in6 in9 ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3 ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3 ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3 ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3 psubsw m2, m0, m3 ; t9a t8a t11a t10a paddsw m0, m3 ; t1a t0a t3a t2a psubsw m3, m1, m4 ; t13a t12a t15a t14a paddsw m1, m4 ; t5a t4a t7a t6a ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3 psubw m6, m7, m5 ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6 vpbroadcastd m6, [o(pw_m3784_1567)] vpbroadcastd m5, [o(pw_1567_3784)] psubsw m4, m0, m1 ; t5 t4 t7 t6 paddsw m0, m1 ; t1 t0 t3 t2 psubsw m1, m2, m3 ; t13a t12a t15a t14a paddsw m2, m3 ; t9a t8a t11a t10a psubw m3, m7, m6 ; pw_3784_m1567 vpblendd m6, m3, 0xf0 ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14 vbroadcasti128 m5, [o(deint_shuf)] pshufb m0, m5 pshufb m2, m5 vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14 vinserti128 m4, xm1, 1 ; t4a t5a t12 t13 pshufd m2, m2, q1032 ; t6a t7a t14 t15 psubsw m1, m0, m3 ; t3a t2a t11 t10 paddsw m0, m3 ; -out15 out0 out14 -out1 paddsw m3, m4, m2 ; -out3 out12 out2 -out13 psubsw m4, m2 ; t6 t7 t14a t15a shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a ret ALIGN function_align .main_pass1_end: vpbroadcastd m5, [o(pw_m2896_2896)] vpbroadcastd m6, [o(pw_2896_2896)] punpcklwd m1, m4, m2 punpckhwd m4, m2 pmaddwd m2, m5, m4 pmaddwd m4, m6 pmaddwd m5, m1 pmaddwd m1, m6 REPX {paddd x, m8}, m5, m1, m2, m4 REPX {psrad x, 12}, m5, m2, m1, m4 packssdw m2, m5 ; -out11 out8 out10 -out9 packssdw m1, m4 ; -out7 out4 out6 -out5 ret INV_TXFM_4X16_FN flipadst, dct INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] call m(iadst_16x4_internal).main vpbroadcastd m5, [o(pw_16384)] punpcklwd m4, m1, m0 punpckhwd m1, m0 punpcklwd m0, m3, m2 punpckhwd m3, m2 REPX {pmulhrsw x, m5}, m4, m1, m0, m3 punpckldq m2, m3, m1 punpckhdq m3, m1 punpckhdq m1, m0, m4 punpckldq m0, m4 jmp tx2q .pass2: call m(iadst_4x16_internal).main vpbroadcastd m5, [o(pw_2896x8)] paddsw m1, m2, m4 psubsw m2, m4 pmulhrsw m1, m5 ; -out7 out4 out6 -out5 pmulhrsw m2, m5 ; out8 -out11 -out9 out10 vpbroadcastd m6, [o(pw_2048)] pshufd m1, m1, q1032 vpblendd m4, m0, m2, 0x33 vpblendd m0, m1, 0xcc vpblendd m1, m3, 0xcc vpblendd m2, m3, 0x33 vpermq m0, m0, q3120 vpermq m1, m1, q0213 vpermq m2, m2, q2031 vpermq m3, m4, q1302 psubw m5, m7, m6 jmp m(iadst_4x16_internal).end INV_TXFM_4X16_FN identity, dct INV_TXFM_4X16_FN identity, adst INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 mova m3, [cq+32*0] mova m2, [cq+32*1] mova m4, [cq+32*2] mova m5, [cq+32*3] vpbroadcastd m8, [o(pw_1697x8)] pcmpeqw m0, m0 ; -1 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m5 punpckhwd m4, m5 pmulhrsw m5, m8, m1 pmulhrsw m6, m8, m2 pmulhrsw m7, m8, m3 pmulhrsw m8, m4 pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is pxor m1, m9 ; unsigned. as long as both signs are equal pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the pxor m2, m9 ; pmulhrsw result will become 0 which causes pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless pxor m3, m9 ; we explicitly deal with that case here. pcmpeqw m0, m4 pxor m4, m0 pavgw m1, m5 pavgw m2, m6 pavgw m3, m7 pavgw m4, m8 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 jmp tx2q .pass2: vpbroadcastd m8, [o(pw_1697x16)] vpbroadcastd m5, [o(pw_2048)] pmulhrsw m4, m8, m0 pmulhrsw m6, m8, m1 pmulhrsw m7, m8, m2 pmulhrsw m8, m3 REPX {paddsw x, x}, m0, m1, m2, m3 paddsw m0, m4 paddsw m1, m6 paddsw m2, m7 paddsw m3, m8 jmp m(iadst_4x16_internal).end2 %macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] movq xm%3, [dstq ] movhps xm%3, [dstq+%5] movq xm%4, [dstq+%6] movhps xm%4, [dstq+%7] pmovzxbw m%3, xm%3 pmovzxbw m%4, xm%4 %ifnum %1 paddw m%3, m%1 %else paddw m%3, %1 %endif %ifnum %2 paddw m%4, m%2 %else paddw m%4, %2 %endif packuswb m%3, m%4 vextracti128 xm%4, m%3, 1 movq [dstq ], xm%3 movhps [dstq+%6], xm%3 movq [dstq+%5], xm%4 movhps [dstq+%7], xm%4 %endmacro %macro INV_TXFM_8X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x4 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] pmulhrsw xm0, xm1 movd xm2, [o(pw_2048)] pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mova m1, m0 jmp m(iadst_8x4_internal).end3 %endif %endmacro INV_TXFM_8X4_FN dct, dct INV_TXFM_8X4_FN dct, adst INV_TXFM_8X4_FN dct, flipadst INV_TXFM_8X4_FN dct, identity cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd xm3, [o(pw_2896x8)] pmulhrsw xm0, xm3, [cq+16*0] pmulhrsw xm1, xm3, [cq+16*1] pmulhrsw xm2, xm3, [cq+16*2] pmulhrsw xm3, [cq+16*3] call m(idct_4x8_internal).main vbroadcasti128 m4, [o(deint_shuf)] vinserti128 m3, m1, xm3, 1 vinserti128 m1, m0, xm2, 1 shufps m0, m1, m3, q0220 shufps m1, m3, q1331 pshufb m0, m4 pshufb m1, m4 jmp tx2q .pass2: IDCT4_1D_PACKED vpermq m0, m0, q3120 vpermq m1, m1, q2031 jmp m(iadst_8x4_internal).end2 INV_TXFM_8X4_FN adst, dct INV_TXFM_8X4_FN adst, adst INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd xm0, [o(pw_2896x8)] pshufd xm4, [cq+16*0], q1032 pmulhrsw xm3, xm0, [cq+16*3] pshufd xm5, [cq+16*1], q1032 pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 call m(iadst_4x8_internal).main_pass1 vinserti128 m0, xm2, 1 vinserti128 m1, xm3, 1 punpckhwd m2, m0, m1 punpcklwd m0, m1 pxor m3, m3 psubsw m3, m2 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: call .main .end: vpermq m0, m0, q3120 vpermq m1, m1, q3120 .end2: vpbroadcastd m2, [o(pw_2048)] pmulhrsw m0, m2 pmulhrsw m1, m2 WIN64_RESTORE_XMM .end3: pxor m2, m2 mova [cq+32*0], m2 mova [cq+32*1], m2 lea r3, [strideq*3] WRITE_8X4 0, 1, 4, 5 RET ALIGN function_align .main: IADST4_1D_PACKED ret INV_TXFM_8X4_FN flipadst, dct INV_TXFM_8X4_FN flipadst, adst INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd xm0, [o(pw_2896x8)] pshufd xm4, [cq+16*0], q1032 pmulhrsw xm3, xm0, [cq+16*3] pshufd xm5, [cq+16*1], q1032 pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 call m(iadst_4x8_internal).main_pass1 vinserti128 m3, xm1, 1 vinserti128 m2, xm0, 1 punpckhwd m1, m3, m2 punpcklwd m3, m2 pxor m0, m0 psubsw m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: call m(iadst_8x4_internal).main mova m2, m1 vpermq m1, m0, q2031 vpermq m0, m2, q2031 jmp m(iadst_8x4_internal).end2 INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 mova xm2, [cq+16*0] mova xm0, [cq+16*1] vinserti128 m2, [cq+16*2], 1 vinserti128 m0, [cq+16*3], 1 vpbroadcastd m3, [o(pw_2896x8)] punpcklwd m1, m2, m0 punpckhwd m2, m0 pmulhrsw m1, m3 pmulhrsw m2, m3 punpcklwd m0, m1, m2 punpckhwd m1, m2 paddsw m0, m0 paddsw m1, m1 jmp tx2q .pass2: vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 jmp m(iadst_8x4_internal).end %macro INV_TXFM_8X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x8 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm2 psrlw xm2, 3 ; pw_2048 pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 .end: mov r2d, 2 .end2: lea r3, [strideq*3] .loop: WRITE_8X4 0, 0, 1, 2 lea dstq, [dstq+strideq*4] dec r2d jg .loop RET %endif %endmacro INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst INV_TXFM_8X8_FN dct, identity cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m2, [cq+32*2], q3120 ; 4 5 vpermq m1, [cq+32*1], q3120 ; 2 3 call .main shufps m4, m0, m1, q0220 shufps m5, m0, m1, q1331 shufps m1, m2, m3, q0220 shufps m3, m2, m3, q1331 vbroadcasti128 m0, [o(deint_shuf)] vpbroadcastd m2, [o(pw_16384)] REPX {pshufb x, m0}, m4, m5, m1, m3 REPX {pmulhrsw x, m2}, m4, m5, m1, m3 vinserti128 m0, m4, xm1, 1 vperm2i128 m2, m4, m1, 0x31 vinserti128 m1, m5, xm3, 1 vperm2i128 m3, m5, m3, 0x31 jmp tx2q .pass2: call .main vpbroadcastd m4, [o(pw_2048)] vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 jmp m(iadst_8x8_internal).end2 ALIGN function_align .main: IDCT8_1D_PACKED ret INV_TXFM_8X8_FN adst, dct INV_TXFM_8X8_FN adst, adst INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m4, [cq+32*0], q1302 ; 1 0 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 vpermq m2, [cq+32*2], q3120 ; 4 5 call .main_pass1 vpbroadcastd m5, [o(pw_16384)] punpcklwd m4, m0, m1 punpckhwd m0, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 pxor m3, m3 psubw m3, m5 ; negate odd elements during rounding pmulhrsw m4, m5 pmulhrsw m0, m3 pmulhrsw m1, m5 pmulhrsw m2, m3 punpcklwd m3, m4, m0 punpckhwd m4, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 vperm2i128 m2, m3, m0, 0x31 vinserti128 m0, m3, xm0, 1 vperm2i128 m3, m4, m1, 0x31 vinserti128 m1, m4, xm1, 1 jmp tx2q .pass2: pshufd m4, m0, q1032 pshufd m5, m1, q1032 call .main_pass2 vpbroadcastd m5, [o(pw_2048)] vpbroadcastd xm4, [o(pw_4096)] psubw m4, m5 ; lower half = 2048, upper half = -2048 .end: REPX {vpermq x, x, q3120}, m0, m1, m2, m3 .end2: pmulhrsw m0, m4 pmulhrsw m1, m4 .end3: pmulhrsw m2, m4 pmulhrsw m3, m4 WIN64_RESTORE_XMM .end4: pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 mova [cq+32*2], m4 mova [cq+32*3], m4 lea r3, [strideq*3] WRITE_8X4 0, 1, 4, 5 lea dstq, [dstq+strideq*4] WRITE_8X4 2, 3, 4, 5 RET ALIGN function_align .main_pass1: IADST8_1D_PACKED 1 ret ALIGN function_align .main_pass2: IADST8_1D_PACKED 2 ret INV_TXFM_8X8_FN flipadst, dct INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m4, [cq+32*0], q1302 ; 1 0 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 vpermq m2, [cq+32*2], q3120 ; 4 5 call m(iadst_8x8_internal).main_pass1 vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m3, m2 punpcklwd m3, m2 punpckhwd m2, m1, m0 punpcklwd m1, m0 pxor m0, m0 psubw m0, m5 pmulhrsw m4, m0 pmulhrsw m3, m5 pmulhrsw m2, m0 pmulhrsw m1, m5 punpckhwd m0, m4, m3 punpcklwd m4, m3 punpckhwd m3, m2, m1 punpcklwd m2, m1 vinserti128 m1, m0, xm3, 1 vperm2i128 m3, m0, m3, 0x31 vinserti128 m0, m4, xm2, 1 vperm2i128 m2, m4, m2, 0x31 jmp tx2q .pass2: pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_8x8_internal).main_pass2 vpbroadcastd m4, [o(pw_2048)] vpbroadcastd xm5, [o(pw_4096)] psubw m4, m5 ; lower half = -2048, upper half = 2048 vpermq m5, m3, q2031 vpermq m3, m0, q2031 vpermq m0, m2, q2031 vpermq m2, m1, q2031 pmulhrsw m1, m0, m4 pmulhrsw m0, m5, m4 jmp m(iadst_8x8_internal).end3 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 mova xm3, [cq+16*0] mova xm2, [cq+16*1] vinserti128 m3, [cq+16*4], 1 vinserti128 m2, [cq+16*5], 1 mova xm4, [cq+16*2] mova xm0, [cq+16*3] vinserti128 m4, [cq+16*6], 1 vinserti128 m0, [cq+16*7], 1 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m0 punpckhwd m4, m0 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 jmp tx2q .pass2: vpbroadcastd m4, [o(pw_4096)] jmp m(iadst_8x8_internal).end %macro INV_TXFM_8X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x16 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 psrlw xm2, 3 ; pw_2048 pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mov r2d, 4 jmp m(inv_txfm_add_dct_dct_8x8).end2 %endif %endmacro %macro ITX_8X16_LOAD_COEFS 0 vpbroadcastd m4, [o(pw_2896x8)] pmulhrsw m0, m4, [cq+32*0] add cq, 32*4 pmulhrsw m7, m4, [cq+32*3] pmulhrsw m1, m4, [cq-32*3] pmulhrsw m6, m4, [cq+32*2] pmulhrsw m2, m4, [cq-32*2] pmulhrsw m5, m4, [cq+32*1] pmulhrsw m3, m4, [cq-32*1] pmulhrsw m4, [cq+32*0] %endmacro INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst INV_TXFM_8X16_FN dct, identity cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS call m(idct_16x8_internal).main vpbroadcastd m10, [o(pw_16384)] .pass1_end: vperm2i128 m9, m3, m7, 0x31 vinserti128 m3, xm7, 1 vperm2i128 m8, m2, m6, 0x31 vinserti128 m2, xm6, 1 vperm2i128 m6, m1, m5, 0x31 vinserti128 m1, xm5, 1 vperm2i128 m5, m0, m4, 0x31 vinserti128 m0, xm4, 1 punpckhwd m4, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m1 punpcklwd m0, m1 .pass1_end2: punpckhwd m7, m5, m6 punpcklwd m5, m6 punpcklwd m6, m8, m9 punpckhwd m8, m9 REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 punpckldq m4, m5, m6 punpckhdq m5, m6 punpckldq m6, m7, m8 punpckhdq m7, m8 jmp tx2q .pass2: call .main REPX {vpermq x, x, q3120}, m0, m2, m4, m6 REPX {vpermq x, x, q2031}, m1, m3, m5, m7 .end: vpbroadcastd m8, [o(pw_2048)] .end2: REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 .end3: pxor m8, m8 REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 lea r3, [strideq*3] WRITE_8X4 0, 1, 8, 9 lea dstq, [dstq+strideq*4] WRITE_8X4 2, 3, 0, 1 lea dstq, [dstq+strideq*4] WRITE_8X4 4, 5, 0, 1 lea dstq, [dstq+strideq*4] WRITE_8X4 6, 7, 0, 1 RET ALIGN function_align .main: IDCT16_1D_PACKED ret INV_TXFM_8X16_FN adst, dct INV_TXFM_8X16_FN adst, adst INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS call m(iadst_16x8_internal).main call m(iadst_16x8_internal).main_pass1_end vpbroadcastd m10, [o(pw_16384)] pslld m9, m10, 17 psubw m10, m9 ; 16384, -16384 jmp m(idct_8x16_internal).pass1_end ALIGN function_align .pass2: call .main call .main_pass2_end vpbroadcastd m9, [o(pw_2048)] vpbroadcastd xm8, [o(pw_4096)] psubw m8, m9 REPX {vpermq x, x, q2031}, m0, m1, m2, m3 REPX {vpermq x, x, q3120}, m4, m5, m6, m7 jmp m(idct_8x16_internal).end2 ALIGN function_align .main: REPX {pshufd x, x, q1032}, m7, m1, m5, m3 .main2: vpbroadcastd m10, [o(pd_2048)] punpckhwd m8, m7, m0 ; in14 in1 punpcklwd m0, m7 ; in0 in15 punpcklwd m7, m6, m1 ; in12 in3 punpckhwd m1, m6 ; in2 in13 punpckhwd m6, m5, m2 ; in10 in5 punpcklwd m2, m5 ; in4 in11 punpcklwd m5, m4, m3 ; in8 in7 punpckhwd m3, m4 ; in6 in9 ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1 ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3 ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5 ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7 ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9 ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11 ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13 ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15 psubsw m4, m0, m5 ; t9a t8a paddsw m0, m5 ; t1a t0a psubsw m5, m1, m6 ; t11a t10a paddsw m1, m6 ; t3a t2a psubsw m6, m2, m7 ; t13a t12a paddsw m2, m7 ; t5a t4a psubsw m7, m3, m8 ; t15a t14a paddsw m3, m8 ; t7a t6a vpbroadcastd m11, [o(pw_m4017_799)] vpbroadcastd m12, [o(pw_799_4017)] pxor m9, m9 ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9 psubw m8, m9, m11 ; pw_4017_m799 ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13 vpbroadcastd m11, [o(pw_m2276_3406)] vpbroadcastd m12, [o(pw_3406_2276)] ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11 psubw m8, m9, m11 ; pw_2276_m3406 ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15 psubsw m8, m1, m3 ; t7 t6 paddsw m1, m3 ; t3 t2 psubsw m3, m0, m2 ; t5 t4 paddsw m0, m2 ; t1 t0 psubsw m2, m5, m7 ; t14a t15a paddsw m7, m5 ; t10a t11a psubsw m5, m4, m6 ; t12a t13a paddsw m4, m6 ; t8a t9a vpbroadcastd m11, [o(pw_m3784_1567)] vpbroadcastd m12, [o(pw_1567_3784)] ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a psubw m6, m9, m11 ; pw_3784_m1567 ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a vpbroadcastd m11, [o(pw_m1567_3784)] vpbroadcastd m12, [o(pw_3784_1567)] ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14 psubw m6, m9, m11 ; pw_1567_m3784 ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12 vbroadcasti128 m12, [o(deint_shuf)] paddsw m6, m4, m7 ; -out1 out14 psubsw m4, m7 ; t10 t11 psubsw m11, m3, m8 ; t7 t6 paddsw m8, m3 ; out12 -out3 psubsw m3, m0, m1 ; t3a t2a paddsw m0, m1 ; -out15 out0 paddsw m1, m2, m5 ; -out13 out2 psubsw m5, m2 ; t15a t14a pshufb m0, m12 pshufb m6, m12 pshufb m8, m12 pshufb m1, m12 shufps m7, m6, m0, q1032 ; out14 -out15 vpblendd m0, m6, 0x33 ; -out1 out0 punpcklqdq m6, m8, m1 ; out12 -out13 punpckhqdq m1, m8, m1 ; -out3 out2 ret ALIGN function_align .main_pass1_end: vpbroadcastd m8, [o(pw_m2896_2896)] vpbroadcastd m12, [o(pw_2896_2896)] pmaddwd m9, m8, m11 ; -out11 pmaddwd m2, m12, m5 ; -out5 pmaddwd m5, m8 ; out10 pmaddwd m11, m12 ; out4 REPX {paddd x, m10}, m9, m5, m2, m11 REPX {psrad x, 12 }, m9, m5, m2, m11 packssdw m5, m9 ; out10 -out11 packssdw m2, m11 ; -out5 out4 pmaddwd m11, m8, m3 ; out8 vpbroadcastd m8, [o(pw_2896_m2896)] pmaddwd m3, m12 ; -out7 pmaddwd m8, m4 ; -out9 pmaddwd m4, m12 ; out6 REPX {paddd x, m10}, m11, m3, m8, m4 REPX {psrad x, 12 }, m11, m3, m8, m4 packssdw m3, m4 ; -out7 out6 packssdw m4, m11, m8 ; out8 -out9 vpbroadcastd m10, [o(pw_16384)] pxor m9, m9 ret ALIGN function_align .main_pass2_end: vpbroadcastd m8, [o(pw_2896x8)] pshufb m2, m11, m12 pshufb m5, m12 pshufb m3, m12 pshufb m4, m12 punpcklqdq m11, m5, m2 ; t15a t7 punpckhqdq m5, m2 ; t14a t6 shufps m2, m3, m4, q1032 ; t2a t10 vpblendd m3, m4, 0xcc ; t3a t11 psubsw m4, m2, m3 ; out8 -out9 paddsw m3, m2 ; -out7 out6 paddsw m2, m5, m11 ; -out5 out4 psubsw m5, m11 ; out10 -out11 REPX {pmulhrsw x, m8}, m2, m3, m4, m5 ret INV_TXFM_8X16_FN flipadst, dct INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS call m(iadst_16x8_internal).main call m(iadst_16x8_internal).main_pass1_end vpbroadcastd m9, [o(pw_16384)] pslld m10, m9, 17 psubw m10, m9 ; -16384, 16384 vperm2i128 m9, m4, m0, 0x31 vinserti128 m0, m4, xm0, 1 vperm2i128 m8, m5, m1, 0x31 vinserti128 m4, m5, xm1, 1 vperm2i128 m5, m7, m3, 0x31 vinserti128 m3, m7, xm3, 1 vinserti128 m1, m6, xm2, 1 vperm2i128 m6, m6, m2, 0x31 punpcklwd m2, m4, m0 punpckhwd m4, m0 punpcklwd m0, m3, m1 punpckhwd m3, m1 jmp m(idct_8x16_internal).pass1_end2 .pass2: call m(iadst_8x16_internal).main call m(iadst_8x16_internal).main_pass2_end vpbroadcastd m8, [o(pw_2048)] vpbroadcastd xm9, [o(pw_4096)] psubw m8, m9 vpermq m9, m0, q3120 vpermq m0, m7, q2031 vpermq m7, m1, q3120 vpermq m1, m6, q2031 vpermq m6, m2, q3120 vpermq m2, m5, q2031 vpermq m5, m3, q3120 vpermq m3, m4, q2031 pmulhrsw m0, m8 pmulhrsw m1, m8 pmulhrsw m2, m8 pmulhrsw m3, m8 pmulhrsw m4, m5, m8 pmulhrsw m5, m6, m8 pmulhrsw m6, m7, m8 pmulhrsw m7, m9, m8 jmp m(idct_8x16_internal).end3 INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity %macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] pmulhrsw m%2, m%3, m%1 %if %0 == 4 ; if downshifting by 1 pmulhrsw m%2, m%4 %else paddsw m%1, m%1 %endif paddsw m%1, m%2 %endmacro cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 mova xm3, [cq+16*0] mova xm2, [cq+16*2] add cq, 16*8 vinserti128 m3, [cq+16*0], 1 vinserti128 m2, [cq+16*2], 1 vpbroadcastd m9, [o(pw_2896x8)] mova xm4, [cq-16*4] mova xm5, [cq-16*2] vinserti128 m4, [cq+16*4], 1 vinserti128 m5, [cq+16*6], 1 mova xm7, [cq-16*7] mova xm6, [cq-16*5] vinserti128 m7, [cq+16*1], 1 vinserti128 m6, [cq+16*3], 1 mova xm8, [cq-16*3] mova xm0, [cq-16*1] vinserti128 m8, [cq+16*5], 1 vinserti128 m0, [cq+16*7], 1 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m5 punpckhwd m4, m5 punpcklwd m5, m7, m6 punpckhwd m7, m6 punpcklwd m6, m8, m0 punpckhwd m8, m0 REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 punpckldq m4, m5, m6 punpckhdq m5, m6 punpckldq m6, m7, m8 punpckhdq m7, m8 jmp tx2q .pass2: vpbroadcastd m8, [o(pw_1697x16)] REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 jmp m(idct_8x16_internal).end %macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] pmovzxbw m%3, [dstq+%5] %ifnum %1 paddw m%3, m%1 %else paddw m%3, %1 %endif pmovzxbw m%4, [dstq+%6] %ifnum %2 paddw m%4, m%2 %else paddw m%4, %2 %endif packuswb m%3, m%4 vpermq m%3, m%3, q3120 mova [dstq+%5], xm%3 vextracti128 [dstq+%6], m%3, 1 %endmacro %macro INV_TXFM_16X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x4 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd mov r2d, 2 .dconly: pmulhrsw xm0, xm2 movd xm2, [pw_2048] ; intentionally rip-relative pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 pxor m3, m3 .dconly_loop: mova xm1, [dstq] vinserti128 m1, [dstq+strideq], 1 punpckhbw m2, m1, m3 punpcklbw m1, m3 paddw m2, m0 paddw m1, m0 packuswb m1, m2 mova [dstq], xm1 vextracti128 [dstq+strideq], m1, 1 lea dstq, [dstq+strideq*2] dec r2d jg .dconly_loop RET %endif %endmacro INV_TXFM_16X4_FN dct, dct INV_TXFM_16X4_FN dct, adst INV_TXFM_16X4_FN dct, flipadst INV_TXFM_16X4_FN dct, identity cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 mova xm0, [cq+16*0] mova xm1, [cq+16*1] mova xm2, [cq+16*2] mova xm3, [cq+16*3] mova xm4, [cq+16*4] mova xm5, [cq+16*5] mova xm6, [cq+16*6] mova xm7, [cq+16*7] call m(idct_4x16_internal).main vinserti128 m6, m2, xm6, 1 vinserti128 m2, m0, xm4, 1 vinserti128 m0, m1, xm5, 1 vinserti128 m1, m3, xm7, 1 punpcklwd m3, m2, m6 punpckhwd m2, m6 vpbroadcastd m6, [o(pw_16384)] punpckhwd m4, m0, m1 punpcklwd m0, m1 mova m1, m6 jmp m(iadst_16x4_internal).pass1_end .pass2: call .main jmp m(iadst_16x4_internal).end ALIGN function_align .main: vpbroadcastd m6, [o(pd_2048)] IDCT4_1D 0, 1, 2, 3, 4, 5, 6 ret INV_TXFM_16X4_FN adst, dct INV_TXFM_16X4_FN adst, adst INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q1230 vpermq m3, [cq+32*3], q2103 vpermq m1, [cq+32*1], q1230 vpermq m2, [cq+32*2], q2103 call m(iadst_4x16_internal).main2 call m(iadst_4x16_internal).main_pass1_end punpcklwd m4, m3, m1 punpcklwd m5, m2, m0 punpckhwd m0, m1 punpckhwd m2, m3 vpbroadcastd m1, [o(pw_16384)] vinserti128 m3, m0, xm2, 1 vperm2i128 m2, m0, m2, 0x31 vinserti128 m0, m4, xm5, 1 vperm2i128 m4, m4, m5, 0x31 psubw m6, m7, m1 .pass1_end: pmulhrsw m3, m1 pmulhrsw m2, m6 pmulhrsw m4, m1 pmulhrsw m0, m6 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m0 punpckhwd m4, m0 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 jmp tx2q .pass2: call .main .end: vpbroadcastd m4, [o(pw_2048)] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 WIN64_RESTORE_XMM .end2: pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 mova [cq+32*2], m4 mova [cq+32*3], m4 .end3: WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1 lea dstq, [dstq+strideq*2] WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1 RET ALIGN function_align .main: vpbroadcastd m6, [o(pw_m3344_3344)] vpbroadcastd m7, [o(pw_3803_1321)] vpbroadcastd m8, [o(pw_m1321_2482)] vpbroadcastd m9, [o(pw_2482_3344)] punpcklwd m4, m2, m0 ; in2 in0 l punpckhwd m2, m0 ; in2 in0 h psrld m5, m6, 16 pmaddwd m10, m6, m4 ; t2:02 l pmaddwd m6, m2 ; t2:02 h pmaddwd m0, m7, m4 ; t0:02 l pmaddwd m7, m2 ; t0:02 h pmaddwd m4, m8 ; t1:02 l pmaddwd m8, m2 ; t1:02 h punpckhwd m2, m3, m1 ; in3 in1 h punpcklwd m3, m1 ; in3 in1 l pmaddwd m1, m5, m2 ; t2:3 h pmaddwd m5, m3 ; t2:3 l paddd m6, m1 vpbroadcastd m1, [o(pd_2048)] paddd m10, m5 pmaddwd m5, m9, m3 pmaddwd m9, m2 paddd m0, m1 paddd m7, m1 paddd m0, m5 ; t0 + t3 + 2048 l paddd m7, m9 ; t0 + t3 + 2048 h vpbroadcastd m9, [o(pw_m3803_3344)] pmaddwd m5, m9, m2 pmaddwd m9, m3 paddd m10, m1 ; t2 + 2048 l paddd m6, m1 ; t2 + 2048 h paddd m5, m1 ; t1:13 + 2048 h paddd m1, m9 ; t1:13 + 2048 l vpbroadcastd m9, [o(pw_m3803_m6688)] pmaddwd m2, m9 pmaddwd m3, m9 paddd m5, m8 ; t1 + t3 + 2048 h paddd m1, m4 ; t1 + t3 + 2048 l paddd m8, m7 paddd m4, m0 paddd m2, m8 ; t0 + t1 - t3 + 2048 h paddd m3, m4 ; t0 + t1 - t3 + 2048 l REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3 packssdw m0, m7 packssdw m1, m5 packssdw m3, m2 packssdw m2, m10, m6 ret INV_TXFM_16X4_FN flipadst, dct INV_TXFM_16X4_FN flipadst, adst INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q1230 vpermq m3, [cq+32*3], q2103 vpermq m1, [cq+32*1], q1230 vpermq m2, [cq+32*2], q2103 call m(iadst_4x16_internal).main2 call m(iadst_4x16_internal).main_pass1_end punpckhwd m4, m3, m2 punpckhwd m5, m1, m0 punpcklwd m0, m2 punpcklwd m1, m3 vpbroadcastd m6, [o(pw_16384)] vinserti128 m3, m0, xm1, 1 vperm2i128 m2, m0, m1, 0x31 vinserti128 m0, m4, xm5, 1 vperm2i128 m4, m4, m5, 0x31 psubw m1, m7, m6 jmp m(iadst_16x4_internal).pass1_end ALIGN function_align .pass2: call m(iadst_16x4_internal).main vpbroadcastd m4, [o(pw_2048)] REPX {pmulhrsw x, m4}, m3, m2, m1, m0 pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 mova [cq+32*2], m4 mova [cq+32*3], m4 WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1 lea dstq, [dstq+strideq*2] WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1 RET INV_TXFM_16X4_FN identity, dct INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 mova xm2, [cq+16*0] mova xm4, [cq+16*1] vinserti128 m2, [cq+16*4], 1 vinserti128 m4, [cq+16*5], 1 mova xm0, [cq+16*2] mova xm1, [cq+16*3] vinserti128 m0, [cq+16*6], 1 vinserti128 m1, [cq+16*7], 1 vpbroadcastd m7, [o(pw_1697x16)] vpbroadcastd m8, [o(pw_16384)] punpcklwd m3, m2, m4 punpckhwd m2, m4 punpcklwd m4, m0, m1 punpckhwd m0, m1 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m0 punpckhwd m4, m0 pmulhrsw m0, m7, m1 pmulhrsw m5, m7, m2 pmulhrsw m6, m7, m3 pmulhrsw m7, m4 REPX {pmulhrsw x, m8}, m0, m5, m6, m7 paddsw m1, m0 paddsw m2, m5 paddsw m3, m6 paddsw m4, m7 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 jmp tx2q .pass2: vpbroadcastd m7, [o(pw_1697x8)] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 jmp m(iadst_16x4_internal).end %macro INV_TXFM_16X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x8 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 4 jmp m(inv_txfm_add_dct_dct_16x4).dconly %endif %endmacro %macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd vpbroadcastd m8, [o(pw_2896x8)] vpermq m0, [cq+32*0], q3120 add cq, 32*4 vpermq m7, [cq+32*3], q%1 vpermq m1, [cq-32*3], q%1 vpermq m6, [cq+32*2], q3120 vpermq m2, [cq-32*2], q3120 vpermq m5, [cq+32*1], q%1 vpermq m3, [cq-32*1], q%1 vpermq m4, [cq+32*0], q3120 REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 %endmacro INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst INV_TXFM_16X8_FN dct, identity cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 3120 call m(idct_8x16_internal).main vpbroadcastd m10, [o(pw_16384)] punpckhwd m8, m0, m2 punpcklwd m0, m2 punpckhwd m2, m1, m3 punpcklwd m1, m3 punpcklwd m9, m4, m6 punpckhwd m4, m6 punpcklwd m6, m5, m7 punpckhwd m5, m7 REPX {pmulhrsw x, m10}, m8, m1, m4, m6 .pass1_end: REPX {pmulhrsw x, m10}, m0, m2, m9, m5 punpckhwd m3, m0, m8 punpcklwd m0, m8 punpckhwd m8, m2, m1 punpcklwd m2, m1 punpcklwd m7, m9, m4 punpckhwd m9, m4 punpcklwd m4, m5, m6 punpckhwd m5, m6 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m8 punpckhdq m3, m8 punpckldq m6, m7, m4 punpckhdq m7, m4 punpckldq m8, m9, m5 punpckhdq m9, m5 vperm2i128 m4, m0, m6, 0x31 vinserti128 m0, xm6, 1 vperm2i128 m5, m1, m7, 0x31 vinserti128 m1, xm7, 1 vperm2i128 m6, m2, m8, 0x31 vinserti128 m2, xm8, 1 vperm2i128 m7, m3, m9, 0x31 vinserti128 m3, xm9, 1 jmp tx2q .pass2: call .main vpbroadcastd m8, [o(pw_2048)] .end: REPX {pmulhrsw x, m8}, m0, m2, m4, m6 .end2: REPX {pmulhrsw x, m8}, m1, m3, m5, m7 lea r3, [strideq*3] WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 .end3: pxor m0, m0 REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 .end4: lea dstq, [dstq+strideq*4] WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r3 RET ALIGN function_align .main: vpbroadcastd m10, [o(pd_2048)] .main2: IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ret INV_TXFM_16X8_FN adst, dct INV_TXFM_16X8_FN adst, adst INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 1302 call m(iadst_8x16_internal).main2 call m(iadst_8x16_internal).main_pass1_end psubw m11, m9, m10 punpcklwd m8, m0, m2 punpckhwd m0, m2 punpckhwd m2, m1, m3 punpcklwd m1, m3 punpcklwd m9, m4, m6 punpckhwd m4, m6 punpckhwd m6, m5, m7 punpcklwd m5, m7 REPX {pmulhrsw x, m11}, m8, m1, m4, m6 jmp m(idct_16x8_internal).pass1_end ALIGN function_align .pass2: call .main call .main_pass2_end pxor m8, m8 psubw m8, m9 REPX {pmulhrsw x, m9}, m0, m2, m4, m6 jmp m(idct_16x8_internal).end2 ALIGN function_align .main: vpbroadcastd m10, [o(pd_2048)] ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a psubsw m8, m2, m6 ; t6 paddsw m2, m6 ; t2 psubsw m6, m0, m4 ; t4 paddsw m0, m4 ; t0 psubsw m4, m5, m1 ; t7 paddsw m5, m1 ; t3 psubsw m1, m7, m3 ; t5 paddsw m7, m3 ; t1 ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a psubsw m9, m6, m8 ; t7 paddsw m6, m8 ; out6 psubsw m3, m7, m5 ; t3 paddsw m7, m5 ; -out7 psubsw m5, m0, m2 ; t2 paddsw m0, m2 ; out0 psubsw m2, m1, m4 ; t6 paddsw m1, m4 ; -out1 ret ALIGN function_align .main_pass1_end: vpbroadcastd m11, [o(pw_m2896_2896)] vpbroadcastd m12, [o(pw_2896_2896)] punpckhwd m4, m3, m5 punpcklwd m3, m5 pmaddwd m5, m11, m4 pmaddwd m4, m12 pmaddwd m8, m11, m3 pmaddwd m3, m12 REPX {paddd x, m10}, m5, m4, m8, m3 REPX {psrad x, 12 }, m5, m8, m4, m3 packssdw m3, m4 ; -out3 packssdw m4, m8, m5 ; out4 punpcklwd m5, m9, m2 punpckhwd m9, m2 pmaddwd m2, m12, m5 pmaddwd m5, m11 pmaddwd m12, m9 pmaddwd m11, m9 REPX {paddd x, m10}, m2, m5, m12, m11 REPX {psrad x, 12 }, m2, m12, m5, m11 packssdw m2, m12 ; out2 packssdw m5, m11 ; -out5 ret ALIGN function_align .main_pass2_end: vpbroadcastd m8, [o(pw_2896x8)] psubsw m4, m5, m3 paddsw m3, m5 psubsw m5, m2, m9 paddsw m2, m9 pmulhrsw m2, m8 ; out2 pmulhrsw m3, m8 ; -out3 pmulhrsw m4, m8 ; out4 pmulhrsw m5, m8 ; -out5 vpbroadcastd m9, [o(pw_2048)] ret INV_TXFM_16X8_FN flipadst, dct INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 1302 call m(iadst_8x16_internal).main2 call m(iadst_8x16_internal).main_pass1_end psubw m9, m10 punpcklwd m8, m6, m4 punpckhwd m6, m4 punpcklwd m4, m7, m5 punpckhwd m7, m5 punpckhwd m5, m3, m1 punpcklwd m3, m1 punpckhwd m1, m2, m0 punpcklwd m2, m0 REPX {pmulhrsw x, m10}, m8, m4, m5, m1 REPX {pmulhrsw x, m9 }, m6, m7, m3, m2 punpcklwd m0, m7, m4 punpckhwd m7, m4 punpckhwd m4, m6, m8 punpcklwd m6, m8 punpckhwd m8, m3, m5 punpcklwd m3, m5 punpcklwd m5, m2, m1 punpckhwd m2, m1 punpckhdq m1, m0, m6 punpckldq m0, m6 punpckldq m6, m7, m4 punpckhdq m7, m4 punpckhdq m4, m3, m5 punpckldq m3, m5 punpckldq m5, m8, m2 punpckhdq m8, m2 vinserti128 m2, m6, xm5, 1 vperm2i128 m6, m5, 0x31 vperm2i128 m5, m1, m4, 0x31 vinserti128 m1, xm4, 1 vperm2i128 m4, m0, m3, 0x31 vinserti128 m0, xm3, 1 vinserti128 m3, m7, xm8, 1 vperm2i128 m7, m8, 0x31 jmp tx2q .pass2: call m(iadst_16x8_internal).main call m(iadst_16x8_internal).main_pass2_end pxor m8, m8 psubw m8, m9 pmulhrsw m10, m7, m8 pmulhrsw m7, m0, m9 pmulhrsw m0, m6, m9 pmulhrsw m6, m1, m8 pmulhrsw m1, m5, m8 pmulhrsw m5, m2, m9 pmulhrsw m2, m4, m9 pmulhrsw m4, m3, m8 lea r3, [strideq*3] WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1 WRITE_16X2 1, 2, 0, 1, strideq*2, r3 jmp m(idct_16x8_internal).end3 INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 mova xm7, [cq+16*0] mova xm2, [cq+16*1] add cq, 16*8 vpbroadcastd m3, [o(pw_2896x8)] vinserti128 m7, [cq+16*0], 1 vinserti128 m2, [cq+16*1], 1 mova xm6, [cq-16*6] mova xm4, [cq-16*5] vinserti128 m6, [cq+16*2], 1 vinserti128 m4, [cq+16*3], 1 mova xm8, [cq-16*4] mova xm5, [cq-16*3] vinserti128 m8, [cq+16*4], 1 vinserti128 m5, [cq+16*5], 1 mova xm0, [cq-16*2] mova xm1, [cq-16*1] vinserti128 m0, [cq+16*6], 1 vinserti128 m1, [cq+16*7], 1 vpbroadcastd m10, [o(pw_1697x16)] vpbroadcastd m11, [o(pw_16384)] REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1 punpcklwd m3, m7, m2 punpckhwd m7, m2 punpcklwd m2, m6, m4 punpckhwd m6, m4 punpcklwd m4, m8, m5 punpckhwd m8, m5 punpcklwd m5, m0, m1 punpckhwd m0, m1 punpckldq m1, m3, m2 punpckhdq m3, m2 punpckldq m2, m4, m5 punpckhdq m4, m5 punpckldq m5, m7, m6 punpckhdq m7, m6 punpckldq m6, m8, m0 punpckhdq m8, m0 REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m6 punpckhqdq m5, m6 punpcklqdq m6, m7, m8 punpckhqdq m7, m8 jmp tx2q .pass2: vpbroadcastd m8, [o(pw_4096)] jmp m(idct_16x8_internal).end %define o_base pw_5 + 128 %macro INV_TXFM_16X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x16 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 8 jmp m(inv_txfm_add_dct_dct_16x4).dconly %endif %endmacro %macro ITX_16X16_LOAD_COEFS 0 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] add cq, 32*8 mova m4, [cq-32*4] mova m5, [cq-32*3] mova m6, [cq-32*2] mova m7, [cq-32*1] mova m8, [cq+32*0] mova m9, [cq+32*1] mova m10, [cq+32*2] mova m11, [cq+32*3] mova m12, [cq+32*4] mova m13, [cq+32*5] mova m14, [cq+32*6] mova m15, [cq+32*7] mova [rsp], m15 %endmacro INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst INV_TXFM_16X16_FN dct, identity cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS call .main .pass1_end: vpbroadcastd m1, [o(pw_8192)] REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 vextracti128 [rsp+16*5], m8, 1 mova [rsp+16*1], xm8 .pass1_end2: vextracti128 [rsp+16*4], m0, 1 mova [rsp+16*0], xm0 REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 pmulhrsw m1, [rsp+32*1] vperm2i128 m8, m1, m9, 0x31 vinserti128 m1, xm9, 1 vperm2i128 m9, m2, m10, 0x31 vinserti128 m2, xm10, 1 vperm2i128 m10, m3, m11, 0x31 vinserti128 m3, xm11, 1 vperm2i128 m11, m4, m12, 0x31 vinserti128 m4, xm12, 1 vperm2i128 m12, m5, m13, 0x31 vinserti128 m5, xm13, 1 vperm2i128 m13, m6, m14, 0x31 vinserti128 m6, xm14, 1 vperm2i128 m14, m7, m15, 0x31 vinserti128 m7, xm15, 1 mova m15, [rsp+32*2] .pass1_end3: punpcklwd m0, m9, m10 punpckhwd m9, m10 punpcklwd m10, m15, m8 punpckhwd m15, m8 punpckhwd m8, m11, m12 punpcklwd m11, m12 punpckhwd m12, m13, m14 punpcklwd m13, m14 punpckhdq m14, m11, m13 punpckldq m11, m13 punpckldq m13, m15, m9 punpckhdq m15, m9 punpckldq m9, m10, m0 punpckhdq m10, m0 punpckhdq m0, m8, m12 punpckldq m8, m12 punpcklqdq m12, m13, m8 punpckhqdq m13, m8 punpcklqdq m8, m9, m11 punpckhqdq m9, m11 punpckhqdq m11, m10, m14 punpcklqdq m10, m14 punpcklqdq m14, m15, m0 punpckhqdq m15, m0 mova m0, [rsp] mova [rsp], m15 punpckhwd m15, m4, m5 punpcklwd m4, m5 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhwd m1, m6, m7 punpcklwd m6, m7 punpckhwd m7, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 punpckhdq m6, m5, m7 punpckldq m5, m7 punpckldq m7, m15, m1 punpckhdq m15, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m7 punpckhqdq m5, m7 punpckhqdq m7, m6, m15 punpcklqdq m6, m15 jmp tx2q .pass2: call .main .end: vpbroadcastd m1, [o(pw_2048)] REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 mova [rsp], m6 .end2: REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 pmulhrsw m1, [rsp+32*1] lea r3, [strideq*3] WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 lea dstq, [dstq+strideq*4] WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3 .end3: pxor m2, m2 REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1 lea dstq, [dstq+strideq*4] WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1 WRITE_16X2 10, 11, 0, 1, strideq*2, r3 REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7 lea dstq, [dstq+strideq*4] WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1 WRITE_16X2 14, 15, 0, 1, strideq*2, r3 RET ALIGN function_align .main: vpbroadcastd m15, [o(pd_2048)] mova [rsp+gprsize+32*1], m1 mova [rsp+gprsize+32*2], m9 IDCT8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15 mova m1, [rsp+gprsize+32*2] ; in9 mova [rsp+gprsize+32*2], m14 ; tmp7 mova m9, [rsp+gprsize+32*1] ; in1 mova [rsp+gprsize+32*1], m10 ; tmp5 mova m14, [rsp+gprsize+32*0] ; in15 mova [rsp+gprsize+32*0], m6 ; tmp3 IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15 mova m6, [rsp+gprsize+32*1] ; tmp5 psubsw m15, m0, m14 ; out15 paddsw m0, m14 ; out0 psubsw m14, m2, m13 ; out14 paddsw m2, m13 ; out1 mova [rsp+gprsize+32*1], m2 psubsw m13, m4, m11 ; out13 paddsw m2, m4, m11 ; out2 psubsw m11, m8, m7 ; out11 paddsw m4, m8, m7 ; out4 mova m7, [rsp+gprsize+32*2] ; tmp7 psubsw m10, m6, m5 ; out10 paddsw m5, m6 ; out5 psubsw m8, m7, m9 ; out8 paddsw m7, m9 ; out7 psubsw m9, m12, m3 ; out9 paddsw m6, m12, m3 ; out6 mova m3, [rsp+gprsize+32*0] ; tmp3 psubsw m12, m3, m1 ; out12 paddsw m3, m1 ; out3 ret INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS call .main call .main_pass1_end pmulhrsw m0, m1, [cq+32*0] pmulhrsw m2, m1, [cq+32*1] REPX {pmulhrsw x, m1}, m4, m6, m8, m10 pmulhrsw m12, m1, [cq+32*2] pmulhrsw m14, m1, [cq+32*3] vextracti128 [rsp+16*5], m8, 1 mova [rsp+16*1], xm8 pxor m8, m8 psubw m1, m8, m1 jmp m(idct_16x16_internal).pass1_end2 ALIGN function_align .pass2: call .main call .main_pass2_end REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 mova [rsp+32*0], m6 pxor m6, m6 psubw m1, m6, m1 jmp m(idct_16x16_internal).end2 ALIGN function_align .main: vpbroadcastd m15, [o(pd_2048)] mova [rsp+gprsize+32*1], m0 mova [rsp+gprsize+32*2], m4 ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973 ; t3, t2 ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6 ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10 ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14 psubsw m0, m2, m10 ; t10a paddsw m2, m10 ; t2a psubsw m10, m13, m5 ; t11a paddsw m13, m5 ; t3a psubsw m5, m6, m14 ; t14a paddsw m6, m14 ; t6a psubsw m14, m9, m1 ; t15a paddsw m9, m1 ; t7a ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10 ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15 psubsw m1, m10, m14 ; t14a paddsw m10, m14 ; t10a psubsw m14, m0, m5 ; t15a paddsw m0, m5 ; t11a psubsw m5, m2, m6 ; t6 paddsw m2, m6 ; t2 psubsw m6, m13, m9 ; t7 paddsw m13, m9 ; t3 ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15 mova m9, [rsp+gprsize+32*0] ; in15 mova [rsp+gprsize+32*0], m10 ; t10a mova m4, [rsp+gprsize+32*1] ; in0 mova [rsp+gprsize+32*1], m6 ; t6a mova m6, [rsp+gprsize+32*2] ; in4 mova [rsp+gprsize+32*2], m2 ; t2 ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091 ; t1, t0 ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4 ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8 ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12 psubsw m10, m4, m8 ; t8a paddsw m8, m4 ; t0a psubsw m4, m9, m7 ; t9a paddsw m9, m7 ; t1a psubsw m7, m6, m12 ; t12a paddsw m6, m12 ; t4a psubsw m12, m11, m3 ; t13a paddsw m11, m3 ; t5a ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8 ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13 psubsw m3, m9, m11 ; t5 paddsw m9, m11 ; t1 psubsw m11, m4, m12 ; t12a paddsw m4, m12 ; t8a paddsw m12, m8, m6 ; t0 psubsw m8, m6 ; t4 paddsw m6, m10, m7 ; t9a psubsw m10, m7 ; t13a ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12 mova m7, [rsp+gprsize+32*0] ; t10a mova m2, [rsp+gprsize+32*1] ; t6a paddsw m15, m9, m13 ; -out15 psubsw m9, m13 ; t3a paddsw m13, m11, m1 ; -out13 psubsw m11, m1 ; t15a psubsw m1, m4, m7 ; t10 paddsw m7, m4 ; -out1 psubsw m4, m3, m2 ; t6 paddsw m3, m2 ; -out3 paddsw m2, m10, m14 ; out2 psubsw m10, m14 ; t14a paddsw m14, m6, m0 ; out14 psubsw m6, m0 ; t11 mova m0, [rsp+gprsize+32*2] ; t2 mova [rsp+gprsize+32*1], m7 psubsw m7, m12, m0 ; t2a paddsw m0, m12 ; out0 paddsw m12, m8, m5 ; out12 psubsw m8, m5 ; t7 ret ALIGN function_align .main_pass1_end: mova [cq+32*0], m0 mova [cq+32*1], m2 mova [cq+32*2], m12 mova [cq+32*3], m14 vpbroadcastd m14, [pw_m2896_2896] vpbroadcastd m12, [pw_2896_2896] vpbroadcastd m2, [pd_2048] punpcklwd m5, m11, m10 punpckhwd m11, m10 pmaddwd m10, m14, m5 pmaddwd m0, m14, m11 pmaddwd m5, m12 pmaddwd m11, m12 REPX {paddd x, m2}, m10, m0, m5, m11 REPX {psrad x, 12}, m10, m0, m5, m11 packssdw m10, m0 ; out10 packssdw m5, m11 ; -out5 punpcklwd m11, m8, m4 punpckhwd m8, m4 pmaddwd m4, m12, m11 pmaddwd m0, m12, m8 pmaddwd m11, m14 pmaddwd m8, m14 REPX {paddd x, m2}, m4, m0, m11, m8 REPX {psrad x, 12}, m4, m0, m11, m8 packssdw m4, m0 ; out4 packssdw m11, m8 ; -out11 punpcklwd m8, m9, m7 punpckhwd m9, m7 pmaddwd m7, m12, m8 pmaddwd m0, m12, m9 pmaddwd m8, m14 pmaddwd m9, m14 REPX {paddd x, m2}, m7, m0, m8, m9 REPX {psrad x, 12}, m7, m0, m8, m9 packssdw m7, m0 ; -out7 packssdw m8, m9 ; out8 punpckhwd m0, m6, m1 punpcklwd m6, m1 pmaddwd m1, m14, m0 pmaddwd m9, m14, m6 pmaddwd m0, m12 pmaddwd m6, m12 REPX {paddd x, m2}, m1, m9, m0, m6 REPX {psrad x, 12}, m1, m9, m0, m6 packssdw m9, m1 ; -out7 packssdw m6, m0 ; out8 vpbroadcastd m1, [o(pw_8192)] ret ALIGN function_align .main_pass2_end: ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to ; 16-bit here will produce the same result as using 32-bit intermediates. paddsw m5, m10, m11 ; -out5 psubsw m10, m11 ; out10 psubsw m11, m4, m8 ; -out11 paddsw m4, m8 ; out4 psubsw m8, m7, m9 ; out8 paddsw m7, m9 ; -out7 psubsw m9, m1, m6 ; -out9 paddsw m6, m1 ; out6 vpbroadcastd m1, [o(pw_2896x8)] REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11 vpbroadcastd m1, [o(pw_2048)] ret INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS call m(iadst_16x16_internal).main call m(iadst_16x16_internal).main_pass1_end pmulhrsw m6, m1 pmulhrsw m2, m1, m8 mova [rsp+32*2], m6 pmulhrsw m6, m1, m4 pmulhrsw m4, m1, m10 pmulhrsw m8, m1, [cq+32*3] pmulhrsw m10, m1, [cq+32*2] pmulhrsw m12, m1, [cq+32*1] pmulhrsw m14, m1, [cq+32*0] pxor m0, m0 psubw m0, m1 REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15 pmulhrsw m1, m0, m9 pmulhrsw m9, m0, m13 pmulhrsw m0, [rsp+32*1] mova [rsp+16*0], xm15 mova [rsp+16*1], xm7 vperm2i128 m15, m15, m7, 0x31 vinserti128 m7, m2, xm14, 1 vperm2i128 m14, m2, m14, 0x31 vinserti128 m2, m9, xm5, 1 vperm2i128 m9, m9, m5, 0x31 vinserti128 m5, m4, xm12, 1 vperm2i128 m12, m4, m12, 0x31 vinserti128 m4, m11, xm3, 1 vperm2i128 m11, m11, m3, 0x31 vinserti128 m3, m10, xm6, 1 vperm2i128 m10, m10, m6, 0x31 vinserti128 m6, m1, xm0, 1 vperm2i128 m13, m1, m0, 0x31 vinserti128 m1, m8, [rsp+32*2], 1 vperm2i128 m8, m8, [rsp+32*2], 0x31 jmp m(idct_16x16_internal).pass1_end3 .pass2: call m(iadst_16x16_internal).main call m(iadst_16x16_internal).main_pass2_end pmulhrsw m0, m1 pmulhrsw m8, m1 mova [rsp+32*0], m0 mova [rsp+32*2], m8 pxor m0, m0 psubw m0, m1 pmulhrsw m8, m0, m7 pmulhrsw m7, m0, m9 pmulhrsw m9, m1, m6 pmulhrsw m6, m1, m10 pmulhrsw m10, m0, m5 pmulhrsw m5, m0, m11 pmulhrsw m11, m1, m4 pmulhrsw m4, m1, m12 pmulhrsw m12, m0, m3 pmulhrsw m3, m0, m13 pmulhrsw m13, m1, m2 pmulhrsw m1, m14 pmulhrsw m14, m0, [rsp+32*1] pmulhrsw m0, m15 lea r3, [strideq*3] WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1 mova m15, [rsp+32*0] WRITE_16X2 3, 4, 0, 1, strideq*2, r3 lea dstq, [dstq+strideq*4] WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1 WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3 jmp m(idct_16x16_internal).end3 %macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 pmulhrsw m%2, m%3, m%1 psraw m%2, 1 pavgw m%1, m%2 ; signs are guaranteed to be equal %endmacro INV_TXFM_16X16_FN identity, dct INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 vpbroadcastd m7, [o(pw_1697x16)] mova xm0, [cq+16* 0] vinserti128 m0, [cq+16*16], 1 mova xm15, [cq+16* 1] vinserti128 m15, [cq+16*17], 1 mova xm1, [cq+16* 2] vinserti128 m1, [cq+16*18], 1 mova xm8, [cq+16* 3] vinserti128 m8, [cq+16*19], 1 mova xm2, [cq+16* 4] vinserti128 m2, [cq+16*20], 1 mova xm9, [cq+16* 5] vinserti128 m9, [cq+16*21], 1 mova xm3, [cq+16* 6] vinserti128 m3, [cq+16*22], 1 mova xm10, [cq+16* 7] add cq, 16*16 vinserti128 m10, [cq+16* 7], 1 mova xm4, [cq-16* 8] vinserti128 m4, [cq+16* 8], 1 mova xm11, [cq-16* 7] vinserti128 m11, [cq+16* 9], 1 mova xm5, [cq-16* 6] vinserti128 m5, [cq+16*10], 1 mova xm12, [cq-16* 5] vinserti128 m12, [cq+16*11], 1 mova xm13, [cq-16* 3] vinserti128 m13, [cq+16*13], 1 mova xm14, [cq-16* 1] vinserti128 m14, [cq+16*15], 1 REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \ 10, 4, 11, 5, 12, 13, 14 mova xm6, [cq-16* 4] vinserti128 m6, [cq+16*12], 1 mova [rsp], m0 IDTX16B 6, 0, 7 mova xm0, [cq-16* 2] vinserti128 m0, [cq+16*14], 1 pmulhrsw m7, m0 psraw m7, 1 pavgw m7, m0 jmp m(idct_16x16_internal).pass1_end3 ALIGN function_align .pass2: vpbroadcastd m15, [o(pw_1697x16)] mova [rsp+32*1], m0 REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ 8, 9, 10, 11, 12, 13, 14 mova m0, [rsp+32*1] mova [rsp+32*1], m1 IDTX16 0, 1, 15 mova m1, [rsp+32*0] pmulhrsw m15, m1 paddsw m1, m1 paddsw m15, m1 jmp m(idct_16x16_internal).end %define o_base deint_shuf + 128 %macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 %if %3 vpbroadcastd m15, [o(pw_2896x8)] pmulhrsw m0, m15, [%1+%2*0] pmulhrsw m1, m15, [%1+%2*1] pmulhrsw m2, m15, [%1+%2*2] pmulhrsw m3, m15, [%1+%2*3] pmulhrsw m4, m15, [%1+%2*4] pmulhrsw m5, m15, [%1+%2*5] pmulhrsw m6, m15, [%1+%2*6] pmulhrsw m7, m15, [%1+%2*7] %else mova m0, [%1+%2*0] mova m1, [%1+%2*1] mova m2, [%1+%2*2] mova m3, [%1+%2*3] mova m4, [%1+%2*4] mova m5, [%1+%2*5] mova m6, [%1+%2*6] mova m7, [%1+%2*7] %endif %endmacro %macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2 %if %3 %if %3 == 1 vpbroadcastd m15, [o(pw_2896x8)] %endif pmulhrsw m8, m15, [%1+%2*0] pmulhrsw m9, m15, [%1+%2*1] pmulhrsw m10, m15, [%1+%2*2] pmulhrsw m11, m15, [%1+%2*3] pmulhrsw m12, m15, [%1+%2*4] pmulhrsw m13, m15, [%1+%2*5] pmulhrsw m14, m15, [%1+%2*6] pmulhrsw m15, [%1+%2*7] %else mova m8, [%1+%2*0] mova m9, [%1+%2*1] mova m10, [%1+%2*2] mova m11, [%1+%2*3] mova m12, [%1+%2*4] mova m13, [%1+%2*5] mova m14, [%1+%2*6] mova m15, [%1+%2*7] %endif %endmacro %macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%4_%5x8] punpcklwd m%1, m%2, m%2 pmulhrsw m%1, m%3 vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%6_%7x8] punpckhwd m%2, m%2 pmulhrsw m%2, m%3 %endmacro cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jz .dconly PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob %undef cmp cmp eobd, 106 jle .fast LOAD_8ROWS cq+32*1, 32*2 call m(idct_16x8_internal).main vperm2i128 m11, m0, m4, 0x31 vinserti128 m0, xm4, 1 vperm2i128 m4, m1, m5, 0x31 vinserti128 m1, xm5, 1 vperm2i128 m5, m2, m6, 0x31 vinserti128 m2, xm6, 1 vperm2i128 m6, m3, m7, 0x31 vinserti128 m3, xm7, 1 pxor m7, m7 REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15 punpckhwd m7, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpcklwd m3, m11, m4 punpckhwd m11, m4 punpckhwd m4, m5, m6 punpcklwd m5, m6 punpckhdq m6, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m5 punpckhdq m3, m5 punpckhdq m5, m11, m4 punpckldq m11, m4 punpckldq m4, m7, m1 punpckhdq m7, m1 punpckhqdq m12, m6, m0 punpcklqdq m0, m6 ; out4 punpckhqdq m13, m7, m4 punpcklqdq m4, m7 ; out5 punpckhqdq m14, m3, m2 punpcklqdq m2, m3 ; out6 punpckhqdq m15, m5, m11 punpcklqdq m11, m5 ; out7 mova [rsp+32*0], m0 mova [rsp+32*1], m4 mova [rsp+32*2], m2 .fast: LOAD_8ROWS cq+32*0, 32*2 call m(idct_16x8_internal).main vperm2i128 m8, m0, m4, 0x31 vinserti128 m0, xm4, 1 vperm2i128 m4, m1, m5, 0x31 vinserti128 m1, xm5, 1 vperm2i128 m5, m2, m6, 0x31 vinserti128 m2, xm6, 1 vperm2i128 m6, m3, m7, 0x31 vinserti128 m3, xm7, 1 vpbroadcastd m9, [o(pw_8192)] pxor m7, m7 REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14 punpckhwd m7, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m8, m4 punpcklwd m8, m4 punpckhwd m4, m5, m6 punpcklwd m5, m6 punpckhdq m6, m0, m2 punpckldq m0, m2 punpckldq m2, m8, m5 punpckhdq m8, m5 punpckhdq m5, m3, m4 punpckldq m3, m4 punpckhdq m4, m7, m1 punpckldq m7, m1 punpcklqdq m1, m7, m4 punpckhqdq m7, m4 ; out9 punpckhqdq m4, m2, m8 ; out10 punpcklqdq m2, m8 punpckhqdq m8, m3, m5 punpcklqdq m3, m5 punpckhqdq m5, m0, m6 ; out8 punpcklqdq m0, m6 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7 cmp eobd, 106 jg .full mova [rsp+32*0], m5 mova [rsp+32*1], m7 mova [rsp+32*2], m4 pmulhrsw m11, m9, m8 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call .main_fast jmp .pass2 .dconly: movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd pmulhrsw xm0, xm2 psrlw xm2, 2 ; pw_2048 pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mov r2d, 8 jmp m(inv_txfm_add_dct_dct_8x8).end2 .full: REPX {pmulhrsw x, m9}, m12, m13, m14, m15 pmulhrsw m6, m9, [rsp+32*2] mova [rsp+32*2], m4 pmulhrsw m4, m9, [rsp+32*0] mova [rsp+32*0], m5 pmulhrsw m5, m9, [rsp+32*1] mova [rsp+32*1], m7 pmulhrsw m7, m9, m11 pmulhrsw m11, m9, m8 call .main .pass2: vpbroadcastd m12, [o(pw_2048)] REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m13, m14, m15 pmulhrsw m12, [rsp] REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14 REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15 mova [rsp+32*0], m4 mova [rsp+32*1], m6 lea r3, [strideq*3] WRITE_8X4 0, 1, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 2, 3, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 [rsp+32*0], 5, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 [rsp+32*1], 7, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 8, 9, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 10, 11, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 12, 13, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 14, 15, 4, 6 RET ALIGN function_align .main_fast: ; bottom half is zero call m(idct_8x16_internal).main mova m8, [rsp+gprsize+0*32] mova [rsp+gprsize+0*32], m0 mova m9, [rsp+gprsize+1*32] mova [rsp+gprsize+1*32], m1 mova m0, [rsp+gprsize+2*32] mova [rsp+gprsize+2*32], m6 lea r5, [rax-(o_base)+pw_201_4091x8] ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a jmp .main2 ALIGN function_align .main: call m(idct_8x16_internal).main mova m8, [rsp+gprsize+0*32] mova [rsp+gprsize+0*32], m0 mova m9, [rsp+gprsize+1*32] mova [rsp+gprsize+1*32], m1 mova m0, [rsp+gprsize+2*32] mova [rsp+gprsize+2*32], m6 punpcklwd m1, m15, m8 ; in31 in1 punpckhwd m8, m15 ; in3 in29 punpcklwd m15, m14, m9 ; in27 in5 punpckhwd m9, m14 ; in7 in25 punpcklwd m14, m13, m0 ; in23 in9 punpckhwd m0, m13 ; in11 in21 punpcklwd m13, m12, m11 ; in19 in13 punpckhwd m11, m12 ; in15 in17 ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a .main2: psubsw m6, m1, m11 ; t17 t30 paddsw m1, m11 ; t16 t31 psubsw m11, m9, m14 ; t18 t29 paddsw m9, m14 ; t19 t28 psubsw m14, m15, m0 ; t21 t26 paddsw m15, m0 ; t20 t27 psubsw m0, m8, m13 ; t22 t25 paddsw m8, m13 ; t23 t24 ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a psubsw m13, m1, m9 ; t19a t28a paddsw m1, m9 ; t16a t31a psubsw m9, m8, m15 ; t20a t27a paddsw m8, m15 ; t23a t24a psubsw m15, m6, m11 ; t18 t29 paddsw m6, m11 ; t17 t30 psubsw m11, m0, m14 ; t21 t26 paddsw m0, m14 ; t22 t25 ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28 ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27 ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a vbroadcasti128 m12, [o(deint_shuf)] psubsw m14, m1, m8 ; t23 t24 paddsw m1, m8 ; t16 t31 psubsw m8, m6, m0 ; t22a t25a paddsw m6, m0 ; t17a t30a psubsw m0, m15, m11 ; t21 t26 paddsw m15, m11 ; t18 t29 psubsw m11, m13, m9 ; t20a t27a paddsw m13, m9 ; t19a t28a REPX {pshufb x, m12}, m1, m6, m15, m13 ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a vpbroadcastd m9, [o(pw_m2896_2896)] ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25 vpbroadcastd m12, [o(pw_2896_2896)] ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a vpbroadcastd m12, [o(pw_2896_2896)] ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20 shufps m9, m14, m8, q1032 ; t23a t22 vpblendd m14, m8, 0xcc ; t24a t25 shufps m8, m11, m0, q1032 ; t20 t21a vpblendd m11, m0, 0xcc ; t27 t26a punpcklqdq m0, m1, m6 ; t16 t17a punpckhqdq m1, m6 ; t31 t30a psubsw m10, m5, m8 ; out20 out21 paddsw m5, m8 ; out11 out10 psubsw m6, m3, m14 ; out24 out25 paddsw m3, m14 ; out7 out6 psubsw m8, m7, m0 ; out16 out17 paddsw m7, m0 ; out15 out14 mova m0, [rsp+gprsize+0*32] punpcklqdq m12, m13, m15 ; t19a t18 punpckhqdq m13, m15 ; t28a t29 psubsw m15, m0, m1 ; out31 out30 paddsw m0, m1 ; out0 out1 mova m1, [rsp+gprsize+1*32] mova [rsp+gprsize+0*32], m6 mova m6, [rsp+gprsize+2*32] psubsw m14, m1, m13 ; out28 out29 paddsw m1, m13 ; out3 out2 psubsw m13, m2, m11 ; out27 out26 paddsw m2, m11 ; out4 out5 psubsw m11, m4, m9 ; out23 out22 paddsw m4, m9 ; out8 out9 psubsw m9, m6, m12 ; out19 out18 paddsw m6, m12 ; out12 out13 ret %macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2] vbroadcasti128 m%1, [cq+16*%3] vbroadcasti128 m%2, [cq+16*%4] shufpd m%1, m%2, 0x0c %endmacro cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 8 .dconly: pmulhrsw xm0, xm2 movd xm2, [pw_2048] ; intentionally rip-relative pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 pxor m3, m3 .dconly_loop: mova m1, [dstq] punpckhbw m2, m1, m3 punpcklbw m1, m3 paddw m2, m0 paddw m1, m0 packuswb m1, m2 mova [dstq], m1 add dstq, strideq dec r2d jg .dconly_loop RET .normal: PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob %undef cmp LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2 LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3 LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6 LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7 pxor m8, m8 REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 add cq, 16*16 LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10 LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11 LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14 LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15 REPX {mova [cq+32*x], m8}, -4, -3, -2, -1 mova [rsp+32*0], m4 mova [rsp+32*1], m5 mova [rsp+32*2], m6 cmp eobd, 106 jg .full pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(inv_txfm_add_dct_dct_8x32).main_fast jmp .pass2 .full: LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18 LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17 LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22 LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21 REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 add cq, 16*8 LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26 LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25 LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30 LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29 pxor m8, m8 REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 call m(inv_txfm_add_dct_dct_8x32).main .pass2: vpbroadcastd m12, [o(pw_8192)] REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15 mova [rsp+32*1], m9 mova [rsp+32*2], m10 punpckhwd m9, m0, m2 punpcklwd m0, m2 punpckhwd m2, m1, m3 punpcklwd m1, m3 punpcklwd m10, m4, m6 punpckhwd m4, m6 punpcklwd m6, m5, m7 punpckhwd m5, m7 punpckhwd m3, m0, m9 punpcklwd m0, m9 punpckhwd m9, m2, m1 punpcklwd m2, m1 punpcklwd m7, m10, m4 punpckhwd m10, m4 punpcklwd m4, m5, m6 punpckhwd m5, m6 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m9 punpckhdq m3, m9 punpckldq m6, m7, m4 punpckhdq m7, m4 punpckldq m9, m10, m5 punpckhdq m10, m5 REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10 pmulhrsw m12, [rsp+32*0] mova [rsp+32*0], m8 vperm2i128 m4, m0, m6, 0x31 vinserti128 m0, xm6, 1 vperm2i128 m5, m1, m7, 0x31 vinserti128 m1, xm7, 1 vperm2i128 m6, m2, m9, 0x31 vinserti128 m2, xm9, 1 vperm2i128 m7, m3, m10, 0x31 vinserti128 m3, xm10, 1 call m(idct_16x8_internal).main vpbroadcastd m8, [o(pw_2048)] REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 lea r2, [strideq*3] WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r2 lea r3, [dstq+strideq*4] %define dstq r3 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r2 mova m0, [rsp+32*0] mova m1, [rsp+32*1] mova m2, [rsp+32*2] punpckhwd m7, m0, m2 punpcklwd m0, m2 punpckhwd m2, m1, m11 punpcklwd m1, m11 punpckhwd m4, m12, m14 punpcklwd m12, m14 punpckhwd m5, m13, m15 punpcklwd m13, m15 punpckhwd m3, m0, m7 punpcklwd m0, m7 punpckhwd m9, m2, m1 punpcklwd m2, m1 punpcklwd m7, m12, m4 punpckhwd m12, m4 punpcklwd m4, m5, m13 punpckhwd m5, m13 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m9 punpckhdq m3, m9 punpckldq m6, m7, m4 punpckhdq m7, m4 punpckldq m9, m12, m5 punpckhdq m12, m5 vperm2i128 m4, m0, m6, 0x31 vinserti128 m0, xm6, 1 vperm2i128 m5, m1, m7, 0x31 vinserti128 m1, xm7, 1 vperm2i128 m6, m2, m9, 0x31 vinserti128 m2, xm9, 1 vperm2i128 m7, m3, m12, 0x31 vinserti128 m3, xm12, 1 call m(idct_16x8_internal).main2 vpbroadcastd m8, [o(pw_2048)] REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 add r0, 16 add r3, 16 %define dstq r0 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r2 %define dstq r3 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r2 RET cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob vpbroadcastd m9, [pw_5] lea r4, [strideq*3] sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) .loop: mova xm0,[cq+16* 0] mova xm1, [cq+16* 4] vinserti128 m0, [cq+16* 1], 1 vinserti128 m1, [cq+16* 5], 1 pxor m8, m8 mova [cq+32*0], m8 mova [cq+32*2], m8 add cq, 16*16 mova xm2, [cq-16* 8] mova xm3, [cq-16* 4] vinserti128 m2, [cq-16* 7], 1 vinserti128 m3, [cq-16* 3], 1 mova xm4, [cq+16* 0] mova xm5, [cq+16* 4] vinserti128 m4, [cq+16* 1], 1 vinserti128 m5, [cq+16* 5], 1 mova xm6, [cq+16* 8] mova xm7, [cq+16*12] vinserti128 m6, [cq+16* 9], 1 vinserti128 m7, [cq+16*13], 1 REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6 REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 call .transpose8x8 REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4 add dstq, strideq WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4 add dstq, strideq WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4 add dstq, strideq WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4 add dstq, strideq sub cq, 16*16-32 lea dstq, [dstq+r4*4] add eobd, 0x80000000 jnc .loop RET ALIGN function_align .transpose8x8: punpckhwd m8, m4, m5 punpcklwd m4, m5 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhwd m1, m6, m7 punpcklwd m6, m7 punpckhwd m7, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 punpckhdq m6, m5, m7 punpckldq m5, m7 punpckldq m7, m8, m1 punpckhdq m8, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m7 punpckhqdq m5, m7 punpckhqdq m7, m6, m8 punpcklqdq m6, m8 ret cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob add cq, 16*8 vpbroadcastd m9, [pw_4096] lea r4, [strideq*3] lea r5, [dstq+strideq*4] sub eobd, 107 .loop: mova xm0, [cq-16*8] mova xm1, [cq-16*7] vinserti128 m0, [cq+16*0], 1 vinserti128 m1, [cq+16*1], 1 mova xm2, [cq-16*6] mova xm3, [cq-16*5] vinserti128 m2, [cq+16*2], 1 vinserti128 m3, [cq+16*3], 1 mova xm4, [cq-16*4] mova xm5, [cq-16*3] vinserti128 m4, [cq+16*4], 1 vinserti128 m5, [cq+16*5], 1 mova xm6, [cq-16*2] mova xm7, [cq-16*1] vinserti128 m6, [cq+16*6], 1 vinserti128 m7, [cq+16*7], 1 pxor m8, m8 REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 call m(inv_txfm_add_identity_identity_8x32).transpose8x8 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r4 %define dstq r5 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r4 add cq, 16*16 add r0, 16 add r5, 16 add eobd, 0x80000000 jnc .loop RET %define o_base pw_5 + 128 %macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs %if %3 vpbroadcastd m15, [o(pw_2896x8)] pmulhrsw m0, m15, [%1+%2* 0] pmulhrsw m1, m15, [%1+%2* 1] pmulhrsw m2, m15, [%1+%2* 2] pmulhrsw m3, m15, [%1+%2* 3] pmulhrsw m4, m15, [%1+%2* 4] pmulhrsw m5, m15, [%1+%2* 5] pmulhrsw m6, m15, [%1+%2* 6] pmulhrsw m7, m15, [%1+%2* 7] pmulhrsw m8, m15, [%1+%2* 8] pmulhrsw m9, m15, [%1+%2* 9] pmulhrsw m10, m15, [%1+%2*10] pmulhrsw m11, m15, [%1+%2*11] pmulhrsw m12, m15, [%1+%2*12] pmulhrsw m13, m15, [%1+%2*13] pmulhrsw m14, m15, [%1+%2*14] pmulhrsw m15, [%1+%2*15] %else mova m0, [%1+%2* 0] mova m1, [%1+%2* 1] mova m2, [%1+%2* 2] mova m3, [%1+%2* 3] mova m4, [%1+%2* 4] mova m5, [%1+%2* 5] mova m6, [%1+%2* 6] mova m7, [%1+%2* 7] mova m8, [%1+%2* 8] mova m9, [%1+%2* 9] mova m10, [%1+%2*10] mova m11, [%1+%2*11] mova m12, [%1+%2*12] mova m13, [%1+%2*13] mova m14, [%1+%2*14] mova m15, [%1+%2*15] %endif mova [rsp], m15 %if %4 pxor m15, m15 REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \ 8, 9, 10, 11, 12, 13, 14, 15 %endif %endmacro %macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2] mova m%4, [%2] paddsw m%3, m%1, m%4 psubsw m%1, m%4 pmovzxbw m%4, [dstq+%6] pmulhrsw m%3, m%5 pmulhrsw m%1, m%5 paddw m%3, m%4 pmovzxbw m%4, [r2+%7] paddw m%1, m%4 packuswb m%3, m%1 vpermq m%3, m%3, q3120 mova [dstq+%6], xm%3 vextracti128 [r2+%7], m%3, 1 %endmacro cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \ base, tmp3 %undef cmp LOAD_16ROWS cq, 64, 1 call m(idct_16x16_internal).main lea tmp1q, [rsp+32*7] lea tmp2q, [tmp1q+32*8] lea tmp3q, [tmp1q+32*16] mova m1, [rsp+32*1] mova [rsp+32*0], m6 mova [rsp+32*1], m7 vpbroadcastd m7, [o(pw_16384)] call .transpose_2x8x8_round mova m15, [rsp+32*0] mova [tmp3q-32*4+ 0], xm0 vextracti128 [tmp3q+32*0+ 0], m0, 1 mova [tmp3q-32*3+ 0], xm2 vextracti128 [tmp3q+32*1+ 0], m2, 1 mova [tmp3q-32*2+ 0], xm4 vextracti128 [tmp3q+32*2+ 0], m4, 1 mova [tmp3q-32*1+ 0], xm6 vextracti128 [tmp3q+32*3+ 0], m6, 1 mova [tmp3q-32*4+16], xm8 vextracti128 [tmp3q+32*0+16], m8, 1 mova [tmp3q-32*3+16], xm10 vextracti128 [tmp3q+32*1+16], m10, 1 mova [tmp3q-32*2+16], xm12 vextracti128 [tmp3q+32*2+16], m12, 1 mova [tmp3q-32*1+16], xm14 vextracti128 [tmp3q+32*3+16], m14, 1 cmp eobd, 150 jg .full vinserti128 m0, m1, xm9, 1 vperm2i128 m4, m1, m9, 0x31 vinserti128 m2, m5, xm13, 1 vperm2i128 m6, m5, m13, 0x31 vinserti128 m1, m3, xm11, 1 vperm2i128 m5, m3, m11, 0x31 vinserti128 m3, m7, xm15, 1 vperm2i128 m7, m7, m15, 0x31 call .main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp .idct16 .dconly: movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 16 jmp m(inv_txfm_add_dct_dct_16x4).dconly .full: mova [tmp1q-32*4], m1 mova [tmp1q-32*3], m3 mova [tmp1q-32*2], m5 mova [tmp1q-32*1], m7 mova [tmp1q+32*0], m9 mova [tmp1q+32*1], m11 mova [tmp1q+32*2], m13 mova [tmp1q+32*3], m15 LOAD_16ROWS cq+32, 64, 1 call m(idct_16x16_internal).main lea r2, [tmp3q+32*8] mova m1, [rsp+32*1] mova [rsp+32*0], m6 mova [rsp+32*1], m7 vpbroadcastd m7, [o(pw_16384)] call .transpose_2x8x8_round mova m15, [rsp+32*0] mova [r2-32*4+ 0], xm0 vextracti128 [r2+32*0+ 0], m0, 1 mova [r2-32*3+ 0], xm2 vextracti128 [r2+32*1+ 0], m2, 1 mova [r2-32*2+ 0], xm4 vextracti128 [r2+32*2+ 0], m4, 1 mova [r2-32*1+ 0], xm6 vextracti128 [r2+32*3+ 0], m6, 1 mova [r2-32*4+16], xm8 vextracti128 [r2+32*0+16], m8, 1 mova [r2-32*3+16], xm10 vextracti128 [r2+32*1+16], m10, 1 mova [r2-32*2+16], xm12 vextracti128 [r2+32*2+16], m12, 1 mova [r2-32*1+16], xm14 vextracti128 [r2+32*3+16], m14, 1 vinserti128 m8, m1, xm9, 1 vperm2i128 m12, m1, m9, 0x31 mova xm0, [tmp1q-32*4] mova xm1, [tmp1q-32*3] vinserti128 m0, [tmp1q+32*0], 1 vinserti128 m1, [tmp1q+32*1], 1 vinserti128 m10, m5, xm13, 1 vperm2i128 m14, m5, m13, 0x31 mova xm4, [tmp1q-32*4+16] mova xm5, [tmp1q-32*3+16] vinserti128 m4, [tmp1q+32*0+16], 1 vinserti128 m5, [tmp1q+32*1+16], 1 vinserti128 m9, m3, xm11, 1 vperm2i128 m13, m3, m11, 0x31 mova xm2, [tmp1q-32*2] mova xm3, [tmp1q-32*1] vinserti128 m2, [tmp1q+32*2], 1 vinserti128 m3, [tmp1q+32*3], 1 vinserti128 m11, m7, xm15, 1 vperm2i128 m15, m7, m15, 0x31 mova xm6, [tmp1q-32*2+16] mova xm7, [tmp1q-32*1+16] vinserti128 m6, [tmp1q+32*2+16], 1 vinserti128 m7, [tmp1q+32*3+16], 1 call .main_oddhalf LOAD_8ROWS_H r2-32*4, 32 .idct16: LOAD_8ROWS tmp3q-32*4, 32 mova [rsp], m15 call m(idct_16x16_internal).main imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq call .pass2_end RET ALIGN function_align .main_oddhalf_fast: ; lower half is zero mova [rsp+gprsize+32*1], m7 pxor m7, m7 mova [rsp+gprsize+32*0], m7 mova [rsp+gprsize+32*2], m7 vpbroadcastd m11, [o(pw_3703x8)] vpbroadcastd m7, [o(pw_1751x8)] vpbroadcastd m12, [o(pw_m1380x8)] vpbroadcastd m8, [o(pw_3857x8)] vpbroadcastd m13, [o(pw_3973x8)] vpbroadcastd m15, [o(pw_995x8)] pmulhrsw m11, m4 ; t29a pmulhrsw m4, m7 ; t18a pmulhrsw m12, m3 ; t19a pmulhrsw m3, m8 ; t28a pmulhrsw m13, m2 ; t27a pmulhrsw m2, m15 ; t20a vpbroadcastd m10, [o(pw_m2106x8)] vpbroadcastd m7, [o(pw_3513x8)] vpbroadcastd m9, [o(pw_3290x8)] vpbroadcastd m8, [o(pw_2440x8)] vpbroadcastd m14, [o(pw_m601x8)] vpbroadcastd m15, [o(pw_4052x8)] pmulhrsw m10, m5 ; t21a pmulhrsw m5, m7 ; t26a pmulhrsw m9, m6 ; t25a pmulhrsw m6, m8 ; t22a pmulhrsw m14, m1 ; t23a pmulhrsw m1, m15 ; t24a vpbroadcastd m15, [o(pd_2048)] jmp .main2 ALIGN function_align .main_oddhalf: mova [rsp+gprsize+32*0], m15 mova [rsp+gprsize+32*1], m7 mova [rsp+gprsize+32*2], m8 vpbroadcastd m15, [o(pd_2048)] ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703 ; t18a, t29a ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380 ; t19a, t28a ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973 ; t20a, t27a ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106 ; t21a, t26a ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a .main2: psubsw m7, m12, m4 ; t18 paddsw m12, m4 ; t19 psubsw m4, m2, m10 ; t21 paddsw m2, m10 ; t20 psubsw m10, m14, m6 ; t22 paddsw m14, m6 ; t23 psubsw m6, m1, m9 ; t25 paddsw m1, m9 ; t24 psubsw m9, m13, m5 ; t26 paddsw m13, m5 ; t27 psubsw m5, m3, m11 ; t29 paddsw m3, m11 ; t28 ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a psubsw m8, m14, m2 ; t20a paddsw m14, m2 ; t23a psubsw m2, m1, m13 ; t27a paddsw m1, m13 ; t24a psubsw m13, m6, m9 ; t21 paddsw m6, m9 ; t22 psubsw m9, m10, m4 ; t26 paddsw m10, m4 ; t25 ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27 ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a mova m4, [rsp+gprsize+32*0] ; in31 mova [rsp+gprsize+32*0], m6 ; t22 mova m6, [rsp+gprsize+32*1] ; in15 mova [rsp+gprsize+32*1], m14 ; t23a mova m14, [rsp+gprsize+32*2] ; in17 mova [rsp+gprsize+32*2], m1 ; t24a ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a psubsw m1, m0, m14 ; t17 paddsw m0, m14 ; t16 psubsw m14, m4, m6 ; t30 paddsw m4, m6 ; t31 ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a psubsw m6, m0, m12 ; t19a paddsw m0, m12 ; t16a psubsw m12, m4, m3 ; t28a paddsw m4, m3 ; t31a psubsw m3, m14, m5 ; t18 paddsw m14, m5 ; t17 psubsw m5, m1, m7 ; t29 paddsw m1, m7 ; t30 ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28 psubsw m7, m1, m10 ; t25a paddsw m1, m10 ; t30a psubsw m10, m5, m9 ; t21 paddsw m5, m9 ; t18 psubsw m9, m12, m2 ; t20a paddsw m12, m2 ; t19a psubsw m2, m3, m13 ; t26 paddsw m3, m13 ; t29 psubsw m13, m6, m8 ; t27a paddsw m6, m8 ; t28a mova [tmp1q-32*2], m5 mova [tmp1q-32*1], m12 mova [tmp2q+32*0], m6 mova [tmp2q+32*1], m3 mova [tmp2q+32*2], m1 mova m5, [rsp+gprsize+32*0] ; t22 mova m6, [rsp+gprsize+32*1] ; t23 mova m3, [rsp+gprsize+32*2] ; t24a psubsw m1, m14, m5 ; t22a paddsw m14, m5 ; t17a psubsw m5, m0, m6 ; t23 paddsw m0, m6 ; t16 psubsw m6, m4, m3 ; t24 paddsw m4, m3 ; t31 vpbroadcastd m8, [o(pw_m2896_2896)] vpbroadcastd m3, [o(pw_2896_2896)] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m14 mova [tmp2q+32*3], m4 ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27 ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25 ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a mova [tmp1q+32*0], m13 mova [tmp1q+32*1], m2 mova [tmp1q+32*2], m7 mova [tmp1q+32*3], m6 mova [tmp2q-32*4], m5 mova [tmp2q-32*3], m1 mova [tmp2q-32*2], m10 mova [tmp2q-32*1], m9 ret ALIGN function_align .transpose_2x8x8_round: punpckhwd m6, m12, m13 punpcklwd m12, m13 punpckhwd m13, m8, m9 punpcklwd m8, m9 punpckhwd m9, m14, m15 punpcklwd m14, m15 punpckhwd m15, m10, m11 punpcklwd m10, m11 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5 punpckhdq m11, m8, m10 punpckldq m8, m10 punpckldq m10, m12, m14 punpckhdq m12, m14 punpckhdq m14, m13, m15 punpckldq m13, m15 punpckldq m15, m6, m9 punpckhdq m6, m9 punpckhqdq m9, m8, m10 punpcklqdq m8, m10 punpcklqdq m10, m11, m12 punpckhqdq m11, m12 punpcklqdq m12, m13, m15 punpckhqdq m13, m15 punpckhqdq m15, m14, m6 punpcklqdq m14, m6 pmulhrsw m6, m7, [rsp+gprsize+32*0] REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15 pmulhrsw m7, [rsp+gprsize+32*1] mova [rsp+gprsize+32*0], m15 punpckhwd m15, m4, m5 punpcklwd m4, m5 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhwd m1, m6, m7 punpcklwd m6, m7 punpckhwd m7, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 punpckhdq m6, m5, m7 punpckldq m5, m7 punpckldq m7, m15, m1 punpckhdq m15, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m7 punpckhqdq m5, m7 punpckhqdq m7, m6, m15 punpcklqdq m6, m15 ret ALIGN function_align .pass2_end: mova [rsp+gprsize+32*0], m7 mova [rsp+gprsize+32*2], m15 vpbroadcastd m15, [o(pw_2048)] IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4 IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8 IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4 IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0 add dstq, strideq sub r2, strideq mova m1, [rsp+gprsize+32*1] IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4 IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8 IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4 IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0 add dstq, strideq sub r2, strideq IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4 IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8 IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4 IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0 add dstq, strideq sub r2, strideq mova m7, [rsp+gprsize+32*0] mova m1, [rsp+gprsize+32*2] IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4 IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8 IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4 IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0 ret ; Perform the final sumsub step and YMM lane shuffling %macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2] mova m%3, [tmp2q+32*( 3-%1)] psubsw m%4, m%1, m%3 paddsw m%1, m%3 mova m%3, [tmp1q+32*(11-%2)] mova [tmp1q+32*(11-%2)+16], xm%4 vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1 paddsw m%4, m%2, m%3 psubsw m%2, m%3 mova [tmp1q+32*(11-%2)], xm%2 vextracti128 [tmp2q+32*( 3-%1)], m%2, 1 vperm2i128 m%2, m%1, m%4, 0x31 vinserti128 m%1, xm%4, 1 %endmacro cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 16 jmp m(inv_txfm_add_dct_dct_32x8).dconly .normal: PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 vpbroadcastd m15, [o(pw_2896x8)] pmulhrsw m0, m15, [cq+32* 1] pmulhrsw m1, m15, [cq+32* 3] pmulhrsw m2, m15, [cq+32* 5] pmulhrsw m3, m15, [cq+32* 7] pmulhrsw m4, m15, [cq+32* 9] pmulhrsw m5, m15, [cq+32*11] pmulhrsw m6, m15, [cq+32*13] pmulhrsw m7, m15, [cq+32*15] pmulhrsw m8, m15, [cq+32*17] pmulhrsw m9, m15, [cq+32*19] pmulhrsw m10, m15, [cq+32*21] pmulhrsw m11, m15, [cq+32*23] pmulhrsw m12, m15, [cq+32*25] pmulhrsw m13, m15, [cq+32*27] pmulhrsw m14, m15, [cq+32*29] pmulhrsw m15, [cq+32*31] lea tmp1q, [rsp+32*7] lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32).main_oddhalf LOAD_16ROWS cq+32*0, 32*2, 1, 0 pxor m15, m15 mov r3d, 8 .zero_loop: mova [cq+32*0], m15 mova [cq+32*1], m15 mova [cq+32*2], m15 mova [cq+32*3], m15 add cq, 32*4 dec r3d jg .zero_loop call m(idct_16x16_internal).main call .pass1_end lea r2, [strideq*3] mov r3, dstq .pass2: vpbroadcastd m7, [o(pw_16384)] call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round call m(idct_16x16_internal).main mova [rsp+32*2], m15 vpbroadcastd m15, [o(pw_2048)] REPX {pmulhrsw x, m15}, m2, m3, m0 WRITE_16X2 2, 3, 1, 2, strideq*2, r2 pmulhrsw m1, m15, [rsp+32*1] WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 lea dstq, [dstq+strideq*4] REPX {pmulhrsw x, m15}, m4, m5, m6, m7 WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 WRITE_16X2 6, 7, 2, 3, strideq*2, r2 lea dstq, [dstq+strideq*4] REPX {pmulhrsw x, m15}, m8, m9, m10, m11 WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 WRITE_16X2 10, 11, 2, 3, strideq*2, r2 lea dstq, [dstq+strideq*4] REPX {pmulhrsw x, m15}, m11, m12, m13, m14 pmulhrsw m15, [rsp+32*2] WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 WRITE_16X2 14, 15, 2, 3, strideq*2, r2 test r3, r3 jnz .right_half RET .right_half: LOAD_8ROWS tmp1q-32*4, 32 LOAD_8ROWS_H tmp2q-32*4, 32 lea dstq, [r3+16] xor r3d, r3d mova [rsp+32*0], m6 mova [rsp+32*1], m7 jmp .pass2 ALIGN function_align .pass1_end: mova [rsp+gprsize+32*0], m9 IDCT32_PASS1_END 0, 8, 1, 9 IDCT32_PASS1_END 2, 10, 1, 9 IDCT32_PASS1_END 3, 11, 1, 9 IDCT32_PASS1_END 4, 12, 1, 9 IDCT32_PASS1_END 5, 13, 1, 9 IDCT32_PASS1_END 6, 14, 1, 9 IDCT32_PASS1_END 7, 15, 1, 9 mova m1, [rsp+gprsize+32*1] mova m9, [rsp+gprsize+32*0] mova [rsp+gprsize+32*0], m6 mova [rsp+gprsize+32*1], m7 IDCT32_PASS1_END 1, 9, 6, 7 ret cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 13, dst, stride, c, eob %undef cmp lea rax, [o_base] vpbroadcastd m9, [o(pw_2896x8)] vpbroadcastd m10, [o(pw_1697x16)] vpbroadcastd m12, [o(pw_8192)] cmp eobd, 43 ; if (eob > 43) setg r4b ; iteration_count++ cmp eobd, 150 ; if (eob > 150) setg al ; iteration_count++ add eobd, -279 ; if (eob > 278) adc r4b, al ; iteration_count++ lea r3, [strideq*3] mov rax, cq paddw m11, m12, m12 ; pw_16384 .loop: mova xm0, [cq+64* 0] mova xm1, [cq+64* 1] vinserti128 m0, [cq+64* 8], 1 vinserti128 m1, [cq+64* 9], 1 mova xm2, [cq+64* 2] mova xm3, [cq+64* 3] vinserti128 m2, [cq+64*10], 1 vinserti128 m3, [cq+64*11], 1 mova xm4, [cq+64* 4] mova xm5, [cq+64* 5] vinserti128 m4, [cq+64*12], 1 vinserti128 m5, [cq+64*13], 1 mova xm6, [cq+64* 6] mova xm7, [cq+64* 7] vinserti128 m6, [cq+64*14], 1 vinserti128 m7, [cq+64*15], 1 REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7 call m(inv_txfm_add_identity_identity_8x32).transpose8x8 REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 lea dstq, [dstq+strideq*4] WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r3 lea dstq, [dstq+strideq*4] add cq, 16 dec r4b jge .loop sub cq, 32 pxor m0, m0 mov r0d, 8 cmp cq, rax ja .zero_loop .zero_loop_half: mova [rax+64*0], m0 mova [rax+64*1], m0 add rax, 64*4 mova [rax-64*2], m0 mova [rax-64*1], m0 sub r0d, 2 jg .zero_loop_half RET .zero_loop: mova [rax+32*0], m0 mova [rax+32*1], m0 mova [rax+32*2], m0 mova [rax+32*3], m0 add rax, 32*4 dec r0d jg .zero_loop RET cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob %undef cmp lea rax, [o_base] vpbroadcastd m9, [o(pw_2896x8)] vpbroadcastd m10, [o(pw_1697x16)] vpbroadcastd m11, [o(pw_2048)] cmp eobd, 35 ; if (eob > 35) setg r4b ; iteration_count++ cmp eobd, 150 ; if (eob > 150) setg r3b ; iteration_count += 2 lea r4d, [r4+r3*2] lea r3, [strideq*3] mov r5, dstq mov rax, cq .loop: mova xm0, [cq+32* 0] mova xm1, [cq+32* 1] vinserti128 m0, [cq+32* 8], 1 vinserti128 m1, [cq+32* 9], 1 mova xm2, [cq+32* 2] mova xm3, [cq+32* 3] vinserti128 m2, [cq+32*10], 1 vinserti128 m3, [cq+32*11], 1 mova xm4, [cq+32* 4] mova xm5, [cq+32* 5] vinserti128 m4, [cq+32*12], 1 vinserti128 m5, [cq+32*13], 1 mova xm6, [cq+32* 6] mova xm7, [cq+32* 7] vinserti128 m6, [cq+32*14], 1 vinserti128 m7, [cq+32*15], 1 REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_identity_identity_8x32).transpose8x8 REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7 REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 lea dstq, [dstq+strideq*4] WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r3 lea dstq, [dstq+strideq*4] add cq, 16 dec r4b jl .ret test r4b, 1 jz .loop add cq, 32*15 lea dstq, [r5+16] jmp .loop .ret: sub cd, eax pxor m0, m0 add cd, 384 .zero_loop: mova [rax+32*0], m0 mova [rax+32*1], m0 mova [rax+32*2], m0 mova [rax+32*3], m0 add rax, 32*4 sub cd, 128 jge .zero_loop RET cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 32 jmp m(inv_txfm_add_dct_dct_32x8).dconly .normal: PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ base, tmp3, tmp4 %undef cmp lea tmp1q, [rsp+32*7] lea tmp2q, [tmp1q+32*8] sub eobd, 136 mov tmp4d, eobd .pass1_loop: LOAD_8ROWS cq+64*1, 64*2 pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 test tmp4d, tmp4d jl .fast LOAD_8ROWS_H cq+64*17, 64*2 call m(inv_txfm_add_dct_dct_16x32).main_oddhalf LOAD_8ROWS_H cq+64*16, 64*2 pxor m0, m0 REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 mova [rsp], m15 jmp .idct16 .fast: call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 .idct16: LOAD_8ROWS cq+64*0, 64*2 pxor m15, m15 REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 call m(idct_16x16_internal).main call m(inv_txfm_add_dct_dct_32x16).pass1_end vpbroadcastd m7, [o(pw_8192)] call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round lea tmp3q, [tmp1q+32*32] mova m15, [rsp] mova [tmp3q-32*4], m0 mova [tmp3q-32*3], m2 mova [tmp3q-32*2], m4 mova [tmp3q-32*1], m6 mova [tmp3q+32*0], m8 mova [tmp3q+32*1], m10 mova [tmp3q+32*2], m12 mova [tmp3q+32*3], m14 add tmp3q, 32*8 mova [tmp3q-32*4], m1 mova [tmp3q-32*3], m3 mova [tmp3q-32*2], m5 mova [tmp3q-32*1], m7 mova [tmp3q+32*0], m9 mova [tmp3q+32*1], m11 mova [tmp3q+32*2], m13 mova [tmp3q+32*3], m15 vpbroadcastd m9, [o(pw_8192)] pmulhrsw m0, m9, [tmp1q-32*4] pmulhrsw m1, m9, [tmp1q-32*3] pmulhrsw m2, m9, [tmp1q-32*2] pmulhrsw m3, m9, [tmp1q-32*1] pmulhrsw m4, m9, [tmp1q+32*0] pmulhrsw m5, m9, [tmp1q+32*1] pmulhrsw m6, m9, [tmp1q+32*2] pmulhrsw m7, m9, [tmp1q+32*3] call m(inv_txfm_add_identity_identity_8x32).transpose8x8 mova [tmp1q-32*4], m0 pmulhrsw m0, m9, [tmp2q-32*4] mova [tmp2q-32*4], m1 pmulhrsw m1, m9, [tmp2q-32*3] mova [tmp1q-32*3], m2 pmulhrsw m2, m9, [tmp2q-32*2] mova [tmp2q-32*3], m3 pmulhrsw m3, m9, [tmp2q-32*1] mova [tmp1q-32*2], m4 pmulhrsw m4, m9, [tmp2q+32*0] mova [tmp2q-32*2], m5 pmulhrsw m5, m9, [tmp2q+32*1] mova [tmp1q-32*1], m6 pmulhrsw m6, m9, [tmp2q+32*2] mova [tmp2q-32*1], m7 pmulhrsw m7, m9, [tmp2q+32*3] call m(inv_txfm_add_identity_identity_8x32).transpose8x8 mova [tmp1q+32*0], m0 mova [tmp2q+32*0], m1 mova [tmp1q+32*1], m2 mova [tmp2q+32*1], m3 mova [tmp1q+32*2], m4 mova [tmp2q+32*2], m5 mova [tmp1q+32*3], m6 mova [tmp2q+32*3], m7 add cq, 32 add tmp1q, 32*16 add tmp2q, 32*16 add eobd, 0x80000000 jnc .pass1_loop add tmp1q, 32*24 imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq test tmp4d, tmp4d jge .pass2_loop add tmp1q, 32*16 add tmp2q, 32*16 add tmp3q, 32*16 .pass2_loop: LOAD_8ROWS tmp2q-32*4, 32 test tmp4d, tmp4d jl .fast2 LOAD_8ROWS_H tmp3q-32*4, 32 call m(inv_txfm_add_dct_dct_16x32).main_oddhalf sub tmp3q, 32*8 LOAD_8ROWS_H tmp3q-32*4, 32 sub tmp3q, 32*16 jmp .pass2_loop_end .fast2: call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast sub tmp3q, 32*24 pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 .pass2_loop_end: LOAD_8ROWS tmp3q-32*4, 32 mova [rsp], m15 call m(idct_16x16_internal).main call m(inv_txfm_add_dct_dct_16x32).pass2_end lea tmp3q, [tmp1q-32*32] cmp tmp2q, tmp3q jb .ret sub tmp2q, 32*32 sub dstq, r3 lea r2, [r2+r3+16] add dstq, 16 jmp .pass2_loop .ret: RET cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob %undef cmp vpbroadcastd m9, [pw_8192] sub eobd, 136 ; if (eob < 136) shr eobd, 30 ; topleft 16x16 only lea eobd, [eobq*2-8] lea r4, [strideq*3] mov r5, dstq lea rax, [cq+32] .loop: mova xm0, [cq+64* 0] mova xm1, [cq+64* 1] vinserti128 m0, [cq+64* 8], 1 vinserti128 m1, [cq+64* 9], 1 mova xm2, [cq+64* 2] mova xm3, [cq+64* 3] vinserti128 m2, [cq+64*10], 1 vinserti128 m3, [cq+64*11], 1 mova xm4, [cq+64* 4] mova xm5, [cq+64* 5] vinserti128 m4, [cq+64*12], 1 vinserti128 m5, [cq+64*13], 1 mova xm6, [cq+64* 6] mova xm7, [cq+64* 7] vinserti128 m6, [cq+64*14], 1 vinserti128 m7, [cq+64*15], 1 call m(inv_txfm_add_identity_identity_8x32).transpose8x8 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r4 lea dstq, [dstq+strideq*4] WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r4 lea dstq, [dstq+strideq*4] add cq, 16 inc eobd jz .ret test eobd, 3 jnz .loop add cq, 64*15 lea dstq, [r5+16] jmp .loop .ret: pxor m0, m0 mov r0d, 16 cmp cq, rax jne .zero_loop .zero_loop_topleft: mova [rax-32*1], m0 mova [rax+32*1], m0 mova [rax+32*3], m0 mova [rax+32*5], m0 add rax, 64*4 sub r0d, 4 jg .zero_loop_topleft RET .zero_loop: mova [rax-32*1], m0 mova [rax+32*0], m0 mova [rax+32*1], m0 mova [rax+32*2], m0 add rax, 32*4 dec r0d jg .zero_loop RET %macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) %if %1 & 1 mova m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n mova m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n %else mova m%5, [tmp1q-32*(45-%1)] mova m%4, [tmp2q-32*(20+%1)] %endif psubsw m%6, m%5, m%4 ; idct32 out31-n paddsw m%5, m%4 ; idct32 out 0+n psubsw m%4, m%6, m%3 ; out32+n paddsw m%6, m%3 ; out31-n psubsw m%3, m%5, m%2 ; out63-n paddsw m%5, m%2 ; out 0+n %if %0 == 6 ; pass 1 %if %1 & 1 mova [tmp2q-32*(19-%1)], m%4 mova [tmp1q-32*(14+%1)], m%6 mova [tmp1q+32*(18-%1)], m%3 mova [tmp2q-32*(51-%1)], m%5 %else mova [tmp1q-32*(13-%1)], m%4 mova [tmp2q-32*(20+%1)], m%6 mova [tmp2q+32*(12-%1)], m%3 mova [tmp1q-32*(45-%1)], m%5 %endif %else ; pass 2 REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5 %if %1 & 1 %define %%d0 r2 %define %%d1 dstq %else %define %%d0 dstq %define %%d1 r2 %endif pmovzxbw m%2, [%%d0+%9 ] paddw m%2, m%4 pmovzxbw m%4, [%%d1+%8 ] paddw m%4, m%6 pmovzxbw m%6, [%%d1+%10] paddw m%3, m%6 pmovzxbw m%6, [%%d0+%7 ] paddw m%5, m%6 packuswb m%2, m%4 packuswb m%3, m%5 vpermq m%2, m%2, q3120 vpermq m%3, m%3, q3120 mova [%%d0+%9 ], xm%2 vextracti128 [%%d1+%8 ], m%2, 1 mova [%%d1+%10], xm%3 vextracti128 [%%d0+%7 ], m%3, 1 %endif %endmacro cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 32 jmp m(inv_txfm_add_dct_dct_16x4).dconly .normal: PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 %undef cmp lea tmp1q, [rsp+32*23] lea tmp2q, [tmp1q+32*24] sub eobd, 151 mov r7d, eobd .pass1_loop: LOAD_16ROWS cq, 64 call m(idct_16x16_internal).main mova m1, [rsp+32*1] mova [rsp+32*0], m6 mova [rsp+32*1], m7 vpbroadcastd m7, [o(pw_8192)] call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round mova m15, [rsp+32*0] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m2 mova [tmp1q-32*2], m4 mova [tmp1q-32*1], m6 mova [tmp1q+32*0], m8 mova [tmp1q+32*1], m10 mova [tmp1q+32*2], m12 mova [tmp1q+32*3], m14 mova [tmp2q-32*4], m1 mova [tmp2q-32*3], m3 mova [tmp2q-32*2], m5 mova [tmp2q-32*1], m7 mova [tmp2q+32*0], m9 mova [tmp2q+32*1], m11 mova [tmp2q+32*2], m13 mova [tmp2q+32*3], m15 add cq, 32 add tmp1q, 32*8 add tmp2q, 32*8 add eobd, 0x80000000 jnc .pass1_loop lea r2, [rsp+32*23] mova xm0, [r2-32*4+ 0] mova xm1, [r2-32*2+ 0] vinserti128 m0, [r2+32*0+ 0], 1 vinserti128 m1, [r2+32*2+ 0], 1 mova xm2, [r2-32*4+16] mova xm3, [r2-32*2+16] vinserti128 m2, [r2+32*0+16], 1 vinserti128 m3, [r2+32*2+16], 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 test r7d, r7d jl .fast lea r3, [r2+32*8] mova xm4, [r3-32*4+ 0] mova xm5, [r3-32*2+ 0] vinserti128 m4, [r3+32*0+ 0], 1 vinserti128 m5, [r3+32*2+ 0], 1 mova xm6, [r3-32*4+16] mova xm7, [r3-32*2+16] vinserti128 m6, [r3+32*0+16], 1 vinserti128 m7, [r3+32*2+16], 1 .fast: mova [rsp], m8 lea tmp1q, [rsp+32*7] call m(idct_16x16_internal).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m5 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 add tmp1q, 32*8 mova [tmp1q-32*4], m8 mova [tmp1q-32*3], m9 mova [tmp1q-32*2], m10 mova [tmp1q-32*1], m11 mova [tmp1q+32*0], m12 mova [tmp1q+32*1], m13 mova [tmp1q+32*2], m14 mova [tmp1q+32*3], m15 mova xm0, [r2-32*3+ 0] mova xm1, [r2-32*1+ 0] vinserti128 m0, [r2+32*1+ 0], 1 vinserti128 m1, [r2+32*3+ 0], 1 mova xm2, [r2-32*3+16] mova xm3, [r2-32*1+16] vinserti128 m2, [r2+32*1+16], 1 vinserti128 m3, [r2+32*3+16], 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 test r7d, r7d jl .fast2 mova xm4, [r3-32*3+ 0] mova xm5, [r3-32*1+ 0] vinserti128 m4, [r3+32*1+ 0], 1 vinserti128 m5, [r3+32*3+ 0], 1 mova xm6, [r3-32*3+16] mova xm7, [r3-32*1+16] vinserti128 m6, [r3+32*1+16], 1 vinserti128 m7, [r3+32*3+16], 1 .fast2: add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast add r2, 32*24 vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 mova xm0, [r2-32*4+ 0] mova xm3, [r2-32*1+16] vinserti128 m0, [r2+32*0+ 0], 1 vinserti128 m3, [r2+32*3+16], 1 mova xm4, [r2-32*4+16] mova xm7, [r2-32*1+ 0] vinserti128 m4, [r2+32*0+16], 1 vinserti128 m7, [r2+32*3+ 0], 1 pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r7d, r7d jl .fast3 add r3, 32*24 mova xm1, [r3-32*1+16] mova xm2, [r3-32*4+ 0] vinserti128 m1, [r3+32*3+16], 1 vinserti128 m2, [r3+32*0+ 0], 1 mova xm5, [r3-32*1+ 0] mova xm6, [r3-32*4+16] vinserti128 m5, [r3+32*3+ 0], 1 vinserti128 m6, [r3+32*0+16], 1 .fast3: add rax, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova xm0, [r2-32*2+ 0] mova xm3, [r2-32*3+16] vinserti128 m0, [r2+32*2+ 0], 1 vinserti128 m3, [r2+32*1+16], 1 mova xm4, [r2-32*2+16] mova xm7, [r2-32*3+ 0] vinserti128 m4, [r2+32*2+16], 1 vinserti128 m7, [r2+32*1+ 0], 1 pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r7d, r7d jl .fast4 mova xm1, [r3-32*3+16] mova xm2, [r3-32*2+ 0] vinserti128 m1, [r3+32*1+16], 1 vinserti128 m2, [r3+32*2+ 0], 1 mova xm5, [r3-32*3+ 0] mova xm6, [r3-32*2+16] vinserti128 m5, [r3+32*1+ 0], 1 vinserti128 m6, [r3+32*2+16], 1 .fast4: call m(inv_txfm_add_dct_dct_16x64).main_part1 call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 RET ALIGN function_align %define o_base idct64_mul - 8 .main_part1: ; idct64 steps 1-5: ; in1/31/17/15/ 9/23/25/ 7 -> ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a ; in5/27/21/11/13/19/29/ 3 -> ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a vpbroadcastd m11, [o(idct64_mul+4* 0)] vpbroadcastd m13, [o(idct64_mul+4* 1)] vpbroadcastd m10, [o(idct64_mul+4* 4)] vpbroadcastd m12, [o(idct64_mul+4* 5)] pmulhrsw m11, m0 ; t63a pmulhrsw m0, m13 ; t32a pmulhrsw m10, m1 ; t62a pmulhrsw m1, m12 ; t33a vpbroadcastd m9, [o(idct64_mul+4* 8)] vpbroadcastd m13, [o(idct64_mul+4* 9)] vpbroadcastd m8, [o(idct64_mul+4*12)] vpbroadcastd m12, [o(idct64_mul+4*13)] pmulhrsw m9, m2 ; t61a pmulhrsw m2, m13 ; t34a pmulhrsw m8, m3 ; t60a pmulhrsw m3, m12 ; t35a psubsw m12, m0, m1 ; t33 paddsw m0, m1 ; t32 psubsw m1, m3, m2 ; t34 paddsw m3, m2 ; t35 psubsw m2, m8, m9 ; t61 paddsw m8, m9 ; t60 psubsw m9, m11, m10 ; t62 paddsw m11, m10 ; t63 ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a vpbroadcastd m14, [o(pw_401_4076)] ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a psubsw m10, m0, m3 ; t35a paddsw m0, m3 ; t32a psubsw m3, m11, m8 ; t60a paddsw m11, m8 ; t63a psubsw m8, m9, m2 ; t34 paddsw m9, m2 ; t33 psubsw m2, m12, m1 ; t61 paddsw m12, m1 ; t62 mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m9 mova [tmp2q+32*2], m12 mova [tmp2q+32*3], m11 vpbroadcastd m13, [o(pw_m4017_799)] vpbroadcastd m14, [o(pw_799_4017)] ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13 ; t34a, t61a ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13 ; t35, t60 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp2q+32*0], m10 mova [tmp2q+32*1], m8 vpbroadcastd m3, [o(idct64_mul+4*16)] vpbroadcastd m11, [o(idct64_mul+4*17)] vpbroadcastd m2, [o(idct64_mul+4*20)] vpbroadcastd m10, [o(idct64_mul+4*21)] vpbroadcastd m1, [o(idct64_mul+4*24)] vpbroadcastd m9, [o(idct64_mul+4*25)] vpbroadcastd m0, [o(idct64_mul+4*28)] vpbroadcastd m8, [o(idct64_mul+4*29)] pmulhrsw m3, m4 ; t59a pmulhrsw m4, m11 ; t36a pmulhrsw m2, m5 ; t58a pmulhrsw m5, m10 ; t37a pmulhrsw m1, m6 ; t57a pmulhrsw m6, m9 ; t38a pmulhrsw m0, m7 ; t56a pmulhrsw m7, m8 ; t39a psubsw m8, m4, m5 ; t37 paddsw m4, m5 ; t36 psubsw m5, m7, m6 ; t38 paddsw m7, m6 ; t39 psubsw m6, m0, m1 ; t57 paddsw m0, m1 ; t56 psubsw m1, m3, m2 ; t58 paddsw m3, m2 ; t59 ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a vpbroadcastd m10, [o(pw_3166_2598)] ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a psubsw m2, m7, m4 ; t36a paddsw m7, m4 ; t39a psubsw m4, m0, m3 ; t59a paddsw m0, m3 ; t56a psubsw m3, m6, m1 ; t37 paddsw m6, m1 ; t38 psubsw m1, m5, m8 ; t58 paddsw m5, m8 ; t57 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 mova [tmp2q-32*4], m0 mova [tmp2q-32*3], m5 vpbroadcastd m6, [o(pw_m799_m4017)] vpbroadcastd m7, [o(pw_m4017_799)] ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6 ; t36, t59 ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6 ; t37a, t58a mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m1 mova [tmp2q-32*2], m3 mova [tmp2q-32*1], m2 ret %define o_base pw_5 + 128 .main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub sub rax, o_idct64_offset + 8 vpbroadcastd m11, [o(pw_1567_3784)] vpbroadcastd m12, [o(pw_m3784_1567)] vpbroadcastd m13, [o(pw_2896_2896)] vpbroadcastd m14, [o(pw_m2896_2896)] .main_part2_pass1_loop: call .main_part2_internal IDCT64_PART2_END 0, 7, 0, 6, 9, 10 IDCT64_PART2_END 7, 8, 5, 0, 6, 7 IDCT64_PART2_END 8, 2, 1, 0, 6, 7 IDCT64_PART2_END 15, 3, 4, 0, 6, 7 cmp tmp1q, tmp2q jne .main_part2_pass1_loop ret .main_part2_internal: mova m0, [tmp1q-32*12] ; t32a mova m6, [tmp2q-32*13] ; t39a mova m1, [tmp1q-32* 4] ; t40a mova m5, [tmp2q+32* 3] ; t55a add tmp1q, 32 sub tmp2q, 32 mova m2, [tmp1q+32* 3] ; t48a mova m4, [tmp2q-32* 4] ; t47a mova m3, [tmp1q+32*11] ; t56a mova m7, [tmp2q+32*12] ; t63a psubsw m8, m0, m6 ; t39 paddsw m0, m6 ; t32 psubsw m6, m4, m1 ; t40 paddsw m4, m1 ; t47 psubsw m1, m2, m5 ; t55 paddsw m2, m5 ; t48 psubsw m5, m7, m3 ; t56 paddsw m7, m3 ; t63 ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a vpbroadcastd m9, [o(pw_m1567_m3784)] ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a psubsw m3, m0, m4 ; t47a paddsw m0, m4 ; t32a psubsw m4, m7, m2 ; t48a paddsw m7, m2 ; t63a psubsw m2, m5, m1 ; t40 paddsw m5, m1 ; t39 psubsw m1, m8, m6 ; t55 paddsw m8, m6 ; t56 ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48 ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a ret .main_part2_pass2: sub rax, o_idct64_offset + 8 vpbroadcastd m11, [o(pw_1567_3784)] vpbroadcastd m12, [o(pw_m3784_1567)] vpbroadcastd m13, [o(pw_2896_2896)] lea r9, [strideq*5] ; stride*5 lea r3, [r9+strideq*1] ; stride*6 lea r7, [r9+strideq*2] ; stride*7 lea r8, [r3+strideq*2] ; stride*8 lea r2, [dstq+r7] .main_part2_pass2_loop: vpbroadcastd m14, [o(pw_m2896_2896)] call .main_part2_internal vpbroadcastd m14, [o(pw_2048)] IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8 IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8 IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 add dstq, strideq sub r2, strideq cmp tmp1q, tmp2q jne .main_part2_pass2_loop ret cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 16 .dconly: pmulhrsw xm0, xm2 movd xm2, [o(pw_2048)] pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 pxor m1, m1 .dconly_loop: mova m2, [dstq+32*0] mova m3, [dstq+32*1] punpckhbw m4, m2, m1 punpcklbw m2, m1 punpckhbw m5, m3, m1 punpcklbw m3, m1 paddw m4, m0 paddw m2, m0 paddw m5, m0 paddw m3, m0 packuswb m2, m4 packuswb m3, m5 mova [dstq+32*0], m2 mova [dstq+32*1], m3 add dstq, strideq dec r2d jg .dconly_loop RET .normal: PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 LOAD_8ROWS cq+32*0, 32*4 pxor m8, m8 REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 lea tmp1q, [rsp+32*7] call m(idct_16x16_internal).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m5 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 add tmp1q, 32*8 mova [tmp1q-32*4], m8 mova [tmp1q-32*3], m9 mova [tmp1q-32*2], m10 mova [tmp1q-32*1], m11 mova [tmp1q+32*0], m12 mova [tmp1q+32*1], m13 mova [tmp1q+32*2], m14 mova [tmp1q+32*3], m15 LOAD_8ROWS cq+32*2, 32*4 pxor m8, m8 REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 mova m0, [cq+32* 1] mova m1, [cq+32*31] mova m2, [cq+32*17] mova m3, [cq+32*15] mova m4, [cq+32* 9] mova m5, [cq+32*23] mova m6, [cq+32*25] mova m7, [cq+32* 7] pxor m8, m8 REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 add rax, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova m0, [cq+32* 5] mova m1, [cq+32*27] mova m2, [cq+32*21] mova m3, [cq+32*11] mova m4, [cq+32*13] mova m5, [cq+32*19] mova m6, [cq+32*29] mova m7, [cq+32* 3] pxor m8, m8 REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 call m(inv_txfm_add_dct_dct_16x64).main_part1 call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 sub tmp1q, 32*36 lea r2, [strideq*3] mov tmp2d, 4 .pass2_loop: lea r3, [tmp1q-32*8] mova xm0, [r3 -32*4] mova xm1, [r3 -32*3] vinserti128 m0, [tmp1q-32*4], 1 vinserti128 m1, [tmp1q-32*3], 1 mova xm2, [r3 -32*2] mova xm3, [r3 -32*1] vinserti128 m2, [tmp1q-32*2], 1 vinserti128 m3, [tmp1q-32*1], 1 mova xm4, [r3 +32*0] mova xm5, [r3 +32*1] vinserti128 m4, [tmp1q+32*0], 1 vinserti128 m5, [tmp1q+32*1], 1 mova xm6, [r3 +32*2] mova xm7, [r3 +32*3] vinserti128 m6, [tmp1q+32*2], 1 vinserti128 m7, [tmp1q+32*3], 1 mova xm8, [r3 -32*4+16] mova xm9, [r3 -32*3+16] vinserti128 m8, [tmp1q-32*4+16], 1 vinserti128 m9, [tmp1q-32*3+16], 1 mova xm10, [r3 -32*2+16] mova xm11, [r3 -32*1+16] vinserti128 m10, [tmp1q-32*2+16], 1 vinserti128 m11, [tmp1q-32*1+16], 1 mova xm12, [r3 +32*0+16] mova xm13, [r3 +32*1+16] vinserti128 m12, [tmp1q+32*0+16], 1 vinserti128 m13, [tmp1q+32*1+16], 1 mova xm14, [r3 +32*2+16] mova xm15, [r3 +32*3+16] vinserti128 m14, [tmp1q+32*2+16], 1 vinserti128 m15, [tmp1q+32*3+16], 1 mova [rsp+32*0], m6 mova [rsp+32*1], m7 vpbroadcastd m7, [o(pw_8192)] call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round call m(idct_16x16_internal).main mova [rsp+32*0], m15 vpbroadcastd m15, [o(pw_2048)] REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7 WRITE_16X2 2, 3, 1, 2, strideq*2, r2 pmulhrsw m1, m15, [rsp+32*1] WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 lea r3, [dstq+strideq*4] %define dstq r3 WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 WRITE_16X2 6, 7, 2, 3, strideq*2, r2 REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14 lea r3, [r3+strideq*4] WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 WRITE_16X2 10, 11, 2, 3, strideq*2, r2 pmulhrsw m15, [rsp+32*0] lea r3, [r3+strideq*4] WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 WRITE_16X2 14, 15, 2, 3, strideq*2, r2 add tmp1q, 32*16 add r0, 16 dec tmp2d jg .pass2_loop RET cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 64 jmp m(inv_txfm_add_dct_dct_32x8).dconly .normal: PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 lea tmp1q, [rsp+32*7] lea r10d, [eobq-136] sar r10d, 31 .pass1_loop: lea tmp2q, [tmp1q+32*16] LOAD_8ROWS cq+64*1, 64*2, 1 pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 test r10b, r10b jnz .fast LOAD_8ROWS_H cq+64*17, 64*2, 2 call m(inv_txfm_add_dct_dct_16x32).main_oddhalf LOAD_8ROWS_H cq+64*16, 64*2, 1 mova [rsp], m15 pxor m15, m15 REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 jmp .idct16 .fast: call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 .idct16: LOAD_8ROWS cq+64*0, 64*2, 1 pxor m15, m15 REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 call m(idct_16x16_internal).main call m(inv_txfm_add_dct_dct_32x16).pass1_end vpbroadcastd m7, [o(pw_16384)] call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round lea r3, [tmp1q+32*48] mova m15, [rsp] mova [r3-32*4], m0 mova [r3-32*3], m2 mova [r3-32*2], m4 mova [r3-32*1], m6 mova [r3+32*0], m8 mova [r3+32*1], m10 mova [r3+32*2], m12 mova [r3+32*3], m14 add r3, 32*24 mova [r3-32*4], m1 mova [r3-32*3], m3 mova [r3-32*2], m5 mova [r3-32*1], m7 mova [r3+32*0], m9 mova [r3+32*1], m11 mova [r3+32*2], m13 mova [r3+32*3], m15 vpbroadcastd m9, [o(pw_16384)] pmulhrsw m0, m9, [tmp1q-32*4] pmulhrsw m1, m9, [tmp1q-32*3] pmulhrsw m2, m9, [tmp1q-32*2] pmulhrsw m3, m9, [tmp1q-32*1] pmulhrsw m4, m9, [tmp1q+32*0] pmulhrsw m5, m9, [tmp1q+32*1] pmulhrsw m6, m9, [tmp1q+32*2] pmulhrsw m7, m9, [tmp1q+32*3] call m(inv_txfm_add_identity_identity_8x32).transpose8x8 mova [tmp1q-32*4], m0 pmulhrsw m0, m9, [tmp2q-32*4] mova [tmp2q-32*4], m1 pmulhrsw m1, m9, [tmp2q-32*3] mova [tmp1q-32*3], m2 pmulhrsw m2, m9, [tmp2q-32*2] mova [tmp2q-32*3], m3 pmulhrsw m3, m9, [tmp2q-32*1] mova [tmp1q-32*2], m4 pmulhrsw m4, m9, [tmp2q+32*0] mova [tmp2q-32*2], m5 pmulhrsw m5, m9, [tmp2q+32*1] mova [tmp1q-32*1], m6 pmulhrsw m6, m9, [tmp2q+32*2] mova [tmp2q-32*1], m7 pmulhrsw m7, m9, [tmp2q+32*3] call m(inv_txfm_add_identity_identity_8x32).transpose8x8 mova [tmp1q+32*0], m0 mova [tmp2q+32*0], m1 mova [tmp1q+32*1], m2 mova [tmp2q+32*1], m3 mova [tmp1q+32*2], m4 mova [tmp2q+32*2], m5 mova [tmp1q+32*3], m6 mova [tmp2q+32*3], m7 add cq, 32 add tmp1q, 32*8 add r10d, 0x80000000 jnc .pass1_loop lea r2, [rsp+32*55] lea r7, [r2+32*24] .pass2_loop: lea r3, [r2+32*8] lea r8, [r7+32*8] mova m0, [r2-32*4] mova m1, [r2-32*2] mova m2, [r2+32*0] mova m3, [r2+32*2] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 test r10b, r10b jnz .fast2 mova m4, [r3-32*4] mova m5, [r3-32*2] mova m6, [r3+32*0] mova m7, [r3+32*2] .fast2: mova [rsp], m8 lea tmp1q, [rsp+32*39] call m(idct_16x16_internal).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m5 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 add tmp1q, 32*8 mova [tmp1q-32*4], m8 mova [tmp1q-32*3], m9 mova [tmp1q-32*2], m10 mova [tmp1q-32*1], m11 mova [tmp1q+32*0], m12 mova [tmp1q+32*1], m13 mova [tmp1q+32*2], m14 mova [tmp1q+32*3], m15 mova m0, [r2-32*3] mova m1, [r2-32*1] mova m2, [r2+32*1] mova m3, [r2+32*3] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 test r10b, r10b jnz .fast3 mova m4, [r3-32*3] mova m5, [r3-32*1] mova m6, [r3+32*1] mova m7, [r3+32*3] .fast3: add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 mova m0, [r7-32*4] mova m3, [r7+32*3] mova m4, [r7+32*0] mova m7, [r7-32*1] pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r10b, r10b jnz .fast4 mova m1, [r8+32*3] mova m2, [r8-32*4] mova m5, [r8-32*1] mova m6, [r8+32*0] .fast4: add rax, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova m0, [r7-32*2] mova m3, [r7+32*1] mova m4, [r7+32*2] mova m7, [r7-32*3] pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r10b, r10b jnz .fast5 mova m1, [r8+32*1] mova m2, [r8-32*2] mova m5, [r8-32*3] mova m6, [r8+32*2] .fast5: call m(inv_txfm_add_dct_dct_16x64).main_part1 call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 add r10d, 0x80000000 jc .ret lea r2, [rsp+32*7] lea r7, [r2+32*16] sub dstq, r8 lea dstq, [dstq+strideq*4+16] jmp .pass2_loop .ret: RET cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 32 jmp m(inv_txfm_add_dct_dct_64x16).dconly .normal: PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ base, tmp3, tmp4 lea tmp1q, [rsp+32*7] lea tmp4d, [eobq-136] .pass1_loop: LOAD_8ROWS cq+64*0, 64*4, 1 pxor m8, m8 REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 call m(idct_16x16_internal).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m5 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 add tmp1q, 32*8 mova [tmp1q-32*4], m8 mova [tmp1q-32*3], m9 mova [tmp1q-32*2], m10 mova [tmp1q-32*1], m11 mova [tmp1q+32*0], m12 mova [tmp1q+32*1], m13 mova [tmp1q+32*2], m14 mova [tmp1q+32*3], m15 LOAD_8ROWS cq+64*2, 64*4, 1 pxor m8, m8 REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 vpbroadcastd m7, [o(pw_2896x8)] pmulhrsw m0, m7, [cq+64* 1] pmulhrsw m1, m7, [cq+64*31] pmulhrsw m2, m7, [cq+64*17] pmulhrsw m3, m7, [cq+64*15] pmulhrsw m4, m7, [cq+64* 9] pmulhrsw m5, m7, [cq+64*23] pmulhrsw m6, m7, [cq+64*25] pmulhrsw m7, [cq+64* 7] pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 add rax, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64).main_part1 vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))] add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 pmulhrsw m0, m7, [cq+64* 5] pmulhrsw m1, m7, [cq+64*27] pmulhrsw m2, m7, [cq+64*21] pmulhrsw m3, m7, [cq+64*11] pmulhrsw m4, m7, [cq+64*13] pmulhrsw m5, m7, [cq+64*19] pmulhrsw m6, m7, [cq+64*29] pmulhrsw m7, [cq+64* 3] pxor m8, m8 REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 call m(inv_txfm_add_dct_dct_16x64).main_part1 call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 sub tmp1q, 32*44 vpbroadcastd m10, [o(pw_16384)] call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave add cq, 32 add tmp4d, 0x80000000 jnc .pass1_loop lea tmp1q, [rsp+32*15] imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq mov tmp4b, 4 .pass2_loop: lea tmp2q, [tmp1q+32*64] LOAD_8ROWS tmp1q-32*4, 32 test tmp4d, 0x40000000 jnz .fast LOAD_8ROWS_H tmp2q-32*4, 32 call m(inv_txfm_add_dct_dct_16x32).main_oddhalf lea tmp3q, [tmp2q-32*8] LOAD_8ROWS_H tmp3q-32*4, 32 mova [rsp], m15 jmp .idct16 .fast: call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 .idct16: lea tmp3q, [tmp1q-32*8] LOAD_8ROWS tmp3q-32*4, 32 call m(idct_16x16_internal).main call m(inv_txfm_add_dct_dct_16x32).pass2_end add tmp1q, 32*16 sub dstq, r3 lea r2, [r2+r3+16] add dstq, 16 dec tmp4b jg .pass2_loop RET ALIGN function_align .transpose_round_interleave: mov tmp3d, 4 .loop: lea tmp2q, [tmp1q+32*8] mova xm0, [tmp1q-32*4] mova xm1, [tmp1q-32*3] vinserti128 m0, [tmp2q-32*4], 1 vinserti128 m1, [tmp2q-32*3], 1 mova xm2, [tmp1q-32*2] mova xm3, [tmp1q-32*1] vinserti128 m2, [tmp2q-32*2], 1 vinserti128 m3, [tmp2q-32*1], 1 mova xm4, [tmp1q+32*0] mova xm5, [tmp1q+32*1] vinserti128 m4, [tmp2q+32*0], 1 vinserti128 m5, [tmp2q+32*1], 1 mova xm6, [tmp1q+32*2] mova xm7, [tmp1q+32*3] vinserti128 m6, [tmp2q+32*2], 1 vinserti128 m7, [tmp2q+32*3], 1 REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_identity_identity_8x32).transpose8x8 mova xm8, [tmp1q-32*4+16] mova xm9, [tmp1q-32*3+16] vinserti128 m8, [tmp2q-32*4+16], 1 vinserti128 m9, [tmp2q-32*3+16], 1 mova [tmp1q-32*4], m0 mova [tmp2q-32*4], m1 mova [tmp1q-32*3], m2 mova [tmp2q-32*3], m3 mova xm2, [tmp1q-32*2+16] mova xm3, [tmp1q-32*1+16] vinserti128 m2, [tmp2q-32*2+16], 1 vinserti128 m3, [tmp2q-32*1+16], 1 mova [tmp1q-32*2], m4 mova [tmp2q-32*2], m5 mova [tmp1q-32*1], m6 mova [tmp2q-32*1], m7 mova xm4, [tmp1q+32*0+16] mova xm5, [tmp1q+32*1+16] vinserti128 m4, [tmp2q+32*0+16], 1 vinserti128 m5, [tmp2q+32*1+16], 1 mova xm6, [tmp1q+32*2+16] mova xm7, [tmp1q+32*3+16] vinserti128 m6, [tmp2q+32*2+16], 1 vinserti128 m7, [tmp2q+32*3+16], 1 pmulhrsw m0, m8, m10 pmulhrsw m1, m9, m10 REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_identity_identity_8x32).transpose8x8 mova [tmp1q+32*0], m0 mova [tmp2q+32*0], m1 mova [tmp1q+32*1], m2 mova [tmp2q+32*1], m3 mova [tmp1q+32*2], m4 mova [tmp2q+32*2], m5 mova [tmp1q+32*3], m6 mova [tmp2q+32*3], m7 add tmp1q, 32*16 dec tmp3d jg .loop ret cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 64 jmp m(inv_txfm_add_dct_dct_64x16).dconly .normal: PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 lea tmp1q, [rsp+32*71] lea r10d, [eobq-136] .pass1_loop: LOAD_8ROWS cq+64*0, 64*4 pxor m8, m8 REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 call m(idct_16x16_internal).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m5 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 add tmp1q, 32*8 mova [tmp1q-32*4], m8 mova [tmp1q-32*3], m9 mova [tmp1q-32*2], m10 mova [tmp1q-32*1], m11 mova [tmp1q+32*0], m12 mova [tmp1q+32*1], m13 mova [tmp1q+32*2], m14 mova [tmp1q+32*3], m15 LOAD_8ROWS cq+64*2, 64*4 pxor m8, m8 REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 mova m0, [cq+64* 1] mova m1, [cq+64*31] mova m2, [cq+64*17] mova m3, [cq+64*15] mova m4, [cq+64* 9] mova m5, [cq+64*23] mova m6, [cq+64*25] mova m7, [cq+64* 7] pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 add rax, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova m0, [cq+64* 5] mova m1, [cq+64*27] mova m2, [cq+64*21] mova m3, [cq+64*11] mova m4, [cq+64*13] mova m5, [cq+64*19] mova m6, [cq+64*29] mova m7, [cq+64* 3] pxor m8, m8 REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 call m(inv_txfm_add_dct_dct_16x64).main_part1 call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 sub tmp1q, 32*44 vpbroadcastd m10, [o(pw_8192)] call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave add cq, 32 add r10d, 0x80000000 jnc .pass1_loop lea tmp1q, [rsp+32*7] mov r10b, 4 .pass2_loop: lea r2, [tmp1q+32*64] mova m0, [r2-32*4] mova m1, [r2-32*2] mova m2, [r2+32*0] mova m3, [r2+32*2] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 mova [rsp], m4 test r10d, 0x40000000 jnz .fast lea r3, [r2+32*64] mova m4, [r3-32*4] mova m5, [r3-32*2] mova m6, [r3+32*0] mova m7, [r3+32*2] .fast: call m(idct_16x16_internal).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m5 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 add tmp1q, 32*8 mova [tmp1q-32*4], m8 mova [tmp1q-32*3], m9 mova [tmp1q-32*2], m10 mova [tmp1q-32*1], m11 mova [tmp1q+32*0], m12 mova [tmp1q+32*1], m13 mova [tmp1q+32*2], m14 mova [tmp1q+32*3], m15 mova m0, [r2-32*3] mova m1, [r2-32*1] mova m2, [r2+32*1] mova m3, [r2+32*3] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 test r10d, 0x40000000 jnz .fast2 mova m4, [r3-32*3] mova m5, [r3-32*1] mova m6, [r3+32*1] mova m7, [r3+32*3] .fast2: add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add r2, 32*8 add r3, 32*8 add tmp1q, 32*16 add tmp2q, 32*32 mova m0, [r2-32*4] ; 1 mova m3, [r2+32*3] ; 15 mova m4, [r2+32*0] ; 9 mova m7, [r2-32*1] ; 7 pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r10d, 0x40000000 jnz .fast3 mova m1, [r3+32*3] ; 31 mova m2, [r3-32*4] ; 17 mova m5, [r3-32*1] ; 23 mova m6, [r3+32*0] ; 25 .fast3: add rax, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova m0, [r2-32*2] ; 5 mova m3, [r2+32*1] ; 11 mova m4, [r2+32*2] ; 13 mova m7, [r2-32*3] ; 3 pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r10d, 0x40000000 jnz .fast4 mova m1, [r3+32*1] ; 27 mova m2, [r3-32*2] ; 21 mova m5, [r3-32*3] ; 19 mova m6, [r3+32*2] ; 29 .fast4: call m(inv_txfm_add_dct_dct_16x64).main_part1 call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 sub tmp1q, 32*28 sub dstq, r8 lea dstq, [dstq+strideq*4+16] dec r10b jg .pass2_loop RET %endif ; ARCH_X86_64