ref: 241dafa0454e02e1af7a85f08c6465357402f710
dir: /src/x86/itx.asm/
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 32
; Note: The order of (at least some of) those constants matter!
iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424
iadst4_dconly1a: dw 10568, 19856, 26752, 30424
iadst4_dconly1b: dw 30424, 26752, 19856, 10568
deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
%macro COEF_PAIR 2
pw_%1_%2: dw %1, %2
pw_m%2_%1: dw -%2, %1
%endmacro
; ADST-only
pw_3803_1321: dw 3803, 1321
pw_m1321_2482: dw -1321, 2482
pw_2482_3344: dw 2482, 3344
pw_m3803_3344: dw -3803, 3344
pw_m3803_m6688: dw -3803, -6688
%define pw_3344x8 iadst4_dconly2b
pw_5: times 2 dw 5
pw_2048: times 2 dw 2048
pw_4096: times 2 dw 4096
pw_8192: times 2 dw 8192
pw_16384: times 2 dw 16384
pw_2896x8: times 2 dw 2896*8
pw_5793x4: times 2 dw 5793*4
pd_2048: dd 2048
COEF_PAIR 1567, 3784
COEF_PAIR 3784, 1567
COEF_PAIR 201, 4091
COEF_PAIR 995, 3973
COEF_PAIR 1751, 3703
COEF_PAIR 2440, 3290
COEF_PAIR 3035, 2751
COEF_PAIR 3513, 2106
COEF_PAIR 3857, 1380
COEF_PAIR 4052, 601
COEF_PAIR 401, 4076
COEF_PAIR 1931, 3612
COEF_PAIR 3166, 2598
COEF_PAIR 3920, 1189
COEF_PAIR 799, 4017
COEF_PAIR 3406, 2276
pw_m799_m4017: dw -799, -4017
pw_m1567_m3784: dw -1567, -3784
pw_m3406_m2276: dw -3406, -2276
pw_m401_m4076: dw -401, -4076
pw_m3166_m2598: dw -3166, -2598
pw_m1931_m3612: dw -1931, -3612
pw_m3920_m1189: dw -3920, -1189
COEF_PAIR 2276, 3406
COEF_PAIR 4017, 799
%macro COEF_X8 1-*
%rep %0
dw %1*8, %1*8
%rotate 1
%endrep
%endmacro
pw_3703x8: COEF_X8 3703
pw_1751x8: COEF_X8 1751
pw_m1380x8: COEF_X8 -1380
pw_3857x8: COEF_X8 3857
pw_3973x8: COEF_X8 3973
pw_995x8: COEF_X8 995
pw_m2106x8: COEF_X8 -2106
pw_3513x8: COEF_X8 3513
pw_3290x8: COEF_X8 3290
pw_2440x8: COEF_X8 2440
pw_m601x8: COEF_X8 -601
pw_4052x8: COEF_X8 4052
idct64_mul: COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520
COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092
COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842
COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301
%define o_idct64_offset idct64_mul - (o_base) - 8
SECTION .text
; Code size reduction trickery: Intead of using rip-relative loads with
; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
; single rip-relative lea and then address things relative from that with
; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
%define o_base iadst4_dconly2a + 128
%define o(x) (rax - (o_base) + (x))
%macro REPX 2-*
%xdefine %%f(x) %1
%rep %0 - 1
%rotate 1
%%f(%1)
%endrep
%endmacro
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
; flags: 1 = swap, 2 = interleave, 4: coef_regs
%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
%if %7 & 4
pmaddwd m%2, m%5, m%1
pmaddwd m%1, m%6
%else
%if %7 & 1
vpbroadcastd m%2, [o(pw_%5_%6)]
vpbroadcastd m%3, [o(pw_m%6_%5)]
%else
vpbroadcastd m%2, [o(pw_m%6_%5)]
vpbroadcastd m%3, [o(pw_%5_%6)]
%endif
pmaddwd m%2, m%1
pmaddwd m%1, m%3
%endif
paddd m%2, m%4
paddd m%1, m%4
%if %7 & 2
pslld m%2, 4
psrld m%1, 12
pblendw m%1, m%2, 0xaa
%else
psrad m%2, 12
psrad m%1, 12
packssdw m%1, m%2
%endif
%endmacro
; flags: 1 = swap, 2 = interleave, 4 = coef_regs
%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags
%if %10 & 1
vpbroadcastd m%3, [o(pw_%8_%9)]
vpbroadcastd m%4, [o(pw_m%9_%8)]
vpbroadcastd xm%2, [o(pw_%6_%7)]
vpblendd m%2, m%2, m%3, 0xf0
vpbroadcastd xm%3, [o(pw_m%7_%6)]
%else
vpbroadcastd m%3, [o(pw_m%9_%8)]
vpbroadcastd m%4, [o(pw_%8_%9)]
vpbroadcastd xm%2, [o(pw_m%7_%6)]
vpblendd m%2, m%2, m%3, 0xf0
vpbroadcastd xm%3, [o(pw_%6_%7)]
%endif
vpblendd m%3, m%3, m%4, 0xf0
ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10)
%endmacro
; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
punpckhwd m%3, m%2, m%1
punpcklwd m%2, m%1
%if %7 < 32
pmaddwd m%1, m%7, m%2
pmaddwd m%4, m%7, m%3
%else
vpbroadcastd m%1, [o(pw_m%7_%6)]
pmaddwd m%4, m%3, m%1
pmaddwd m%1, m%2
%endif
paddd m%4, m%5
paddd m%1, m%5
psrad m%4, 12
psrad m%1, 12
packssdw m%1, m%4
%if %7 < 32
pmaddwd m%3, m%6
pmaddwd m%2, m%6
%else
vpbroadcastd m%4, [o(pw_%6_%7)]
pmaddwd m%3, m%4
pmaddwd m%2, m%4
%endif
paddd m%3, m%5
paddd m%2, m%5
psrad m%3, 12
psrad m%2, 12
packssdw m%2, m%3
%endmacro
%macro ITX_MULHRSW_SHL3 4 ; dst/src, tmp, coef[1-2]
vpbroadcastd m%2, [pw_%3_%4]
psllw m%2, 3
pmulhrsw m%1, m%2
%endmacro
%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784 ; t2, t3
vpbroadcastd m%6, [o(pw_2896x8)]
paddw m%5, m%1, m%3
psubw m%1, m%3
pmulhrsw m%1, m%6 ; t1
pmulhrsw m%5, m%6 ; t0
psubw m%3, m%1, m%2
paddw m%2, m%1
paddw m%1, m%5, m%4
psubw m%4, m%5, m%4
%endmacro
%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a
ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
paddw m%9, m%2, m%6 ; t4
psubw m%2, m%6 ; t5a
paddw m%10, m%8, m%4 ; t7
psubw m%8, m%4 ; t6a
vpbroadcastd m%4, [o(pw_2896x8)]
psubw m%6, m%1, m%5
paddw m%1, m%5
psubw m%5, m%8, m%2
paddw m%8, m%2
pmulhrsw m%1, m%4 ; t0
pmulhrsw m%6, m%4 ; t1
pmulhrsw m%8, m%4 ; t6
pmulhrsw m%5, m%4 ; t5
psubw m%4, m%1, m%7 ; dct4 out3
paddw m%1, m%7 ; dct4 out0
paddw m%7, m%6, m%3 ; dct4 out1
psubw m%6, m%3 ; dct4 out2
paddw m%2, m%7, m%8 ; out1
psubw m%7, m%8 ; out6
psubw m%8, m%1, m%10 ; out7
paddw m%1, m%10 ; out0
paddw m%3, m%6, m%5 ; out2
psubw m%6, m%5 ; out5
psubw m%5, m%4, m%9 ; out4
paddw m%4, m%9 ; out3
%endmacro
; in1 = %1, in3 = %2, in5 = %3, in7 = %4
; in9 = %5, in11 = %6, in13 = %7, in15 = %8
%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048
ITX_MULSUB_2W %1, %8, %9, %10, %11, 401, 4076 ; t8a, t15a
ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a
ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
psubw m%9, m%2, m%6 ; t13
paddw m%6, m%2 ; t12
psubw m%2, m%8, m%4 ; t14
paddw m%8, m%4 ; t15
psubw m%4, m%7, m%3 ; t10
paddw m%3, m%7 ; t11
psubw m%7, m%1, m%5 ; t9
paddw m%1, m%5 ; t8
ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a
ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
vpbroadcastd m%10, [o(pw_2896x8)]
psubw m%5, m%2, m%9 ; t10
paddw m%2, m%9 ; t9
psubw m%9, m%1, m%3 ; t11a
paddw m%1, m%3 ; t8a
psubw m%3, m%7, m%4 ; t13
paddw m%7, m%4 ; t14
psubw m%4, m%8, m%6 ; t12a
paddw m%8, m%6 ; t15a
paddw m%6, m%3, m%5 ; t13a
psubw m%3, m%5 ; t10a
paddw m%5, m%4, m%9 ; t12
psubw m%4, m%9 ; t11
REPX {pmulhrsw x, m%10}, m%6, m%3, m%5, m%4
%endmacro
%macro WRAP_XMM 1+
INIT_XMM cpuname
%1
INIT_YMM cpuname
%endmacro
%macro ITX4_END 4-5 2048 ; row[1-4], rnd
%if %5
vpbroadcastd m2, [o(pw_%5)]
pmulhrsw m0, m2
pmulhrsw m1, m2
%endif
lea r2, [dstq+strideq*2]
%assign %%i 1
%rep 4
%if %1 & 2
CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1)
%else
CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
%endif
%assign %%i %%i + 1
%rotate 1
%endrep
movd m2, [%%row_adr1]
pinsrd m2, [%%row_adr2], 1
movd m3, [%%row_adr3]
pinsrd m3, [%%row_adr4], 1
pmovzxbw m2, m2
pmovzxbw m3, m3
paddw m0, m2
paddw m1, m3
packuswb m0, m1
movd [%%row_adr1], m0
pextrd [%%row_adr2], m0, 1
pextrd [%%row_adr3], m0, 2
pextrd [%%row_adr4], m0, 3
ret
%endmacro
%macro IWHT4_1D_PACKED 0
punpckhqdq m3, m0, m1 ; in1 in3
punpcklqdq m0, m1 ; in0 in2
psubw m2, m0, m3
paddw m0, m3
punpckhqdq m2, m2 ; t2 t2
punpcklqdq m0, m0 ; t0 t0
psubw m1, m0, m2
psraw m1, 1
psubw m1, m3 ; t1 t3
psubw m0, m1 ; ____ out0
paddw m2, m1 ; out3 ____
%endmacro
INIT_XMM avx2
cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, c
mova m0, [cq+16*0]
mova m1, [cq+16*1]
pxor m2, m2
mova [cq+16*0], m2
mova [cq+16*1], m2
psraw m0, 2
psraw m1, 2
IWHT4_1D_PACKED
punpckhwd m0, m1
punpcklwd m3, m1, m2
punpckhdq m1, m0, m3
punpckldq m0, m3
IWHT4_1D_PACKED
vpblendd m0, m0, m2, 0x03
ITX4_END 3, 0, 2, 1, 0
%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, c, eob, tx2
%undef cmp
%define %%p1 m(i%1_%4_internal)
lea rax, [o_base]
; Jump to the 1st txfm function if we're not taking the fast path, which
; in turn performs an indirect jump to the 2nd txfm function.
lea tx2q, [m(i%2_%4_internal).pass2]
%if %3 > 0
cmp eobd, %3
jg %%p1
%elif %3 == 0
test eobd, eobd
jnz %%p1
%else
; jump to the 1st txfm function unless it's located directly after this
times ((%%end - %%p1) >> 31) & 1 jmp %%p1
ALIGN function_align
%%end:
%endif
%endmacro
%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 4x4
%ifidn %1_%2, dct_identity
vpbroadcastd m0, [o(pw_2896x8)]
pmulhrsw m0, [cq]
vpbroadcastd m1, [o(pw_5793x4)]
paddw m0, m0
pmulhrsw m0, m1
punpcklwd m0, m0
punpckhdq m1, m0, m0
punpckldq m0, m0
jmp m(iadst_4x4_internal).end
%elifidn %1_%2, identity_dct
mova m0, [cq+16*0]
packusdw m0, [cq+16*1]
vpbroadcastd m2, [o(pw_5793x4)]
vpbroadcastd m3, [o(pw_2896x8)]
packusdw m0, m0
paddw m0, m0
pmulhrsw m0, m2
pmulhrsw m0, m3
mova m1, m0
jmp m(iadst_4x4_internal).end
%elif %3 >= 0
vpbroadcastw m0, [cq]
%ifidn %1, dct
vpbroadcastd m1, [o(pw_2896x8)]
pmulhrsw m0, m1
%elifidn %1, adst
movddup m1, [o(iadst4_dconly1a)]
pmulhrsw m0, m1
%elifidn %1, flipadst
movddup m1, [o(iadst4_dconly1b)]
pmulhrsw m0, m1
%endif
mov [cq], eobd ; 0
%ifidn %2, dct
%ifnidn %1, dct
vpbroadcastd m1, [o(pw_2896x8)]
%endif
pmulhrsw m0, m1
mova m1, m0
jmp m(iadst_4x4_internal).end2
%else ; adst / flipadst
pmulhrsw m1, m0, [o(iadst4_dconly2b)]
pmulhrsw m0, [o(iadst4_dconly2a)]
jmp m(i%2_4x4_internal).end2
%endif
%endif
%endmacro
%macro IDCT4_1D_PACKED 0-1 ; pw_2896x8
vpbroadcastd m4, [o(pd_2048)]
punpckhwd m2, m1, m0
psubw m3, m0, m1
paddw m0, m1
punpcklqdq m0, m3
ITX_MUL2X_PACK 2, 1, 3, 4, 1567, 3784
%if %0 == 1
pmulhrsw m0, m%1
%else
vpbroadcastd m4, [o(pw_2896x8)]
pmulhrsw m0, m4 ; t0 t1
%endif
psubw m1, m0, m2 ; out3 out2
paddw m0, m2 ; out0 out1
%endmacro
%macro IADST4_1D_PACKED 0
punpcklwd m2, m1, m0
punpckhwd m3, m1, m0
psubw m0, m1
punpckhqdq m1, m1
paddw m1, m0 ; in0 - in2 + in3
vpbroadcastd m0, [o(pw_3803_1321)]
vpbroadcastd m4, [o(pw_m1321_2482)]
pmaddwd m0, m2
pmaddwd m2, m4
vpbroadcastd m4, [o(pw_2482_3344)]
vpbroadcastd m5, [o(pw_m3803_3344)]
pmaddwd m4, m3
pmaddwd m5, m3
paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
vpbroadcastd m0, [o(pw_m3803_m6688)]
pmaddwd m3, m0
vpbroadcastd m0, [o(pw_3344x8)]
pmulhrsw m1, m0 ; out2 ____
vpbroadcastd m0, [o(pd_2048)]
paddd m2, m0
paddd m0, m4
paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
paddd m2, m4
paddd m2, m3
psrad m0, 12
psrad m5, 12
psrad m2, 12
packssdw m0, m5 ; out0 out1
packssdw m2, m2 ; out3 out3
%endmacro
INV_TXFM_4X4_FN dct, dct, 0
INV_TXFM_4X4_FN dct, adst, 0
INV_TXFM_4X4_FN dct, flipadst, 0
INV_TXFM_4X4_FN dct, identity, 3
cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
mova m0, [cq+16*0]
mova m1, [cq+16*1]
IDCT4_1D_PACKED
mova m2, [o(deint_shuf)]
shufps m3, m0, m1, q1331
shufps m0, m0, m1, q0220
pshufb m0, m2
pshufb m1, m3, m2
jmp tx2q
.pass2:
IDCT4_1D_PACKED
pxor m2, m2
mova [cq+16*0], m2
mova [cq+16*1], m2
ITX4_END 0, 1, 3, 2
INV_TXFM_4X4_FN adst, dct, 0
INV_TXFM_4X4_FN adst, adst, 0
INV_TXFM_4X4_FN adst, flipadst, 0
INV_TXFM_4X4_FN adst, identity
cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
mova m0, [cq+16*0]
mova m1, [cq+16*1]
call .main
punpckhwd m3, m0, m2
punpcklwd m0, m1
punpckhwd m1, m0, m3
punpcklwd m0, m3
jmp tx2q
.pass2:
call .main
vpblendd m1, m1, m2, 0x0c ; out2 out3
.end:
pxor m2, m2
mova [cq+16*0], m2
mova [cq+16*1], m2
.end2:
ITX4_END 0, 1, 2, 3
ALIGN function_align
.main:
IADST4_1D_PACKED
ret
INV_TXFM_4X4_FN flipadst, dct, 0
INV_TXFM_4X4_FN flipadst, adst, 0
INV_TXFM_4X4_FN flipadst, flipadst, 0
INV_TXFM_4X4_FN flipadst, identity
cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
mova m0, [cq+16*0]
mova m1, [cq+16*1]
call m(iadst_4x4_internal).main
punpcklwd m1, m0
punpckhwd m2, m0
punpcklwd m0, m2, m1
punpckhwd m1, m2, m1
jmp tx2q
.pass2:
call m(iadst_4x4_internal).main
vpblendd m1, m1, m2, 0x0c ; out2 out3
.end:
pxor m2, m2
mova [cq+16*0], m2
mova [cq+16*1], m2
.end2:
ITX4_END 3, 2, 1, 0
INV_TXFM_4X4_FN identity, dct, 3
INV_TXFM_4X4_FN identity, adst
INV_TXFM_4X4_FN identity, flipadst
INV_TXFM_4X4_FN identity, identity
cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
mova m0, [cq+16*0]
mova m1, [cq+16*1]
vpbroadcastd m2, [o(pw_5793x4)]
paddw m0, m0
paddw m1, m1
pmulhrsw m0, m2
pmulhrsw m1, m2
punpckhwd m2, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m2
punpcklwd m0, m2
jmp tx2q
.pass2:
vpbroadcastd m2, [o(pw_5793x4)]
paddw m0, m0
paddw m1, m1
pmulhrsw m0, m2
pmulhrsw m1, m2
jmp m(iadst_4x4_internal).end
%macro WRITE_4X8 2 ; coefs[1-2]
movd xm4, [dstq+strideq*0]
pinsrd xm4, [dstq+strideq*1], 1
movd xm5, [dstq+strideq*2]
pinsrd xm5, [dstq+r3 ], 1
pinsrd xm4, [r2 +strideq*0], 2
pinsrd xm4, [r2 +strideq*1], 3
pinsrd xm5, [r2 +strideq*2], 2
pinsrd xm5, [r2 +r3 ], 3
pmovzxbw m4, xm4
pmovzxbw m5, xm5
paddw m4, m%1
paddw m5, m%2
packuswb m4, m5
vextracti128 xm5, m4, 1
movd [dstq+strideq*0], xm4
pextrd [dstq+strideq*1], xm4, 1
pextrd [dstq+strideq*2], xm4, 2
pextrd [dstq+r3 ], xm4, 3
movd [r2 +strideq*0], xm5
pextrd [r2 +strideq*1], xm5, 1
pextrd [r2 +strideq*2], xm5, 2
pextrd [r2 +r3 ], xm5, 3
%endmacro
%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 4x8
%if %3 >= 0
%ifidn %1_%2, dct_identity
vpbroadcastd xm0, [o(pw_2896x8)]
pmulhrsw xm1, xm0, [cq]
vpbroadcastd xm2, [o(pw_4096)]
pmulhrsw xm1, xm0
pmulhrsw xm1, xm2
vpermq m1, m1, q1100
punpcklwd m1, m1
punpckldq m0, m1, m1
punpckhdq m1, m1
jmp m(iadst_4x8_internal).end3
%elifidn %1_%2, identity_dct
movd xm0, [cq+16*0]
punpcklwd xm0, [cq+16*1]
movd xm1, [cq+16*2]
punpcklwd xm1, [cq+16*3]
vpbroadcastd xm2, [o(pw_2896x8)]
vpbroadcastd xm3, [o(pw_5793x4)]
vpbroadcastd xm4, [o(pw_2048)]
punpckldq xm0, xm1
pmulhrsw xm0, xm2
paddw xm0, xm0
pmulhrsw xm0, xm3
pmulhrsw xm0, xm2
pmulhrsw xm0, xm4
vpbroadcastq m0, xm0
mova m1, m0
jmp m(iadst_4x8_internal).end3
%elifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_2048)]
mov [cq], eobd
pmulhrsw xm0, xm1
pmulhrsw xm0, xm1
pmulhrsw xm0, xm2
vpbroadcastw m0, xm0
mova m1, m0
jmp m(iadst_4x8_internal).end4
%else ; adst_dct / flipadst_dct
vpbroadcastw xm0, [cq]
vpbroadcastd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1
pmulhrsw xm0, [o(iadst4_dconly1a)]
vpbroadcastd xm2, [o(pw_2048)]
mov [cq], eobd
pmulhrsw xm0, xm1
pmulhrsw xm0, xm2
%ifidn %1, adst
vpbroadcastq m0, xm0
%else ; flipadst
vpermq m0, m0, q1111
%endif
mova m1, m0
jmp m(iadst_4x8_internal).end4
%endif
%endif
%endmacro
%macro IDCT8_1D_PACKED 0
vpbroadcastd m6, [o(pd_2048)]
punpckhwd m5, m3, m0 ; in7 in1
punpckhwd m4, m1, m2 ; in3 in5
punpcklwd m3, m1 ; in2 in6
psubw m1, m0, m2
paddw m0, m2
punpcklqdq m0, m1 ; in0+in4 in0-in4
ITX_MUL2X_PACK 5, 1, 2, 6, 799, 4017, 1 ; t4a t7a
ITX_MUL2X_PACK 4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a
ITX_MUL2X_PACK 3, 1, 2, 6, 1567, 3784 ; t3 t2
vpbroadcastd m6, [o(pw_2896x8)]
psubw m2, m5, m4 ; t4 t7
paddw m5, m4 ; t5a t6a
pshufd m4, m2, q1032
psubw m1, m2, m4
paddw m4, m2
vpblendd m4, m4, m1, 0xcc
pmulhrsw m0, m6 ; t0 t1
pmulhrsw m4, m6 ; t6 t5
psubw m1, m0, m3 ; tmp3 tmp2
paddw m0, m3 ; tmp0 tmp1
shufps m2, m5, m4, q1032 ; t7 t6
vpblendd m5, m5, m4, 0xcc ; t4 t5
psubw m3, m0, m2 ; out7 out6
paddw m0, m2 ; out0 out1
psubw m2, m1, m5 ; out4 out5
paddw m1, m5 ; out3 out2
%endmacro
%macro IADST8_1D_PACKED 0
vpbroadcastd m6, [o(pd_2048)]
punpckhwd m0, m4, m3 ; 0 7
punpckhwd m1, m5, m2 ; 2 5
punpcklwd m2, m5 ; 4 3
punpcklwd m3, m4 ; 6 1
ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a
ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a
psubw m4, m0, m2 ; t4 t5
paddw m0, m2 ; t0 t1
psubw m5, m1, m3 ; t6 t7
paddw m1, m3 ; t2 t3
shufps m2, m5, m4, q1032
punpckhwd m4, m2
punpcklwd m5, m2
ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a
psubw m2, m0, m1 ; t2 t3
paddw m0, m1 ; out0 -out7
psubw m1, m4, m5 ; t7 t6
paddw m4, m5 ; out6 -out1
vpbroadcastd m5, [o(pw_2896x8)]
vpblendd m3, m0, m4, 0x33 ; out6 -out7
vpblendd m0, m0, m4, 0xcc ; out0 -out1
shufps m4, m2, m1, q1032 ; t3 t7
vpblendd m1, m2, m1, 0xcc ; t2 t6
psubw m2, m1, m4 ; t2-t3 t6-t7
paddw m1, m4 ; t2+t3 t6+t7
pmulhrsw m2, m5 ; out4 -out5
pshufd m1, m1, q1032
pmulhrsw m1, m5 ; out2 -out3
%endmacro
INIT_YMM avx2
INV_TXFM_4X8_FN dct, dct, 0
INV_TXFM_4X8_FN dct, identity, 7
INV_TXFM_4X8_FN dct, adst
INV_TXFM_4X8_FN dct, flipadst
cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q3120
vpermq m1, [cq+32*1], q3120
vpbroadcastd m5, [o(pw_2896x8)]
pmulhrsw m0, m5
pmulhrsw m1, m5
IDCT4_1D_PACKED 5
vbroadcasti128 m2, [o(deint_shuf)]
shufps m3, m0, m1, q1331
shufps m0, m0, m1, q0220
pshufb m0, m2
pshufb m1, m3, m2
jmp tx2q
.pass2:
vextracti128 xm2, m0, 1
vextracti128 xm3, m1, 1
call .main
vpbroadcastd m4, [o(pw_2048)]
vinserti128 m0, m0, xm2, 1
vinserti128 m1, m1, xm3, 1
pshufd m1, m1, q1032
jmp m(iadst_4x8_internal).end2
ALIGN function_align
.main:
WRAP_XMM IDCT8_1D_PACKED
ret
INV_TXFM_4X8_FN adst, dct, 0
INV_TXFM_4X8_FN adst, adst
INV_TXFM_4X8_FN adst, flipadst
INV_TXFM_4X8_FN adst, identity
cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q3120
vpermq m1, [cq+32*1], q3120
vpbroadcastd m2, [o(pw_2896x8)]
pmulhrsw m0, m2
pmulhrsw m1, m2
call m(iadst_8x4_internal).main
punpckhwd m3, m0, m2
punpcklwd m0, m1
punpckhwd m1, m0, m3
punpcklwd m0, m3
jmp tx2q
.pass2:
vextracti128 xm2, m0, 1
vextracti128 xm3, m1, 1
pshufd xm4, xm0, q1032
pshufd xm5, xm1, q1032
call .main
vpbroadcastd m4, [o(pw_2048)]
vinserti128 m0, m0, xm2, 1
vinserti128 m1, m1, xm3, 1
pxor m5, m5
psubw m5, m4
.end:
vpblendd m4, m4, m5, 0xcc
.end2:
pmulhrsw m0, m4
pmulhrsw m1, m4
WIN64_RESTORE_XMM
.end3:
pxor m2, m2
mova [cq+32*0], m2
mova [cq+32*1], m2
.end4:
lea r2, [dstq+strideq*4]
lea r3, [strideq*3]
WRITE_4X8 0, 1
RET
ALIGN function_align
.main:
WRAP_XMM IADST8_1D_PACKED
ret
INV_TXFM_4X8_FN flipadst, dct, 0
INV_TXFM_4X8_FN flipadst, adst
INV_TXFM_4X8_FN flipadst, flipadst
INV_TXFM_4X8_FN flipadst, identity
cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q3120
vpermq m1, [cq+32*1], q3120
vpbroadcastd m2, [o(pw_2896x8)]
pmulhrsw m0, m2
pmulhrsw m1, m2
call m(iadst_8x4_internal).main
punpcklwd m3, m1, m0
punpckhwd m1, m2, m0
punpcklwd m0, m1, m3
punpckhwd m1, m3
jmp tx2q
.pass2:
vextracti128 xm2, m0, 1
vextracti128 xm3, m1, 1
pshufd xm4, xm0, q1032
pshufd xm5, xm1, q1032
call m(iadst_4x8_internal).main
vpbroadcastd m5, [o(pw_2048)]
vinserti128 m3, m3, xm1, 1
vinserti128 m2, m2, xm0, 1
pxor m4, m4
psubw m4, m5
pshufd m0, m3, q1032
pshufd m1, m2, q1032
jmp m(iadst_4x8_internal).end
INV_TXFM_4X8_FN identity, dct, 3
INV_TXFM_4X8_FN identity, adst
INV_TXFM_4X8_FN identity, flipadst
INV_TXFM_4X8_FN identity, identity
cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m2, [cq+32*0], q3120
vpermq m0, [cq+32*1], q3120
vpbroadcastd m3, [o(pw_2896x8)]
vpbroadcastd m4, [o(pw_5793x4)]
punpcklwd m1, m2, m0
punpckhwd m2, m0
pmulhrsw m1, m3
pmulhrsw m2, m3
punpcklwd m0, m1, m2
punpckhwd m1, m2
paddw m0, m0
paddw m1, m1
pmulhrsw m0, m4
pmulhrsw m1, m4
jmp tx2q
.pass2:
vpbroadcastd m4, [o(pw_4096)]
jmp m(iadst_4x8_internal).end2
%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 4x16
%if %3 >= 0
%ifidn %1_%2, dct_identity
vpbroadcastd m0, [o(pw_2896x8)]
pmulhrsw m0, [cq]
vpbroadcastd m1, [o(pw_16384)]
vpbroadcastd m2, [o(pw_5793x4)]
vpbroadcastd m3, [o(pw_2048)]
pmulhrsw m0, m1
psllw m0, 2
pmulhrsw m0, m2
pmulhrsw m3, m0
punpcklwd m1, m3, m3
punpckhwd m3, m3
punpckldq m0, m1, m1
punpckhdq m1, m1
punpckldq m2, m3, m3
punpckhdq m3, m3
jmp m(iadst_4x16_internal).end3
%elifidn %1_%2, identity_dct
movd xm0, [cq+32*0]
punpcklwd xm0, [cq+32*1]
movd xm1, [cq+32*2]
punpcklwd xm1, [cq+32*3]
vpbroadcastd xm2, [o(pw_5793x4)]
vpbroadcastd xm3, [o(pw_16384)]
vpbroadcastd xm4, [o(pw_2896x8)]
punpckldq xm0, xm1
paddw xm0, xm0
pmulhrsw xm0, xm2
pmulhrsw xm0, xm3
psrlw xm3, 3 ; pw_2048
pmulhrsw xm0, xm4
pmulhrsw xm0, xm3
vpbroadcastq m0, xm0
mova m1, m0
mova m2, m0
mova m3, m0
jmp m(iadst_4x16_internal).end3
%elifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
movd xm3, [o(pw_2048)]
mov [cq], eobd
pmulhrsw xm0, xm2
pmulhrsw xm0, xm1
pmulhrsw xm0, xm3
vpbroadcastw m0, xm0
mova m1, m0
mova m2, m0
mova m3, m0
jmp m(iadst_4x16_internal).end4
%else ; adst_dct / flipadst_dct
vpbroadcastw xm0, [cq]
pmulhrsw xm0, [o(iadst4_dconly1a)]
vpbroadcastd xm1, [o(pw_16384)]
vpbroadcastd xm2, [o(pw_2896x8)]
mov [cq], eobd
pmulhrsw xm0, xm1
psrlw xm1, 3 ; pw_2048
pmulhrsw xm0, xm2
pmulhrsw xm0, xm1
%ifidn %1, adst
vpbroadcastq m0, xm0
%else ; flipadst
vpermq m0, m0, q1111
%endif
mova m1, m0
mova m2, m0
mova m3, m0
jmp m(iadst_4x16_internal).end4
%endif
%endif
%endmacro
%macro IDCT16_1D_PACKED 0
vpbroadcastd m10, [o(pd_2048)]
.main2:
punpckhwd m8, m7, m0 ; dct16 in15 in1
paddw m9, m0, m4
psubw m0, m4
punpcklqdq m9, m0 ; dct4 in0+in2 in0-in2
punpckhwd m0, m3, m4 ; dct16 in7 in9
punpcklwd m7, m1 ; dct8 in7 in1
punpckhwd m1, m6 ; dct16 in3 in13
punpcklwd m3, m5 ; dct8 in3 in5
punpckhwd m5, m2 ; dct16 in11 in5
punpcklwd m6, m2 ; dct4 in3 in1
ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 3 ; t8a t15a
ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a
ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 1 ; t4a t7a
ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 1 ; t5a t6a
ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2
psubw m2, m8, m0 ; t9 t14
paddw m8, m0 ; t8 t15
psubw m0, m1, m5 ; t10 t13
paddw m1, m5 ; t11 t12
%if mmsize > 16
vbroadcasti128 m5, [o(deint_shuf)]
%else
mova m5, [o(deint_shuf)]
%endif
pshufb m8, m5
pshufb m1, m5
vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784
ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 4 ; t9a t14a
vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 4 ; t10a t13a
psubw m5, m7, m3 ; t5a t6a
paddw m7, m3 ; t4 t7
psubw m4, m8, m1 ; t11a t12a
paddw m8, m1 ; t8a t15a
paddw m1, m2, m0 ; t9 t14
psubw m2, m0 ; t10 t13
punpckhqdq m0, m8, m1 ; t15a t14
punpcklqdq m8, m1 ; t8a t9
pshufd m3, m5, q1032
psubw m1, m5, m3
paddw m3, m5
vpblendd m3, m3, m1, 0xcc ; t6 t5
vpbroadcastd m1, [o(pw_2896x8)]
punpckhqdq m5, m4, m2 ; t12a t13
punpcklqdq m2, m4, m2 ; t11a t10
psubw m4, m5, m2
paddw m5, m2
pmulhrsw m9, m1 ; t0 t1
pmulhrsw m3, m1 ; t6 t5
pmulhrsw m4, m1 ; t11 t10a
pmulhrsw m5, m1 ; t12 t13a
shufps m2, m7, m3, q1032 ; t7 t6
vpblendd m7, m7, m3, 0xcc ; t4 t5
psubw m1, m9, m6 ; dct4 out3 out2
paddw m9, m6 ; dct4 out0 out1
psubw m3, m9, m2 ; dct8 out7 out6
paddw m9, m2 ; dct8 out0 out1
psubw m2, m1, m7 ; dct8 out4 out5
paddw m1, m7 ; dct8 out3 out2
psubw m7, m9, m0 ; out15 out14
paddw m0, m9 ; out0 out1
psubw m6, m1, m5 ; out12 out13
paddw m1, m5 ; out3 out2
psubw m5, m2, m4 ; out11 out10
paddw m2, m4 ; out4 out5
psubw m4, m3, m8 ; out8 out9
paddw m3, m8 ; out7 out6
%endmacro
INV_TXFM_4X16_FN dct, dct, 0
INV_TXFM_4X16_FN dct, identity, 15
INV_TXFM_4X16_FN dct, adst
INV_TXFM_4X16_FN dct, flipadst
cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova m0, [cq+32*0]
mova m1, [cq+32*1]
mova m2, [cq+32*2]
mova m3, [cq+32*3]
call m(idct_16x4_internal).main
vpbroadcastd m5, [o(pw_16384)]
punpckhwd m4, m2, m3
punpcklwd m2, m3
punpckhwd m3, m0, m1
punpcklwd m0, m1
REPX {pmulhrsw x, m5}, m0, m4, m2, m3
punpckhdq m1, m0, m2
punpckldq m0, m2
punpckldq m2, m3, m4
punpckhdq m3, m4
jmp tx2q
.pass2:
vextracti128 xm4, m0, 1
vextracti128 xm5, m1, 1
vextracti128 xm6, m2, 1
vextracti128 xm7, m3, 1
call .main
vinserti128 m0, m0, xm4, 1
vinserti128 m1, m1, xm5, 1
vpbroadcastd m5, [o(pw_2048)]
vinserti128 m2, m2, xm6, 1
vinserti128 m3, m3, xm7, 1
pshufd m1, m1, q1032
pshufd m3, m3, q1032
jmp m(iadst_4x16_internal).end2
ALIGN function_align
.main:
WRAP_XMM IDCT16_1D_PACKED
ret
INV_TXFM_4X16_FN adst, dct, 0
INV_TXFM_4X16_FN adst, adst
INV_TXFM_4X16_FN adst, flipadst
INV_TXFM_4X16_FN adst, identity
cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova m0, [cq+32*0]
mova m1, [cq+32*1]
mova m2, [cq+32*2]
mova m3, [cq+32*3]
call m(iadst_16x4_internal).main
vpbroadcastd m5, [o(pw_16384)]
punpckhwd m4, m2, m3
punpcklwd m2, m3
punpckhwd m3, m0, m1
punpcklwd m0, m1
REPX {pmulhrsw x, m5}, m4, m2, m3, m0
punpckhdq m1, m0, m2
punpckldq m0, m2
punpckldq m2, m3, m4
punpckhdq m3, m4
jmp tx2q
.pass2:
call .main
pshufd m1, m1, q1032
vpbroadcastd m5, [o(pw_2048)]
vpblendd m4, m1, m0, 0x33
vpblendd m0, m0, m2, 0x33
vpblendd m2, m2, m3, 0x33
vpblendd m3, m3, m1, 0x33
vpermq m0, m0, q2031
vpermq m1, m2, q1302
vpermq m2, m3, q3120
vpermq m3, m4, q0213
psubw m6, m7, m5
.end:
vpblendd m5, m5, m6, 0xcc
.end2:
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
WIN64_RESTORE_XMM
.end3:
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
mova [cq+32*2], m4
mova [cq+32*3], m4
.end4:
lea r2, [dstq+strideq*8]
lea r3, [strideq*3]
WRITE_4X8 0, 1
lea dstq, [dstq+strideq*4]
lea r2, [r2 +strideq*4]
WRITE_4X8 2, 3
RET
ALIGN function_align
.main:
vpblendd m4, m1, m0, 0xcc
vpblendd m1, m1, m0, 0x33
vpblendd m5, m2, m3, 0xcc
vpblendd m2, m2, m3, 0x33
vperm2i128 m3, m5, m2, 0x31
vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1
vperm2i128 m4, m1, m4, 0x31
vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5
pshufd m3, m3, q1032 ; in12 in15 in13 in14
pshufd m2, m4, q1032 ; in11 in8 in9 in10
.main2:
vpbroadcastd m8, [o(pd_2048)]
pxor m7, m7
punpckhwd m4, m3, m0 ; in12 in3 in14 in1
punpcklwd m0, m3 ; in0 in15 in2 in13
punpckhwd m3, m2, m1 ; in8 in7 in10 in5
punpcklwd m1, m2 ; in4 in11 in6 in9
ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3
ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3
psubw m2, m0, m3 ; t9a t8a t11a t10a
paddw m0, m3 ; t1a t0a t3a t2a
psubw m3, m1, m4 ; t13a t12a t15a t14a
paddw m1, m4 ; t5a t4a t7a t6a
ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3
psubw m6, m7, m5
ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6
vpbroadcastd m6, [o(pw_m3784_1567)]
vpbroadcastd m5, [o(pw_1567_3784)]
psubw m4, m0, m1 ; t5 t4 t7 t6
paddw m0, m1 ; t1 t0 t3 t2
psubw m1, m2, m3 ; t13a t12a t15a t14a
paddw m2, m3 ; t9a t8a t11a t10a
psubw m3, m7, m6
vpblendd m6, m6, m3, 0xf0
ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
vbroadcasti128 m5, [o(deint_shuf)]
pshufb m0, m5
pshufb m2, m5
vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a
vinserti128 m0, m0, xm2, 1 ; t1 t0 t9a t8a
vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14
vinserti128 m4, m4, xm1, 1 ; t4a t5a t12 t13
vpbroadcastd m5, [o(pw_2896x8)]
pshufd m2, m2, q1032 ; t6a t7a t14 t15
psubw m1, m0, m3 ; t3a t2a t11 t10
paddw m0, m3 ; -out15 out0 out14 -out1
paddw m3, m4, m2 ; -out3 out12 out2 -out13
psubw m4, m2 ; t6 t7 t14a t15a
shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a
vpblendd m4, m4, m1, 0x33 ; t3a t7 t11 t15a
paddw m1, m2, m4
psubw m2, m4
pmulhrsw m1, m5 ; -out7 out4 out6 -out5
pmulhrsw m2, m5 ; out8 -out11 -out9 out10
ret
INV_TXFM_4X16_FN flipadst, dct, 0
INV_TXFM_4X16_FN flipadst, adst
INV_TXFM_4X16_FN flipadst, flipadst
INV_TXFM_4X16_FN flipadst, identity
cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova m0, [cq+32*0]
mova m1, [cq+32*1]
mova m2, [cq+32*2]
mova m3, [cq+32*3]
call m(iadst_16x4_internal).main
vpbroadcastd m5, [o(pw_16384)]
punpcklwd m4, m1, m0
punpckhwd m1, m0
punpcklwd m0, m3, m2
punpckhwd m3, m2
REPX {pmulhrsw x, m5}, m4, m1, m0, m3
punpckldq m2, m3, m1
punpckhdq m3, m1
punpckhdq m1, m0, m4
punpckldq m0, m4
jmp tx2q
.pass2:
call m(iadst_4x16_internal).main
pshufd m1, m1, q1032
vpbroadcastd m6, [o(pw_2048)]
vpblendd m4, m0, m2, 0x33
vpblendd m0, m0, m1, 0xcc
vpblendd m1, m1, m3, 0xcc
vpblendd m2, m2, m3, 0x33
vpermq m0, m0, q3120
vpermq m1, m1, q0213
vpermq m2, m2, q2031
vpermq m3, m4, q1302
psubw m5, m7, m6
jmp m(iadst_4x16_internal).end
INV_TXFM_4X16_FN identity, dct, 3
INV_TXFM_4X16_FN identity, adst
INV_TXFM_4X16_FN identity, flipadst
INV_TXFM_4X16_FN identity, identity
cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova m3, [cq+32*0]
mova m2, [cq+32*1]
mova m4, [cq+32*2]
mova m0, [cq+32*3]
vpbroadcastd m5, [o(pw_5793x4)]
punpcklwd m1, m3, m2
punpckhwd m3, m2
punpcklwd m2, m4, m0
punpckhwd m4, m0
REPX {paddw x, x }, m1, m2, m3, m4
REPX {pmulhrsw x, m5}, m1, m2, m3, m4
vpbroadcastd m5, [o(pw_16384)]
punpckldq m0, m1, m2
punpckhdq m1, m2
punpckldq m2, m3, m4
punpckhdq m3, m4
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
jmp tx2q
.pass2:
vpbroadcastd m4, [o(pw_5793x4)]
vpbroadcastd m5, [o(pw_2048)]
REPX {psllw x, 2 }, m0, m1, m2, m3
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
jmp m(iadst_4x16_internal).end2
%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3, ; coefs[1-2], tmp[1-2], off[1-3]
movq xm%3, [dstq ]
movhps xm%3, [dstq+%5]
movq xm%4, [dstq+%6]
movhps xm%4, [dstq+%7]
pmovzxbw m%3, xm%3
pmovzxbw m%4, xm%4
%ifnum %1
paddw m%3, m%1
%else
paddw m%3, %1
%endif
%ifnum %2
paddw m%4, m%2
%else
paddw m%4, %2
%endif
packuswb m%3, m%4
vextracti128 xm%4, m%3, 1
movq [dstq ], xm%3
movhps [dstq+%6], xm%3
movq [dstq+%5], xm%4
movhps [dstq+%7], xm%4
%endmacro
%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 8x4
%if %3 >= 0
%ifidn %1_%2, dct_identity
vpbroadcastd xm0, [o(pw_2896x8)]
pmulhrsw xm1, xm0, [cq]
vpbroadcastd xm2, [o(pw_5793x4)]
vpbroadcastd xm3, [o(pw_2048)]
pmulhrsw xm1, xm0
paddw xm1, xm1
pmulhrsw xm1, xm2
pmulhrsw xm1, xm3
punpcklwd xm1, xm1
punpckldq xm0, xm1, xm1
punpckhdq xm1, xm1
vpermq m0, m0, q1100
vpermq m1, m1, q1100
%elifidn %1_%2, identity_dct
mova xm0, [cq+16*0]
packusdw xm0, [cq+16*1]
mova xm1, [cq+16*2]
packusdw xm1, [cq+16*3]
vpbroadcastd xm2, [o(pw_2896x8)]
vpbroadcastd xm3, [o(pw_2048)]
packusdw xm0, xm1
pmulhrsw xm0, xm2
paddw xm0, xm0
pmulhrsw xm0, xm2
pmulhrsw xm0, xm3
vinserti128 m0, m0, xm0, 1
mova m1, m0
%else
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
pmulhrsw xm0, xm1
%ifidn %2, dct
movd xm2, [o(pw_2048)]
pmulhrsw xm0, xm1
pmulhrsw xm0, xm2
vpbroadcastw m0, xm0
mova m1, m0
%else ; adst / flipadst
vpbroadcastw m0, xm0
pmulhrsw m0, [o(iadst4_dconly2a)]
vpbroadcastd m1, [o(pw_2048)]
pmulhrsw m1, m0
%ifidn %2, adst
vpermq m0, m1, q1100
vpermq m1, m1, q3322
%else ; flipadst
vpermq m0, m1, q2233
vpermq m1, m1, q0011
%endif
%endif
%endif
jmp m(iadst_8x4_internal).end3
%endif
%endmacro
INV_TXFM_8X4_FN dct, dct, 0
INV_TXFM_8X4_FN dct, adst, 0
INV_TXFM_8X4_FN dct, flipadst, 0
INV_TXFM_8X4_FN dct, identity, 3
cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpbroadcastd xm3, [o(pw_2896x8)]
pmulhrsw xm0, xm3, [cq+16*0]
pmulhrsw xm1, xm3, [cq+16*1]
pmulhrsw xm2, xm3, [cq+16*2]
pmulhrsw xm3, [cq+16*3]
call m(idct_4x8_internal).main
vbroadcasti128 m4, [o(deint_shuf)]
vinserti128 m3, m1, xm3, 1
vinserti128 m1, m0, xm2, 1
shufps m0, m1, m3, q0220
shufps m1, m1, m3, q1331
pshufb m0, m4
pshufb m1, m4
jmp tx2q
.pass2:
IDCT4_1D_PACKED
vpermq m0, m0, q3120
vpermq m1, m1, q2031
jmp m(iadst_8x4_internal).end2
INV_TXFM_8X4_FN adst, dct
INV_TXFM_8X4_FN adst, adst
INV_TXFM_8X4_FN adst, flipadst
INV_TXFM_8X4_FN adst, identity
cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpbroadcastd xm0, [o(pw_2896x8)]
pshufd xm4, [cq+16*0], q1032
pmulhrsw xm3, xm0, [cq+16*3]
pshufd xm5, [cq+16*1], q1032
pmulhrsw xm2, xm0, [cq+16*2]
pmulhrsw xm4, xm0
pmulhrsw xm5, xm0
call m(iadst_4x8_internal).main
vinserti128 m0, m0, xm2, 1
vinserti128 m1, m1, xm3, 1
punpckhwd m2, m0, m1
punpcklwd m0, m1
pxor m3, m3
psubw m3, m2
punpckhwd m1, m0, m3
punpcklwd m0, m3
jmp tx2q
.pass2:
call .main
vpblendd m1, m1, m2, 0xcc
.end:
vpermq m0, m0, q3120
vpermq m1, m1, q3120
.end2:
vpbroadcastd m2, [o(pw_2048)]
pmulhrsw m0, m2
pmulhrsw m1, m2
WIN64_RESTORE_XMM
.end3:
pxor m2, m2
mova [cq+32*0], m2
mova [cq+32*1], m2
lea r3, [strideq*3]
WRITE_8X4 0, 1, 4, 5
RET
ALIGN function_align
.main:
IADST4_1D_PACKED
ret
INV_TXFM_8X4_FN flipadst, dct
INV_TXFM_8X4_FN flipadst, adst
INV_TXFM_8X4_FN flipadst, flipadst
INV_TXFM_8X4_FN flipadst, identity
cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpbroadcastd xm0, [o(pw_2896x8)]
pshufd xm4, [cq+16*0], q1032
pmulhrsw xm3, xm0, [cq+16*3]
pshufd xm5, [cq+16*1], q1032
pmulhrsw xm2, xm0, [cq+16*2]
pmulhrsw xm4, xm0
pmulhrsw xm5, xm0
call m(iadst_4x8_internal).main
vinserti128 m3, m3, xm1, 1
vinserti128 m2, m2, xm0, 1
punpckhwd m1, m3, m2
punpcklwd m3, m2
pxor m0, m0
psubw m0, m1
punpckhwd m1, m0, m3
punpcklwd m0, m3
jmp tx2q
.pass2:
call m(iadst_8x4_internal).main
vpblendd m2, m2, m1, 0x33
vpermq m1, m0, q2031
vpermq m0, m2, q2031
jmp m(iadst_8x4_internal).end2
INV_TXFM_8X4_FN identity, dct, 7
INV_TXFM_8X4_FN identity, adst
INV_TXFM_8X4_FN identity, flipadst
INV_TXFM_8X4_FN identity, identity
cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
mova xm2, [cq+16*0]
mova xm0, [cq+16*1]
vinserti128 m2, m2, [cq+16*2], 1
vinserti128 m0, m0, [cq+16*3], 1
vpbroadcastd m3, [o(pw_2896x8)]
punpcklwd m1, m2, m0
punpckhwd m2, m0
pmulhrsw m1, m3
pmulhrsw m2, m3
punpcklwd m0, m1, m2
punpckhwd m1, m2
paddw m0, m0
paddw m1, m1
jmp tx2q
.pass2:
vpbroadcastd m2, [o(pw_5793x4)]
paddw m0, m0
paddw m1, m1
pmulhrsw m0, m2
pmulhrsw m1, m2
jmp m(iadst_8x4_internal).end
%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 8x8
%ifidn %1_%2, dct_identity
vpbroadcastd xm0, [o(pw_2896x8)]
pmulhrsw xm0, [cq]
vpbroadcastd xm1, [o(pw_16384)]
pmulhrsw xm0, xm1
psrlw xm1, 2 ; pw_4096
pmulhrsw xm0, xm1
pshufb xm0, [o(deint_shuf)]
vpermq m3, m0, q1100
punpcklwd m3, m3
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
jmp m(iadst_8x8_internal).end4
%elif %3 >= 0
%ifidn %1, dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm2
psrlw xm2, 3 ; pw_2048
pmulhrsw xm0, xm1
pmulhrsw xm0, xm2
vpbroadcastw m0, xm0
.end:
mov r2d, 2
.end2:
lea r3, [strideq*3]
.loop:
WRITE_8X4 0, 0, 1, 2
lea dstq, [dstq+strideq*4]
dec r2d
jg .loop
RET
%else ; identity
mova m0, [cq+32*0]
punpcklwd m0, [cq+32*1]
mova m1, [cq+32*2]
punpcklwd m1, [cq+32*3]
vpbroadcastd m2, [o(pw_2896x8)]
vpbroadcastd m3, [o(pw_2048)]
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
mova [cq+32*2], m4
mova [cq+32*3], m4
punpckldq m0, m1
vpermq m1, m0, q3232
vpermq m0, m0, q1010
punpcklwd m0, m1
pmulhrsw m0, m2
pmulhrsw m0, m3
jmp m(inv_txfm_add_dct_dct_8x8).end
%endif
%endif
%endmacro
INV_TXFM_8X8_FN dct, dct, 0
INV_TXFM_8X8_FN dct, identity, 7
INV_TXFM_8X8_FN dct, adst
INV_TXFM_8X8_FN dct, flipadst
cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q3120 ; 0 1
vpermq m3, [cq+32*3], q3120 ; 6 7
vpermq m2, [cq+32*2], q3120 ; 4 5
vpermq m1, [cq+32*1], q3120 ; 2 3
call .main
shufps m4, m0, m1, q0220
shufps m5, m0, m1, q1331
shufps m1, m2, m3, q0220
shufps m3, m2, m3, q1331
vbroadcasti128 m0, [o(deint_shuf)]
vpbroadcastd m2, [o(pw_16384)]
REPX {pshufb x, m0}, m4, m5, m1, m3
REPX {pmulhrsw x, m2}, m4, m5, m1, m3
vinserti128 m0, m4, xm1, 1
vperm2i128 m2, m4, m1, 0x31
vinserti128 m1, m5, xm3, 1
vperm2i128 m3, m5, m3, 0x31
jmp tx2q
.pass2:
call .main
vpbroadcastd m4, [o(pw_2048)]
vpermq m0, m0, q3120
vpermq m1, m1, q2031
vpermq m2, m2, q3120
vpermq m3, m3, q2031
jmp m(iadst_8x8_internal).end2
ALIGN function_align
.main:
IDCT8_1D_PACKED
ret
INV_TXFM_8X8_FN adst, dct
INV_TXFM_8X8_FN adst, adst
INV_TXFM_8X8_FN adst, flipadst
INV_TXFM_8X8_FN adst, identity
cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m4, [cq+32*0], q1302 ; 1 0
vpermq m3, [cq+32*3], q3120 ; 6 7
vpermq m5, [cq+32*1], q1302 ; 3 2
vpermq m2, [cq+32*2], q3120 ; 4 5
call .main
vpbroadcastd m5, [o(pw_16384)]
punpcklwd m4, m0, m1
punpckhwd m0, m1
punpcklwd m1, m2, m3
punpckhwd m2, m3
pxor m3, m3
psubw m3, m5 ; negate odd elements during rounding
pmulhrsw m4, m5
pmulhrsw m0, m3
pmulhrsw m1, m5
pmulhrsw m2, m3
punpcklwd m3, m4, m0
punpckhwd m4, m0
punpcklwd m0, m1, m2
punpckhwd m1, m2
vperm2i128 m2, m3, m0, 0x31
vinserti128 m0, m3, xm0, 1
vperm2i128 m3, m4, m1, 0x31
vinserti128 m1, m4, xm1, 1
jmp tx2q
.pass2:
pshufd m4, m0, q1032
pshufd m5, m1, q1032
call .main
vpbroadcastd m5, [o(pw_2048)]
vpbroadcastd xm4, [o(pw_4096)]
psubw m4, m5 ; lower half = 2048, upper half = -2048
.end:
REPX {vpermq x, x, q3120}, m0, m1, m2, m3
.end2:
pmulhrsw m0, m4
pmulhrsw m1, m4
.end3:
pmulhrsw m2, m4
pmulhrsw m3, m4
WIN64_RESTORE_XMM
.end4:
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
mova [cq+32*2], m4
mova [cq+32*3], m4
lea r3, [strideq*3]
WRITE_8X4 0, 1, 4, 5
lea dstq, [dstq+strideq*4]
WRITE_8X4 2, 3, 4, 5
RET
ALIGN function_align
.main:
IADST8_1D_PACKED
ret
INV_TXFM_8X8_FN flipadst, dct
INV_TXFM_8X8_FN flipadst, adst
INV_TXFM_8X8_FN flipadst, flipadst
INV_TXFM_8X8_FN flipadst, identity
cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m4, [cq+32*0], q1302 ; 1 0
vpermq m3, [cq+32*3], q3120 ; 6 7
vpermq m5, [cq+32*1], q1302 ; 3 2
vpermq m2, [cq+32*2], q3120 ; 4 5
call m(iadst_8x8_internal).main
vpbroadcastd m5, [o(pw_16384)]
punpckhwd m4, m3, m2
punpcklwd m3, m2
punpckhwd m2, m1, m0
punpcklwd m1, m0
pxor m0, m0
psubw m0, m5
pmulhrsw m4, m0
pmulhrsw m3, m5
pmulhrsw m2, m0
pmulhrsw m1, m5
punpckhwd m0, m4, m3
punpcklwd m4, m3
punpckhwd m3, m2, m1
punpcklwd m2, m1
vinserti128 m1, m0, xm3, 1
vperm2i128 m3, m0, m3, 0x31
vinserti128 m0, m4, xm2, 1
vperm2i128 m2, m4, m2, 0x31
jmp tx2q
.pass2:
pshufd m4, m0, q1032
pshufd m5, m1, q1032
call m(iadst_8x8_internal).main
vpbroadcastd m4, [o(pw_2048)]
vpbroadcastd xm5, [o(pw_4096)]
psubw m4, m5 ; lower half = -2048, upper half = 2048
vpermq m5, m3, q2031
vpermq m3, m0, q2031
vpermq m0, m2, q2031
vpermq m2, m1, q2031
pmulhrsw m1, m0, m4
pmulhrsw m0, m5, m4
jmp m(iadst_8x8_internal).end3
INV_TXFM_8X8_FN identity, dct, 7
INV_TXFM_8X8_FN identity, adst
INV_TXFM_8X8_FN identity, flipadst
INV_TXFM_8X8_FN identity, identity
cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
mova xm3, [cq+16*0]
mova xm2, [cq+16*1]
vinserti128 m3, m3, [cq+16*4], 1
vinserti128 m2, m2, [cq+16*5], 1
mova xm4, [cq+16*2]
mova xm0, [cq+16*3]
vinserti128 m4, m4, [cq+16*6], 1
vinserti128 m0, m0, [cq+16*7], 1
punpcklwd m1, m3, m2
punpckhwd m3, m2
punpcklwd m2, m4, m0
punpckhwd m4, m0
punpckldq m0, m1, m2
punpckhdq m1, m2
punpckldq m2, m3, m4
punpckhdq m3, m4
jmp tx2q
.pass2:
vpbroadcastd m4, [o(pw_4096)]
jmp m(iadst_8x8_internal).end
%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 8x16
%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
pmulhrsw xm0, xm2
psrlw xm2, 3 ; pw_2048
pmulhrsw xm0, xm1
pmulhrsw xm0, xm2
vpbroadcastw m0, xm0
mov r2d, 4
jmp m(inv_txfm_add_dct_dct_8x8).end2
%elifidn %1_%2, dct_identity
WIN64_SPILL_XMM 13
vpbroadcastd m0, [o(pw_2896x8)]
pmulhrsw m7, m0, [cq]
vpbroadcastd m1, [o(pw_16384)]
vpbroadcastd m2, [o(pw_5793x4)]
pxor m3, m3
mova [cq], m3
pmulhrsw m7, m0
pmulhrsw m7, m1
psrlw m1, 3 ; pw_2048
psllw m7, 2
pmulhrsw m7, m2
pmulhrsw m7, m1
punpcklwd m5, m7, m7
punpckhwd m7, m7
punpcklwd m4, m5, m5
punpckhwd m5, m5
punpcklwd m6, m7, m7
punpckhwd m7, m7
vpermq m0, m4, q1100
vpermq m1, m5, q1100
vpermq m2, m6, q1100
vpermq m3, m7, q1100
vpermq m4, m4, q3322
vpermq m5, m5, q3322
vpermq m6, m6, q3322
vpermq m7, m7, q3322
jmp m(idct_8x16_internal).end4
%elifidn %1_%2, identity_dct
movd xm0, [cq+32*0]
punpcklwd xm0, [cq+32*1]
movd xm2, [cq+32*2]
punpcklwd xm2, [cq+32*3]
add cq, 32*4
movd xm1, [cq+32*0]
punpcklwd xm1, [cq+32*1]
movd xm3, [cq+32*2]
punpcklwd xm3, [cq+32*3]
vpbroadcastd xm4, [o(pw_2896x8)]
vpbroadcastd xm5, [o(pw_2048)]
xor eax, eax
mov [cq-32*4], eax
mov [cq-32*3], eax
mov [cq-32*2], eax
mov [cq-32*1], eax
punpckldq xm0, xm2
punpckldq xm1, xm3
punpcklqdq xm0, xm1
pmulhrsw xm0, xm4
pmulhrsw xm0, xm4
pmulhrsw xm0, xm5
mov [cq+32*0], eax
mov [cq+32*1], eax
mov [cq+32*2], eax
mov [cq+32*3], eax
vinserti128 m0, m0, xm0, 1
mov r2d, 4
jmp m(inv_txfm_add_dct_dct_8x8).end2
%endif
%endmacro
%macro ITX_8X16_LOAD_COEFS 0
vpbroadcastd m4, [o(pw_2896x8)]
pmulhrsw m0, m4, [cq+32*0]
add cq, 32*4
pmulhrsw m7, m4, [cq+32*3]
pmulhrsw m1, m4, [cq-32*3]
pmulhrsw m6, m4, [cq+32*2]
pmulhrsw m2, m4, [cq-32*2]
pmulhrsw m5, m4, [cq+32*1]
pmulhrsw m3, m4, [cq-32*1]
pmulhrsw m4, [cq+32*0]
%endmacro
INV_TXFM_8X16_FN dct, dct, 0
INV_TXFM_8X16_FN dct, identity, 15
INV_TXFM_8X16_FN dct, adst
INV_TXFM_8X16_FN dct, flipadst
cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_8X16_LOAD_COEFS
call m(idct_16x8_internal).main
vpbroadcastd m10, [o(pw_16384)]
.pass1_end:
vperm2i128 m9, m3, m7, 0x31
vinserti128 m3, m3, xm7, 1
vperm2i128 m8, m2, m6, 0x31
vinserti128 m2, m2, xm6, 1
vperm2i128 m6, m1, m5, 0x31
vinserti128 m1, m1, xm5, 1
vperm2i128 m5, m0, m4, 0x31
vinserti128 m0, m0, xm4, 1
punpckhwd m4, m2, m3
punpcklwd m2, m3
punpckhwd m3, m0, m1
punpcklwd m0, m1
.pass1_end2:
punpckhwd m7, m5, m6
punpcklwd m5, m6
punpcklwd m6, m8, m9
punpckhwd m8, m9
REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8
punpckhdq m1, m0, m2
punpckldq m0, m2
punpckldq m2, m3, m4
punpckhdq m3, m4
punpckldq m4, m5, m6
punpckhdq m5, m6
punpckldq m6, m7, m8
punpckhdq m7, m8
jmp tx2q
.pass2:
call .main
REPX {vpermq x, x, q3120}, m0, m2, m4, m6
REPX {vpermq x, x, q2031}, m1, m3, m5, m7
.end:
vpbroadcastd m8, [o(pw_2048)]
.end2:
REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
.end3:
pxor m8, m8
REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
.end4:
lea r3, [strideq*3]
WRITE_8X4 0, 1, 8, 9
lea dstq, [dstq+strideq*4]
WRITE_8X4 2, 3, 0, 1
lea dstq, [dstq+strideq*4]
WRITE_8X4 4, 5, 0, 1
lea dstq, [dstq+strideq*4]
WRITE_8X4 6, 7, 0, 1
RET
ALIGN function_align
.main:
IDCT16_1D_PACKED
ret
INV_TXFM_8X16_FN adst, dct
INV_TXFM_8X16_FN adst, adst
INV_TXFM_8X16_FN adst, flipadst
INV_TXFM_8X16_FN adst, identity
cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_8X16_LOAD_COEFS
call m(iadst_16x8_internal).main
vpbroadcastd m10, [o(pw_16384)]
pslld m9, m10, 17
psubw m10, m9 ; 16384, -16384
jmp m(idct_8x16_internal).pass1_end
ALIGN function_align
.pass2:
call .main
vpbroadcastd m9, [o(pw_2048)]
vpbroadcastd xm8, [o(pw_4096)]
psubw m8, m9
REPX {vpermq x, x, q2031}, m0, m1, m2, m3
REPX {vpermq x, x, q3120}, m4, m5, m6, m7
jmp m(idct_8x16_internal).end2
ALIGN function_align
.main:
REPX {pshufd x, x, q1032}, m7, m1, m5, m3
.main2:
vpbroadcastd m10, [o(pd_2048)]
punpckhwd m8, m7, m0 ; in14 in1
punpcklwd m0, m7 ; in0 in15
punpcklwd m7, m6, m1 ; in12 in3
punpckhwd m1, m6 ; in2 in13
punpckhwd m6, m5, m2 ; in10 in5
punpcklwd m2, m5 ; in4 in11
punpcklwd m5, m4, m3 ; in8 in7
punpckhwd m3, m4 ; in6 in9
ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1
ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3
ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5
ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7
ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9
ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15
psubw m4, m0, m5 ; t9a t8a
paddw m0, m5 ; t1a t0a
psubw m5, m1, m6 ; t11a t10a
paddw m1, m6 ; t3a t2a
psubw m6, m2, m7 ; t13a t12a
paddw m2, m7 ; t5a t4a
psubw m7, m3, m8 ; t15a t14a
paddw m3, m8 ; t7a t6a
vpbroadcastd m11, [o(pw_m4017_799)]
vpbroadcastd m12, [o(pw_799_4017)]
pxor m9, m9
ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9
psubw m8, m9, m11
ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13
vpbroadcastd m11, [o(pw_m2276_3406)]
vpbroadcastd m12, [o(pw_3406_2276)]
ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11
psubw m8, m9, m11
ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15
psubw m8, m1, m3 ; t7 t6
paddw m1, m3 ; t3 t2
psubw m3, m0, m2 ; t5 t4
paddw m0, m2 ; t1 t0
psubw m2, m5, m7 ; t14a t15a
paddw m7, m5 ; t10a t11a
psubw m5, m4, m6 ; t12a t13a
paddw m4, m6 ; t8a t9a
vpbroadcastd m11, [o(pw_m3784_1567)]
vpbroadcastd m12, [o(pw_1567_3784)]
ITX_MUL2X_PACK 3, 6, _, 10, 11, 12, 4 ; t4a t5a
psubw m6, m9, m11
ITX_MUL2X_PACK 8, 12, _, 10, 12, 6, 4 ; t6a t7a
vpbroadcastd m11, [o(pw_m1567_3784)]
vpbroadcastd m12, [o(pw_3784_1567)]
ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 4 ; t15 t14
psubw m6, m9, m11
ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 4 ; t13 t12
vbroadcasti128 m11, [o(deint_shuf)]
vpbroadcastd m12, [o(pw_2896x8)]
psubw m6, m0, m1 ; t3a t2a
paddw m0, m1 ; -out15 out0
paddw m1, m2, m5 ; -out13 out2
psubw m5, m2 ; t15a t14a
paddw m2, m4, m7 ; -out1 out14
psubw m4, m7 ; t10 t11
psubw m7, m3, m8 ; t6 t7
paddw m8, m3 ; -out3 out12
REPX {pshufb x, m11}, m6, m4, m0, m2
vpblendd m3, m6, m4, 0xcc ; t3a t11
shufps m6, m6, m4, q1032 ; t2a t10
vpblendd m4, m5, m7, 0xcc ; t15a t7
shufps m5, m5, m7, q1032 ; t14a t6
shufps m7, m2, m0, q1032 ; out14 -out15
vpblendd m0, m0, m2, 0x33 ; -out1 out0
paddw m2, m5, m4 ; -out5 out4
psubw m5, m4 ; out10 -out11
psubw m4, m6, m3 ; out8 -out9
paddw m3, m6 ; -out7 out6
shufps m6, m8, m1, q1032 ; out12 -out13
vpblendd m1, m1, m8, 0x33 ; -out3 out2
REPX {pmulhrsw x, m12}, m2, m3, m4, m5
ret
INV_TXFM_8X16_FN flipadst, dct
INV_TXFM_8X16_FN flipadst, adst
INV_TXFM_8X16_FN flipadst, flipadst
INV_TXFM_8X16_FN flipadst, identity
cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_8X16_LOAD_COEFS
call m(iadst_16x8_internal).main
vpbroadcastd m9, [o(pw_16384)]
pslld m10, m9, 17
psubw m10, m9 ; -16384, 16384
vperm2i128 m9, m4, m0, 0x31
vinserti128 m0, m4, xm0, 1
vperm2i128 m8, m5, m1, 0x31
vinserti128 m4, m5, xm1, 1
vperm2i128 m5, m7, m3, 0x31
vinserti128 m3, m7, xm3, 1
vinserti128 m1, m6, xm2, 1
vperm2i128 m6, m6, m2, 0x31
punpcklwd m2, m4, m0
punpckhwd m4, m0
punpcklwd m0, m3, m1
punpckhwd m3, m1
jmp m(idct_8x16_internal).pass1_end2
.pass2:
call m(iadst_8x16_internal).main
vpbroadcastd m8, [o(pw_2048)]
vpbroadcastd xm9, [o(pw_4096)]
psubw m8, m9
vpermq m9, m0, q3120
vpermq m0, m7, q2031
vpermq m7, m1, q3120
vpermq m1, m6, q2031
vpermq m6, m2, q3120
vpermq m2, m5, q2031
vpermq m5, m3, q3120
vpermq m3, m4, q2031
pmulhrsw m0, m8
pmulhrsw m1, m8
pmulhrsw m2, m8
pmulhrsw m3, m8
pmulhrsw m4, m5, m8
pmulhrsw m5, m6, m8
pmulhrsw m6, m7, m8
pmulhrsw m7, m9, m8
jmp m(idct_8x16_internal).end3
INV_TXFM_8X16_FN identity, dct, 7
INV_TXFM_8X16_FN identity, adst
INV_TXFM_8X16_FN identity, flipadst
INV_TXFM_8X16_FN identity, identity
cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
mova xm3, [cq+16*0]
mova xm2, [cq+16*2]
add cq, 16*8
vinserti128 m3, m3, [cq+16*0], 1
vinserti128 m2, m2, [cq+16*2], 1
vpbroadcastd m9, [o(pw_2896x8)]
mova xm4, [cq-16*4]
mova xm5, [cq-16*2]
vinserti128 m4, m4, [cq+16*4], 1
vinserti128 m5, m5, [cq+16*6], 1
mova xm7, [cq-16*7]
mova xm6, [cq-16*5]
vinserti128 m7, m7, [cq+16*1], 1
vinserti128 m6, m6, [cq+16*3], 1
mova xm8, [cq-16*3]
mova xm0, [cq-16*1]
vinserti128 m8, m8, [cq+16*5], 1
vinserti128 m0, m0, [cq+16*7], 1
punpcklwd m1, m3, m2
punpckhwd m3, m2
punpcklwd m2, m4, m5
punpckhwd m4, m5
punpcklwd m5, m7, m6
punpckhwd m7, m6
punpcklwd m6, m8, m0
punpckhwd m8, m0
REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8
punpckldq m0, m1, m2
punpckhdq m1, m2
punpckldq m2, m3, m4
punpckhdq m3, m4
punpckldq m4, m5, m6
punpckhdq m5, m6
punpckldq m6, m7, m8
punpckhdq m7, m8
jmp tx2q
.pass2:
vpbroadcastd m8, [o(pw_5793x4)]
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pmulhrsw x, m8 }, m0, m1, m2, m3, m4, m5, m6, m7
jmp m(idct_8x16_internal).end
%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
pmovzxbw m%3, [dstq+%5]
%ifnum %1
paddw m%3, m%1
%else
paddw m%3, %1
%endif
pmovzxbw m%4, [dstq+%6]
%ifnum %2
paddw m%4, m%2
%else
paddw m%4, %2
%endif
packuswb m%3, m%4
vpermq m%3, m%3, q3120
mova [dstq+%5], xm%3
vextracti128 [dstq+%6], m%3, 1
%endmacro
%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 16x4
%if %3 >= 0
%ifidn %1_%2, dct_identity
vpbroadcastd xm3, [o(pw_2896x8)]
pmulhrsw xm3, [cq]
vpbroadcastd xm0, [o(pw_16384)]
vpbroadcastd xm1, [o(pw_5793x4)]
pmulhrsw xm3, xm0
psrlw xm0, 3 ; pw_2048
paddw xm3, xm3
pmulhrsw xm3, xm1
pmulhrsw xm3, xm0
punpcklwd xm3, xm3
punpckldq xm1, xm3, xm3
punpckhdq xm3, xm3
vpbroadcastq m0, xm1
vpermq m1, m1, q1111
vpbroadcastq m2, xm3
vpermq m3, m3, q1111
jmp m(iadst_16x4_internal).end2
%elifidn %1_%2, identity_dct
mova xm0, [cq+16*0]
mova xm2, [cq+16*1]
vinserti128 m0, m0, [cq+16*4], 1
vinserti128 m2, m2, [cq+16*5], 1
mova xm1, [cq+16*2]
mova xm3, [cq+16*3]
vinserti128 m1, m1, [cq+16*6], 1
vinserti128 m3, m3, [cq+16*7], 1
vpbroadcastd m4, [o(pw_5793x4)]
vpbroadcastd m5, [o(pw_16384)]
packusdw m0, m2
packusdw m1, m3
packusdw m0, m1
vpbroadcastd m1, [o(pw_2896x8)]
psllw m0, 2
pmulhrsw m0, m4
pmulhrsw m0, m5
psrlw m5, 3 ; pw_2048
pmulhrsw m0, m1
pmulhrsw m0, m5
mov r3d, 2
.end:
pxor m3, m3
.end_loop:
mova [cq+32*0], m3
mova [cq+32*1], m3
add cq, 32*2
WRITE_16X2 0, 0, 1, 2, strideq*0, strideq*1
lea dstq, [dstq+strideq*2]
dec r3d
jg .end_loop
RET
%else
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
%ifidn %2, dct
movd xm2, [o(pw_16384)]
mov [cq], eobd
mov r2d, 2
.dconly:
pmulhrsw xm0, xm2
movd xm2, [pw_2048] ; intentionally rip-relative
pmulhrsw xm0, xm1
pmulhrsw xm0, xm2
vpbroadcastw m0, xm0
pxor m3, m3
.dconly_loop:
mova xm1, [dstq]
vinserti128 m1, m1, [dstq+strideq], 1
punpckhbw m2, m1, m3
punpcklbw m1, m3
paddw m2, m0
paddw m1, m0
packuswb m1, m2
mova [dstq], xm1
vextracti128 [dstq+strideq], m1, 1
lea dstq, [dstq+strideq*2]
dec r2d
jg .dconly_loop
RET
%else ; adst / flipadst
movd xm2, [o(pw_16384)]
pmulhrsw xm0, xm2
vpbroadcastw m0, xm0
pmulhrsw m0, [o(iadst4_dconly2a)]
vpbroadcastd m3, [o(pw_2048)]
mov [cq], eobd
pmulhrsw m3, m0
%ifidn %2, adst
vpbroadcastq m0, xm3
vpermq m1, m3, q1111
vpermq m2, m3, q2222
vpermq m3, m3, q3333
%else ; flipadst
vpermq m0, m3, q3333
vpermq m1, m3, q2222
vpermq m2, m3, q1111
vpbroadcastq m3, xm3
%endif
jmp m(iadst_16x4_internal).end3
%endif
%endif
%endif
%endmacro
INV_TXFM_16X4_FN dct, dct, 0
INV_TXFM_16X4_FN dct, adst, 0
INV_TXFM_16X4_FN dct, flipadst, 0
INV_TXFM_16X4_FN dct, identity, 3
cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova xm0, [cq+16*0]
mova xm1, [cq+16*1]
mova xm2, [cq+16*2]
mova xm3, [cq+16*3]
mova xm4, [cq+16*4]
mova xm5, [cq+16*5]
mova xm6, [cq+16*6]
mova xm7, [cq+16*7]
call m(idct_4x16_internal).main
vinserti128 m6, m2, xm6, 1
vinserti128 m2, m0, xm4, 1
vinserti128 m0, m1, xm5, 1
vinserti128 m1, m3, xm7, 1
punpcklwd m3, m2, m6
punpckhwd m2, m6
vpbroadcastd m6, [o(pw_16384)]
punpckhwd m4, m0, m1
punpcklwd m0, m1
mova m1, m6
jmp m(iadst_16x4_internal).pass1_end
.pass2:
call .main
jmp m(iadst_16x4_internal).end
ALIGN function_align
.main:
vpbroadcastd m6, [o(pd_2048)]
IDCT4_1D 0, 1, 2, 3, 4, 5, 6
ret
INV_TXFM_16X4_FN adst, dct
INV_TXFM_16X4_FN adst, adst
INV_TXFM_16X4_FN adst, flipadst
INV_TXFM_16X4_FN adst, identity
cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q1230
vpermq m3, [cq+32*3], q2103
vpermq m1, [cq+32*1], q1230
vpermq m2, [cq+32*2], q2103
call m(iadst_4x16_internal).main2
pshufd m2, m2, q1032
punpcklwd m4, m3, m1
punpcklwd m5, m2, m0
punpckhwd m0, m1
punpckhwd m2, m3
vpbroadcastd m1, [o(pw_16384)]
vinserti128 m3, m0, xm2, 1
vperm2i128 m2, m0, m2, 0x31
vinserti128 m0, m4, xm5, 1
vperm2i128 m4, m4, m5, 0x31
psubw m6, m7, m1
.pass1_end:
pmulhrsw m3, m1
pmulhrsw m2, m6
pmulhrsw m4, m1
pmulhrsw m0, m6
punpcklwd m1, m3, m2
punpckhwd m3, m2
punpcklwd m2, m4, m0
punpckhwd m4, m0
punpckldq m0, m1, m2
punpckhdq m1, m2
punpckldq m2, m3, m4
punpckhdq m3, m4
jmp tx2q
.pass2:
call .main
.end:
vpbroadcastd m4, [o(pw_2048)]
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
WIN64_RESTORE_XMM
.end2:
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
mova [cq+32*2], m4
mova [cq+32*3], m4
.end3:
WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1
lea dstq, [dstq+strideq*2]
WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1
RET
ALIGN function_align
.main:
vpbroadcastd m7, [o(pw_3803_1321)]
vpbroadcastd m8, [o(pw_m1321_2482)]
vpbroadcastd m9, [o(pw_2482_3344)]
punpcklwd m4, m2, m0 ; in2 in0 l
psubw m6, m0, m2
punpckhwd m2, m0 ; in2 in0 h
paddw m6, m3 ; t2
pmaddwd m0, m7, m4 ; t0:02 l
pmaddwd m7, m2 ; t0:02 h
pmaddwd m4, m8 ; t1:02 l
pmaddwd m8, m2 ; t1:02 h
punpckhwd m2, m3, m1 ; in3 in1 h
punpcklwd m3, m1 ; in3 in1 l
vpbroadcastd m1, [o(pd_2048)]
pmaddwd m5, m9, m3
pmaddwd m9, m2
paddd m0, m1
paddd m7, m1
paddd m0, m5 ; t0 + t3 + 2048 l
paddd m7, m9 ; t0 + t3 + 2048 h
vpbroadcastd m9, [o(pw_m3803_3344)]
pmaddwd m5, m9, m2
pmaddwd m9, m3
paddd m5, m1 ; t1:13 + 2048 h
paddd m1, m9 ; t1:13 + 2048 l
vpbroadcastd m9, [o(pw_m3803_m6688)]
pmaddwd m2, m9
pmaddwd m3, m9
paddd m5, m8 ; t1 + t3 + 2048 h
paddd m1, m4 ; t1 + t3 + 2048 l
paddd m8, m7
paddd m4, m0
paddd m2, m8 ; t0 + t1 - t3 + 2048 h
paddd m3, m4 ; t0 + t1 - t3 + 2048 l
REPX {psrad x, 12}, m0, m7, m5, m1, m2, m3
packssdw m0, m7
packssdw m1, m5
packssdw m3, m2
vpbroadcastd m2, [o(pw_3344x8)]
pmulhrsw m2, m6
ret
INV_TXFM_16X4_FN flipadst, dct
INV_TXFM_16X4_FN flipadst, adst
INV_TXFM_16X4_FN flipadst, flipadst
INV_TXFM_16X4_FN flipadst, identity
cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q1230
vpermq m3, [cq+32*3], q2103
vpermq m1, [cq+32*1], q1230
vpermq m2, [cq+32*2], q2103
call m(iadst_4x16_internal).main2
pshufd m2, m2, q1032
punpckhwd m4, m3, m2
punpckhwd m5, m1, m0
punpcklwd m0, m2
punpcklwd m1, m3
vpbroadcastd m6, [o(pw_16384)]
vinserti128 m3, m0, xm1, 1
vperm2i128 m2, m0, m1, 0x31
vinserti128 m0, m4, xm5, 1
vperm2i128 m4, m4, m5, 0x31
psubw m1, m7, m6
jmp m(iadst_16x4_internal).pass1_end
ALIGN function_align
.pass2:
call m(iadst_16x4_internal).main
vpbroadcastd m4, [o(pw_2048)]
REPX {pmulhrsw x, m4}, m3, m2, m1, m0
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
mova [cq+32*2], m4
mova [cq+32*3], m4
WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1
lea dstq, [dstq+strideq*2]
WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1
RET
INV_TXFM_16X4_FN identity, dct, 15
INV_TXFM_16X4_FN identity, adst
INV_TXFM_16X4_FN identity, flipadst
INV_TXFM_16X4_FN identity, identity
cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova xm2, [cq+16*0]
mova xm4, [cq+16*1]
vinserti128 m2, m2, [cq+16*4], 1
vinserti128 m4, m4, [cq+16*5], 1
mova xm0, [cq+16*2]
mova xm1, [cq+16*3]
vinserti128 m0, m0, [cq+16*6], 1
vinserti128 m1, m1, [cq+16*7], 1
vpbroadcastd m5, [o(pw_5793x4)]
punpcklwd m3, m2, m4
punpckhwd m2, m4
punpcklwd m4, m0, m1
punpckhwd m0, m1
REPX {psllw x, 2}, m3, m2, m4, m0
punpcklwd m1, m3, m2
punpckhwd m3, m2
punpcklwd m2, m4, m0
punpckhwd m4, m0
REPX {pmulhrsw x, m5}, m1, m3, m2, m4
vpbroadcastd m5, [o(pw_16384)]
punpcklqdq m0, m1, m2
punpckhqdq m1, m2
punpcklqdq m2, m3, m4
punpckhqdq m3, m4
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
jmp tx2q
.pass2:
vpbroadcastd m4, [o(pw_5793x4)]
REPX {paddw x, x }, m0, m1, m2, m3
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
jmp m(iadst_16x4_internal).end
%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 16x8
%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
mov r2d, 4
jmp m(inv_txfm_add_dct_dct_16x4).dconly
%elifidn %1_%2, dct_identity
WIN64_SPILL_XMM 13
vbroadcasti128 m7, [cq]
vpbroadcastd m0, [o(pw_2896x8)]
vpbroadcastd m1, [o(pw_16384)]
pxor xm2, xm2
mova [cq], xm2
pmulhrsw m7, m0
pmulhrsw m7, m0
pmulhrsw m7, m1
psrlw m1, 2 ; pw_4096
pmulhrsw m7, m1
punpcklwd m3, m7, m7
punpckhwd m7, m7
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
pshufd m4, m7, q0000
pshufd m5, m7, q1111
pshufd m6, m7, q2222
pshufd m7, m7, q3333
lea r3, [strideq*3]
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
WRITE_16X2 2, 3, 0, 1, strideq*2, r3
jmp m(idct_16x8_internal).end4
%elifidn %1_%2, identity_dct
mova m0, [cq+32*0]
packusdw m0, [cq+32*1]
mova m2, [cq+32*2]
packusdw m2, [cq+32*3]
mova m1, [cq+32*4]
packusdw m1, [cq+32*5]
mova m3, [cq+32*6]
packusdw m3, [cq+32*7]
vpbroadcastd m4, [o(pw_2896x8)]
vpbroadcastd m5, [o(pw_5793x4)]
packusdw m0, m2
packusdw m1, m3
vpbroadcastd m2, [o(pw_16384)]
packusdw m0, m1
vpermq m1, m0, q3322
vpermq m0, m0, q1100
punpcklwd m0, m1
pmulhrsw m0, m4
psllw m0, 2
pmulhrsw m0, m5
pmulhrsw m0, m2
psrlw m2, 3 ; pw_2048
pmulhrsw m0, m4
pmulhrsw m0, m2
mov r3d, 4
jmp m(inv_txfm_add_identity_dct_16x4).end
%endif
%endmacro
%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
vpbroadcastd m8, [o(pw_2896x8)]
vpermq m0, [cq+32*0], q3120
add cq, 32*4
vpermq m7, [cq+32*3], q%1
vpermq m1, [cq-32*3], q%1
vpermq m6, [cq+32*2], q3120
vpermq m2, [cq-32*2], q3120
vpermq m5, [cq+32*1], q%1
vpermq m3, [cq-32*1], q%1
vpermq m4, [cq+32*0], q3120
REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
%endmacro
INV_TXFM_16X8_FN dct, dct, 0
INV_TXFM_16X8_FN dct, identity, 7
INV_TXFM_16X8_FN dct, adst
INV_TXFM_16X8_FN dct, flipadst
cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_16X8_LOAD_COEFS 3120
call m(idct_8x16_internal).main
vpbroadcastd m10, [o(pw_16384)]
punpckhwd m8, m0, m2
punpcklwd m0, m2
punpckhwd m2, m1, m3
punpcklwd m1, m3
punpcklwd m9, m4, m6
punpckhwd m4, m6
punpcklwd m6, m5, m7
punpckhwd m5, m7
REPX {pmulhrsw x, m10}, m8, m1, m4, m6
.pass1_end:
REPX {pmulhrsw x, m10}, m0, m2, m9, m5
punpckhwd m3, m0, m8
punpcklwd m0, m8
punpckhwd m8, m2, m1
punpcklwd m2, m1
punpcklwd m7, m9, m4
punpckhwd m9, m4
punpcklwd m4, m5, m6
punpckhwd m5, m6
punpckhdq m1, m0, m2
punpckldq m0, m2
punpckldq m2, m3, m8
punpckhdq m3, m8
punpckldq m6, m7, m4
punpckhdq m7, m4
punpckldq m8, m9, m5
punpckhdq m9, m5
vperm2i128 m4, m0, m6, 0x31
vinserti128 m0, m0, xm6, 1
vperm2i128 m5, m1, m7, 0x31
vinserti128 m1, m1, xm7, 1
vperm2i128 m6, m2, m8, 0x31
vinserti128 m2, m2, xm8, 1
vperm2i128 m7, m3, m9, 0x31
vinserti128 m3, m3, xm9, 1
jmp tx2q
.pass2:
call .main
vpbroadcastd m8, [o(pw_2048)]
.end:
REPX {pmulhrsw x, m8}, m0, m2, m4, m6
.end2:
REPX {pmulhrsw x, m8}, m1, m3, m5, m7
lea r3, [strideq*3]
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
WRITE_16X2 2, 3, 0, 1, strideq*2, r3
.end3:
pxor m0, m0
REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
.end4:
lea dstq, [dstq+strideq*4]
WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
WRITE_16X2 6, 7, 0, 1, strideq*2, r3
RET
ALIGN function_align
.main:
vpbroadcastd m10, [o(pd_2048)]
.main2:
IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
ret
INV_TXFM_16X8_FN adst, dct
INV_TXFM_16X8_FN adst, adst
INV_TXFM_16X8_FN adst, flipadst
INV_TXFM_16X8_FN adst, identity
cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_16X8_LOAD_COEFS 1302
call m(iadst_8x16_internal).main2
vpbroadcastd m10, [o(pw_16384)]
psubw m11, m9, m10
punpcklwd m8, m0, m2
punpckhwd m0, m2
punpckhwd m2, m1, m3
punpcklwd m1, m3
punpcklwd m9, m4, m6
punpckhwd m4, m6
punpckhwd m6, m5, m7
punpcklwd m5, m7
REPX {pmulhrsw x, m11}, m8, m1, m4, m6
jmp m(idct_16x8_internal).pass1_end
ALIGN function_align
.pass2:
call .main
vpbroadcastd m9, [o(pw_2048)]
pxor m8, m8
psubw m8, m9
REPX {pmulhrsw x, m9}, m0, m2, m4, m6
jmp m(idct_16x8_internal).end2
ALIGN function_align
.main:
vpbroadcastd m10, [o(pd_2048)]
ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a
ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
psubw m8, m2, m6 ; t6
paddw m2, m6 ; t2
psubw m6, m0, m4 ; t4
paddw m0, m4 ; t0
psubw m4, m5, m1 ; t7
paddw m5, m1 ; t3
psubw m1, m7, m3 ; t5
paddw m7, m3 ; t1
ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
psubw m9, m6, m8 ; t7
paddw m6, m8 ; out6
vpbroadcastd m8, [o(pw_2896x8)]
psubw m3, m7, m5 ; t3
paddw m7, m5 ; -out7
psubw m5, m0, m2 ; t2
paddw m0, m2 ; out0
psubw m2, m1, m4 ; t6
paddw m1, m4 ; -out1
psubw m4, m5, m3
paddw m3, m5
psubw m5, m2, m9
paddw m2, m9
pmulhrsw m2, m8 ; out2
pmulhrsw m3, m8 ; -out3
pmulhrsw m4, m8 ; out4
pmulhrsw m5, m8 ; -out5
ret
INV_TXFM_16X8_FN flipadst, dct
INV_TXFM_16X8_FN flipadst, adst
INV_TXFM_16X8_FN flipadst, flipadst
INV_TXFM_16X8_FN flipadst, identity
cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_16X8_LOAD_COEFS 1302
call m(iadst_8x16_internal).main2
vpbroadcastd m10, [o(pw_16384)]
psubw m9, m10
punpcklwd m8, m6, m4
punpckhwd m6, m4
punpcklwd m4, m7, m5
punpckhwd m7, m5
punpckhwd m5, m3, m1
punpcklwd m3, m1
punpckhwd m1, m2, m0
punpcklwd m2, m0
REPX {pmulhrsw x, m10}, m8, m4, m5, m1
REPX {pmulhrsw x, m9 }, m6, m7, m3, m2
punpcklwd m0, m7, m4
punpckhwd m7, m4
punpckhwd m4, m6, m8
punpcklwd m6, m8
punpckhwd m8, m3, m5
punpcklwd m3, m5
punpcklwd m5, m2, m1
punpckhwd m2, m1
punpckhdq m1, m0, m6
punpckldq m0, m6
punpckldq m6, m7, m4
punpckhdq m7, m4
punpckhdq m4, m3, m5
punpckldq m3, m5
punpckldq m5, m8, m2
punpckhdq m8, m2
vinserti128 m2, m6, xm5, 1
vperm2i128 m6, m6, m5, 0x31
vperm2i128 m5, m1, m4, 0x31
vinserti128 m1, m1, xm4, 1
vperm2i128 m4, m0, m3, 0x31
vinserti128 m0, m0, xm3, 1
vinserti128 m3, m7, xm8, 1
vperm2i128 m7, m7, m8, 0x31
jmp tx2q
.pass2:
call m(iadst_16x8_internal).main
vpbroadcastd m9, [o(pw_2048)]
pxor m8, m8
psubw m8, m9
pmulhrsw m10, m7, m8
pmulhrsw m7, m0, m9
pmulhrsw m0, m6, m9
pmulhrsw m6, m1, m8
pmulhrsw m1, m5, m8
pmulhrsw m5, m2, m9
pmulhrsw m2, m4, m9
pmulhrsw m4, m3, m8
lea r3, [strideq*3]
WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1
WRITE_16X2 1, 2, 0, 1, strideq*2, r3
jmp m(idct_16x8_internal).end3
INV_TXFM_16X8_FN identity, dct, 15
INV_TXFM_16X8_FN identity, adst
INV_TXFM_16X8_FN identity, flipadst
INV_TXFM_16X8_FN identity, identity
cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
mova xm7, [cq+16*0]
mova xm2, [cq+16*1]
add cq, 16*8
vpbroadcastd m3, [o(pw_2896x8)]
vinserti128 m7, m7, [cq+16*0], 1
vinserti128 m2, m2, [cq+16*1], 1
mova xm6, [cq-16*6]
mova xm4, [cq-16*5]
vinserti128 m6, m6, [cq+16*2], 1
vinserti128 m4, m4, [cq+16*3], 1
mova xm8, [cq-16*4]
mova xm5, [cq-16*3]
vinserti128 m8, m8, [cq+16*4], 1
vinserti128 m5, m5, [cq+16*5], 1
mova xm0, [cq-16*2]
mova xm1, [cq-16*1]
vinserti128 m0, m0, [cq+16*6], 1
vinserti128 m1, m1, [cq+16*7], 1
vpbroadcastd m9, [o(pw_5793x4)]
vpbroadcastd m10, [o(pw_16384)]
REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
punpcklwd m3, m7, m2
punpckhwd m7, m2
punpcklwd m2, m6, m4
punpckhwd m6, m4
punpcklwd m4, m8, m5
punpckhwd m8, m5
punpcklwd m5, m0, m1
punpckhwd m0, m1
REPX {psllw x, 2}, m3, m7, m2, m6, m4, m8, m5, m0
punpckldq m1, m3, m2
punpckhdq m3, m2
punpckldq m2, m4, m5
punpckhdq m4, m5
punpckldq m5, m7, m6
punpckhdq m7, m6
punpckldq m6, m8, m0
punpckhdq m8, m0
REPX {pmulhrsw x, m9}, m1, m3, m2, m4, m5, m7, m6, m8
punpcklqdq m0, m1, m2
punpckhqdq m1, m2
punpcklqdq m2, m3, m4
punpckhqdq m3, m4
punpcklqdq m4, m5, m6
punpckhqdq m5, m6
punpcklqdq m6, m7, m8
punpckhqdq m7, m8
REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
jmp tx2q
.pass2:
vpbroadcastd m8, [o(pw_4096)]
jmp m(idct_16x8_internal).end
%define o_base pw_5 + 128
%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 16x16
%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
mov r2d, 8
jmp m(inv_txfm_add_dct_dct_16x4).dconly
%elifidn %1_%2, dct_identity
WIN64_SPILL_XMM 7
vpbroadcastd m3, [o(pw_2896x8)]
pmulhrsw m3, [cq]
vpbroadcastd m0, [o(pw_8192)]
vpbroadcastd m1, [o(pw_5793x4)]
vpbroadcastw m4, [o(deint_shuf)] ; pb_0_1
pcmpeqb m5, m5
pxor m6, m6
mova [cq], m6
paddb m5, m5 ; pb_m2
pmulhrsw m3, m0
psrlw m0, 2 ; pw_2048
psllw m3, 2
pmulhrsw m3, m1
pmulhrsw m3, m0
mov r3d, 8
.loop:
mova xm1, [dstq]
vinserti128 m1, m1, [dstq+strideq*8], 1
pshufb m0, m3, m4
psubb m4, m5 ; += 2
punpckhbw m2, m1, m6
punpcklbw m1, m6
paddw m2, m0
paddw m1, m0
packuswb m1, m2
mova [dstq], xm1
vextracti128 [dstq+strideq*8], m1, 1
add dstq, strideq
dec r3d
jg .loop
RET
%elifidn %1_%2, identity_dct
movd xm0, [cq+32*0 ]
movd xm2, [cq+32*1 ]
movd xm1, [cq+32*2 ]
movd xm3, [cq+32*3 ]
vinserti128 m0, m0, [cq+32*8 ], 1
vinserti128 m2, m2, [cq+32*9 ], 1
vinserti128 m1, m1, [cq+32*10], 1
vinserti128 m3, m3, [cq+32*11], 1
punpcklwd m0, m2
punpcklwd m1, m3
punpckldq m0, m1
movd xm1, [cq+32*4 ]
movd xm3, [cq+32*5 ]
movd xm2, [cq+32*6 ]
movd xm4, [cq+32*7 ]
vinserti128 m1, m1, [cq+32*12], 1
vinserti128 m3, m3, [cq+32*13], 1
vinserti128 m2, m2, [cq+32*14], 1
vinserti128 m4, m4, [cq+32*15], 1
punpcklwd m1, m3
vpbroadcastd m3, [o(pw_5793x4)]
punpcklwd m2, m4
vpbroadcastd m4, [o(pw_8192)]
punpckldq m1, m2
vpbroadcastd m2, [o(pw_2896x8)]
punpcklqdq m0, m1
psllw m0, 2
pmulhrsw m0, m3
pmulhrsw m0, m4
psrlw m4, 2 ; pw_2048
pmulhrsw m0, m2
pmulhrsw m0, m4
mov r3d, 8
jmp m(inv_txfm_add_identity_dct_16x4).end
%endif
%endmacro
%macro ITX_16X16_LOAD_COEFS 0
mova m0, [cq+32*0]
mova m1, [cq+32*1]
mova m2, [cq+32*2]
mova m3, [cq+32*3]
add cq, 32*8
mova m4, [cq-32*4]
mova m5, [cq-32*3]
mova m6, [cq-32*2]
mova m7, [cq-32*1]
mova m8, [cq+32*0]
mova m9, [cq+32*1]
mova m10, [cq+32*2]
mova m11, [cq+32*3]
mova m12, [cq+32*4]
mova m13, [cq+32*5]
mova m14, [cq+32*6]
mova m15, [cq+32*7]
mova [rsp], m15
%endmacro
INV_TXFM_16X16_FN dct, dct, 0
INV_TXFM_16X16_FN dct, identity, 15
INV_TXFM_16X16_FN dct, adst
INV_TXFM_16X16_FN dct, flipadst
cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
ITX_16X16_LOAD_COEFS
call .main
.pass1_end:
vpbroadcastd m1, [o(pw_8192)]
REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
vextracti128 [rsp+16*5], m8, 1
mova [rsp+16*1], xm8
.pass1_end2:
vextracti128 [rsp+16*4], m0, 1
mova [rsp+16*0], xm0
REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
pmulhrsw m1, [rsp+32*1]
vperm2i128 m8, m1, m9, 0x31
vinserti128 m1, m1, xm9, 1
vperm2i128 m9, m2, m10, 0x31
vinserti128 m2, m2, xm10, 1
vperm2i128 m10, m3, m11, 0x31
vinserti128 m3, m3, xm11, 1
vperm2i128 m11, m4, m12, 0x31
vinserti128 m4, m4, xm12, 1
vperm2i128 m12, m5, m13, 0x31
vinserti128 m5, m5, xm13, 1
vperm2i128 m13, m6, m14, 0x31
vinserti128 m6, m6, xm14, 1
vperm2i128 m14, m7, m15, 0x31
vinserti128 m7, m7, xm15, 1
mova m15, [rsp+32*2]
.pass1_end3:
punpcklwd m0, m9, m10
punpckhwd m9, m10
punpcklwd m10, m15, m8
punpckhwd m15, m8
punpckhwd m8, m11, m12
punpcklwd m11, m12
punpckhwd m12, m13, m14
punpcklwd m13, m14
punpckhdq m14, m11, m13
punpckldq m11, m13
punpckldq m13, m15, m9
punpckhdq m15, m9
punpckldq m9, m10, m0
punpckhdq m10, m0
punpckhdq m0, m8, m12
punpckldq m8, m12
punpcklqdq m12, m13, m8
punpckhqdq m13, m8
punpcklqdq m8, m9, m11
punpckhqdq m9, m11
punpckhqdq m11, m10, m14
punpcklqdq m10, m14
punpcklqdq m14, m15, m0
punpckhqdq m15, m0
mova m0, [rsp]
mova [rsp], m15
punpckhwd m15, m4, m5
punpcklwd m4, m5
punpckhwd m5, m0, m1
punpcklwd m0, m1
punpckhwd m1, m6, m7
punpcklwd m6, m7
punpckhwd m7, m2, m3
punpcklwd m2, m3
punpckhdq m3, m0, m2
punpckldq m0, m2
punpckldq m2, m4, m6
punpckhdq m4, m6
punpckhdq m6, m5, m7
punpckldq m5, m7
punpckldq m7, m15, m1
punpckhdq m15, m1
punpckhqdq m1, m0, m2
punpcklqdq m0, m2
punpcklqdq m2, m3, m4
punpckhqdq m3, m4
punpcklqdq m4, m5, m7
punpckhqdq m5, m7
punpckhqdq m7, m6, m15
punpcklqdq m6, m15
jmp tx2q
.pass2:
call .main
.end:
vpbroadcastd m1, [o(pw_2048)]
REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
mova [rsp], m6
.end2:
REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
pmulhrsw m1, [rsp+32*1]
lea r3, [strideq*3]
WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1
WRITE_16X2 2, 3, 0, 1, strideq*2, r3
lea dstq, [dstq+strideq*4]
WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3
.end3:
pxor m2, m2
REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1
lea dstq, [dstq+strideq*4]
WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1
WRITE_16X2 10, 11, 0, 1, strideq*2, r3
REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7
lea dstq, [dstq+strideq*4]
WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1
WRITE_16X2 14, 15, 0, 1, strideq*2, r3
RET
ALIGN function_align
.main:
vpbroadcastd m15, [o(pd_2048)]
mova [rsp+gprsize+32*1], m1
mova [rsp+gprsize+32*2], m9
IDCT8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15
mova m1, [rsp+gprsize+32*2] ; in9
mova [rsp+gprsize+32*2], m14 ; tmp7
mova m9, [rsp+gprsize+32*1] ; in1
mova [rsp+gprsize+32*1], m10 ; tmp5
mova m14, [rsp+gprsize+32*0] ; in15
mova [rsp+gprsize+32*0], m6 ; tmp3
IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15
mova m6, [rsp+gprsize+32*1] ; tmp5
psubw m15, m0, m14 ; out15
paddw m0, m14 ; out0
psubw m14, m2, m13 ; out14
paddw m2, m13 ; out1
mova [rsp+gprsize+32*1], m2
psubw m13, m4, m11 ; out13
paddw m2, m4, m11 ; out2
psubw m11, m8, m7 ; out11
paddw m4, m8, m7 ; out4
mova m7, [rsp+gprsize+32*2] ; tmp7
psubw m10, m6, m5 ; out10
paddw m5, m6 ; out5
psubw m8, m7, m9 ; out8
paddw m7, m9 ; out7
psubw m9, m12, m3 ; out9
paddw m6, m12, m3 ; out6
mova m3, [rsp+gprsize+32*0] ; tmp3
psubw m12, m3, m1 ; out12
paddw m3, m1 ; out3
ret
INV_TXFM_16X16_FN adst, dct
INV_TXFM_16X16_FN adst, adst
INV_TXFM_16X16_FN adst, flipadst
cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
ITX_16X16_LOAD_COEFS
call .main
vpbroadcastd m1, [o(pw_8192)]
REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
vextracti128 [rsp+16*5], m8, 1
mova [rsp+16*1], xm8
pxor m8, m8
psubw m1, m8, m1
jmp m(idct_16x16_internal).pass1_end2
ALIGN function_align
.pass2:
call .main
vpbroadcastd m1, [o(pw_2048)]
REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
mova [rsp+32*0], m6
pxor m6, m6
psubw m1, m6, m1
jmp m(idct_16x16_internal).end2
ALIGN function_align
.main:
vpbroadcastd m15, [o(pd_2048)]
mova [rsp+gprsize+32*1], m0
mova [rsp+gprsize+32*2], m4
ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973 ; t3, t2
ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6
ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10
ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14
psubw m0, m2, m10 ; t10a
paddw m2, m10 ; t2a
psubw m10, m13, m5 ; t11a
paddw m13, m5 ; t3a
psubw m5, m6, m14 ; t14a
paddw m6, m14 ; t6a
psubw m14, m9, m1 ; t15a
paddw m9, m1 ; t7a
ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10
ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15
psubw m1, m10, m14 ; t14a
paddw m10, m14 ; t10a
psubw m14, m0, m5 ; t15a
paddw m0, m5 ; t11a
psubw m5, m2, m6 ; t6
paddw m2, m6 ; t2
psubw m6, m13, m9 ; t7
paddw m13, m9 ; t3
ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a
ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15
mova m9, [rsp+gprsize+32*0] ; in15
mova [rsp+gprsize+32*0], m10 ; t10a
mova m4, [rsp+gprsize+32*1] ; in0
mova [rsp+gprsize+32*1], m6 ; t6a
mova m6, [rsp+gprsize+32*2] ; in4
mova [rsp+gprsize+32*2], m2 ; t2
ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091 ; t1, t0
ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4
ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8
ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12
psubw m10, m4, m8 ; t8a
paddw m8, m4 ; t0a
psubw m4, m9, m7 ; t9a
paddw m9, m7 ; t1a
psubw m7, m6, m12 ; t12a
paddw m6, m12 ; t4a
psubw m12, m11, m3 ; t13a
paddw m11, m3 ; t5a
ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8
ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13
psubw m3, m9, m11 ; t5
paddw m9, m11 ; t1
psubw m11, m4, m12 ; t12a
paddw m4, m12 ; t8a
paddw m12, m8, m6 ; t0
psubw m8, m6 ; t4
paddw m6, m10, m7 ; t9a
psubw m10, m7 ; t13a
ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a
ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12
mova m7, [rsp+gprsize+32*0] ; t10a
mova m2, [rsp+gprsize+32*1] ; t6a
paddw m15, m9, m13 ; -out15
psubw m9, m13 ; t3a
paddw m13, m11, m1 ; -out13
psubw m11, m1 ; t15a
psubw m1, m4, m7 ; t10
paddw m7, m4 ; -out1
psubw m4, m3, m2 ; t6
paddw m3, m2 ; -out3
paddw m2, m10, m14 ; out2
psubw m10, m14 ; t14a
paddw m14, m6, m0 ; out14
psubw m6, m0 ; t11
mova m0, [rsp+gprsize+32*2] ; t2
mova [rsp+gprsize+32*1], m7
psubw m7, m12, m0 ; t2a
paddw m0, m12 ; out0
paddw m12, m8, m5 ; out12
psubw m8, m5 ; t7
paddw m5, m10, m11 ; -out5
psubw m10, m11 ; out10
psubw m11, m4, m8 ; -out11
paddw m4, m8 ; out4
psubw m8, m7, m9 ; out8
paddw m7, m9 ; -out7
psubw m9, m1, m6 ; -out9
paddw m6, m1 ; out6
vpbroadcastd m1, [o(pw_2896x8)]
REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
ret
INV_TXFM_16X16_FN flipadst, dct
INV_TXFM_16X16_FN flipadst, adst
INV_TXFM_16X16_FN flipadst, flipadst
cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
ITX_16X16_LOAD_COEFS
call m(iadst_16x16_internal).main
vpbroadcastd m1, [o(pw_8192)]
pmulhrsw m6, m1
mova [rsp+32*2], m6
pmulhrsw m6, m1, m4
pmulhrsw m4, m1, m10
pmulhrsw m10, m1, m12
pmulhrsw m12, m1, m2
pmulhrsw m2, m1, m8
pmulhrsw m8, m1, m14
pmulhrsw m14, m1, m0
pxor m0, m0
psubw m0, m1
REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15
pmulhrsw m1, m0, m9
pmulhrsw m9, m0, m13
pmulhrsw m0, [rsp+32*1]
mova [rsp+16*0], xm15
mova [rsp+16*1], xm7
vperm2i128 m15, m15, m7, 0x31
vinserti128 m7, m2, xm14, 1
vperm2i128 m14, m2, m14, 0x31
vinserti128 m2, m9, xm5, 1
vperm2i128 m9, m9, m5, 0x31
vinserti128 m5, m4, xm12, 1
vperm2i128 m12, m4, m12, 0x31
vinserti128 m4, m11, xm3, 1
vperm2i128 m11, m11, m3, 0x31
vinserti128 m3, m10, xm6, 1
vperm2i128 m10, m10, m6, 0x31
vinserti128 m6, m1, xm0, 1
vperm2i128 m13, m1, m0, 0x31
vinserti128 m1, m8, [rsp+32*2], 1
vperm2i128 m8, m8, [rsp+32*2], 0x31
jmp m(idct_16x16_internal).pass1_end3
.pass2:
call m(iadst_16x16_internal).main
vpbroadcastd m1, [o(pw_2048)]
pmulhrsw m0, m1
pmulhrsw m8, m1
mova [rsp+32*0], m0
mova [rsp+32*2], m8
pxor m0, m0
psubw m0, m1
pmulhrsw m8, m0, m7
pmulhrsw m7, m0, m9
pmulhrsw m9, m1, m6
pmulhrsw m6, m1, m10
pmulhrsw m10, m0, m5
pmulhrsw m5, m0, m11
pmulhrsw m11, m1, m4
pmulhrsw m4, m1, m12
pmulhrsw m12, m0, m3
pmulhrsw m3, m0, m13
pmulhrsw m13, m1, m2
pmulhrsw m1, m14
pmulhrsw m14, m0, [rsp+32*1]
pmulhrsw m0, m15
lea r3, [strideq*3]
WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1
mova m15, [rsp+32*0]
WRITE_16X2 3, 4, 0, 1, strideq*2, r3
lea dstq, [dstq+strideq*4]
WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1
WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3
jmp m(idct_16x16_internal).end3
INV_TXFM_16X16_FN identity, dct, 15
INV_TXFM_16X16_FN identity, identity
cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
mova xm0, [cq+16*0]
mova xm15, [cq+16*1]
mova xm1, [cq+16*2]
mova xm8, [cq+16*3]
mova xm2, [cq+16*4]
mova xm9, [cq+16*5]
mova xm3, [cq+16*6]
mova xm10, [cq+16*7]
add cq, 16*16
vinserti128 m0, m0, [cq+16*0], 1
vinserti128 m15, m15, [cq+16*1], 1
mova xm4, [cq-16*8]
mova xm11, [cq-16*7]
vinserti128 m1, m1, [cq+16*2], 1
vinserti128 m8, m8, [cq+16*3], 1
mova xm5, [cq-16*6]
mova xm12, [cq-16*5]
vinserti128 m2, m2, [cq+16*4], 1
vinserti128 m9, m9, [cq+16*5], 1
mova xm6, [cq-16*4]
mova xm13, [cq-16*3]
vinserti128 m3, m3, [cq+16*6], 1
vinserti128 m10, m10, [cq+16*7], 1
mova xm7, [cq-16*2]
mova xm14, [cq-16*1]
vinserti128 m4, m4, [cq+16*8], 1
vinserti128 m11, m11, [cq+16*9], 1
vinserti128 m5, m5, [cq+16*10], 1
vinserti128 m12, m12, [cq+16*11], 1
vinserti128 m6, m6, [cq+16*12], 1
vinserti128 m13, m13, [cq+16*13], 1
vinserti128 m7, m7, [cq+16*14], 1
vinserti128 m14, m14, [cq+16*15], 1
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
mova [rsp], m0
vpbroadcastd m0, [o(pw_5793x4)]
REPX {pmulhrsw x, m0}, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
pmulhrsw m0, [rsp]
mova [rsp], m1
vpbroadcastd m1, [o(pw_8192)]
REPX {pmulhrsw x, m1}, m0, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
pmulhrsw m1, [rsp]
mova [rsp], m0
jmp m(idct_16x16_internal).pass1_end3
ALIGN function_align
.pass2:
vpbroadcastd m15, [o(pw_5793x4)]
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pmulhrsw x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
mova [rsp+32*1], m1
mova m1, [rsp+32*0]
REPX {psllw x, 2 }, m8, m9, m10, m11, m12, m13, m14, m1
REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
pmulhrsw m15, m1
jmp m(idct_16x16_internal).end
%define o_base iadst4_dconly2a + 128
%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
%if %3
vpbroadcastd m15, [o(pw_2896x8)]
pmulhrsw m0, m15, [%1+%2*0]
pmulhrsw m1, m15, [%1+%2*1]
pmulhrsw m2, m15, [%1+%2*2]
pmulhrsw m3, m15, [%1+%2*3]
pmulhrsw m4, m15, [%1+%2*4]
pmulhrsw m5, m15, [%1+%2*5]
pmulhrsw m6, m15, [%1+%2*6]
pmulhrsw m7, m15, [%1+%2*7]
%else
mova m0, [%1+%2*0]
mova m1, [%1+%2*1]
mova m2, [%1+%2*2]
mova m3, [%1+%2*3]
mova m4, [%1+%2*4]
mova m5, [%1+%2*5]
mova m6, [%1+%2*6]
mova m7, [%1+%2*7]
%endif
%endmacro
%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2
%if %3
%if %3 == 1
vpbroadcastd m15, [o(pw_2896x8)]
%endif
pmulhrsw m8, m15, [%1+%2*0]
pmulhrsw m9, m15, [%1+%2*1]
pmulhrsw m10, m15, [%1+%2*2]
pmulhrsw m11, m15, [%1+%2*3]
pmulhrsw m12, m15, [%1+%2*4]
pmulhrsw m13, m15, [%1+%2*5]
pmulhrsw m14, m15, [%1+%2*6]
pmulhrsw m15, [%1+%2*7]
%else
mova m8, [%1+%2*0]
mova m9, [%1+%2*1]
mova m10, [%1+%2*2]
mova m11, [%1+%2*3]
mova m12, [%1+%2*4]
mova m13, [%1+%2*5]
mova m14, [%1+%2*6]
mova m15, [%1+%2*7]
%endif
%endmacro
cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob
lea rax, [o_base]
test eobd, eobd
jz .dconly
PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob
%undef cmp
cmp eobd, 106
jle .fast
LOAD_8ROWS cq+32*1, 32*2
call m(idct_16x8_internal).main
vperm2i128 m11, m0, m4, 0x31
vinserti128 m0, m0, xm4, 1
vperm2i128 m4, m1, m5, 0x31
vinserti128 m1, m1, xm5, 1
vperm2i128 m5, m2, m6, 0x31
vinserti128 m2, m2, xm6, 1
vperm2i128 m6, m3, m7, 0x31
vinserti128 m3, m3, xm7, 1
pxor m7, m7
REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15
punpckhwd m7, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
punpcklwd m2, m3
punpcklwd m3, m11, m4
punpckhwd m11, m4
punpckhwd m4, m5, m6
punpcklwd m5, m6
punpckhdq m6, m0, m2
punpckldq m0, m2
punpckldq m2, m3, m5
punpckhdq m3, m5
punpckhdq m5, m11, m4
punpckldq m11, m4
punpckldq m4, m7, m1
punpckhdq m7, m1
punpckhqdq m12, m6, m0
punpcklqdq m0, m6 ; out4
punpckhqdq m13, m7, m4
punpcklqdq m4, m7 ; out5
punpckhqdq m14, m3, m2
punpcklqdq m2, m3 ; out6
punpckhqdq m15, m5, m11
punpcklqdq m11, m5 ; out7
mova [rsp+32*0], m0
mova [rsp+32*1], m4
mova [rsp+32*2], m2
.fast:
LOAD_8ROWS cq+32*0, 32*2
call m(idct_16x8_internal).main
vperm2i128 m8, m0, m4, 0x31
vinserti128 m0, m0, xm4, 1
vperm2i128 m4, m1, m5, 0x31
vinserti128 m1, m1, xm5, 1
vperm2i128 m5, m2, m6, 0x31
vinserti128 m2, m2, xm6, 1
vperm2i128 m6, m3, m7, 0x31
vinserti128 m3, m3, xm7, 1
vpbroadcastd m9, [o(pw_8192)]
pxor m7, m7
REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14
punpckhwd m7, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
punpcklwd m2, m3
punpckhwd m3, m8, m4
punpcklwd m8, m4
punpckhwd m4, m5, m6
punpcklwd m5, m6
punpckhdq m6, m0, m2
punpckldq m0, m2
punpckldq m2, m8, m5
punpckhdq m8, m5
punpckhdq m5, m3, m4
punpckldq m3, m4
punpckhdq m4, m7, m1
punpckldq m7, m1
punpcklqdq m1, m7, m4
punpckhqdq m7, m4 ; out9
punpckhqdq m4, m2, m8 ; out10
punpcklqdq m2, m8
punpckhqdq m8, m3, m5
punpcklqdq m3, m5
punpckhqdq m5, m0, m6 ; out8
punpcklqdq m0, m6
REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7
cmp eobd, 106
jg .full
mova [rsp+32*0], m5
mova [rsp+32*1], m7
mova [rsp+32*2], m4
pmulhrsw m11, m9, m8
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
call .main_fast
jmp .pass2
.dconly:
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
pmulhrsw xm0, xm2
psrlw xm2, 2 ; pw_2048
pmulhrsw xm0, xm1
pmulhrsw xm0, xm2
vpbroadcastw m0, xm0
mov r2d, 8
jmp m(inv_txfm_add_dct_dct_8x8).end2
.full:
REPX {pmulhrsw x, m9}, m12, m13, m14, m15
pmulhrsw m6, m9, [rsp+32*2]
mova [rsp+32*2], m4
pmulhrsw m4, m9, [rsp+32*0]
mova [rsp+32*0], m5
pmulhrsw m5, m9, [rsp+32*1]
mova [rsp+32*1], m7
pmulhrsw m7, m9, m11
pmulhrsw m11, m9, m8
call .main
.pass2:
vpbroadcastd m12, [o(pw_2048)]
REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m13, m14, m15
pmulhrsw m12, [rsp]
REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14
REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15
mova [rsp+32*0], m4
mova [rsp+32*1], m6
lea r3, [strideq*3]
WRITE_8X4 0, 1, 4, 6
lea dstq, [dstq+strideq*4]
WRITE_8X4 2, 3, 4, 6
lea dstq, [dstq+strideq*4]
WRITE_8X4 [rsp+32*0], 5, 4, 6
lea dstq, [dstq+strideq*4]
WRITE_8X4 [rsp+32*1], 7, 4, 6
lea dstq, [dstq+strideq*4]
WRITE_8X4 8, 9, 4, 6
lea dstq, [dstq+strideq*4]
WRITE_8X4 10, 11, 4, 6
lea dstq, [dstq+strideq*4]
WRITE_8X4 12, 13, 4, 6
lea dstq, [dstq+strideq*4]
WRITE_8X4 14, 15, 4, 6
RET
ALIGN function_align
.main_fast: ; bottom half is zero
call m(idct_8x16_internal).main
mova m8, [rsp+gprsize+0*32]
mova [rsp+gprsize+0*32], m0
mova m9, [rsp+gprsize+1*32]
mova [rsp+gprsize+1*32], m1
mova m0, [rsp+gprsize+2*32]
mova [rsp+gprsize+2*32], m6
punpcklwd m1, m8, m8
punpckhwd m8, m8
punpcklwd m15, m9, m9
punpckhwd m9, m9
punpcklwd m14, m0, m0
punpckhwd m0, m0
punpcklwd m13, m11, m11
punpckhwd m11, m11
ITX_MULHRSW_SHL3 1, 6, 201, 4091 ; t16a, t31a
ITX_MULHRSW_SHL3 8, 6, m601, 4052 ; t23a, t24a
ITX_MULHRSW_SHL3 15, 6, 995, 3973 ; t20a, t27a
ITX_MULHRSW_SHL3 9, 6, m1380, 3857 ; t19a, t28a
ITX_MULHRSW_SHL3 14, 6, 1751, 3703 ; t18a, t29a
ITX_MULHRSW_SHL3 0, 6, m2106, 3513 ; t21a, t26a
ITX_MULHRSW_SHL3 13, 6, 2440, 3290 ; t22a, t25a
ITX_MULHRSW_SHL3 11, 6, m2751, 3035 ; t17a, t30a
jmp .main2
ALIGN function_align
.main:
call m(idct_8x16_internal).main
mova m8, [rsp+gprsize+0*32]
mova [rsp+gprsize+0*32], m0
mova m9, [rsp+gprsize+1*32]
mova [rsp+gprsize+1*32], m1
mova m0, [rsp+gprsize+2*32]
mova [rsp+gprsize+2*32], m6
punpcklwd m1, m15, m8 ; in31 in1
punpckhwd m8, m15 ; in3 in29
punpcklwd m15, m14, m9 ; in27 in5
punpckhwd m9, m14 ; in7 in25
punpcklwd m14, m13, m0 ; in23 in9
punpckhwd m0, m13 ; in11 in21
punpcklwd m13, m12, m11 ; in19 in13
punpckhwd m11, m12 ; in15 in17
ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a
ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a
ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a
ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a
ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a
ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a
ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a
ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a
.main2:
psubw m6, m1, m11 ; t17 t30
paddw m1, m11 ; t16 t31
psubw m11, m9, m14 ; t18 t29
paddw m9, m14 ; t19 t28
psubw m14, m15, m0 ; t21 t26
paddw m15, m0 ; t20 t27
psubw m0, m8, m13 ; t22 t25
paddw m8, m13 ; t23 t24
ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a
ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a
ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a
ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
psubw m13, m1, m9 ; t19a t28a
paddw m1, m9 ; t16a t31a
psubw m9, m8, m15 ; t20a t27a
paddw m8, m15 ; t23a t24a
psubw m15, m6, m11 ; t18 t29
paddw m6, m11 ; t17 t30
psubw m11, m0, m14 ; t21 t26
paddw m0, m14 ; t22 t25
ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 1 ; t18a t29a
ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 1 ; t19 t28
ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 1 ; t20 t27
ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 1 ; t21a t26a
vbroadcasti128 m12, [o(deint_shuf)]
REPX {pshufb x, m12}, m0, m1, m6, m8
psubw m14, m1, m8 ; t23 t24
paddw m1, m8 ; t16 t31
psubw m8, m6, m0 ; t22a t25a
paddw m6, m0 ; t17a t30a
psubw m0, m15, m11 ; t21 t26
paddw m15, m11 ; t18 t29
psubw m11, m13, m9 ; t20a t27a
paddw m13, m9 ; t19a t28a
vpbroadcastd m12, [o(pw_2896x8)]
punpcklqdq m9, m11, m0 ; t20a t21
punpckhqdq m11, m0 ; t27a t26
punpcklqdq m0, m14, m8 ; t23 t22a
punpckhqdq m14, m8 ; t24 t25a
psubw m8, m11, m9 ; t20 t21a
paddw m11, m9 ; t27 t26a
psubw m9, m14, m0 ; t23a t22
paddw m14, m0 ; t24a t25
REPX {pmulhrsw x, m12}, m8, m9, m14, m11
punpcklqdq m0, m1, m6 ; t16 t17a
punpckhqdq m1, m6 ; t31 t30a
psubw m10, m5, m8 ; out20 out21
paddw m5, m8 ; out11 out10
psubw m6, m3, m14 ; out24 out25
paddw m3, m14 ; out7 out6
psubw m8, m7, m0 ; out16 out17
paddw m7, m0 ; out15 out14
mova m0, [rsp+gprsize+0*32]
punpcklqdq m12, m13, m15 ; t19a t18
punpckhqdq m13, m15 ; t28a t29
psubw m15, m0, m1 ; out31 out30
paddw m0, m1 ; out0 out1
mova m1, [rsp+gprsize+1*32]
mova [rsp+gprsize+0*32], m6
mova m6, [rsp+gprsize+2*32]
psubw m14, m1, m13 ; out28 out29
paddw m1, m13 ; out3 out2
psubw m13, m2, m11 ; out27 out26
paddw m2, m11 ; out4 out5
psubw m11, m4, m9 ; out23 out22
paddw m4, m9 ; out8 out9
psubw m9, m6, m12 ; out19 out18
paddw m6, m12 ; out12 out13
ret
%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
vbroadcasti128 m%1, [cq+16*%3]
vbroadcasti128 m%2, [cq+16*%4]
shufpd m%1, m%1, m%2, 0x0c
%endmacro
cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob
lea rax, [o_base]
test eobd, eobd
jnz .normal
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
mov r2d, 8
.dconly:
pmulhrsw xm0, xm2
movd xm2, [pw_2048] ; intentionally rip-relative
pmulhrsw xm0, xm1
pmulhrsw xm0, xm2
vpbroadcastw m0, xm0
pxor m3, m3
.dconly_loop:
mova m1, [dstq]
punpckhbw m2, m1, m3
punpcklbw m1, m3
paddw m2, m0
paddw m1, m0
packuswb m1, m2
mova [dstq], m1
add dstq, strideq
dec r2d
jg .dconly_loop
RET
.normal:
PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob
%undef cmp
LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2
LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3
LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6
LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7
pxor m8, m8
REPX {mova [cq+32*x], m8}, 0, 1, 2, 3
add cq, 16*16
LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10
LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11
LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14
LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15
REPX {mova [cq+32*x], m8}, -4, -3, -2, -1
mova [rsp+32*0], m4
mova [rsp+32*1], m5
mova [rsp+32*2], m6
cmp eobd, 106
jg .full
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
call m(inv_txfm_add_dct_dct_8x32).main_fast
jmp .pass2
.full:
LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18
LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17
LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22
LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21
REPX {mova [cq+32*x], m8}, 0, 1, 2, 3
add cq, 16*8
LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26
LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25
LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30
LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29
pxor m8, m8
REPX {mova [cq+32*x], m8}, 0, 1, 2, 3
call m(inv_txfm_add_dct_dct_8x32).main
.pass2:
vpbroadcastd m12, [o(pw_8192)]
REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15
mova [rsp+32*1], m9
mova [rsp+32*2], m10
punpckhwd m9, m0, m2
punpcklwd m0, m2
punpckhwd m2, m1, m3
punpcklwd m1, m3
punpcklwd m10, m4, m6
punpckhwd m4, m6
punpcklwd m6, m5, m7
punpckhwd m5, m7
punpckhwd m3, m0, m9
punpcklwd m0, m9
punpckhwd m9, m2, m1
punpcklwd m2, m1
punpcklwd m7, m10, m4
punpckhwd m10, m4
punpcklwd m4, m5, m6
punpckhwd m5, m6
punpckhdq m1, m0, m2
punpckldq m0, m2
punpckldq m2, m3, m9
punpckhdq m3, m9
punpckldq m6, m7, m4
punpckhdq m7, m4
punpckldq m9, m10, m5
punpckhdq m10, m5
REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10
pmulhrsw m12, [rsp+32*0]
mova [rsp+32*0], m8
vperm2i128 m4, m0, m6, 0x31
vinserti128 m0, m0, xm6, 1
vperm2i128 m5, m1, m7, 0x31
vinserti128 m1, m1, xm7, 1
vperm2i128 m6, m2, m9, 0x31
vinserti128 m2, m2, xm9, 1
vperm2i128 m7, m3, m10, 0x31
vinserti128 m3, m3, xm10, 1
call m(idct_16x8_internal).main
vpbroadcastd m8, [o(pw_2048)]
REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
lea r2, [strideq*3]
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
WRITE_16X2 2, 3, 0, 1, strideq*2, r2
lea r3, [dstq+strideq*4]
%define dstq r3
WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
WRITE_16X2 6, 7, 0, 1, strideq*2, r2
mova m0, [rsp+32*0]
mova m1, [rsp+32*1]
mova m2, [rsp+32*2]
punpckhwd m7, m0, m2
punpcklwd m0, m2
punpckhwd m2, m1, m11
punpcklwd m1, m11
punpckhwd m4, m12, m14
punpcklwd m12, m14
punpckhwd m5, m13, m15
punpcklwd m13, m15
punpckhwd m3, m0, m7
punpcklwd m0, m7
punpckhwd m9, m2, m1
punpcklwd m2, m1
punpcklwd m7, m12, m4
punpckhwd m12, m4
punpcklwd m4, m5, m13
punpckhwd m5, m13
punpckhdq m1, m0, m2
punpckldq m0, m2
punpckldq m2, m3, m9
punpckhdq m3, m9
punpckldq m6, m7, m4
punpckhdq m7, m4
punpckldq m9, m12, m5
punpckhdq m12, m5
vperm2i128 m4, m0, m6, 0x31
vinserti128 m0, m0, xm6, 1
vperm2i128 m5, m1, m7, 0x31
vinserti128 m1, m1, xm7, 1
vperm2i128 m6, m2, m9, 0x31
vinserti128 m2, m2, xm9, 1
vperm2i128 m7, m3, m12, 0x31
vinserti128 m3, m3, xm12, 1
call m(idct_16x8_internal).main2
vpbroadcastd m8, [o(pw_2048)]
REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
add r0, 16
add r3, 16
%define dstq r0
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
WRITE_16X2 2, 3, 0, 1, strideq*2, r2
%define dstq r3
WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
WRITE_16X2 6, 7, 0, 1, strideq*2, r2
RET
cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob
vpbroadcastd m9, [pw_5]
lea r4, [strideq*3]
sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107)
.loop:
mova xm0, [cq+16* 0]
mova xm1, [cq+16* 4]
vinserti128 m0, m0, [cq+16* 1], 1
vinserti128 m1, m1, [cq+16* 5], 1
pxor m8, m8
mova [cq+32*0], m8
mova [cq+32*2], m8
add cq, 16*16
mova xm2, [cq-16* 8]
mova xm3, [cq-16* 4]
vinserti128 m2, m2, [cq-16* 7], 1
vinserti128 m3, m3, [cq-16* 3], 1
mova xm4, [cq+16* 0]
mova xm5, [cq+16* 4]
vinserti128 m4, m4, [cq+16* 1], 1
vinserti128 m5, m5, [cq+16* 5], 1
mova xm6, [cq+16* 8]
mova xm7, [cq+16*12]
vinserti128 m6, m6, [cq+16* 9], 1
vinserti128 m7, m7, [cq+16*13], 1
REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6
REPX {paddw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
call .transpose8x8
REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4
add dstq, strideq
WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4
add dstq, strideq
WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4
add dstq, strideq
WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4
add dstq, strideq
sub cq, 16*16-32
lea dstq, [dstq+r4*4]
add eobd, 0x80000000
jnc .loop
RET
ALIGN function_align
.transpose8x8:
punpckhwd m8, m4, m5
punpcklwd m4, m5
punpckhwd m5, m0, m1
punpcklwd m0, m1
punpckhwd m1, m6, m7
punpcklwd m6, m7
punpckhwd m7, m2, m3
punpcklwd m2, m3
punpckhdq m3, m0, m2
punpckldq m0, m2
punpckldq m2, m4, m6
punpckhdq m4, m6
punpckhdq m6, m5, m7
punpckldq m5, m7
punpckldq m7, m8, m1
punpckhdq m8, m1
punpckhqdq m1, m0, m2
punpcklqdq m0, m2
punpcklqdq m2, m3, m4
punpckhqdq m3, m4
punpcklqdq m4, m5, m7
punpckhqdq m5, m7
punpckhqdq m7, m6, m8
punpcklqdq m6, m8
ret
cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob
add cq, 16*8
vpbroadcastd m9, [pw_4096]
lea r4, [strideq*3]
lea r5, [dstq+strideq*4]
sub eobd, 107
.loop:
mova xm0, [cq-16*8]
mova xm1, [cq-16*7]
vinserti128 m0, m0, [cq+16*0], 1
vinserti128 m1, m1, [cq+16*1], 1
mova xm2, [cq-16*6]
mova xm3, [cq-16*5]
vinserti128 m2, m2, [cq+16*2], 1
vinserti128 m3, m3, [cq+16*3], 1
mova xm4, [cq-16*4]
mova xm5, [cq-16*3]
vinserti128 m4, m4, [cq+16*4], 1
vinserti128 m5, m5, [cq+16*5], 1
mova xm6, [cq-16*2]
mova xm7, [cq-16*1]
vinserti128 m6, m6, [cq+16*6], 1
vinserti128 m7, m7, [cq+16*7], 1
pxor m8, m8
REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
WRITE_16X2 2, 3, 0, 1, strideq*2, r4
%define dstq r5
WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
WRITE_16X2 6, 7, 0, 1, strideq*2, r4
add cq, 16*16
add r0, 16
add r5, 16
add eobd, 0x80000000
jnc .loop
RET
%define o_base pw_5 + 128
%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs
%if %3
vpbroadcastd m15, [o(pw_2896x8)]
pmulhrsw m0, m15, [%1+%2* 0]
pmulhrsw m1, m15, [%1+%2* 1]
pmulhrsw m2, m15, [%1+%2* 2]
pmulhrsw m3, m15, [%1+%2* 3]
pmulhrsw m4, m15, [%1+%2* 4]
pmulhrsw m5, m15, [%1+%2* 5]
pmulhrsw m6, m15, [%1+%2* 6]
pmulhrsw m7, m15, [%1+%2* 7]
pmulhrsw m8, m15, [%1+%2* 8]
pmulhrsw m9, m15, [%1+%2* 9]
pmulhrsw m10, m15, [%1+%2*10]
pmulhrsw m11, m15, [%1+%2*11]
pmulhrsw m12, m15, [%1+%2*12]
pmulhrsw m13, m15, [%1+%2*13]
pmulhrsw m14, m15, [%1+%2*14]
pmulhrsw m15, [%1+%2*15]
%else
mova m0, [%1+%2* 0]
mova m1, [%1+%2* 1]
mova m2, [%1+%2* 2]
mova m3, [%1+%2* 3]
mova m4, [%1+%2* 4]
mova m5, [%1+%2* 5]
mova m6, [%1+%2* 6]
mova m7, [%1+%2* 7]
mova m8, [%1+%2* 8]
mova m9, [%1+%2* 9]
mova m10, [%1+%2*10]
mova m11, [%1+%2*11]
mova m12, [%1+%2*12]
mova m13, [%1+%2*13]
mova m14, [%1+%2*14]
mova m15, [%1+%2*15]
%endif
mova [rsp], m15
%if %4
pxor m15, m15
REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \
8, 9, 10, 11, 12, 13, 14, 15
%endif
%endmacro
%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
mova m%4, [%2]
paddw m%3, m%1, m%4
psubw m%1, m%4
pmovzxbw m%4, [dstq+%6]
pmulhrsw m%3, m%5
pmulhrsw m%1, m%5
paddw m%3, m%4
pmovzxbw m%4, [r2+%7]
paddw m%1, m%4
packuswb m%3, m%1
vpermq m%3, m%3, q3120
mova [dstq+%6], xm%3
vextracti128 [r2+%7], m%3, 1
%endmacro
cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob
lea rax, [o_base]
test eobd, eobd
jz .dconly
PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
base, tmp3
%undef cmp
LOAD_16ROWS cq, 64, 1
call m(idct_16x16_internal).main
lea tmp1q, [rsp+32*7]
lea tmp2q, [tmp1q+32*8]
lea tmp3q, [tmp1q+32*16]
mova m1, [rsp+32*1]
mova [rsp+32*0], m6
mova [rsp+32*1], m7
vpbroadcastd m7, [o(pw_16384)]
call .transpose_2x8x8_round
mova m15, [rsp+32*0]
mova [tmp3q-32*4+ 0], xm0
vextracti128 [tmp3q+32*0+ 0], m0, 1
mova [tmp3q-32*3+ 0], xm2
vextracti128 [tmp3q+32*1+ 0], m2, 1
mova [tmp3q-32*2+ 0], xm4
vextracti128 [tmp3q+32*2+ 0], m4, 1
mova [tmp3q-32*1+ 0], xm6
vextracti128 [tmp3q+32*3+ 0], m6, 1
mova [tmp3q-32*4+16], xm8
vextracti128 [tmp3q+32*0+16], m8, 1
mova [tmp3q-32*3+16], xm10
vextracti128 [tmp3q+32*1+16], m10, 1
mova [tmp3q-32*2+16], xm12
vextracti128 [tmp3q+32*2+16], m12, 1
mova [tmp3q-32*1+16], xm14
vextracti128 [tmp3q+32*3+16], m14, 1
cmp eobd, 150
jg .full
vinserti128 m0, m1, xm9, 1
vperm2i128 m4, m1, m9, 0x31
vinserti128 m2, m5, xm13, 1
vperm2i128 m6, m5, m13, 0x31
vinserti128 m1, m3, xm11, 1
vperm2i128 m5, m3, m11, 0x31
vinserti128 m3, m7, xm15, 1
vperm2i128 m7, m7, m15, 0x31
call .main_oddhalf_fast
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
jmp .idct16
.dconly:
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
mov r2d, 16
jmp m(inv_txfm_add_dct_dct_16x4).dconly
.full:
mova [tmp1q-32*4], m1
mova [tmp1q-32*3], m3
mova [tmp1q-32*2], m5
mova [tmp1q-32*1], m7
mova [tmp1q+32*0], m9
mova [tmp1q+32*1], m11
mova [tmp1q+32*2], m13
mova [tmp1q+32*3], m15
LOAD_16ROWS cq+32, 64, 1
call m(idct_16x16_internal).main
lea r2, [tmp3q+32*8]
mova m1, [rsp+32*1]
mova [rsp+32*0], m6
mova [rsp+32*1], m7
vpbroadcastd m7, [o(pw_16384)]
call .transpose_2x8x8_round
mova m15, [rsp+32*0]
mova [r2-32*4+ 0], xm0
vextracti128 [r2+32*0+ 0], m0, 1
mova [r2-32*3+ 0], xm2
vextracti128 [r2+32*1+ 0], m2, 1
mova [r2-32*2+ 0], xm4
vextracti128 [r2+32*2+ 0], m4, 1
mova [r2-32*1+ 0], xm6
vextracti128 [r2+32*3+ 0], m6, 1
mova [r2-32*4+16], xm8
vextracti128 [r2+32*0+16], m8, 1
mova [r2-32*3+16], xm10
vextracti128 [r2+32*1+16], m10, 1
mova [r2-32*2+16], xm12
vextracti128 [r2+32*2+16], m12, 1
mova [r2-32*1+16], xm14
vextracti128 [r2+32*3+16], m14, 1
vinserti128 m8, m1, xm9, 1
vperm2i128 m12, m1, m9, 0x31
mova xm0, [tmp1q-32*4]
mova xm1, [tmp1q-32*3]
vinserti128 m0, m0, [tmp1q+32*0], 1
vinserti128 m1, m1, [tmp1q+32*1], 1
vinserti128 m10, m5, xm13, 1
vperm2i128 m14, m5, m13, 0x31
mova xm4, [tmp1q-32*4+16]
mova xm5, [tmp1q-32*3+16]
vinserti128 m4, m4, [tmp1q+32*0+16], 1
vinserti128 m5, m5, [tmp1q+32*1+16], 1
vinserti128 m9, m3, xm11, 1
vperm2i128 m13, m3, m11, 0x31
mova xm2, [tmp1q-32*2]
mova xm3, [tmp1q-32*1]
vinserti128 m2, m2, [tmp1q+32*2], 1
vinserti128 m3, m3, [tmp1q+32*3], 1
vinserti128 m11, m7, xm15, 1
vperm2i128 m15, m7, m15, 0x31
mova xm6, [tmp1q-32*2+16]
mova xm7, [tmp1q-32*1+16]
vinserti128 m6, m6, [tmp1q+32*2+16], 1
vinserti128 m7, m7, [tmp1q+32*3+16], 1
call .main_oddhalf
LOAD_8ROWS_H r2-32*4, 32
.idct16:
LOAD_8ROWS tmp3q-32*4, 32
mova [rsp], m15
call m(idct_16x16_internal).main
imul r2, strideq, 19
lea r3, [strideq*3]
add r2, dstq
call .pass2_end
RET
ALIGN function_align
.main_oddhalf_fast: ; lower half is zero
mova [rsp+gprsize+32*1], m7
pxor m7, m7
mova [rsp+gprsize+32*0], m7
mova [rsp+gprsize+32*2], m7
vpbroadcastd m11, [o(pw_3703x8)]
vpbroadcastd m7, [o(pw_1751x8)]
vpbroadcastd m12, [o(pw_m1380x8)]
vpbroadcastd m8, [o(pw_3857x8)]
vpbroadcastd m13, [o(pw_3973x8)]
vpbroadcastd m15, [o(pw_995x8)]
pmulhrsw m11, m4 ; t29a
pmulhrsw m4, m7 ; t18a
pmulhrsw m12, m3 ; t19a
pmulhrsw m3, m8 ; t28a
pmulhrsw m13, m2 ; t27a
pmulhrsw m2, m15 ; t20a
vpbroadcastd m10, [o(pw_m2106x8)]
vpbroadcastd m7, [o(pw_3513x8)]
vpbroadcastd m9, [o(pw_3290x8)]
vpbroadcastd m8, [o(pw_2440x8)]
vpbroadcastd m14, [o(pw_m601x8)]
vpbroadcastd m15, [o(pw_4052x8)]
pmulhrsw m10, m5 ; t21a
pmulhrsw m5, m7 ; t26a
pmulhrsw m9, m6 ; t25a
pmulhrsw m6, m8 ; t22a
pmulhrsw m14, m1 ; t23a
pmulhrsw m1, m15 ; t24a
vpbroadcastd m15, [o(pd_2048)]
jmp .main2
ALIGN function_align
.main_oddhalf:
mova [rsp+gprsize+32*0], m15
mova [rsp+gprsize+32*1], m7
mova [rsp+gprsize+32*2], m8
vpbroadcastd m15, [o(pd_2048)]
ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703 ; t18a, t29a
ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380 ; t19a, t28a
ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973 ; t20a, t27a
ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106 ; t21a, t26a
ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a
ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a
.main2:
psubw m7, m12, m4 ; t18
paddw m12, m4 ; t19
psubw m4, m2, m10 ; t21
paddw m2, m10 ; t20
psubw m10, m14, m6 ; t22
paddw m14, m6 ; t23
psubw m6, m1, m9 ; t25
paddw m1, m9 ; t24
psubw m9, m13, m5 ; t26
paddw m13, m5 ; t27
psubw m5, m3, m11 ; t29
paddw m3, m11 ; t28
ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a
ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a
ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a
psubw m8, m14, m2 ; t20a
paddw m14, m2 ; t23a
psubw m2, m1, m13 ; t27a
paddw m1, m13 ; t24a
psubw m13, m6, m9 ; t21
paddw m6, m9 ; t22
psubw m9, m10, m4 ; t26
paddw m10, m4 ; t25
ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27
ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a
mova m4, [rsp+gprsize+32*0] ; in31
mova [rsp+gprsize+32*0], m6 ; t22
mova m6, [rsp+gprsize+32*1] ; in15
mova [rsp+gprsize+32*1], m14 ; t23a
mova m14, [rsp+gprsize+32*2] ; in17
mova [rsp+gprsize+32*2], m1 ; t24a
ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a
ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a
psubw m1, m0, m14 ; t17
paddw m0, m14 ; t16
psubw m14, m4, m6 ; t30
paddw m4, m6 ; t31
ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a
psubw m6, m0, m12 ; t19a
paddw m0, m12 ; t16a
psubw m12, m4, m3 ; t28a
paddw m4, m3 ; t31a
psubw m3, m14, m5 ; t18
paddw m14, m5 ; t17
psubw m5, m1, m7 ; t29
paddw m1, m7 ; t30
ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a
ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28
psubw m7, m1, m10 ; t25a
paddw m1, m10 ; t30a
psubw m10, m5, m9 ; t21
paddw m5, m9 ; t18
psubw m9, m12, m2 ; t20a
paddw m12, m2 ; t19a
psubw m2, m3, m13 ; t26
paddw m3, m13 ; t29
psubw m13, m6, m8 ; t27a
paddw m6, m8 ; t28a
mova [tmp1q-32*2], m5
mova [tmp1q-32*1], m12
mova [tmp2q+32*0], m6
mova [tmp2q+32*1], m3
mova [tmp2q+32*2], m1
mova m5, [rsp+gprsize+32*0] ; t22
mova m6, [rsp+gprsize+32*1] ; t23
mova m3, [rsp+gprsize+32*2] ; t24a
vpbroadcastd m8, [o(pw_2896x8)]
psubw m1, m14, m5 ; t22a
paddw m14, m5 ; t17a
psubw m5, m0, m6 ; t23
paddw m0, m6 ; t16
psubw m6, m4, m3 ; t24
paddw m4, m3 ; t31
mova [tmp1q-32*4], m0
mova [tmp1q-32*3], m14
mova [tmp2q+32*3], m4
psubw m3, m13, m9 ; t20
paddw m13, m9 ; t27
psubw m9, m2, m10 ; t21a
paddw m2, m10 ; t26a
psubw m10, m7, m1 ; t22
paddw m7, m1 ; t25
psubw m1, m6, m5 ; t23a
paddw m6, m5 ; t24a
REPX {pmulhrsw x, m8}, m3, m13, m9, m2, m10, m7, m1, m6
mova [tmp1q+32*0], m3
mova [tmp1q+32*1], m9
mova [tmp1q+32*2], m10
mova [tmp1q+32*3], m1
mova [tmp2q-32*4], m6
mova [tmp2q-32*3], m7
mova [tmp2q-32*2], m2
mova [tmp2q-32*1], m13
ret
ALIGN function_align
.transpose_2x8x8_round:
punpckhwd m6, m12, m13
punpcklwd m12, m13
punpckhwd m13, m8, m9
punpcklwd m8, m9
punpckhwd m9, m14, m15
punpcklwd m14, m15
punpckhwd m15, m10, m11
punpcklwd m10, m11
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5
punpckhdq m11, m8, m10
punpckldq m8, m10
punpckldq m10, m12, m14
punpckhdq m12, m14
punpckhdq m14, m13, m15
punpckldq m13, m15
punpckldq m15, m6, m9
punpckhdq m6, m9
punpckhqdq m9, m8, m10
punpcklqdq m8, m10
punpcklqdq m10, m11, m12
punpckhqdq m11, m12
punpcklqdq m12, m13, m15
punpckhqdq m13, m15
punpckhqdq m15, m14, m6
punpcklqdq m14, m6
pmulhrsw m6, m7, [rsp+gprsize+32*0]
REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15
pmulhrsw m7, [rsp+gprsize+32*1]
mova [rsp+gprsize+32*0], m15
punpckhwd m15, m4, m5
punpcklwd m4, m5
punpckhwd m5, m0, m1
punpcklwd m0, m1
punpckhwd m1, m6, m7
punpcklwd m6, m7
punpckhwd m7, m2, m3
punpcklwd m2, m3
punpckhdq m3, m0, m2
punpckldq m0, m2
punpckldq m2, m4, m6
punpckhdq m4, m6
punpckhdq m6, m5, m7
punpckldq m5, m7
punpckldq m7, m15, m1
punpckhdq m15, m1
punpckhqdq m1, m0, m2
punpcklqdq m0, m2
punpcklqdq m2, m3, m4
punpckhqdq m3, m4
punpcklqdq m4, m5, m7
punpckhqdq m5, m7
punpckhqdq m7, m6, m15
punpcklqdq m6, m15
ret
ALIGN function_align
.pass2_end:
mova [rsp+gprsize+32*0], m7
mova [rsp+gprsize+32*2], m15
vpbroadcastd m15, [o(pw_2048)]
IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4
IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8
IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4
IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0
add dstq, strideq
sub r2, strideq
mova m1, [rsp+gprsize+32*1]
IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4
IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8
IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4
IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0
add dstq, strideq
sub r2, strideq
IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4
IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8
IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4
IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0
add dstq, strideq
sub r2, strideq
mova m7, [rsp+gprsize+32*0]
mova m1, [rsp+gprsize+32*2]
IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4
IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8
IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4
IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0
ret
; Perform the final sumsub step and YMM lane shuffling
%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
mova m%3, [tmp2q+32*( 3-%1)]
psubw m%4, m%1, m%3
paddw m%1, m%3
mova m%3, [tmp1q+32*(11-%2)]
mova [tmp1q+32*(11-%2)+16], xm%4
vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
paddw m%4, m%2, m%3
psubw m%2, m%3
mova [tmp1q+32*(11-%2)], xm%2
vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
vperm2i128 m%2, m%1, m%4, 0x31
vinserti128 m%1, m%1, xm%4, 1
%endmacro
cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob
lea rax, [o_base]
test eobd, eobd
jnz .normal
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
mov r2d, 16
jmp m(inv_txfm_add_dct_dct_32x8).dconly
.normal:
PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
vpbroadcastd m15, [o(pw_2896x8)]
pmulhrsw m0, m15, [cq+32* 1]
pmulhrsw m1, m15, [cq+32* 3]
pmulhrsw m2, m15, [cq+32* 5]
pmulhrsw m3, m15, [cq+32* 7]
pmulhrsw m4, m15, [cq+32* 9]
pmulhrsw m5, m15, [cq+32*11]
pmulhrsw m6, m15, [cq+32*13]
pmulhrsw m7, m15, [cq+32*15]
pmulhrsw m8, m15, [cq+32*17]
pmulhrsw m9, m15, [cq+32*19]
pmulhrsw m10, m15, [cq+32*21]
pmulhrsw m11, m15, [cq+32*23]
pmulhrsw m12, m15, [cq+32*25]
pmulhrsw m13, m15, [cq+32*27]
pmulhrsw m14, m15, [cq+32*29]
pmulhrsw m15, [cq+32*31]
lea tmp1q, [rsp+32*7]
lea tmp2q, [tmp1q+32*8]
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
LOAD_16ROWS cq+32*0, 32*2, 1, 0
pxor m15, m15
mov r3d, 8
.zero_loop:
mova [cq+32*0], m15
mova [cq+32*1], m15
mova [cq+32*2], m15
mova [cq+32*3], m15
add cq, 32*4
dec r3d
jg .zero_loop
call m(idct_16x16_internal).main
call .pass1_end
lea r2, [strideq*3]
mov r3, dstq
.pass2:
vpbroadcastd m7, [o(pw_16384)]
call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
call m(idct_16x16_internal).main
mova [rsp+32*2], m15
vpbroadcastd m15, [o(pw_2048)]
REPX {pmulhrsw x, m15}, m2, m3, m0
WRITE_16X2 2, 3, 1, 2, strideq*2, r2
pmulhrsw m1, m15, [rsp+32*1]
WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1
lea dstq, [dstq+strideq*4]
REPX {pmulhrsw x, m15}, m4, m5, m6, m7
WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1
WRITE_16X2 6, 7, 2, 3, strideq*2, r2
lea dstq, [dstq+strideq*4]
REPX {pmulhrsw x, m15}, m8, m9, m10, m11
WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1
WRITE_16X2 10, 11, 2, 3, strideq*2, r2
lea dstq, [dstq+strideq*4]
REPX {pmulhrsw x, m15}, m11, m12, m13, m14
pmulhrsw m15, [rsp+32*2]
WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1
WRITE_16X2 14, 15, 2, 3, strideq*2, r2
test r3, r3
jnz .right_half
RET
.right_half:
LOAD_8ROWS tmp1q-32*4, 32
LOAD_8ROWS_H tmp2q-32*4, 32
lea dstq, [r3+16]
xor r3d, r3d
mova [rsp+32*0], m6
mova [rsp+32*1], m7
jmp .pass2
ALIGN function_align
.pass1_end:
mova [rsp+gprsize+32*0], m9
IDCT32_PASS1_END 0, 8, 1, 9
IDCT32_PASS1_END 2, 10, 1, 9
IDCT32_PASS1_END 3, 11, 1, 9
IDCT32_PASS1_END 4, 12, 1, 9
IDCT32_PASS1_END 5, 13, 1, 9
IDCT32_PASS1_END 6, 14, 1, 9
IDCT32_PASS1_END 7, 15, 1, 9
mova m1, [rsp+gprsize+32*1]
mova m9, [rsp+gprsize+32*0]
mova [rsp+gprsize+32*0], m6
mova [rsp+gprsize+32*1], m7
IDCT32_PASS1_END 1, 9, 6, 7
ret
cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
%undef cmp
lea rax, [o_base]
vpbroadcastd m9, [o(pw_2896x8)]
vpbroadcastd m10, [o(pw_5793x4)]
vpbroadcastd m11, [o(pw_5)]
cmp eobd, 43 ; if (eob > 43)
setg r4b ; iteration_count++
cmp eobd, 150 ; if (eob > 150)
setg al ; iteration_count++
add eobd, -279 ; if (eob > 278)
adc r4b, al ; iteration_count++
lea r3, [strideq*3]
mov rax, cq
.loop:
mova xm0, [cq+64* 0]
mova xm1, [cq+64* 1]
vinserti128 m0, m0, [cq+64* 8], 1
vinserti128 m1, m1, [cq+64* 9], 1
mova xm2, [cq+64* 2]
mova xm3, [cq+64* 3]
vinserti128 m2, m2, [cq+64*10], 1
vinserti128 m3, m3, [cq+64*11], 1
mova xm4, [cq+64* 4]
mova xm5, [cq+64* 5]
vinserti128 m4, m4, [cq+64*12], 1
vinserti128 m5, m5, [cq+64*13], 1
mova xm6, [cq+64* 6]
mova xm7, [cq+64* 7]
vinserti128 m6, m6, [cq+64*14], 1
vinserti128 m7, m7, [cq+64*15], 1
REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {paddw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
WRITE_16X2 2, 3, 0, 1, strideq*2, r3
lea dstq, [dstq+strideq*4]
WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
WRITE_16X2 6, 7, 0, 1, strideq*2, r3
lea dstq, [dstq+strideq*4]
add cq, 16
dec r4b
jge .loop
sub cq, 32
pxor m0, m0
mov r0d, 8
cmp cq, rax
jg .zero_loop
.zero_loop_half:
mova [rax+64*0], m0
mova [rax+64*1], m0
mova [rax+64*2], m0
mova [rax+64*3], m0
add rax, 64*4
sub r0d, 2
jg .zero_loop_half
RET
.zero_loop:
mova [rax+32*0], m0
mova [rax+32*1], m0
mova [rax+32*2], m0
mova [rax+32*3], m0
add rax, 32*4
dec r0d
jg .zero_loop
RET
cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
%undef cmp
lea rax, [o_base]
vpbroadcastd m9, [o(pw_2896x8)]
vpbroadcastd m10, [o(pw_5793x4)]
vpbroadcastd m11, [o(pw_2048)]
cmp eobd, 35 ; if (eob > 35)
setg r4b ; iteration_count++
cmp eobd, 150 ; if (eob > 150)
setg r3b ; iteration_count += 2
lea r4d, [r4+r3*2]
lea r3, [strideq*3]
mov r5, dstq
mov rax, cq
.loop:
mova xm0, [cq+32* 0]
mova xm1, [cq+32* 1]
vinserti128 m0, m0, [cq+32* 8], 1
vinserti128 m1, m1, [cq+32* 9], 1
mova xm2, [cq+32* 2]
mova xm3, [cq+32* 3]
vinserti128 m2, m2, [cq+32*10], 1
vinserti128 m3, m3, [cq+32*11], 1
mova xm4, [cq+32* 4]
mova xm5, [cq+32* 5]
vinserti128 m4, m4, [cq+32*12], 1
vinserti128 m5, m5, [cq+32*13], 1
mova xm6, [cq+32* 6]
mova xm7, [cq+32* 7]
vinserti128 m6, m6, [cq+32*14], 1
vinserti128 m7, m7, [cq+32*15], 1
REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psllw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
WRITE_16X2 2, 3, 0, 1, strideq*2, r3
lea dstq, [dstq+strideq*4]
WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
WRITE_16X2 6, 7, 0, 1, strideq*2, r3
lea dstq, [dstq+strideq*4]
add cq, 16
dec r4b
jl .ret
test r4b, 1
jz .loop
add cq, 32*15
lea dstq, [r5+16]
jmp .loop
.ret:
sub cq, 32
pxor m0, m0
mov r0d, 4
mov r1d, 8
cmp cq, rax
cmovg r0d, r1d
.zero_loop:
mova [rax+32*0], m0
mova [rax+32*1], m0
mova [rax+32*2], m0
mova [rax+32*3], m0
add rax, 32*4
dec r0d
jg .zero_loop
RET
cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob
lea rax, [o_base]
test eobd, eobd
jnz .normal
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
mov r2d, 32
jmp m(inv_txfm_add_dct_dct_32x8).dconly
.normal:
PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \
base, tmp3, tmp4
%undef cmp
lea tmp1q, [rsp+32*7]
lea tmp2q, [tmp1q+32*8]
sub eobd, 136
mov tmp4d, eobd
.pass1_loop:
LOAD_8ROWS cq+64*1, 64*2
pxor m8, m8
REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
test tmp4d, tmp4d
jl .fast
LOAD_8ROWS_H cq+64*17, 64*2
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
LOAD_8ROWS_H cq+64*16, 64*2
pxor m0, m0
REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
24, 25, 26, 27, 28, 29, 30, 31
mova [rsp], m15
jmp .idct16
.fast:
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
mova [rsp], m8
.idct16:
LOAD_8ROWS cq+64*0, 64*2
pxor m15, m15
REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
call m(idct_16x16_internal).main
call m(inv_txfm_add_dct_dct_32x16).pass1_end
vpbroadcastd m7, [o(pw_8192)]
call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
lea tmp3q, [tmp1q+32*32]
mova m15, [rsp]
mova [tmp3q-32*4], m0
mova [tmp3q-32*3], m2
mova [tmp3q-32*2], m4
mova [tmp3q-32*1], m6
mova [tmp3q+32*0], m8
mova [tmp3q+32*1], m10
mova [tmp3q+32*2], m12
mova [tmp3q+32*3], m14
add tmp3q, 32*8
mova [tmp3q-32*4], m1
mova [tmp3q-32*3], m3
mova [tmp3q-32*2], m5
mova [tmp3q-32*1], m7
mova [tmp3q+32*0], m9
mova [tmp3q+32*1], m11
mova [tmp3q+32*2], m13
mova [tmp3q+32*3], m15
vpbroadcastd m9, [o(pw_8192)]
pmulhrsw m0, m9, [tmp1q-32*4]
pmulhrsw m1, m9, [tmp1q-32*3]
pmulhrsw m2, m9, [tmp1q-32*2]
pmulhrsw m3, m9, [tmp1q-32*1]
pmulhrsw m4, m9, [tmp1q+32*0]
pmulhrsw m5, m9, [tmp1q+32*1]
pmulhrsw m6, m9, [tmp1q+32*2]
pmulhrsw m7, m9, [tmp1q+32*3]
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
mova [tmp1q-32*4], m0
pmulhrsw m0, m9, [tmp2q-32*4]
mova [tmp2q-32*4], m1
pmulhrsw m1, m9, [tmp2q-32*3]
mova [tmp1q-32*3], m2
pmulhrsw m2, m9, [tmp2q-32*2]
mova [tmp2q-32*3], m3
pmulhrsw m3, m9, [tmp2q-32*1]
mova [tmp1q-32*2], m4
pmulhrsw m4, m9, [tmp2q+32*0]
mova [tmp2q-32*2], m5
pmulhrsw m5, m9, [tmp2q+32*1]
mova [tmp1q-32*1], m6
pmulhrsw m6, m9, [tmp2q+32*2]
mova [tmp2q-32*1], m7
pmulhrsw m7, m9, [tmp2q+32*3]
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
mova [tmp1q+32*0], m0
mova [tmp2q+32*0], m1
mova [tmp1q+32*1], m2
mova [tmp2q+32*1], m3
mova [tmp1q+32*2], m4
mova [tmp2q+32*2], m5
mova [tmp1q+32*3], m6
mova [tmp2q+32*3], m7
add cq, 32
add tmp1q, 32*16
add tmp2q, 32*16
add eobd, 0x80000000
jnc .pass1_loop
add tmp1q, 32*24
imul r2, strideq, 19
lea r3, [strideq*3]
add r2, dstq
test tmp4d, tmp4d
jge .pass2_loop
add tmp1q, 32*16
add tmp2q, 32*16
add tmp3q, 32*16
.pass2_loop:
LOAD_8ROWS tmp2q-32*4, 32
test tmp4d, tmp4d
jl .fast2
LOAD_8ROWS_H tmp3q-32*4, 32
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
sub tmp3q, 32*8
LOAD_8ROWS_H tmp3q-32*4, 32
sub tmp3q, 32*16
jmp .pass2_loop_end
.fast2:
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
sub tmp3q, 32*24
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
.pass2_loop_end:
LOAD_8ROWS tmp3q-32*4, 32
mova [rsp], m15
call m(idct_16x16_internal).main
call m(inv_txfm_add_dct_dct_16x32).pass2_end
lea tmp3q, [tmp1q-32*32]
cmp tmp2q, tmp3q
jl .ret
sub tmp2q, 32*32
sub dstq, r3
lea r2, [r2+r3+16]
add dstq, 16
jmp .pass2_loop
.ret:
RET
cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob
%undef cmp
vpbroadcastd m9, [pw_8192]
sub eobd, 136 ; if (eob < 136)
shr eobd, 30 ; topleft 16x16 only
lea eobd, [eobq*2-8]
lea r4, [strideq*3]
mov r5, dstq
lea rax, [cq+32]
.loop:
mova xm0, [cq+64* 0]
mova xm1, [cq+64* 1]
vinserti128 m0, m0, [cq+64* 8], 1
vinserti128 m1, m1, [cq+64* 9], 1
mova xm2, [cq+64* 2]
mova xm3, [cq+64* 3]
vinserti128 m2, m2, [cq+64*10], 1
vinserti128 m3, m3, [cq+64*11], 1
mova xm4, [cq+64* 4]
mova xm5, [cq+64* 5]
vinserti128 m4, m4, [cq+64*12], 1
vinserti128 m5, m5, [cq+64*13], 1
mova xm6, [cq+64* 6]
mova xm7, [cq+64* 7]
vinserti128 m6, m6, [cq+64*14], 1
vinserti128 m7, m7, [cq+64*15], 1
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
WRITE_16X2 2, 3, 0, 1, strideq*2, r4
lea dstq, [dstq+strideq*4]
WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
WRITE_16X2 6, 7, 0, 1, strideq*2, r4
lea dstq, [dstq+strideq*4]
add cq, 16
inc eobd
jz .ret
test eobd, 3
jnz .loop
add cq, 64*15
lea dstq, [r5+16]
jmp .loop
.ret:
pxor m0, m0
mov r0d, 16
cmp cq, rax
jne .zero_loop
.zero_loop_topleft:
mova [rax-32*1], m0
mova [rax+32*1], m0
mova [rax+32*3], m0
mova [rax+32*5], m0
add rax, 64*4
sub r0d, 4
jg .zero_loop_topleft
RET
.zero_loop:
mova [rax-32*1], m0
mova [rax+32*0], m0
mova [rax+32*1], m0
mova [rax+32*2], m0
add rax, 32*4
dec r0d
jg .zero_loop
RET
%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
%if %1 & 1
mova m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n
mova m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n
%else
mova m%5, [tmp1q-32*(45-%1)]
mova m%4, [tmp2q-32*(20+%1)]
%endif
psubw m%6, m%5, m%4 ; idct32 out31-n
paddw m%5, m%4 ; idct32 out 0+n
psubw m%4, m%6, m%3 ; out32+n
paddw m%6, m%3 ; out31-n
psubw m%3, m%5, m%2 ; out63-n
paddw m%5, m%2 ; out 0+n
%if %0 == 6 ; pass 1
%if %1 & 1
mova [tmp2q-32*(19-%1)], m%4
mova [tmp1q-32*(14+%1)], m%6
mova [tmp1q+32*(18-%1)], m%3
mova [tmp2q-32*(51-%1)], m%5
%else
mova [tmp1q-32*(13-%1)], m%4
mova [tmp2q-32*(20+%1)], m%6
mova [tmp2q+32*(12-%1)], m%3
mova [tmp1q-32*(45-%1)], m%5
%endif
%else ; pass 2
REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5
%if %1 & 1
%define %%d0 r2
%define %%d1 dstq
%else
%define %%d0 dstq
%define %%d1 r2
%endif
pmovzxbw m%2, [%%d0+%9 ]
paddw m%2, m%4
pmovzxbw m%4, [%%d1+%8 ]
paddw m%4, m%6
pmovzxbw m%6, [%%d1+%10]
paddw m%3, m%6
pmovzxbw m%6, [%%d0+%7 ]
paddw m%5, m%6
packuswb m%2, m%4
packuswb m%3, m%5
vpermq m%2, m%2, q3120
vpermq m%3, m%3, q3120
mova [%%d0+%9 ], xm%2
vextracti128 [%%d1+%8 ], m%2, 1
mova [%%d1+%10], xm%3
vextracti128 [%%d0+%7 ], m%3, 1
%endif
%endmacro
cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob
lea rax, [o_base]
test eobd, eobd
jnz .normal
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
mov r2d, 32
jmp m(inv_txfm_add_dct_dct_16x4).dconly
.normal:
PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
%undef cmp
lea tmp1q, [rsp+32*23]
lea tmp2q, [tmp1q+32*24]
sub eobd, 151
mov r7d, eobd
.pass1_loop:
LOAD_16ROWS cq, 64
call m(idct_16x16_internal).main
mova m1, [rsp+32*1]
mova [rsp+32*0], m6
mova [rsp+32*1], m7
vpbroadcastd m7, [o(pw_8192)]
call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
mova m15, [rsp+32*0]
mova [tmp1q-32*4], m0
mova [tmp1q-32*3], m2
mova [tmp1q-32*2], m4
mova [tmp1q-32*1], m6
mova [tmp1q+32*0], m8
mova [tmp1q+32*1], m10
mova [tmp1q+32*2], m12
mova [tmp1q+32*3], m14
mova [tmp2q-32*4], m1
mova [tmp2q-32*3], m3
mova [tmp2q-32*2], m5
mova [tmp2q-32*1], m7
mova [tmp2q+32*0], m9
mova [tmp2q+32*1], m11
mova [tmp2q+32*2], m13
mova [tmp2q+32*3], m15
add cq, 32
add tmp1q, 32*8
add tmp2q, 32*8
add eobd, 0x80000000
jnc .pass1_loop
lea r2, [rsp+32*23]
mova xm0, [r2-32*4+ 0]
mova xm1, [r2-32*2+ 0]
vinserti128 m0, m0, [r2+32*0+ 0], 1
vinserti128 m1, m1, [r2+32*2+ 0], 1
mova xm2, [r2-32*4+16]
mova xm3, [r2-32*2+16]
vinserti128 m2, m2, [r2+32*0+16], 1
vinserti128 m3, m3, [r2+32*2+16], 1
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
test r7d, r7d
jl .fast
lea r3, [r2+32*8]
mova xm4, [r3-32*4+ 0]
mova xm5, [r3-32*2+ 0]
vinserti128 m4, m4, [r3+32*0+ 0], 1
vinserti128 m5, m5, [r3+32*2+ 0], 1
mova xm6, [r3-32*4+16]
mova xm7, [r3-32*2+16]
vinserti128 m6, m6, [r3+32*0+16], 1
vinserti128 m7, m7, [r3+32*2+16], 1
.fast:
mova [rsp], m8
lea tmp1q, [rsp+32*7]
call m(idct_16x16_internal).main
mova m1, [rsp+32*1]
mova [tmp1q-32*4], m0
mova [tmp1q-32*3], m1
mova [tmp1q-32*2], m2
mova [tmp1q-32*1], m3
mova [tmp1q+32*0], m4
mova [tmp1q+32*1], m5
mova [tmp1q+32*2], m6
mova [tmp1q+32*3], m7
add tmp1q, 32*8
mova [tmp1q-32*4], m8
mova [tmp1q-32*3], m9
mova [tmp1q-32*2], m10
mova [tmp1q-32*1], m11
mova [tmp1q+32*0], m12
mova [tmp1q+32*1], m13
mova [tmp1q+32*2], m14
mova [tmp1q+32*3], m15
mova xm0, [r2-32*3+ 0]
mova xm1, [r2-32*1+ 0]
vinserti128 m0, m0, [r2+32*1+ 0], 1
vinserti128 m1, m1, [r2+32*3+ 0], 1
mova xm2, [r2-32*3+16]
mova xm3, [r2-32*1+16]
vinserti128 m2, m2, [r2+32*1+16], 1
vinserti128 m3, m3, [r2+32*3+16], 1
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
test r7d, r7d
jl .fast2
mova xm4, [r3-32*3+ 0]
mova xm5, [r3-32*1+ 0]
vinserti128 m4, m4, [r3+32*1+ 0], 1
vinserti128 m5, m5, [r3+32*3+ 0], 1
mova xm6, [r3-32*3+16]
mova xm7, [r3-32*1+16]
vinserti128 m6, m6, [r3+32*1+16], 1
vinserti128 m7, m7, [r3+32*3+16], 1
.fast2:
add tmp1q, 32*8
lea tmp2q, [tmp1q+32*8]
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
add r2, 32*24
vpbroadcastd m15, [o(pd_2048)]
add tmp1q, 32*16
add tmp2q, 32*32
mova xm0, [r2-32*4+ 0]
mova xm3, [r2-32*1+16]
vinserti128 m0, m0, [r2+32*0+ 0], 1
vinserti128 m3, m3, [r2+32*3+16], 1
mova xm4, [r2-32*4+16]
mova xm7, [r2-32*1+ 0]
vinserti128 m4, m4, [r2+32*0+16], 1
vinserti128 m7, m7, [r2+32*3+ 0], 1
pxor m1, m1
REPX {mova x, m1}, m2, m5, m6
test r7d, r7d
jl .fast3
add r3, 32*24
mova xm1, [r3-32*1+16]
mova xm2, [r3-32*4+ 0]
vinserti128 m1, m1, [r3+32*3+16], 1
vinserti128 m2, m2, [r3+32*0+ 0], 1
mova xm5, [r3-32*1+ 0]
mova xm6, [r3-32*4+16]
vinserti128 m5, m5, [r3+32*3+ 0], 1
vinserti128 m6, m6, [r3+32*0+16], 1
.fast3:
add rax, o_idct64_offset
call m(inv_txfm_add_dct_dct_16x64).main_part1
add rax, 8
add tmp1q, 32*8
sub tmp2q, 32*8
mova xm0, [r2-32*2+ 0]
mova xm3, [r2-32*3+16]
vinserti128 m0, m0, [r2+32*2+ 0], 1
vinserti128 m3, m3, [r2+32*1+16], 1
mova xm4, [r2-32*2+16]
mova xm7, [r2-32*3+ 0]
vinserti128 m4, m4, [r2+32*2+16], 1
vinserti128 m7, m7, [r2+32*1+ 0], 1
pxor m1, m1
REPX {mova x, m1}, m2, m5, m6
test r7d, r7d
jl .fast4
mova xm1, [r3-32*3+16]
mova xm2, [r3-32*2+ 0]
vinserti128 m1, m1, [r3+32*1+16], 1
vinserti128 m2, m2, [r3+32*2+ 0], 1
mova xm5, [r3-32*3+ 0]
mova xm6, [r3-32*2+16]
vinserti128 m5, m5, [r3+32*1+ 0], 1
vinserti128 m6, m6, [r3+32*2+16], 1
.fast4:
call m(inv_txfm_add_dct_dct_16x64).main_part1
call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
RET
ALIGN function_align
%define o_base idct64_mul - 8
.main_part1:
; idct64 steps 1-5:
; in1/31/17/15/ 9/23/25/ 7 ->
; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a
; in5/27/21/11/13/19/29/ 3 ->
; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a
vpbroadcastd m11, [o(idct64_mul+4* 0)]
vpbroadcastd m13, [o(idct64_mul+4* 1)]
vpbroadcastd m10, [o(idct64_mul+4* 4)]
vpbroadcastd m12, [o(idct64_mul+4* 5)]
pmulhrsw m11, m0 ; t63a
pmulhrsw m0, m13 ; t32a
pmulhrsw m10, m1 ; t62a
pmulhrsw m1, m12 ; t33a
vpbroadcastd m9, [o(idct64_mul+4* 8)]
vpbroadcastd m13, [o(idct64_mul+4* 9)]
vpbroadcastd m8, [o(idct64_mul+4*12)]
vpbroadcastd m12, [o(idct64_mul+4*13)]
pmulhrsw m9, m2 ; t61a
pmulhrsw m2, m13 ; t34a
pmulhrsw m8, m3 ; t60a
pmulhrsw m3, m12 ; t35a
psubw m12, m0, m1 ; t33
paddw m0, m1 ; t32
psubw m1, m3, m2 ; t34
paddw m3, m2 ; t35
psubw m2, m8, m9 ; t61
paddw m8, m9 ; t60
psubw m9, m11, m10 ; t62
paddw m11, m10 ; t63
ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a
vpbroadcastd m14, [o(pw_401_4076)]
ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a
psubw m10, m0, m3 ; t35a
paddw m0, m3 ; t32a
psubw m3, m11, m8 ; t60a
paddw m11, m8 ; t63a
psubw m8, m9, m2 ; t34
paddw m9, m2 ; t33
psubw m2, m12, m1 ; t61
paddw m12, m1 ; t62
mova [tmp1q-32*4], m0
mova [tmp1q-32*3], m9
mova [tmp2q+32*2], m12
mova [tmp2q+32*3], m11
vpbroadcastd m13, [o(pw_m4017_799)]
vpbroadcastd m14, [o(pw_799_4017)]
ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13 ; t34a, t61a
ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13 ; t35, t60
mova [tmp1q-32*2], m2
mova [tmp1q-32*1], m3
mova [tmp2q+32*0], m10
mova [tmp2q+32*1], m8
vpbroadcastd m3, [o(idct64_mul+4*16)]
vpbroadcastd m11, [o(idct64_mul+4*17)]
vpbroadcastd m2, [o(idct64_mul+4*20)]
vpbroadcastd m10, [o(idct64_mul+4*21)]
vpbroadcastd m1, [o(idct64_mul+4*24)]
vpbroadcastd m9, [o(idct64_mul+4*25)]
vpbroadcastd m0, [o(idct64_mul+4*28)]
vpbroadcastd m8, [o(idct64_mul+4*29)]
pmulhrsw m3, m4 ; t59a
pmulhrsw m4, m11 ; t36a
pmulhrsw m2, m5 ; t58a
pmulhrsw m5, m10 ; t37a
pmulhrsw m1, m6 ; t57a
pmulhrsw m6, m9 ; t38a
pmulhrsw m0, m7 ; t56a
pmulhrsw m7, m8 ; t39a
psubw m8, m4, m5 ; t37
paddw m4, m5 ; t36
psubw m5, m7, m6 ; t38
paddw m7, m6 ; t39
psubw m6, m0, m1 ; t57
paddw m0, m1 ; t56
psubw m1, m3, m2 ; t58
paddw m3, m2 ; t59
ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a
vpbroadcastd m10, [o(pw_3166_2598)]
ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a
psubw m2, m7, m4 ; t36a
paddw m7, m4 ; t39a
psubw m4, m0, m3 ; t59a
paddw m0, m3 ; t56a
psubw m3, m6, m1 ; t37
paddw m6, m1 ; t38
psubw m1, m5, m8 ; t58
paddw m5, m8 ; t57
mova [tmp1q+32*2], m6
mova [tmp1q+32*3], m7
mova [tmp2q-32*4], m0
mova [tmp2q-32*3], m5
vpbroadcastd m6, [o(pw_m799_m4017)]
vpbroadcastd m7, [o(pw_m4017_799)]
ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6 ; t36, t59
ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6 ; t37a, t58a
mova [tmp1q+32*0], m4
mova [tmp1q+32*1], m1
mova [tmp2q-32*2], m3
mova [tmp2q-32*1], m2
ret
%define o_base pw_5 + 128
.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub
sub rax, o_idct64_offset + 8
vpbroadcastd m11, [o(pw_1567_3784)]
vpbroadcastd m12, [o(pw_m3784_1567)]
vpbroadcastd m13, [o(pw_m1567_m3784)]
vpbroadcastd m14, [o(pw_2896x8)]
.main_part2_pass1_loop:
call .main_part2_internal
REPX {pmulhrsw x, m14}, m1, m2, m4, m3
IDCT64_PART2_END 0, 7, 0, 6, 9, 10
IDCT64_PART2_END 7, 8, 5, 0, 6, 7
IDCT64_PART2_END 8, 2, 1, 0, 6, 7
IDCT64_PART2_END 15, 3, 4, 0, 6, 7
cmp tmp1q, tmp2q
jne .main_part2_pass1_loop
ret
.main_part2_internal:
mova m0, [tmp1q-32*12] ; t32a
mova m1, [tmp2q-32*13] ; t39a
mova m2, [tmp1q-32* 4] ; t40a
mova m5, [tmp2q+32* 3] ; t55a
add tmp1q, 32
sub tmp2q, 32
mova m4, [tmp1q+32* 3] ; t48a
mova m3, [tmp2q-32* 4] ; t47a
mova m6, [tmp1q+32*11] ; t56a
mova m7, [tmp2q+32*12] ; t63a
psubw m8, m0, m1 ; t39
paddw m0, m1 ; t32
psubw m1, m3, m2 ; t40
paddw m3, m2 ; t47
psubw m2, m4, m5 ; t55
paddw m4, m5 ; t48
psubw m5, m7, m6 ; t56
paddw m7, m6 ; t63
ITX_MULSUB_2W 5, 8, 6, 9, 15, 11, 12 ; t39a, t56a
ITX_MULSUB_2W 2, 1, 6, 9, 15, 12, 13 ; t40a, t55a
psubw m6, m0, m3 ; t47a
paddw m0, m3 ; t32a
psubw m3, m7, m4 ; t48a
paddw m7, m4 ; t63a
psubw m4, m5, m2 ; t40
paddw m5, m2 ; t39
psubw m2, m8, m1 ; t55
paddw m8, m1 ; t56
psubw m1, m2, m4 ; t40a
paddw m2, m4 ; t55a
psubw m4, m3, m6 ; t47
paddw m3, m6 ; t48
ret
.main_part2_pass2:
sub rax, o_idct64_offset + 8
vpbroadcastd m11, [o(pw_1567_3784)]
vpbroadcastd m12, [o(pw_m3784_1567)]
vpbroadcastd m13, [o(pw_m1567_m3784)]
vpbroadcastd m14, [o(pw_2048)]
lea r9, [strideq*5] ; stride*5
lea r3, [r9+strideq*1] ; stride*6
lea r7, [r9+strideq*2] ; stride*7
lea r8, [r3+strideq*2] ; stride*8
lea r2, [dstq+r7]
.main_part2_pass2_loop:
call .main_part2_internal
vpbroadcastd m10, [o(pw_2896x8)]
REPX {pmulhrsw x, m10}, m1, m2, m4, m3
IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8
IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8
IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8
IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8
add dstq, strideq
sub r2, strideq
cmp tmp1q, tmp2q
jne .main_part2_pass2_loop
ret
cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob
lea rax, [o_base]
test eobd, eobd
jnz .normal
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
mov r2d, 16
.dconly:
pmulhrsw xm0, xm2
movd xm2, [o(pw_2048)]
pmulhrsw xm0, xm1
pmulhrsw xm0, xm2
vpbroadcastw m0, xm0
pxor m1, m1
.dconly_loop:
mova m2, [dstq+32*0]
mova m3, [dstq+32*1]
punpckhbw m4, m2, m1
punpcklbw m2, m1
punpckhbw m5, m3, m1
punpcklbw m3, m1
paddw m4, m0
paddw m2, m0
paddw m5, m0
paddw m3, m0
packuswb m2, m4
packuswb m3, m5
mova [dstq+32*0], m2
mova [dstq+32*1], m3
add dstq, strideq
dec r2d
jg .dconly_loop
RET
.normal:
PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
LOAD_8ROWS cq+32*0, 32*4
pxor m8, m8
REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
mova [rsp], m8
lea tmp1q, [rsp+32*7]
call m(idct_16x16_internal).main
mova m1, [rsp+32*1]
mova [tmp1q-32*4], m0
mova [tmp1q-32*3], m1
mova [tmp1q-32*2], m2
mova [tmp1q-32*1], m3
mova [tmp1q+32*0], m4
mova [tmp1q+32*1], m5
mova [tmp1q+32*2], m6
mova [tmp1q+32*3], m7
add tmp1q, 32*8
mova [tmp1q-32*4], m8
mova [tmp1q-32*3], m9
mova [tmp1q-32*2], m10
mova [tmp1q-32*1], m11
mova [tmp1q+32*0], m12
mova [tmp1q+32*1], m13
mova [tmp1q+32*2], m14
mova [tmp1q+32*3], m15
LOAD_8ROWS cq+32*2, 32*4
pxor m8, m8
REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
add tmp1q, 32*8
lea tmp2q, [tmp1q+32*8]
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
vpbroadcastd m15, [o(pd_2048)]
add tmp1q, 32*16
add tmp2q, 32*32
mova m0, [cq+32* 1]
mova m1, [cq+32*31]
mova m2, [cq+32*17]
mova m3, [cq+32*15]
mova m4, [cq+32* 9]
mova m5, [cq+32*23]
mova m6, [cq+32*25]
mova m7, [cq+32* 7]
pxor m8, m8
REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
add rax, o_idct64_offset
call m(inv_txfm_add_dct_dct_16x64).main_part1
add rax, 8
add tmp1q, 32*8
sub tmp2q, 32*8
mova m0, [cq+32* 5]
mova m1, [cq+32*27]
mova m2, [cq+32*21]
mova m3, [cq+32*11]
mova m4, [cq+32*13]
mova m5, [cq+32*19]
mova m6, [cq+32*29]
mova m7, [cq+32* 3]
pxor m8, m8
REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
call m(inv_txfm_add_dct_dct_16x64).main_part1
call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1
sub tmp1q, 32*36
lea r2, [strideq*3]
mov tmp2d, 4
.pass2_loop:
lea r3, [tmp1q-32*8]
mova xm0, [r3 -32*4]
mova xm1, [r3 -32*3]
vinserti128 m0, m0, [tmp1q-32*4], 1
vinserti128 m1, m1, [tmp1q-32*3], 1
mova xm2, [r3 -32*2]
mova xm3, [r3 -32*1]
vinserti128 m2, m2, [tmp1q-32*2], 1
vinserti128 m3, m3, [tmp1q-32*1], 1
mova xm4, [r3 +32*0]
mova xm5, [r3 +32*1]
vinserti128 m4, m4, [tmp1q+32*0], 1
vinserti128 m5, m5, [tmp1q+32*1], 1
mova xm6, [r3 +32*2]
mova xm7, [r3 +32*3]
vinserti128 m6, m6, [tmp1q+32*2], 1
vinserti128 m7, m7, [tmp1q+32*3], 1
mova xm8, [r3 -32*4+16]
mova xm9, [r3 -32*3+16]
vinserti128 m8, m8, [tmp1q-32*4+16], 1
vinserti128 m9, m9, [tmp1q-32*3+16], 1
mova xm10, [r3 -32*2+16]
mova xm11, [r3 -32*1+16]
vinserti128 m10, m10, [tmp1q-32*2+16], 1
vinserti128 m11, m11, [tmp1q-32*1+16], 1
mova xm12, [r3 +32*0+16]
mova xm13, [r3 +32*1+16]
vinserti128 m12, m12, [tmp1q+32*0+16], 1
vinserti128 m13, m13, [tmp1q+32*1+16], 1
mova xm14, [r3 +32*2+16]
mova xm15, [r3 +32*3+16]
vinserti128 m14, m14, [tmp1q+32*2+16], 1
vinserti128 m15, m15, [tmp1q+32*3+16], 1
mova [rsp+32*0], m6
mova [rsp+32*1], m7
vpbroadcastd m7, [o(pw_8192)]
call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
call m(idct_16x16_internal).main
mova [rsp+32*0], m15
vpbroadcastd m15, [o(pw_2048)]
REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7
WRITE_16X2 2, 3, 1, 2, strideq*2, r2
pmulhrsw m1, m15, [rsp+32*1]
WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1
lea r3, [dstq+strideq*4]
%define dstq r3
WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1
WRITE_16X2 6, 7, 2, 3, strideq*2, r2
REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
lea r3, [r3+strideq*4]
WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1
WRITE_16X2 10, 11, 2, 3, strideq*2, r2
pmulhrsw m15, [rsp+32*0]
lea r3, [r3+strideq*4]
WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1
WRITE_16X2 14, 15, 2, 3, strideq*2, r2
add tmp1q, 32*16
add r0, 16
dec tmp2d
jg .pass2_loop
RET
cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob
lea rax, [o_base]
test eobd, eobd
jnz .normal
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
mov r2d, 64
jmp m(inv_txfm_add_dct_dct_32x8).dconly
.normal:
PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2
lea tmp1q, [rsp+32*7]
lea r10d, [eobq-136]
sar r10d, 31
.pass1_loop:
lea tmp2q, [tmp1q+32*16]
LOAD_8ROWS cq+64*1, 64*2, 1
pxor m8, m8
REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
test r10b, r10b
jnz .fast
LOAD_8ROWS_H cq+64*17, 64*2, 2
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
LOAD_8ROWS_H cq+64*16, 64*2, 1
mova [rsp], m15
pxor m15, m15
REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \
24, 25, 26, 27, 28, 29, 30, 31
jmp .idct16
.fast:
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
mova [rsp], m8
.idct16:
LOAD_8ROWS cq+64*0, 64*2, 1
pxor m15, m15
REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
call m(idct_16x16_internal).main
call m(inv_txfm_add_dct_dct_32x16).pass1_end
vpbroadcastd m7, [o(pw_16384)]
call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
lea r3, [tmp1q+32*48]
mova m15, [rsp]
mova [r3-32*4], m0
mova [r3-32*3], m2
mova [r3-32*2], m4
mova [r3-32*1], m6
mova [r3+32*0], m8
mova [r3+32*1], m10
mova [r3+32*2], m12
mova [r3+32*3], m14
add r3, 32*24
mova [r3-32*4], m1
mova [r3-32*3], m3
mova [r3-32*2], m5
mova [r3-32*1], m7
mova [r3+32*0], m9
mova [r3+32*1], m11
mova [r3+32*2], m13
mova [r3+32*3], m15
vpbroadcastd m9, [o(pw_16384)]
pmulhrsw m0, m9, [tmp1q-32*4]
pmulhrsw m1, m9, [tmp1q-32*3]
pmulhrsw m2, m9, [tmp1q-32*2]
pmulhrsw m3, m9, [tmp1q-32*1]
pmulhrsw m4, m9, [tmp1q+32*0]
pmulhrsw m5, m9, [tmp1q+32*1]
pmulhrsw m6, m9, [tmp1q+32*2]
pmulhrsw m7, m9, [tmp1q+32*3]
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
mova [tmp1q-32*4], m0
pmulhrsw m0, m9, [tmp2q-32*4]
mova [tmp2q-32*4], m1
pmulhrsw m1, m9, [tmp2q-32*3]
mova [tmp1q-32*3], m2
pmulhrsw m2, m9, [tmp2q-32*2]
mova [tmp2q-32*3], m3
pmulhrsw m3, m9, [tmp2q-32*1]
mova [tmp1q-32*2], m4
pmulhrsw m4, m9, [tmp2q+32*0]
mova [tmp2q-32*2], m5
pmulhrsw m5, m9, [tmp2q+32*1]
mova [tmp1q-32*1], m6
pmulhrsw m6, m9, [tmp2q+32*2]
mova [tmp2q-32*1], m7
pmulhrsw m7, m9, [tmp2q+32*3]
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
mova [tmp1q+32*0], m0
mova [tmp2q+32*0], m1
mova [tmp1q+32*1], m2
mova [tmp2q+32*1], m3
mova [tmp1q+32*2], m4
mova [tmp2q+32*2], m5
mova [tmp1q+32*3], m6
mova [tmp2q+32*3], m7
add cq, 32
add tmp1q, 32*8
add r10d, 0x80000000
jnc .pass1_loop
lea r2, [rsp+32*55]
lea r7, [r2+32*24]
.pass2_loop:
lea r3, [r2+32*8]
lea r8, [r7+32*8]
mova m0, [r2-32*4]
mova m1, [r2-32*2]
mova m2, [r2+32*0]
mova m3, [r2+32*2]
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
test r10b, r10b
jnz .fast2
mova m4, [r3-32*4]
mova m5, [r3-32*2]
mova m6, [r3+32*0]
mova m7, [r3+32*2]
.fast2:
mova [rsp], m8
lea tmp1q, [rsp+32*39]
call m(idct_16x16_internal).main
mova m1, [rsp+32*1]
mova [tmp1q-32*4], m0
mova [tmp1q-32*3], m1
mova [tmp1q-32*2], m2
mova [tmp1q-32*1], m3
mova [tmp1q+32*0], m4
mova [tmp1q+32*1], m5
mova [tmp1q+32*2], m6
mova [tmp1q+32*3], m7
add tmp1q, 32*8
mova [tmp1q-32*4], m8
mova [tmp1q-32*3], m9
mova [tmp1q-32*2], m10
mova [tmp1q-32*1], m11
mova [tmp1q+32*0], m12
mova [tmp1q+32*1], m13
mova [tmp1q+32*2], m14
mova [tmp1q+32*3], m15
mova m0, [r2-32*3]
mova m1, [r2-32*1]
mova m2, [r2+32*1]
mova m3, [r2+32*3]
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
test r10b, r10b
jnz .fast3
mova m4, [r3-32*3]
mova m5, [r3-32*1]
mova m6, [r3+32*1]
mova m7, [r3+32*3]
.fast3:
add tmp1q, 32*8
lea tmp2q, [tmp1q+32*8]
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
vpbroadcastd m15, [o(pd_2048)]
add tmp1q, 32*16
add tmp2q, 32*32
mova m0, [r7-32*4]
mova m3, [r7+32*3]
mova m4, [r7+32*0]
mova m7, [r7-32*1]
pxor m1, m1
REPX {mova x, m1}, m2, m5, m6
test r10b, r10b
jnz .fast4
mova m1, [r8+32*3]
mova m2, [r8-32*4]
mova m5, [r8-32*1]
mova m6, [r8+32*0]
.fast4:
add rax, o_idct64_offset
call m(inv_txfm_add_dct_dct_16x64).main_part1
add rax, 8
add tmp1q, 32*8
sub tmp2q, 32*8
mova m0, [r7-32*2]
mova m3, [r7+32*1]
mova m4, [r7+32*2]
mova m7, [r7-32*3]
pxor m1, m1
REPX {mova x, m1}, m2, m5, m6
test r10b, r10b
jnz .fast5
mova m1, [r8+32*1]
mova m2, [r8-32*2]
mova m5, [r8-32*3]
mova m6, [r8+32*2]
.fast5:
call m(inv_txfm_add_dct_dct_16x64).main_part1
call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
add r10d, 0x80000000
jc .ret
lea r2, [rsp+32*7]
lea r7, [r2+32*16]
sub dstq, r8
lea dstq, [dstq+strideq*4+16]
jmp .pass2_loop
.ret:
RET
cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob
lea rax, [o_base]
test eobd, eobd
jnz .normal
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
mov r2d, 32
jmp m(inv_txfm_add_dct_dct_64x16).dconly
.normal:
PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \
base, tmp3, tmp4
lea tmp1q, [rsp+32*7]
lea tmp4d, [eobq-136]
.pass1_loop:
LOAD_8ROWS cq+64*0, 64*4, 1
pxor m8, m8
REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
mova [rsp], m8
call m(idct_16x16_internal).main
mova m1, [rsp+32*1]
mova [tmp1q-32*4], m0
mova [tmp1q-32*3], m1
mova [tmp1q-32*2], m2
mova [tmp1q-32*1], m3
mova [tmp1q+32*0], m4
mova [tmp1q+32*1], m5
mova [tmp1q+32*2], m6
mova [tmp1q+32*3], m7
add tmp1q, 32*8
mova [tmp1q-32*4], m8
mova [tmp1q-32*3], m9
mova [tmp1q-32*2], m10
mova [tmp1q-32*1], m11
mova [tmp1q+32*0], m12
mova [tmp1q+32*1], m13
mova [tmp1q+32*2], m14
mova [tmp1q+32*3], m15
LOAD_8ROWS cq+64*2, 64*4, 1
pxor m8, m8
REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
add tmp1q, 32*8
lea tmp2q, [tmp1q+32*8]
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
vpbroadcastd m15, [o(pd_2048)]
add tmp1q, 32*16
add tmp2q, 32*32
vpbroadcastd m7, [o(pw_2896x8)]
pmulhrsw m0, m7, [cq+64* 1]
pmulhrsw m1, m7, [cq+64*31]
pmulhrsw m2, m7, [cq+64*17]
pmulhrsw m3, m7, [cq+64*15]
pmulhrsw m4, m7, [cq+64* 9]
pmulhrsw m5, m7, [cq+64*23]
pmulhrsw m6, m7, [cq+64*25]
pmulhrsw m7, [cq+64* 7]
pxor m8, m8
REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
add rax, o_idct64_offset
call m(inv_txfm_add_dct_dct_16x64).main_part1
vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))]
add rax, 8
add tmp1q, 32*8
sub tmp2q, 32*8
pmulhrsw m0, m7, [cq+64* 5]
pmulhrsw m1, m7, [cq+64*27]
pmulhrsw m2, m7, [cq+64*21]
pmulhrsw m3, m7, [cq+64*11]
pmulhrsw m4, m7, [cq+64*13]
pmulhrsw m5, m7, [cq+64*19]
pmulhrsw m6, m7, [cq+64*29]
pmulhrsw m7, [cq+64* 3]
pxor m8, m8
REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
call m(inv_txfm_add_dct_dct_16x64).main_part1
call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1
sub tmp1q, 32*44
vpbroadcastd m10, [o(pw_16384)]
call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave
add cq, 32
add tmp4d, 0x80000000
jnc .pass1_loop
lea tmp1q, [rsp+32*15]
imul r2, strideq, 19
lea r3, [strideq*3]
add r2, dstq
mov tmp4b, 4
.pass2_loop:
lea tmp2q, [tmp1q+32*64]
LOAD_8ROWS tmp1q-32*4, 32
test tmp4d, 0x40000000
jnz .fast
LOAD_8ROWS_H tmp2q-32*4, 32
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
lea tmp3q, [tmp2q-32*8]
LOAD_8ROWS_H tmp3q-32*4, 32
mova [rsp], m15
jmp .idct16
.fast:
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
mova [rsp], m8
.idct16:
lea tmp3q, [tmp1q-32*8]
LOAD_8ROWS tmp3q-32*4, 32
call m(idct_16x16_internal).main
call m(inv_txfm_add_dct_dct_16x32).pass2_end
add tmp1q, 32*16
sub dstq, r3
lea r2, [r2+r3+16]
add dstq, 16
dec tmp4b
jg .pass2_loop
RET
ALIGN function_align
.transpose_round_interleave:
mov tmp3d, 4
.loop:
lea tmp2q, [tmp1q+32*8]
mova xm0, [tmp1q-32*4]
mova xm1, [tmp1q-32*3]
vinserti128 m0, m0, [tmp2q-32*4], 1
vinserti128 m1, m1, [tmp2q-32*3], 1
mova xm2, [tmp1q-32*2]
mova xm3, [tmp1q-32*1]
vinserti128 m2, m2, [tmp2q-32*2], 1
vinserti128 m3, m3, [tmp2q-32*1], 1
mova xm4, [tmp1q+32*0]
mova xm5, [tmp1q+32*1]
vinserti128 m4, m4, [tmp2q+32*0], 1
vinserti128 m5, m5, [tmp2q+32*1], 1
mova xm6, [tmp1q+32*2]
mova xm7, [tmp1q+32*3]
vinserti128 m6, m6, [tmp2q+32*2], 1
vinserti128 m7, m7, [tmp2q+32*3], 1
REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
mova xm8, [tmp1q-32*4+16]
mova xm9, [tmp1q-32*3+16]
vinserti128 m8, m8, [tmp2q-32*4+16], 1
vinserti128 m9, m9, [tmp2q-32*3+16], 1
mova [tmp1q-32*4], m0
mova [tmp2q-32*4], m1
mova [tmp1q-32*3], m2
mova [tmp2q-32*3], m3
mova xm2, [tmp1q-32*2+16]
mova xm3, [tmp1q-32*1+16]
vinserti128 m2, m2, [tmp2q-32*2+16], 1
vinserti128 m3, m3, [tmp2q-32*1+16], 1
mova [tmp1q-32*2], m4
mova [tmp2q-32*2], m5
mova [tmp1q-32*1], m6
mova [tmp2q-32*1], m7
mova xm4, [tmp1q+32*0+16]
mova xm5, [tmp1q+32*1+16]
vinserti128 m4, m4, [tmp2q+32*0+16], 1
vinserti128 m5, m5, [tmp2q+32*1+16], 1
mova xm6, [tmp1q+32*2+16]
mova xm7, [tmp1q+32*3+16]
vinserti128 m6, m6, [tmp2q+32*2+16], 1
vinserti128 m7, m7, [tmp2q+32*3+16], 1
pmulhrsw m0, m8, m10
pmulhrsw m1, m9, m10
REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
mova [tmp1q+32*0], m0
mova [tmp2q+32*0], m1
mova [tmp1q+32*1], m2
mova [tmp2q+32*1], m3
mova [tmp1q+32*2], m4
mova [tmp2q+32*2], m5
mova [tmp1q+32*3], m6
mova [tmp2q+32*3], m7
add tmp1q, 32*16
dec tmp3d
jg .loop
ret
cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob
lea rax, [o_base]
test eobd, eobd
jnz .normal
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
mov r2d, 64
jmp m(inv_txfm_add_dct_dct_64x16).dconly
.normal:
PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2
lea tmp1q, [rsp+32*71]
lea r10d, [eobq-136]
.pass1_loop:
LOAD_8ROWS cq+64*0, 64*4
pxor m8, m8
REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
mova [rsp], m8
call m(idct_16x16_internal).main
mova m1, [rsp+32*1]
mova [tmp1q-32*4], m0
mova [tmp1q-32*3], m1
mova [tmp1q-32*2], m2
mova [tmp1q-32*1], m3
mova [tmp1q+32*0], m4
mova [tmp1q+32*1], m5
mova [tmp1q+32*2], m6
mova [tmp1q+32*3], m7
add tmp1q, 32*8
mova [tmp1q-32*4], m8
mova [tmp1q-32*3], m9
mova [tmp1q-32*2], m10
mova [tmp1q-32*1], m11
mova [tmp1q+32*0], m12
mova [tmp1q+32*1], m13
mova [tmp1q+32*2], m14
mova [tmp1q+32*3], m15
LOAD_8ROWS cq+64*2, 64*4
pxor m8, m8
REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
add tmp1q, 32*8
lea tmp2q, [tmp1q+32*8]
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
vpbroadcastd m15, [o(pd_2048)]
add tmp1q, 32*16
add tmp2q, 32*32
mova m0, [cq+64* 1]
mova m1, [cq+64*31]
mova m2, [cq+64*17]
mova m3, [cq+64*15]
mova m4, [cq+64* 9]
mova m5, [cq+64*23]
mova m6, [cq+64*25]
mova m7, [cq+64* 7]
pxor m8, m8
REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
add rax, o_idct64_offset
call m(inv_txfm_add_dct_dct_16x64).main_part1
add rax, 8
add tmp1q, 32*8
sub tmp2q, 32*8
mova m0, [cq+64* 5]
mova m1, [cq+64*27]
mova m2, [cq+64*21]
mova m3, [cq+64*11]
mova m4, [cq+64*13]
mova m5, [cq+64*19]
mova m6, [cq+64*29]
mova m7, [cq+64* 3]
pxor m8, m8
REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
call m(inv_txfm_add_dct_dct_16x64).main_part1
call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1
sub tmp1q, 32*44
vpbroadcastd m10, [o(pw_8192)]
call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave
add cq, 32
add r10d, 0x80000000
jnc .pass1_loop
lea tmp1q, [rsp+32*7]
mov r10b, 4
.pass2_loop:
lea r2, [tmp1q+32*64]
mova m0, [r2-32*4]
mova m1, [r2-32*2]
mova m2, [r2+32*0]
mova m3, [r2+32*2]
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
mova [rsp], m4
test r10d, 0x40000000
jnz .fast
lea r3, [r2+32*64]
mova m4, [r3-32*4]
mova m5, [r3-32*2]
mova m6, [r3+32*0]
mova m7, [r3+32*2]
.fast:
call m(idct_16x16_internal).main
mova m1, [rsp+32*1]
mova [tmp1q-32*4], m0
mova [tmp1q-32*3], m1
mova [tmp1q-32*2], m2
mova [tmp1q-32*1], m3
mova [tmp1q+32*0], m4
mova [tmp1q+32*1], m5
mova [tmp1q+32*2], m6
mova [tmp1q+32*3], m7
add tmp1q, 32*8
mova [tmp1q-32*4], m8
mova [tmp1q-32*3], m9
mova [tmp1q-32*2], m10
mova [tmp1q-32*1], m11
mova [tmp1q+32*0], m12
mova [tmp1q+32*1], m13
mova [tmp1q+32*2], m14
mova [tmp1q+32*3], m15
mova m0, [r2-32*3]
mova m1, [r2-32*1]
mova m2, [r2+32*1]
mova m3, [r2+32*3]
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
test r10d, 0x40000000
jnz .fast2
mova m4, [r3-32*3]
mova m5, [r3-32*1]
mova m6, [r3+32*1]
mova m7, [r3+32*3]
.fast2:
add tmp1q, 32*8
lea tmp2q, [tmp1q+32*8]
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
vpbroadcastd m15, [o(pd_2048)]
add r2, 32*8
add r3, 32*8
add tmp1q, 32*16
add tmp2q, 32*32
mova m0, [r2-32*4] ; 1
mova m3, [r2+32*3] ; 15
mova m4, [r2+32*0] ; 9
mova m7, [r2-32*1] ; 7
pxor m1, m1
REPX {mova x, m1}, m2, m5, m6
test r10d, 0x40000000
jnz .fast3
mova m1, [r3+32*3] ; 31
mova m2, [r3-32*4] ; 17
mova m5, [r3-32*1] ; 23
mova m6, [r3+32*0] ; 25
.fast3:
add rax, o_idct64_offset
call m(inv_txfm_add_dct_dct_16x64).main_part1
add rax, 8
add tmp1q, 32*8
sub tmp2q, 32*8
mova m0, [r2-32*2] ; 5
mova m3, [r2+32*1] ; 11
mova m4, [r2+32*2] ; 13
mova m7, [r2-32*3] ; 3
pxor m1, m1
REPX {mova x, m1}, m2, m5, m6
test r10d, 0x40000000
jnz .fast4
mova m1, [r3+32*1] ; 27
mova m2, [r3-32*2] ; 21
mova m5, [r3-32*3] ; 19
mova m6, [r3+32*2] ; 29
.fast4:
call m(inv_txfm_add_dct_dct_16x64).main_part1
call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
sub tmp1q, 32*28
sub dstq, r8
lea dstq, [dstq+strideq*4+16]
dec r10b
jg .pass2_loop
RET
%endif ; ARCH_X86_64