ref: a7ca7b225f71b78f1c67e7f095e00fe207fe69ef
parent: f16b43cdfa2f3f2d5af36185819bebf1ca9c806d
author: Henrik Gramner <gramner@twoorioles.com>
date: Tue Jan 7 19:44:15 EST 2020
x86: Fix SSSE3 inverse identity transform overflow/clipping
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -77,7 +77,6 @@
pw_1697x8: times 8 dw 1697*8
pw_2896x8: times 8 dw 2896*8
pw_3344x8: times 8 dw 3344*8
-pw_5793x4: times 8 dw 5793*4
pw_8192: times 8 dw 8192
pw_m8192: times 8 dw -8192
pw_5: times 8 dw 5
@@ -276,7 +275,7 @@
mova m0, [o(pw_2896x8)]
pmulhrsw m0, [coeffq]
pmulhrsw m1, m0, [o(pw_1697x8)]
- paddw m0, m1
+ paddsw m0, m1
punpcklwd m0, m0
punpckhdq m1, m0, m0
punpckldq m0, m0
@@ -289,7 +288,7 @@
punpcklwd m0, m1
punpcklqdq m0, m0
pmulhrsw m1, m0, [o(pw_1697x8)]
- paddw m0, m1
+ paddsw m0, m1
pmulhrsw m0, [o(pw_2896x8)]
mova m1, m0
TAIL_CALL m(iadst_4x4_internal).end
@@ -439,8 +438,8 @@
mova m3, [o(pw_1697x8)]
pmulhrsw m2, m0, m3
pmulhrsw m3, m1
- paddw m0, m2
- paddw m1, m3
+ paddsw m0, m2
+ paddsw m1, m3
punpckhwd m2, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m2 ;high: in3 ;low :in2
@@ -451,8 +450,8 @@
mova m3, [o(pw_1697x8)]
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
- paddw m0, m2
- paddw m1, m3
+ paddsw m0, m2
+ paddsw m1, m3
jmp m(iadst_4x4_internal).end
%macro IWHT4_1D_PACKED 0
@@ -611,7 +610,7 @@
punpckldq m0, m1
pmulhrsw m0, m2
pmulhrsw m1, m0, [o(pw_1697x8)]
- paddw m0, m1
+ paddsw m0, m1
pmulhrsw m0, m2
pmulhrsw m0, [o(pw_2048)]
punpcklqdq m0, m0
@@ -834,10 +833,10 @@
pmulhrsw m5, m7, m1
pmulhrsw m6, m7, m2
pmulhrsw m7, m3
- paddw m0, m4
- paddw m1, m5
- paddw m2, m6
- paddw m3, m7
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
jmp m(iadst_4x8_internal).pass1_end
.pass2:
@@ -1641,10 +1640,10 @@
pmulhrsw m1, m2
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
- paddw m0, m0
- paddw m1, m1
- paddw m0, m2
- paddw m1, m3
+ paddsw m0, m0
+ paddsw m1, m1
+ paddsw m0, m2
+ paddsw m1, m3
pmulhrsw m0, m4
pmulhrsw m4, m1
punpckhwd m2, m0, m0
@@ -1666,18 +1665,17 @@
punpcklwd m0, [coeffq+32*1]
movd m1, [coeffq+32*2]
punpcklwd m1, [coeffq+32*3]
- mova m3, [o(pw_16384)]
punpckldq m0, m1
pmulhrsw m1, m0, [o(pw_1697x8)]
- mova m4, [o(pw_2896x8)]
- paddw m0, m1
- pmulhrsw m0, m3
- psrlw m3, 3 ; pw_2048
- pmulhrsw m0, m4
- pmulhrsw m0, m3
+ pcmpeqw m2, m2
+ pcmpeqw m2, m0
+ pxor m0, m2
+ pavgw m0, m1
+ pmulhrsw m0, [o(pw_2896x8)]
+ pmulhrsw m0, [o(pw_2048)]
punpcklqdq m0, m0
- pxor m7, m7
- REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3
+ pxor m1, m1
+ REPX {mova [coeffq+32*x], m1}, 0, 1, 2, 3
%elifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklwd m0, m0
@@ -1886,16 +1884,59 @@
INV_TXFM_4X16_FN identity, flipadst
INV_TXFM_4X16_FN identity, identity
-%macro IDTX16 3 ; src/dst, tmp, pw_1697x16
- pmulhrsw m%2, m%3, m%1
- paddw m%1, m%1
- paddw m%1, m%2
+%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
+ pmulhrsw m%2, m%3, m%1
+%if %0 == 4 ; if downshifting by 1
+ pmulhrsw m%2, m%4
+%else
+ paddsw m%1, m%1
+%endif
+ paddsw m%1, m%2
%endmacro
cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
- lea r3, [o(m(iidentity_4x8_internal).pass1)]
- jmp m(idct_4x16_internal).pass1
-
+ mova m0, [coeffq+16*1]
+ mova m6, [o(pw_1697x8)]
+ mova m1, [coeffq+16*3]
+ mova m2, [coeffq+16*5]
+ mova m3, [coeffq+16*7]
+ pcmpeqw m7, m7
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_2)]
+.pass1:
+ pmulhrsw m4, m6, m0
+ pmulhrsw m5, m6, m1
+ pavgw m4, m0
+ pcmpeqw m0, m7
+ pavgw m5, m1
+ pcmpeqw m1, m7
+ pandn m0, m4
+ pmulhrsw m4, m6, m2
+ pandn m1, m5
+ pmulhrsw m5, m6, m3
+ pavgw m4, m2
+ pcmpeqw m2, m7
+ pavgw m5, m3
+ pcmpeqw m3, m7
+ pandn m2, m4
+ pandn m3, m5
+ jmp m(iadst_4x8_internal).pass1_end
+.pass1_2:
+ mova [coeffq+16*1], m0
+ mova [coeffq+16*3], m1
+ mova [coeffq+16*5], m2
+ mova [coeffq+16*7], m3
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*2]
+ mova m2, [coeffq+16*4]
+ mova m3, [coeffq+16*6]
+ lea tx2q, [o(.pass1_end)]
+ jmp .pass1
+.pass1_end:
+ mova m4, [coeffq+16*1]
+ mova m5, [coeffq+16*3]
+ mova m6, [coeffq+16*5]
+ jmp r3
.pass2:
mova m7, [o(pw_1697x16)]
mova [coeffq+16*6], m6
@@ -1905,14 +1946,13 @@
mova [coeffq+16*7], m6
mova m6, [coeffq+16*6]
pmulhrsw m7, m6, [o(pw_1697x16)]
- paddw m6, m6
- paddw m6, m7
-
+ paddsw m6, m6
+ paddsw m6, m7
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, [coeffq+16*7]
mova [coeffq+16*4], m4
- jmp m(iadst_4x16_internal).end2
+ jmp m(iadst_4x16_internal).end2
%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
@@ -1925,7 +1965,7 @@
pmulhrsw m3, m0
psrlw m0, 3 ; pw_2048
pmulhrsw m1, m3, [o(pw_1697x8)]
- paddw m3, m1
+ paddsw m3, m1
pmulhrsw m3, m0
punpcklwd m3, m3
pshufd m0, m3, q0000
@@ -1957,7 +1997,7 @@
punpcklqdq m0, m1
pmulhrsw m1, m4, m0
pmulhrsw m1, m5
- paddw m0, m1
+ paddsw m0, m1
pmulhrsw m0, m6
pmulhrsw m0, m7
.end:
@@ -2434,9 +2474,9 @@
pmulhrsw m4, m7
pmulhrsw m3, m7
pmulhrsw m5, m7
- paddw m1, m4
- paddw m0, m3
- paddw m5, m2
+ paddsw m1, m4
+ paddsw m0, m3
+ paddsw m5, m2
mova m2, [coeffq+16*2]
mova m3, [coeffq+16*3]
mova m4, [coeffq+16*4]
@@ -2449,9 +2489,9 @@
pmulhrsw m0, m7
pmulhrsw m1, m7
pmulhrsw m5, m7
- paddw m2, m0
- paddw m3, m1
- paddw m4, m5
+ paddsw m2, m0
+ paddsw m3, m1
+ paddsw m4, m5
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
pmulhrsw m5, m6, m0
@@ -2458,8 +2498,8 @@
pmulhrsw m6, m1
pmulhrsw m5, m7
pmulhrsw m6, m7
- paddw m0, m5
- paddw m1, m6
+ paddsw m0, m5
+ paddsw m1, m6
mova m6, [coeffq+16*6]
mova m5, [coeffq+16*5]
punpckhwd m7, m0, m2 ;packed out1, out5
@@ -2521,8 +2561,8 @@
pmulhrsw m7, m1
psrlw m1, 3 ; pw_2048
pmulhrsw m0, m7, [o(pw_1697x16)]
- paddw m7, m7
- paddw m7, m0
+ paddsw m7, m7
+ paddsw m7, m0
pmulhrsw m7, m1
punpcklwd m0, m7, m7
punpckhwd m7, m7
@@ -2858,7 +2898,7 @@
pmulhrsw m0, m4
pmulhrsw m1, m5, m0
pmulhrsw m1, m6
- paddw m0, m1
+ paddsw m0, m1
pmulhrsw m0, m4
pmulhrsw m0, m7
.end:
@@ -3359,35 +3399,35 @@
REPX {pmulhrsw x, m0}, m4, m5, m6, m7
pmulhrsw m1, m2, m4
pmulhrsw m1, m3
- paddw m1, m4 ; 1
+ paddsw m1, m4 ; 1
pmulhrsw m4, m2, m5
pmulhrsw m4, m3
- paddw m4, m5 ; 3
+ paddsw m4, m5 ; 3
pmulhrsw m5, m2, m6
pmulhrsw m5, m3
- paddw m5, m6 ; 5
+ paddsw m5, m6 ; 5
pmulhrsw m6, m2, m7
pmulhrsw m6, m3
- paddw m7, m6 ; 7
+ paddsw m7, m6 ; 7
pmulhrsw m6, m0, [coeffq+16*6]
mova [rsp+gprsize+16*0], m4
pmulhrsw m4, m2, m6
pmulhrsw m4, m3
- paddw m6, m4 ; 6
+ paddsw m6, m4 ; 6
pmulhrsw m4, m0, [coeffq+16*4]
mova [rsp+gprsize+16*1], m6
pmulhrsw m6, m2, m4
pmulhrsw m6, m3
- paddw m4, m6 ; 4
+ paddsw m4, m6 ; 4
pmulhrsw m6, m0, [coeffq+16*2]
pmulhrsw m0, [coeffq+16*0]
pmulhrsw m2, m6
pmulhrsw m2, m3
- paddw m2, m6 ; 2
+ paddsw m2, m6 ; 2
pmulhrsw m6, m0, [o(pw_1697x16)]
pmulhrsw m6, m3
mova m3, [rsp+gprsize+16*0]
- paddw m0, m6
+ paddsw m0, m6
jmp m(idct_8x8_internal).pass1_end3
.pass1_end:
@@ -3435,7 +3475,7 @@
pmulhrsw m2, m3, [coeffq+16*0]
pmulhrsw m3, [coeffq+16*1]
mova m0, [o(pw_8192)]
- mova m1, [o(pw_5793x4)]
+ mova m1, [o(pw_1697x16)]
pshuflw m4, [o(deint_shuf)], q0000 ;pb_0_1
punpcklwd m4, m4
pcmpeqb m5, m5
@@ -3446,10 +3486,12 @@
pmulhrsw m2, m0
pmulhrsw m3, m0
psrlw m0, 2 ;pw_2048
- psllw m2, 2
- psllw m3, 2
- pmulhrsw m2, m1
- pmulhrsw m3, m1
+ pmulhrsw m7, m1, m2
+ pmulhrsw m1, m3
+ paddsw m2, m2
+ paddsw m3, m3
+ paddsw m2, m7
+ paddsw m3, m1
pmulhrsw m2, m0
pmulhrsw m3, m0
mov r3d, 8
@@ -3477,9 +3519,8 @@
RET
%elifidn %1_%2, identity_dct
mova m4, [o(pw_1697x16)]
- mova m5, [o(pw_8192)]
- mova m6, [o(pw_2896x8)]
- psrlw m7, m5, 2 ;pw_2048
+ mova m5, [o(pw_2896x8)]
+ mova m6, [o(pw_2048)]
xor eobd, eobd
lea tx2q, [o(m(inv_txfm_add_identity_dct_16x16).end)]
lea r3, [dstq+8]
@@ -3503,11 +3544,10 @@
punpckldq m1, m2
punpcklqdq m0, m1
pmulhrsw m1, m4, m0
- paddw m0, m0
- paddw m0, m1
+ psraw m1, 1
+ pavgw m0, m1
pmulhrsw m0, m5
pmulhrsw m0, m6
- pmulhrsw m0, m7
mov [coeffq+32*0], eobd
mov [coeffq+32*1], eobd
mov [coeffq+32*2], eobd
@@ -3814,6 +3854,12 @@
jmp m(iflipadst_8x8_internal).end
+%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
+ pmulhrsw m%2, m%3, m%1
+ psraw m%2, 1
+ pavgw m%1, m%2
+%endmacro
+
INV_TXFM_16X16_FN identity, dct, 15
INV_TXFM_16X16_FN identity, identity
@@ -3823,21 +3869,20 @@
lea tx2q, [o(m(iidentity_16x16_internal).pass1_end)]
.pass1:
- mova m7, [o(pw_1697x16)]
- mova m6, [coeffq+32*7]
+ mova m6, [o(pw_1697x16)]
+ mova m7, [coeffq+32*6]
mova m0, [coeffq+32*0]
mova m1, [coeffq+32*1]
mova m2, [coeffq+32*2]
mova m3, [coeffq+32*3]
mova m4, [coeffq+32*4]
- REPX {IDTX16 x, 5, 7}, 6, 0, 1, 2, 3, 4
+ REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4
mova m5, [coeffq+32*5]
- mova [rsp+gprsize+16*0], m6
- IDTX16 5, 6, 7
- mova m6, [coeffq+32*6]
- IDTX16 6, 7, 7
- mova m7, [o(pw_8192)]
- jmp m(idct_8x8_internal).pass1_end1
+ mova [rsp+gprsize+16*1], m7
+ IDTX16B 5, 7, 6
+ mova m7, [coeffq+32*7]
+ IDTX16B 7, 6, 6
+ jmp m(idct_8x8_internal).pass1_end3
.pass1_end:
SAVE_8ROWS coeffq, 32
@@ -4555,26 +4600,21 @@
LEA r5, $$
%endif
lea tx2q, [o(m(idct_32x8_internal).end8)]
-
.loop:
LOAD_8ROWS coeffq+16*0, 64
- paddw m6, [o(pw_5)]
+ paddsw m6, [o(pw_5)]
mova [rsp+16*1], m6
mova m6, [o(pw_5)]
- REPX {paddw x, m6}, m0, m1, m2, m3, m4, m5, m7
-
+ REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7
call m(idct_8x8_internal).pass1_end3
- REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
-
+ REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
mova [rsp+16*2], m5
mova [rsp+16*1], m6
mova [rsp+16*0], m7
call m(idct_8x8_internal).end3
lea dstq, [dstq+strideq*2]
-
pxor m7, m7
REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
-
add coeffq, 16
dec r3d
jg .loop
@@ -5024,31 +5064,39 @@
.loop:
LOAD_8ROWS coeffq, 64, 1
- REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
mova [rsp+16*1], m6
+ pxor m6, m6
+ REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
lea tx2q, [o(m(idct_32x16_internal).end)]
call m(idct_8x8_internal).pass1_end3
- pmulhrsw m7, [o(pw_5793x4)]
- paddw m7, [o(pw_5)]
- psraw m7, 3
+ mova [rsp+16*0], m2
+ mova [rsp+16*1], m3
+ mova [rsp+16*2], m4
+ mova m3, [o(pw_1697x16)]
+ mova m4, [o(pw_16384)]
+ REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1
+ mova m2, [o(pw_8192)]
+ REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1
+ mova m2, [rsp+16*0]
mova [rsp+16*0], m7
- mova m7, [o(pw_5793x4)]
- REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
- mova m7, [o(pw_5)]
- REPX {paddw x, m7}, m0, m1, m2, m3, m4, m5, m6
- REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6
+ IDTX16 2, 7, 3, 4
+ mova m7, [rsp+16*2]
mova [rsp+16*2], m5
+ IDTX16 7, 5, 3, 4
+ mova m5, [rsp+16*1]
mova [rsp+16*1], m6
+ pmulhrsw m3, m5
+ pmulhrsw m3, m4
+ psrlw m4, 1 ; pw_8192
+ paddsw m3, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ pmulhrsw m4, m7
call m(idct_8x8_internal).end3
lea dstq, [dstq+strideq*2]
-
- pxor m7, m7
- REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
-
add coeffq, 16
dec r3
jg .loop
-
mov coeffq, [rsp+gprsize*2+16*3]
add coeffq, 64*8
mov r3, [rsp+gprsize+16*3]
@@ -5057,7 +5105,6 @@
mov dstq, [rsp+16*3]
test r3, r3
jnz .loop
-
RET
@@ -5081,40 +5128,26 @@
.loop:
LOAD_8ROWS coeffq, 32, 1
- REPX {psllw x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
mova [rsp+16*1], m6
lea tx2q, [o(m(idct_32x16_internal).end)]
call m(idct_8x8_internal).pass1_end3
- mova [rsp+16*2], m5
- mova [rsp+16*1], m6
- mova m5, [o(pw_1697x8)]
- pmulhrsw m6, m5, m7
- paddw m7, m6
- pmulhrsw m6, m5, m0
- paddw m0, m6
- pmulhrsw m6, m5, m1
- paddw m1, m6
- pmulhrsw m6, m5, m2
- paddw m2, m6
- pmulhrsw m6, m5, m3
- paddw m3, m6
- pmulhrsw m6, m5, m4
+ mova [rsp+16*1], m5
+ mova [rsp+16*2], m6
+ mova m6, [o(pw_1697x16)]
+ REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4
pmulhrsw m7, [o(pw_2048)]
- paddw m4, m6
- mova m6, [rsp+16*1]
+ mova m5, [rsp+16*1]
mova [rsp+16*0], m7
- pmulhrsw m7, m5, m6
- paddw m6, m7
+ IDTX16 5, 7, 6
mova m7, [rsp+16*2]
- pmulhrsw m5, m7
- paddw m5, m7
- mova m7, [o(pw_2048)]
- REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ IDTX16 7, 6, 6
+ mova m6, [o(pw_2048)]
+ REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
mova [rsp+16*2], m5
- mova [rsp+16*1], m6
+ mova [rsp+16*1], m7
call m(idct_8x8_internal).end3
lea dstq, [dstq+strideq*2]
-
pxor m7, m7
REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7