shithub: dav1d

Download patch

ref: 87a377e990f6cd1db9c214a6d33c15accefe4a32
parent: 6f2f0188f1efb12614164e356cf1f1027e4cbaaa
author: Liwei Wang <liwei@multicorewareinc.com>
date: Tue Dec 4 11:23:06 EST 2018

Add SSSE3 implementation for the 4x4 blocks in itx

Cycle times:
inv_txfm_add_4x4_adst_adst_0_8bpc_c: 445.9
inv_txfm_add_4x4_adst_adst_0_8bpc_ssse3: 23.7
inv_txfm_add_4x4_adst_adst_1_8bpc_c: 443.7
inv_txfm_add_4x4_adst_adst_1_8bpc_ssse3: 52.6
inv_txfm_add_4x4_adst_dct_0_8bpc_c: 474.5
inv_txfm_add_4x4_adst_dct_0_8bpc_ssse3: 23.9
inv_txfm_add_4x4_adst_dct_1_8bpc_c: 482.0
inv_txfm_add_4x4_adst_dct_1_8bpc_ssse3: 51.1
inv_txfm_add_4x4_adst_flipadst_0_8bpc_c: 587.2
inv_txfm_add_4x4_adst_flipadst_0_8bpc_ssse3: 24.0
inv_txfm_add_4x4_adst_flipadst_1_8bpc_c: 457.2
inv_txfm_add_4x4_adst_flipadst_1_8bpc_ssse3: 52.8
inv_txfm_add_4x4_adst_identity_0_8bpc_c: 412.4
inv_txfm_add_4x4_adst_identity_0_8bpc_ssse3: 43.3
inv_txfm_add_4x4_adst_identity_1_8bpc_c: 412.0
inv_txfm_add_4x4_adst_identity_1_8bpc_ssse3: 43.3
inv_txfm_add_4x4_dct_adst_0_8bpc_c: 467.4
inv_txfm_add_4x4_dct_adst_0_8bpc_ssse3: 23.2
inv_txfm_add_4x4_dct_adst_1_8bpc_c: 588.3
inv_txfm_add_4x4_dct_adst_1_8bpc_ssse3: 48.6
inv_txfm_add_4x4_dct_dct_0_8bpc_c: 611.5
inv_txfm_add_4x4_dct_dct_0_8bpc_ssse3: 23.1
inv_txfm_add_4x4_dct_dct_1_8bpc_c: 576.2
inv_txfm_add_4x4_dct_dct_1_8bpc_ssse3: 47.6
inv_txfm_add_4x4_dct_flipadst_0_8bpc_c: 479.5
inv_txfm_add_4x4_dct_flipadst_0_8bpc_ssse3: 23.4
inv_txfm_add_4x4_dct_flipadst_1_8bpc_c: 549.3
inv_txfm_add_4x4_dct_flipadst_1_8bpc_ssse3: 48.3
inv_txfm_add_4x4_dct_identity_0_8bpc_c: 576.9
inv_txfm_add_4x4_dct_identity_0_8bpc_ssse3: 25.4
inv_txfm_add_4x4_dct_identity_1_8bpc_c: 610.7
inv_txfm_add_4x4_dct_identity_1_8bpc_ssse3: 25.1
inv_txfm_add_4x4_flipadst_adst_0_8bpc_c: 532.8
inv_txfm_add_4x4_flipadst_adst_0_8bpc_ssse3: 23.8
inv_txfm_add_4x4_flipadst_adst_1_8bpc_c: 666.7
inv_txfm_add_4x4_flipadst_adst_1_8bpc_ssse3: 61.0
inv_txfm_add_4x4_flipadst_dct_0_8bpc_c: 539.6
inv_txfm_add_4x4_flipadst_dct_0_8bpc_ssse3: 23.8
inv_txfm_add_4x4_flipadst_dct_1_8bpc_c: 484.6
inv_txfm_add_4x4_flipadst_dct_1_8bpc_ssse3: 51.1
inv_txfm_add_4x4_flipadst_flipadst_0_8bpc_c: 503.1
inv_txfm_add_4x4_flipadst_flipadst_0_8bpc_ssse3: 23.9
inv_txfm_add_4x4_flipadst_flipadst_1_8bpc_c: 463.0
inv_txfm_add_4x4_flipadst_flipadst_1_8bpc_ssse3: 54.0
inv_txfm_add_4x4_flipadst_identity_0_8bpc_c: 719.9
inv_txfm_add_4x4_flipadst_identity_0_8bpc_ssse3: 43.0
inv_txfm_add_4x4_flipadst_identity_1_8bpc_c: 456.8
inv_txfm_add_4x4_flipadst_identity_1_8bpc_ssse3: 44.1
inv_txfm_add_4x4_identity_adst_0_8bpc_c: 422.8
inv_txfm_add_4x4_identity_adst_0_8bpc_ssse3: 42.4
inv_txfm_add_4x4_identity_adst_1_8bpc_c: 417.1
inv_txfm_add_4x4_identity_adst_1_8bpc_ssse3: 42.3
inv_txfm_add_4x4_identity_dct_0_8bpc_c: 435.4
inv_txfm_add_4x4_identity_dct_0_8bpc_ssse3: 25.7
inv_txfm_add_4x4_identity_dct_1_8bpc_c: 434.1
inv_txfm_add_4x4_identity_dct_1_8bpc_ssse3: 25.3
inv_txfm_add_4x4_identity_flipadst_0_8bpc_c: 528.1
inv_txfm_add_4x4_identity_flipadst_0_8bpc_ssse3: 40.9
inv_txfm_add_4x4_identity_flipadst_1_8bpc_c: 720.0
inv_txfm_add_4x4_identity_flipadst_1_8bpc_ssse3: 41.8
inv_txfm_add_4x4_identity_identity_0_8bpc_c: 383.2
inv_txfm_add_4x4_identity_identity_0_8bpc_ssse3: 28.3
inv_txfm_add_4x4_identity_identity_1_8bpc_c: 378.9
inv_txfm_add_4x4_identity_identity_1_8bpc_ssse3: 28.2
inv_txfm_add_4x4_wht_wht_0_8bpc_c: 271.5
inv_txfm_add_4x4_wht_wht_0_8bpc_ssse3: 34.0
inv_txfm_add_4x4_wht_wht_1_8bpc_c: 266.0
inv_txfm_add_4x4_wht_wht_1_8bpc_ssse3: 33.9

--- a/src/x86/itx_init_tmpl.c
+++ b/src/x86/itx_init_tmpl.c
@@ -77,7 +77,7 @@
 decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2);
 decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2);
 
-decl_itx_fn(dav1d_inv_txfm_add_dct_dct_4x4_ssse3);
+decl_itx17_fns(4, 4, ssse3);
 
 void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
 #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
@@ -115,8 +115,6 @@
     assign_itx16_fn(pfx, w, h, ext); \
     assign_itx_fn(pfx, w, h, wht_wht,           WHT_WHT,           ext)
 
-#define assign_itx_ssse3_fn_8b() \
-    c->itxfm_add[TX_4X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x4_ssse3;
 
     const unsigned flags = dav1d_get_cpu_flags();
 
@@ -123,7 +121,7 @@
     if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
 
 #if BITDEPTH == 8
-    assign_itx_ssse3_fn_8b();
+    assign_itx17_fn(, 4, 4, ssse3);
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -35,9 +35,22 @@
 qw_1567_m3784:  times 4 dw  1567, -3784
 qw_3784_1567:   times 4 dw  3784,  1567
 
+qw_1321_3803:   times 4 dw  1321,  3803
+qw_2482_m1321:  times 4 dw  2482, -1321
+qw_3344_2482:   times 4 dw  3344,  2482
+qw_3344_m3803:  times 4 dw  3344, -3803
+qw_m6688_m3803: times 4 dw -6688, -3803
+qw_3344x8:      times 8 dw  3344*8
+qw_5793x4:      times 8 dw  5793*4
+
 pd_2048:        times 4 dd  2048
 qw_2048:        times 8 dw  2048
 
+iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
+iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
+iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
+iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424
+
 SECTION .text
 
 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
@@ -84,7 +97,7 @@
     psrlq                m0, 32
     movd       [%%row_adr4], m0                 ;store dst3 + out3
 
-  RET
+    ret
 %endmacro
 
 
@@ -126,6 +139,34 @@
     paddw                m0, m2              ;high: out1 ;low: out0
 %endmacro
 
+%macro IADST4_1D_PACKED 0
+    punpcklwd            m2, m0, m1                ;unpacked in0 in2
+    punpckhwd            m3, m0, m1                ;unpacked in1 in3
+    psubw                m0, m1
+    punpckhqdq           m1, m1                    ;
+    paddw                m1, m0                    ;low: in0 - in2 + in3
+
+    pmaddwd              m0, m2, [qw_1321_3803]    ;1321 * in0 + 3803 * in2
+    pmaddwd              m2, [qw_2482_m1321]       ;2482 * in0 - 1321 * in2
+    pmaddwd              m4, m3, [qw_3344_2482]    ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m3, [qw_3344_m3803]   ;3344 * in1 - 3803 * in3
+    paddd                m4, m0                    ;t0 + t3
+
+    pmaddwd              m3, [qw_m6688_m3803]      ;-2 * 3344 * in1 - 3803 * in3
+    pmulhrsw             m1, [qw_3344x8]           ;low: out2
+    mova                 m0, [pd_2048]
+    paddd                m2, m0
+    paddd                m0, m4                    ;t0 + t3 + 2048
+    paddd                m5, m2                    ;t1 + t3 + 2048
+    paddd                m2, m4
+    paddd                m2, m3                    ;t0 + t1 - t3 + 2048
+
+    psrad                m0, 12                    ;out0
+    psrad                m5, 12                    ;out1
+    psrad                m2, 12                    ;out3
+    packssdw             m0, m5                    ;high: out1 ;low: out0
+    packssdw             m2, m2                    ;high: out3 ;low: out3
+%endmacro
 
 %macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
 cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, coeff, eob, tx2
@@ -146,29 +187,57 @@
 
 %macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
     INV_TXFM_FN          %1, %2, %3, 4x4
-%ifidn %1_%2, dct_identity
-%elifidn %1_%2, identity_dct
-%elif %3 >= 0
+%ifidn %1_%2, dct_identity
+    mova                 m0, [qw_2896x8]
+    pmulhrsw             m0, [coeffq]
+    paddw                m0, m0
+    pmulhrsw             m0, [qw_5793x4]
+    punpcklwd            m0, m0
+    punpckhdq            m1, m0, m0
+    punpckldq            m0, m0
+    call m(iadst_4x4_internal).end
+    RET
+%elifidn %1_%2, identity_dct
+    mova                 m1, [coeffq+16*0]
+    mova                 m2, [coeffq+16*1]
+    punpcklwd            m0, m1, m2
+    punpckhwd            m1, m2
+    punpcklwd            m0, m1
+    punpcklqdq           m0, m0
+    paddw                m0, m0
+    pmulhrsw             m0, [qw_5793x4]
+    pmulhrsw             m0, [qw_2896x8]
+    mova                 m1, m0
+    call m(iadst_4x4_internal).end
+    RET
+%elif %3 >= 0
     pshuflw              m0, [coeffq], q0000
-    punpcklqdq           m0, m0
-%ifidn %1, dct
-    mova                 m1, [qw_2896x8]
-    pmulhrsw             m0, m1
-%elifidn %1, adst
-%elifidn %1, flipadst
-%endif
-    mov            [coeffq], eobd             ;0
-%ifidn %2, dct
-%ifnidn %1, dct
-    pmulhrsw             m0, [qw_2896x8]
-%else
-    pmulhrsw             m0, m1
-%endif
-    mova                 m1, m0
-    ITX4_END             0, 1, 2, 3
-%else ; adst / flipadst
-%endif
-%endif
+    punpcklqdq           m0, m0
+%ifidn %1, dct
+    mova                 m1, [qw_2896x8]
+    pmulhrsw             m0, m1
+%elifidn %1, adst
+    pmulhrsw             m0, [iadst4_dconly1a]
+%elifidn %1, flipadst
+    pmulhrsw             m0, [iadst4_dconly1b]
+%endif
+    mov            [coeffq], eobd                ;0
+%ifidn %2, dct
+%ifnidn %1, dct
+    pmulhrsw             m0, [qw_2896x8]
+%else
+    pmulhrsw             m0, m1
+%endif
+    mova                 m1, m0
+    call m(iadst_4x4_internal).end2
+    RET
+%else ; adst / flipadst
+    pmulhrsw             m1, m0, [iadst4_dconly2b]
+    pmulhrsw             m0, [iadst4_dconly2a]
+    call m(i%2_4x4_internal).end2
+    RET
+%endif
+%endif
 %endmacro
 
 
@@ -197,3 +266,129 @@
     ITX4_END     0, 1, 3, 2
 
 INV_TXFM_4X4_FN dct, dct, 0
+
+cglobal iadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    call .main
+    punpckhwd            m3, m0, m2
+    punpcklwd            m0, m1
+    punpckhwd            m1, m0, m3       ;high: in3 ;low :in2
+    punpcklwd            m0, m3           ;high: in1 ;low: in0
+    jmp                tx2q
+
+.pass2:
+    call .main
+    punpcklqdq            m1, m2          ;out2 out3
+
+.end:
+    pxor                 m2, m2
+    mova      [coeffq+16*0], m2
+    mova      [coeffq+16*1], m2
+
+.end2:
+    ITX4_END              0, 1, 2, 3
+
+ALIGN function_align
+.main:
+    IADST4_1D_PACKED
+    ret
+
+INV_TXFM_4X4_FN adst, adst, 0
+INV_TXFM_4X4_FN dct,  adst, 0
+INV_TXFM_4X4_FN adst, dct,  0
+
+cglobal iflipadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    call m(iadst_4x4_internal).main
+    punpcklwd            m1, m0
+    punpckhwd            m2, m0
+    punpcklwd            m0, m2, m1            ;high: in3 ;low :in2
+    punpckhwd            m2, m1                ;high: in1 ;low: in0
+    mova                 m1, m2
+    jmp                tx2q
+
+.pass2:
+    call m(iadst_4x4_internal).main
+    punpcklqdq            m1, m2               ;out2 out3
+
+.end:
+    pxor                 m2, m2
+    mova      [coeffq+16*0], m2
+    mova      [coeffq+16*1], m2
+
+.end2:
+    ITX4_END              3, 2, 1, 0
+
+INV_TXFM_4X4_FN flipadst, flipadst, 0
+INV_TXFM_4X4_FN flipadst, dct,      0
+INV_TXFM_4X4_FN flipadst, adst,     0
+INV_TXFM_4X4_FN dct,      flipadst, 0
+INV_TXFM_4X4_FN adst,     flipadst, 0
+
+cglobal iidentity_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    mova                 m2, [qw_5793x4]
+    paddw                m0, m0
+    paddw                m1, m1
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+
+    punpckhwd            m2, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m0, m2            ;high: in3 ;low :in2
+    punpcklwd            m0, m2                ;high: in1 ;low: in0
+    jmp                tx2q
+
+.pass2:
+    mova                 m2, [qw_5793x4]
+    paddw                m0, m0
+    paddw                m1, m1
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    jmp m(iadst_4x4_internal).end
+
+INV_TXFM_4X4_FN identity, identity
+INV_TXFM_4X4_FN identity, dct,      3
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN dct,      identity, 3
+INV_TXFM_4X4_FN adst,     identity
+INV_TXFM_4X4_FN flipadst, identity
+
+%macro IWHT4_1D_PACKED 0
+    punpckhqdq           m3, m0, m1            ;low: in1 high: in3
+    punpcklqdq           m0, m1                ;low: in0 high: in2
+    psubw                m2, m0, m3            ;low: in0 - in1 high: in2 - in3
+    paddw                m0, m3                ;low: in0 + in1 high: in2 + in3
+    punpckhqdq           m2, m2                ;t2 t2
+    punpcklqdq           m0, m0                ;t0 t0
+    psubw                m1, m0, m2
+    psraw                m1, 1                 ;t4 t4
+    psubw                m1, m3                ;low: t1/out2 high: t3/out1
+    psubw                m0, m1                ;high: out0
+    paddw                m2, m1                ;low: out3
+%endmacro
+
+cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    pxor                 m2, m2
+    mova      [coeffq+16*0], m2
+    mova      [coeffq+16*1], m2
+    psraw                m0, 2
+    psraw                m1, 2
+
+    IWHT4_1D_PACKED
+
+    punpckhwd            m0, m1
+    punpcklwd            m3, m1, m2
+    punpckhdq            m1, m0, m3
+    punpckldq            m0, m3
+
+    IWHT4_1D_PACKED
+
+    shufpd               m0, m2, 0x01
+    ITX4_END              0, 3, 2, 1, 0