shithub: dav1d

Download patch

ref: 7a222d46af13cebae6695acf49753328d551739c
parent: 1e9c428ae4e3a8a280ca448d32e1989ec37d2f08
author: Liwei Wang <liwei@multicorewareinc.com>
date: Mon Nov 19 10:33:20 EST 2018

Add SSSE3 implementation for dav1d_inv_txfm_add_dct_dct_4x4

Cycle times:
inv_txfm_add_4x4_dct_dct_0_8bpc_c: 492.6
inv_txfm_add_4x4_dct_dct_0_8bpc_ssse3: 22.6
inv_txfm_add_4x4_dct_dct_1_8bpc_c: 494.2
inv_txfm_add_4x4_dct_dct_1_8bpc_ssse3: 48.3

--- a/src/meson.build
+++ b/src/meson.build
@@ -122,6 +122,7 @@
             'x86/looprestoration.asm',
             'x86/mc.asm',
             'x86/mc_ssse3.asm',
+            'x86/itx_ssse3.asm',
         )
 
         # Compile the ASM sources with NASM
--- a/src/x86/itx_init_tmpl.c
+++ b/src/x86/itx_init_tmpl.c
@@ -77,6 +77,8 @@
 decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2);
 decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2);
 
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_4x4_ssse3);
+
 void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
 #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
     c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
@@ -113,7 +115,16 @@
     assign_itx16_fn(pfx, w, h, ext); \
     assign_itx_fn(pfx, w, h, wht_wht,           WHT_WHT,           ext)
 
+#define assign_itx_ssse3_fn_8b() \
+    c->itxfm_add[TX_4X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x4_ssse3;
+
     const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+    assign_itx_ssse3_fn_8b();
+#endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
 
--- /dev/null
+++ b/src/x86/itx_ssse3.asm
@@ -1,0 +1,199 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+
+SECTION_RODATA 16
+
+deint_shuf: db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
+
+qw_2896x8:      times 8 dw  2896*8
+qw_1567_m3784:  times 4 dw  1567, -3784
+qw_3784_1567:   times 4 dw  3784,  1567
+
+pd_2048:        times 4 dd  2048
+qw_2048:        times 8 dw  2048
+
+SECTION .text
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+    mova                 m2, [qw_%5]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+%endif
+    lea                  r2, [dstq+strideq*2]
+%assign %%i 1
+%rep 4
+    %if %1 & 2
+        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
+    %else
+        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+    %endif
+    %assign %%i %%i + 1
+    %rotate 1
+%endrep
+
+    movd                 m2, [%%row_adr1]       ;dst0
+    movd                 m4, [%%row_adr2]       ;dst1
+    punpckldq            m2, m4                 ;high: dst1 :low: dst0
+    movd                 m3, [%%row_adr3]       ;dst2
+    movd                 m4, [%%row_adr4]       ;dst3
+    punpckldq            m3, m4                 ;high: dst3 :low: dst2
+
+    pxor                 m4, m4
+    punpcklbw            m2, m4                 ;extend byte to word
+    punpcklbw            m3, m4                 ;extend byte to word
+
+    paddw                m0, m2                 ;high: dst1 + out1 ;low: dst0 + out0
+    paddw                m1, m3                 ;high: dst3 + out3 ;low: dst2 + out2
+
+    packuswb             m0, m1                 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
+
+    movd       [%%row_adr1], m0                 ;store dst0 + out0
+    pshuflw              m1, m0, q1032
+    movd       [%%row_adr2], m1                 ;store dst1 + out1
+    punpckhqdq           m0, m0
+    movd       [%%row_adr3], m0                 ;store dst2 + out2
+    psrlq                m0, 32
+    movd       [%%row_adr4], m0                 ;store dst3 + out3
+
+  RET
+%endmacro
+
+
+; flags: 1 = swap, 2: coef_regs
+%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
+%if %6 & 2
+    pmaddwd              m%2, m%4, m%1
+    pmaddwd              m%1, m%5
+%elif %6 & 1
+    pmaddwd              m%2, m%1, [qw_%5_%4]
+    pmaddwd              m%1, [qw_%4_m%5]
+%else
+    pmaddwd              m%2, m%1, [qw_%4_m%5]
+    pmaddwd              m%1, [qw_%5_%4]
+%endif
+    paddd                m%2, m%3
+    paddd                m%1, m%3
+    psrad                m%2, 12
+    psrad                m%1, 12
+    packssdw             m%1, m%2
+%endmacro
+
+%macro IDCT4_1D_PACKED 0-1   ;qw_2896x8
+    punpckhwd            m2, m0, m1           ;unpacked in1 in3
+    psubw                m3, m0, m1
+    paddw                m0, m1
+    punpcklqdq           m0, m3               ;high: in0-in2 ;low: in0+in2
+
+    mova                 m3, [pd_2048]
+    ITX_MUL2X_PACK 2, 1, 3, 1567, 3784
+
+%if %0 == 1
+    pmulhrsw             m0, m%1
+%else
+    pmulhrsw             m0, [qw_2896x8]     ;high: t1 ;low: t0
+%endif
+
+    psubw                m1, m0, m2          ;high: out2 ;low: out3
+    paddw                m0, m2              ;high: out1 ;low: out0
+%endmacro
+
+
+%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
+cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+    lea tx2q, [m(i%2_%4_internal).pass2]
+%if %3 > 0
+    cmp                  eobd, %3
+    jle %%end
+%elif %3 == 0
+    test                 eobd, eobd
+    jz %%end
+%endif
+    call i%1_%4_internal
+    RET
+ALIGN function_align
+%%end:
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 4x4
+%ifidn %1_%2, dct_identity
+%elifidn %1_%2, identity_dct
+%elif %3 >= 0
+    pshuflw              m0, [coeffq], q0000
+    punpcklqdq           m0, m0
+%ifidn %1, dct
+    mova                 m1, [qw_2896x8]
+    pmulhrsw             m0, m1
+%elifidn %1, adst
+%elifidn %1, flipadst
+%endif
+    mov            [coeffq], eobd             ;0
+%ifidn %2, dct
+%ifnidn %1, dct
+    pmulhrsw             m0, [qw_2896x8]
+%else
+    pmulhrsw             m0, m1
+%endif
+    mova                 m1, m0
+    ITX4_END             0, 1, 2, 3
+%else ; adst / flipadst
+%endif
+%endif
+%endmacro
+
+
+INIT_XMM ssse3
+
+cglobal idct_4x4_internal, 0, 0, 4, dst, stride, coeff, eob, tx2
+    mova                 m0, [coeffq+16*0]      ;high: in1 ;low: in0
+    mova                 m1, [coeffq+16*1]      ;high: in3 ;low in2
+
+    IDCT4_1D_PACKED
+
+    mova                 m2, [deint_shuf]
+    shufps               m3, m0, m1, q1331
+    shufps               m0, m1, q0220
+    pshufb               m0, m2                 ;high: in1 ;low: in0
+    pshufb               m1, m3, m2             ;high: in3 ;low :in2
+    jmp                tx2q
+
+.pass2:
+    IDCT4_1D_PACKED
+
+    pxor                 m2, m2
+    mova      [coeffq+16*0], m2
+    mova      [coeffq+16*1], m2                 ;memset(coeff, 0, sizeof(*coeff) * sh * sw);
+
+    ITX4_END     0, 1, 3, 2
+
+INV_TXFM_4X4_FN dct, dct, 0