shithub: dav1d

Download patch

ref: 3e9f9676400dcaa5416ff5f1864763128381a4d1
parent: 11b725064478436213d86aa468051dcfbd2931c0
author: Victorien Le Couviour--Tuffet <victorien@videolan.org>
date: Sat Oct 12 21:30:04 EDT 2019

x86: adapt SSSE3 cdef_filter_{4x4,4x8,8x8} to SSE2

---------------------
x86_64:
------------------------------------------
cdef_filter_4x4_8bpc_c: 1376.0
cdef_filter_4x4_8bpc_sse2: 177.6
cdef_filter_4x4_8bpc_ssse3: 132.5
---------------------
cdef_filter_4x8_8bpc_c: 2725.0
cdef_filter_4x8_8bpc_sse2: 327.6
cdef_filter_4x8_8bpc_ssse3: 234.9
---------------------
cdef_filter_8x8_8bpc_c: 5938.8
cdef_filter_8x8_8bpc_sse2: 556.8
cdef_filter_8x8_8bpc_ssse3: 388.1
------------------------------------------

---------------------
x86_32:
------------------------------------------
cdef_filter_4x4_8bpc_c: 1569.5
cdef_filter_4x4_8bpc_sse2: 201.9
cdef_filter_4x4_8bpc_ssse3: 162.3
---------------------
cdef_filter_4x8_8bpc_c: 3141.6
cdef_filter_4x8_8bpc_sse2: 368.3
cdef_filter_4x8_8bpc_ssse3: 283.4
---------------------
cdef_filter_8x8_8bpc_c: 6534.5
cdef_filter_8x8_8bpc_sse2: 666.7
cdef_filter_8x8_8bpc_ssse3: 503.5
------------------------------------------

--- a/src/x86/cdef_init_tmpl.c
+++ b/src/x86/cdef_init_tmpl.c
@@ -31,14 +31,17 @@
 decl_cdef_fn(dav1d_cdef_filter_8x8_avx2);
 decl_cdef_fn(dav1d_cdef_filter_8x8_sse4);
 decl_cdef_fn(dav1d_cdef_filter_8x8_ssse3);
+decl_cdef_fn(dav1d_cdef_filter_8x8_sse2);
 
 decl_cdef_fn(dav1d_cdef_filter_4x8_avx2);
 decl_cdef_fn(dav1d_cdef_filter_4x8_sse4);
 decl_cdef_fn(dav1d_cdef_filter_4x8_ssse3);
+decl_cdef_fn(dav1d_cdef_filter_4x8_sse2);
 
 decl_cdef_fn(dav1d_cdef_filter_4x4_avx2);
 decl_cdef_fn(dav1d_cdef_filter_4x4_sse4);
 decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
+decl_cdef_fn(dav1d_cdef_filter_4x4_sse2);
 
 decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
 decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
@@ -46,6 +49,14 @@
 
 COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+#if BITDEPTH == 8
+    c->fb[0] = dav1d_cdef_filter_8x8_sse2;
+    c->fb[1] = dav1d_cdef_filter_4x8_sse2;
+    c->fb[2] = dav1d_cdef_filter_4x4_sse2;
+#endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
 
--- a/src/x86/cdef_sse.asm
+++ b/src/x86/cdef_sse.asm
@@ -32,6 +32,7 @@
 pb_0: times 16 db 0
 pb_0xFF: times 16 db 0xFF
 %endif
+pw_8: times 8 dw 8
 pw_128: times 8 dw 128
 pw_256: times 8 dw 256
 pw_2048: times 8 dw 2048
@@ -118,6 +119,36 @@
  %endif
 %endmacro
 
+%macro PMOVZXBW 2-3 0 ; %3 = half
+ %if %3 == 1
+    movd            %1, %2
+ %else
+    movq            %1, %2
+ %endif
+    punpcklbw       %1, m15
+%endmacro
+
+%macro PSHUFB_0 2
+ %if cpuflag(ssse3)
+    pshufb          %1, %2
+ %else
+    punpcklbw       %1, %1
+    pshuflw         %1, %1, q0000
+    punpcklqdq      %1, %1
+ %endif
+%endmacro
+
+%macro LOAD_SEC_TAP 0
+ %if ARCH_X86_64
+    movd            m3, [secq+kq]
+    PSHUFB_0        m3, m15
+ %else
+    movd            m2, [secq+kq]             ; sec_taps
+    pxor            m3, m3
+    PSHUFB_0        m2, m3
+ %endif
+%endmacro
+
 %macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, stride
     ; load p0/p1
     movsx         offq, byte [dirq+kq+%1]       ; off1
@@ -153,13 +184,13 @@
     pmaxsw          m7, m10                     ; max after p1
     pminsw          m8, m6                      ; min after p1
   %else
-    pcmpeqw         m3, m5, OUT_OF_BOUNDS_MEM
-    pandn           m3, m5
-    pmaxsw          m7, m3                      ; max after p0
+    pcmpeqw         m9, m5, OUT_OF_BOUNDS_MEM
+    pandn           m9, m5
+    pmaxsw          m7, m9                      ; max after p0
     pminsw          m8, m5                      ; min after p0
-    pcmpeqw         m3, m6, OUT_OF_BOUNDS_MEM
-    pandn           m3, m6
-    pmaxsw          m7, m3                      ; max after p1
+    pcmpeqw         m9, m6, OUT_OF_BOUNDS_MEM
+    pandn           m9, m6
+    pmaxsw          m7, m9                      ; max after p1
     pminsw          m8, m6                      ; min after p1
   %endif
  %endif
@@ -168,13 +199,24 @@
     psubw           m5, m4          ; diff_p0(p0 - px)
     psubw           m6, m4          ; diff_p1(p1 - px)
     packsswb        m5, m6          ; convert pixel diff to 8-bit
- %if ARCH_X86_64 && cpuflag(sse4)
+ %if cpuflag(ssse3)
+  %if ARCH_X86_64 && cpuflag(sse4)
     pshufb          m5, m14         ; group diffs p0 and p1 into pairs
- %else
+  %else
     pshufb          m5, [PIC_sym(shufb_lohi)]
- %endif
+  %endif
     pabsb           m6, m5
     psignb          m9, %5, m5
+ %else
+    movlhps         m6, m5
+    punpckhbw       m6, m5
+    pxor            m5, m5
+    pcmpgtb         m5, m6
+    paddb           m6, m5
+    pxor            m6, m5
+    paddb           m9, %5, m5
+    pxor            m9, m5
+ %endif
  %if ARCH_X86_64
     psrlw          m10, m6, %2      ; emulate 8-bit shift
     pand           m10, %3
@@ -186,17 +228,18 @@
     pxor            m5, [PIC_sym(pb_0xFF)]
  %endif
     pminub          m5, m6          ; constrain(diff_p)
+ %if cpuflag(ssse3)
     pmaddubsw       m5, m9          ; constrain(diff_p) * taps
-    paddw          m13, m5
-%endmacro
-
-%macro PMOVZXBW 2-3 0 ; %3 = half
- %if %3 == 1
-    movd            %1, %2
  %else
-    movq            %1, %2
+    psrlw           m2, m5, 8
+    psraw           m6, m9, 8
+    psllw           m5, 8
+    psllw           m9, 8
+    pmullw          m2, m6
+    pmulhw          m5, m9
+    paddw           m5, m2
  %endif
-    punpcklbw       %1, m15
+    paddw          m13, m5
 %endmacro
 
 %macro LOAD_BODY 4  ; dst, src, block_width, tmp_stride
@@ -610,8 +653,8 @@
  %endif
     movd            m2, [tableq+pridmpq]
     movd            m3, [tableq+secdmpq]
-    pshufb          m2, m15                     ; pri_shift_mask
-    pshufb          m3, m15                     ; sec_shift_mask
+    PSHUFB_0        m2, m15                     ; pri_shift_mask
+    PSHUFB_0        m3, m15                     ; sec_shift_mask
  %if ARCH_X86_64
     SWAP            m2, m11
     SWAP            m3, m12
@@ -630,13 +673,15 @@
     movd            m0, prid
     movd            m1, secd
  %if ARCH_X86_64
-    pshufb          m0, m15
-    pshufb          m1, m15
+    PSHUFB_0        m0, m15
+    PSHUFB_0        m1, m15
  %else
-    mova            m2, m15
+  %if cpuflag(ssse3)
+    pxor            m2, m2
+  %endif
     mova            m3, [PIC_sym(pb_0xFF)]
-    pshufb          m0, m2
-    pshufb          m1, m2
+    PSHUFB_0        m0, m2
+    PSHUFB_0        m1, m2
     pxor            m0, m3
     pxor            m1, m3
     mova    [esp+0x20], m0
@@ -687,22 +732,29 @@
     mova            m7, m4                      ; max
     mova            m8, m4                      ; min
 .k_loop:
- %if ARCH_X86_64
     movd            m2, [priq+kq]               ; pri_taps
-    movd            m3, [secq+kq]               ; sec_taps
-    pshufb          m2, m15
-    pshufb          m3, m15
+ %if ARCH_X86_64
+    PSHUFB_0        m2, m15
+  %if cpuflag(ssse3)
+    LOAD_SEC_TAP                                ; sec_taps
+  %endif
     ACCUMULATE_TAP 0*2, [rsp+ 0], m11, m0, m2, %1, %3
+  %if notcpuflag(ssse3)
+    LOAD_SEC_TAP                                ; sec_taps
+  %endif
     ACCUMULATE_TAP 2*2, [rsp+16], m12, m1, m3, %1, %3
     ACCUMULATE_TAP 6*2, [rsp+16], m12, m1, m3, %1, %3
  %else
-    movd            m2, [priq+kq]             ; pri_taps
-    pshufb          m2, m15
+  %if cpuflag(ssse3)
+    pxor            m3, m3
+  %endif
+    PSHUFB_0        m2, m3
     ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, %3
-
-    movd            m2, [secq+kq]             ; sec_taps
-    pshufb          m2, m15
+    LOAD_SEC_TAP                                ; sec_taps
     ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
+  %if notcpuflag(ssse3)
+    LOAD_SEC_TAP                                ; sec_taps
+  %endif
     ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
  %endif
 
@@ -709,14 +761,15 @@
     dec             kq
     jge .k_loop
 
- %if cpuflag(sse4)
-    pcmpgtw         m6, m15, m13
- %else
     pxor            m6, m6
     pcmpgtw         m6, m13
- %endif
     paddw          m13, m6
+ %if cpuflag(ssse3)
     pmulhrsw       m13, [PIC_sym(pw_2048)]
+ %else
+    paddw          m13, [PIC_sym(pw_8)]
+    psraw          m13, 4
+ %endif
     paddw           m4, m13
     pminsw          m4, m7
     pmaxsw          m4, m8
@@ -1352,3 +1405,8 @@
 CDEF_FILTER 4, 8, 32
 CDEF_FILTER 4, 4, 32
 CDEF_DIR
+
+INIT_XMM sse2
+CDEF_FILTER 8, 8, 32
+CDEF_FILTER 4, 8, 32
+CDEF_FILTER 4, 4, 32