ref: 4866abab1f9fcdc9a2a4934c4facd94e4ffddb1e
parent: 0526e1eacb98424212715b711bfac9ce5da86f3a
author: Victorien Le Couviour--Tuffet <victorien@videolan.org>
date: Mon Oct 21 07:12:07 EDT 2019
x86: adapt SSSE3 warp_affine_8x8{,t} to SSE2 --------------------- x86_64: ------------------------------------------ warp_8x8_8bpc_c: 1761.5 warp_8x8_8bpc_sse2: 583.0 warp_8x8_8bpc_ssse3: 329.3 --------------------- warp_8x8t_8bpc_c: 1694.3 warp_8x8t_8bpc_sse2: 577.6 warp_8x8t_8bpc_ssse3: 334.1 ------------------------------------------ --------------------- x86_32: ------------------------------------------ warp_8x8_8bpc_c: 1842.6 warp_8x8_8bpc_sse2: 677.1 warp_8x8_8bpc_ssse3: 394.9 --------------------- warp_8x8t_8bpc_c: 1741.1 warp_8x8t_8bpc_sse2: 648.5 warp_8x8t_8bpc_ssse3: 372.6 ------------------------------------------
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -90,9 +90,11 @@
decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4);
decl_warp8x8_fn(dav1d_warp_affine_8x8_ssse3);
+decl_warp8x8_fn(dav1d_warp_affine_8x8_sse2);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_ssse3);
+decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse2);
decl_emu_edge_fn(dav1d_emu_edge_avx2);
decl_emu_edge_fn(dav1d_emu_edge_ssse3);
@@ -104,6 +106,13 @@
c->mct[type] = dav1d_prep_##name##_##suffix
const unsigned flags = dav1d_get_cpu_flags();
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
+ return;
+
+#if BITDEPTH == 8
+ c->warp8x8 = dav1d_warp_affine_8x8_sse2;
+ c->warp8x8t = dav1d_warp_affine_8x8t_sse2;
+#endif
if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
return;
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -68,7 +68,9 @@
pw_8192: times 8 dw 8192
pd_32: times 4 dd 32
pd_512: times 4 dd 512
+pd_16384: times 4 dd 16484
pd_32768: times 4 dd 32768
+pd_262144:times 4 dd 262144
pw_258: times 2 dw 258
@@ -3385,6 +3387,8 @@
%define m14 m6
%define m15 m7
%define m11 m7
+ %endif
+ %if notcpuflag(ssse3) || ARCH_X86_32
pxor m11, m11
%endif
lea tmp1d, [myq+deltaq*4]
@@ -3483,6 +3487,7 @@
mova m14, [esp+0xE0]
mova m15, [esp+0xF0]
%endif
+%if cpuflag(ssse3)
psrad m12, 13
psrad m13, 13
psrad m14, 13
@@ -3492,6 +3497,22 @@
mova m13, [PIC_sym(pw_8192)]
pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7
pmulhrsw m14, m13
+%else
+ %if ARCH_X86_32
+ %define m10 m0
+ %endif
+ mova m10, [PIC_sym(pd_16384)]
+ paddd m12, m10
+ paddd m13, m10
+ paddd m14, m10
+ paddd m15, m10
+ psrad m12, 15
+ psrad m13, 15
+ psrad m14, 15
+ psrad m15, 15
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
mova [tmpq+tsq*0], m12
mova [tmpq+tsq*2], m14
dec counterd
@@ -3554,11 +3575,16 @@
call .main2
lea dstq, [dstq+dsq*2]
.start:
-%if cpuflag(ssse3)
+%if notcpuflag(sse4)
+ %if cpuflag(ssse3)
+ %define roundval pw_8192
+ %else
+ %define roundval pd_262144
+ %endif
%if ARCH_X86_64
- mova m10, [PIC_sym(pw_8192)]
+ mova m10, [PIC_sym(roundval)]
%else
- %define m10 [PIC_sym(pw_8192)]
+ %define m10 [PIC_sym(roundval)]
%endif
%endif
%if ARCH_X86_32
@@ -3577,10 +3603,18 @@
packusdw m12, m13
pavgw m12, m11 ; (x + (1 << 10)) >> 11
%else
+ %if cpuflag(ssse3)
psrad m12, 17
psrad m13, 17
packssdw m12, m13
- pmulhrsw m12, m10 ; (x + (1 << 10)) >> 11
+ pmulhrsw m12, m10
+ %else
+ paddd m12, m10
+ paddd m13, m10
+ psrad m12, 19
+ psrad m13, 19
+ packssdw m12, m13
+ %endif
%endif
%if ARCH_X86_32
%define m14 m6
@@ -3594,10 +3628,18 @@
packusdw m14, m15
pavgw m14, m11 ; (x + (1 << 10)) >> 11
%else
+ %if cpuflag(ssse3)
psrad m14, 17
psrad m15, 17
packssdw m14, m15
- pmulhrsw m14, m10 ; (x + (1 << 10)) >> 11
+ pmulhrsw m14, m10
+ %else
+ paddd m14, m10
+ paddd m15, m10
+ psrad m14, 19
+ psrad m15, 19
+ packssdw m14, m15
+ %endif
%endif
packuswb m12, m14
movq [dstq+dsq*0], m12
@@ -3647,12 +3689,17 @@
lea filterq, [PIC_sym(mc_warp_filter)]
%if ARCH_X86_64
mov myd, r6m
+ %if cpuflag(ssse3)
pxor m11, m11
+ %endif
%endif
call .h
psrld m2, m0, 16
psrld m3, m1, 16
%if ARCH_X86_32
+ %if notcpuflag(ssse3)
+ mova [esp+gprsize+0x00], m2
+ %endif
mova [esp+gprsize+0x10], m3
%endif
call .h
@@ -3666,6 +3713,9 @@
%if ARCH_X86_64
%define blendmask [rsp+gprsize+0x80]
%else
+ %if notcpuflag(ssse3)
+ mova m2, [esp+gprsize+0x00]
+ %endif
mova m3, [esp+gprsize+0x10]
%define blendmask [esp+gprsize+0x120]
%define m10 m7
@@ -3689,6 +3739,9 @@
mova [rsp+gprsize+0x30], m5
call .h
%if ARCH_X86_32
+ %if notcpuflag(ssse3)
+ mova m2, [esp+gprsize+0x00]
+ %endif
mova m3, [esp+gprsize+0x10]
%define m10 m5
%endif
@@ -3848,6 +3901,7 @@
lea tmp2d, [mxq+alphaq*1]
shr mxd, 10
shr tmp1d, 10
+%if cpuflag(ssse3)
movq m14, [filterq+mxq *8] ; 2 X
movq m9, [filterq+tmp1q*8] ; 6 X
lea tmp1d, [tmp2q+alphaq*4]
@@ -3864,10 +3918,99 @@
pmaddubsw m15, m14
pshufb m10, m10, [PIC_sym(warp_8x8_shufD)]
pmaddubsw m10, m9
- mova m14, [PIC_sym(pw_8192)]
- mova m9, [PIC_sym(pd_32768)]
phaddw m0, m15
phaddw m1, m10
+%else
+ %if ARCH_X86_32
+ %define m11 m2
+ %endif
+ pcmpeqw m0, m0
+ psrlw m14, m0, 8
+ psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15
+ pand m14, m10 ; 00 02 04 06 08 10 12 14
+ packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15
+ psrldq m9, m0, 4
+ pshufd m0, m14, q0220
+ pand m0, m9
+ psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __
+ pslldq m15, m14, 12
+ por m0, m15 ; shufA
+ psrlw m15, m0, 8
+ psraw m11, m1, 8
+ psllw m0, 8
+ psllw m1, 8
+ psrlw m0, 8
+ psraw m1, 8
+ pmullw m15, m11
+ pmullw m0, m1
+ paddw m0, m15 ; pmaddubsw m0, m1
+ pshufd m15, m14, q0220
+ pand m15, m9
+ psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __
+ pslldq m1, m14, 12
+ por m15, m1 ; shufC
+ pshufd m1, m14, q0220
+ pand m1, m9
+ psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __
+ pslldq m11, m14, 12
+ por m1, m11 ; shufB
+ pshufd m10, m14, q0220
+ pand m10, m9
+ psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __
+ pslldq m14, m14, 12
+ por m10, m14 ; shufD
+ psrlw m9, m1, 8
+ psraw m11, m8, 8
+ psllw m1, 8
+ psllw m8, 8
+ psrlw m1, 8
+ psraw m8, 8
+ pmullw m9, m11
+ pmullw m1, m8
+ paddw m1, m9 ; pmaddubsw m1, m8
+ movq m14, [filterq+mxq *8] ; 2 X
+ movq m9, [filterq+tmp1q*8] ; 6 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m14, [filterq+tmp2q*8] ; 2 3
+ movhps m9, [filterq+tmp1q*8] ; 6 7
+ psrlw m8, m15, 8
+ psraw m11, m14, 8
+ psllw m15, 8
+ psllw m14, 8
+ psrlw m15, 8
+ psraw m14, 8
+ pmullw m8, m11
+ pmullw m15, m14
+ paddw m15, m8 ; pmaddubsw m15, m14
+ psrlw m8, m10, 8
+ psraw m11, m9, 8
+ psllw m10, 8
+ psllw m9, 8
+ psrlw m10, 8
+ psraw m9, 8
+ pmullw m8, m11
+ pmullw m10, m9
+ paddw m10, m8 ; pmaddubsw m10, m9
+ pslld m8, m0, 16
+ pslld m9, m1, 16
+ pslld m14, m15, 16
+ pslld m11, m10, 16
+ paddw m0, m8
+ paddw m1, m9
+ paddw m15, m14
+ paddw m10, m11
+ psrad m0, 16
+ psrad m1, 16
+ psrad m15, 16
+ psrad m10, 16
+ packssdw m0, m15 ; phaddw m0, m15
+ packssdw m1, m10 ; phaddw m1, m10
+%endif
+ mova m14, [PIC_sym(pw_8192)]
+ mova m9, [PIC_sym(pd_32768)]
pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
pmaddwd m1, m14
paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword
@@ -3882,6 +4025,12 @@
INIT_XMM ssse3
WARP_AFFINE_8X8
WARP_AFFINE_8X8T
+
+INIT_XMM sse2
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM ssse3
%if WIN64
DECLARE_REG_TMP 6, 4