ref: 36d615d1204c126ad6a9fc560cde94dcd7bafb6d
parent: 4866abab1f9fcdc9a2a4934c4facd94e4ffddb1e
author: Victorien Le Couviour--Tuffet <victorien@videolan.org>
date: Tue Oct 22 09:57:35 EDT 2019
x86: adapt SSSE3 wiener filter to SSE2 Also slightly optimized the 32-bit SSSE3, especially by the removal of an XMM store/load. --------------------- x86_64: ------------------------------------------ wiener_chroma_8bpc_c: 193155.1 wiener_chroma_8bpc_sse2: 48973.4 wiener_chroma_8bpc_ssse3: 31486.3 --------------------- wiener_luma_8bpc_c: 192787.5 wiener_luma_8bpc_sse2: 48674.9 wiener_luma_8bpc_ssse3: 30446.3 ------------------------------------------ --------------------- x86_32: ------------------------------------------ wiener_chroma_8bpc_c: 309861.0 wiener_chroma_8bpc_sse2: 52345.9 wiener_chroma_8bpc_ssse3: 32983.2 --------------------- wiener_luma_8bpc_c: 317909.1 wiener_luma_8bpc_sse2: 52522.1 wiener_luma_8bpc_ssse3: 33323.1 ------------------------------------------
--- a/src/x86/looprestoration_init_tmpl.c
+++ b/src/x86/looprestoration_init_tmpl.c
@@ -204,6 +204,7 @@
SGR_FILTER(ext)
#if BITDEPTH == 8
+WIENER_FILTER(sse2)
DEF_LR_FILTERS(ssse3)
# if ARCH_X86_64
DEF_LR_FILTERS(avx2)
@@ -212,6 +213,11 @@
COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+#if BITDEPTH == 8
+ c->wiener = wiener_filter_sse2;
+#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
--- a/src/x86/looprestoration_ssse3.asm
+++ b/src/x86/looprestoration_ssse3.asm
@@ -43,8 +43,6 @@
pb_0_1: times 8 db 0, 1
pb_6_7: times 8 db 6, 7
pb_14_15: times 8 db 14, 15
-pb_0_1_2_3: times 4 db 0, 1, 2, 3
-pb_4_5_6_7: times 4 db 4, 5, 6, 7
pw_1: times 8 dw 1
pw_16: times 8 dw 16
pw_128: times 8 dw 128
@@ -97,47 +95,84 @@
%define PIC_sym(sym) (sym)
%endif
+%macro PALIGNR 4 ; dst, src1, src2, shift
+ %if cpuflag(ssse3)
+ palignr %1, %2, %3, %4
+ %else
+ %assign %%i regnumof%+%1 + 1
+ %define %%tmp m %+ %%i
+ psrldq %1, %3, %4
+ pslldq %%tmp, %2, 16-%4
+ por %1, %%tmp
+ %endif
+%endmacro
+
+%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero
+ %if cpuflag(ssse3)
+ pmaddubsw %1, %2
+ %else
+ %if %5 == 1
+ pxor %3, %3
+ %endif
+ punpckhbw %4, %1, %3
+ punpcklbw %1, %3
+ pmaddwd %4, %2
+ pmaddwd %1, %2
+ packssdw %1, %4
+ %endif
+%endmacro
+
;;;;;;;;;;;;;;;;;;;;;;
;; wiener ;;
;;;;;;;;;;;;;;;;;;;;;;
-INIT_XMM ssse3
+%macro WIENER_H 0
%if ARCH_X86_64
cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge
mov edged, edgem
movifnidn wd, wm
mov hd, hm
- movq m15, [fhq]
- pshufb m12, m15, [pb_6_7]
- pshufb m13, m15, [pb_4]
- pshufb m14, m15, [pb_2]
- pshufb m15, m15, [pb_0]
- mova m11, [pw_2048]
- mova m10, [pw_16380]
- lea r11, [pb_right_ext_mask]
-
- DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
%else
-cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
- mov wd, edgem
- mov [esp+12], wd
+cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge
+ mov r5, edgem
+ mov [esp+12], r5
mov wd, wm
mov hd, hm
SETUP_PIC hd
- movq m0, [fhq]
- pshufb m3, m0, [PIC_sym(pb_6_7)]
- pshufb m2, m0, [PIC_sym(pb_4)]
- pshufb m1, m0, [PIC_sym(pb_2)]
- pshufb m0, m0, [PIC_sym(pb_0)]
+ %define m15 m0
+ %define m14 m1
+ %define m13 m2
+ %define m12 m3
+%endif
- DEFINE_ARGS dst, left, src, stride, x, w, h, edge
+ movq m15, [fhq]
+%if cpuflag(ssse3)
+ pshufb m12, m15, [PIC_sym(pb_6_7)]
+ pshufb m13, m15, [PIC_sym(pb_4)]
+ pshufb m14, m15, [PIC_sym(pb_2)]
+ pshufb m15, m15, [PIC_sym(pb_0)]
+%else
+ pshuflw m12, m15, q3333
+ punpcklbw m15, m15
+ pshufhw m13, m15, q0000
+ pshuflw m14, m15, q2222
+ pshuflw m15, m15, q0000
+ punpcklqdq m12, m12
+ punpckhqdq m13, m13
+ punpcklqdq m14, m14
+ punpcklqdq m15, m15
+ psraw m13, 8
+ psraw m14, 8
+ psraw m15, 8
+%endif
- %define srcptrq srcq
- %define dstptrq dstq
- %define hd dword [esp]
- %define edged dword [esp+12]
- %define xlimd dword [esp+16]
+%if ARCH_X86_64
+ mova m11, [pw_2048]
+ mova m10, [pw_16380]
+ lea r11, [pb_right_ext_mask]
+ DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
+%else
%define m10 [PIC_sym(pw_16380)]
%define m11 [PIC_sym(pw_2048)]
%define m12 [esp+0x14]
@@ -144,11 +179,17 @@
%define m13 [esp+0x24]
%define m14 [esp+0x34]
%define m15 [esp+0x44]
-
- mova m15, m0
- mova m14, m1
- mova m13, m2
mova m12, m3
+ mova m13, m2
+ mova m14, m1
+ mova m15, m0
+
+ DEFINE_ARGS dst, left, src, stride, x, w, h, edge
+ %define srcptrq srcq
+ %define dstptrq dstq
+ %define hd dword [esp+ 0]
+ %define edged dword [esp+12]
+ %define xlimd dword [esp+16]
%endif
; if (edge & has_right) align_w_to_16
@@ -196,7 +237,16 @@
jmp .left_load_done
.emu_left:
movd m0, [srcq]
+%if cpuflag(ssse3)
pshufb m0, [PIC_sym(pb_14x0_1_2)]
+%else
+ pslldq m1, m0, 13
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ psrldq m0, 2
+ por m0, m1
+%endif
; load right edge pixels
.left_load_done:
@@ -208,19 +258,39 @@
; for very small images (w=[1-2]), edge-extend the original cache,
; ugly, but only runs in very odd cases
+%if cpuflag(ssse3)
add wd, wd
-%if ARCH_X86_64
+ %if ARCH_X86_64
pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
-%else
+ %else
pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16]
-%endif
+ %endif
shr wd, 1
+%else
+ shl wd, 4
+ pcmpeqd m2, m2
+ movd m3, wd
+ psrldq m2, 2
+ punpckhbw m1, m0, m0
+ pshufhw m1, m1, q1122
+ psllq m1, m3
+ pand m0, m2
+ pandn m2, m1
+ por m0, m2
+ shr wd, 4
+%endif
; main x loop, mostly this starts in .main_load
.splat_right:
; no need to load new pixels, just extend them from the (possibly previously
; extended) previous load into m0
+%if cpuflag(ssse3)
pshufb m1, m0, [PIC_sym(pb_15)]
+%else
+ punpckhbw m1, m0, m0
+ pshufhw m1, m1, q3333
+ punpckhqdq m1, m1
+%endif
jmp .main_loop
.load_and_splat:
; load new pixels and extend edge for right-most
@@ -235,7 +305,13 @@
add PIC_reg, xd
%endif
movd m3, [srcptrq+2+xq]
+%if cpuflag(ssse3)
pshufb m3, [PIC_sym(pb_0)]
+%else
+ punpcklbw m3, m3
+ pshuflw m3, m3, q0000
+ punpcklqdq m3, m3
+%endif
pand m1, m2
pxor m2, [PIC_sym(pb_right_ext_mask)]
pand m3, m2
@@ -246,17 +322,14 @@
; load subsequent line
movu m1, [srcptrq+3]
.main_loop:
- palignr m2, m1, m0, 10
- palignr m3, m1, m0, 11
- palignr m4, m1, m0, 12
- palignr m5, m1, m0, 13
- palignr m6, m1, m0, 14
- palignr m7, m1, m0, 15
+%if ARCH_X86_64
+ PALIGNR m2, m1, m0, 10
+ PALIGNR m3, m1, m0, 11
+ PALIGNR m4, m1, m0, 12
+ PALIGNR m5, m1, m0, 13
+ PALIGNR m6, m1, m0, 14
+ PALIGNR m7, m1, m0, 15
-%if ARCH_X86_32
- mova [esp+0x54], m1
- %define m8 m1
-%endif
punpcklbw m0, m2, m1
punpckhbw m2, m1
punpcklbw m8, m3, m7
@@ -263,29 +336,31 @@
punpckhbw m3, m7
punpcklbw m7, m4, m6
punpckhbw m4, m6
- pmaddubsw m0, m15
- pmaddubsw m2, m15
- pmaddubsw m8, m14
- pmaddubsw m3, m14
- pmaddubsw m7, m13
- pmaddubsw m4, m13
+ PMADDUBSW m0, m15, m6, m9, 1
+ PMADDUBSW m2, m15, m6, m9, 0
+ PMADDUBSW m8, m14, m6, m9, 0
+ PMADDUBSW m3, m14, m6, m9, 0
+ PMADDUBSW m7, m13, m6, m9, 0
+ PMADDUBSW m4, m13, m6, m9, 0
paddw m0, m8
paddw m2, m3
- pxor m3, m3
- punpcklbw m6, m5, m3
- punpckhbw m5, m3
- psllw m8, m6, 7
- psllw m3, m5, 7
+ %if cpuflag(ssse3)
+ pxor m6, m6
+ %endif
+ punpcklbw m3, m5, m6
+ punpckhbw m5, m6
+ psllw m8, m3, 7
+ psllw m6, m5, 7
psubw m8, m10
- psubw m3, m10
- pmullw m6, m12
+ psubw m6, m10
+ pmullw m3, m12
pmullw m5, m12
paddw m0, m7
paddw m2, m4
- paddw m0, m6
+ paddw m0, m3
paddw m2, m5
paddsw m0, m8
- paddsw m2, m3
+ paddsw m2, m6
psraw m0, 3
psraw m2, 3
paddw m0, m11
@@ -292,12 +367,53 @@
paddw m2, m11
mova [dstptrq+ 0], m0
mova [dstptrq+16], m2
-
-%if ARCH_X86_64
- mova m0, m1
%else
- mova m0, [esp+0x54]
+ PALIGNR m2, m1, m0, 10
+ punpcklbw m3, m2, m1
+ punpckhbw m2, m1
+ PMADDUBSW m3, m15, m4, m5, 1
+ PMADDUBSW m2, m15, m4, m5, 0
+ PALIGNR m4, m1, m0, 11
+ PALIGNR m5, m1, m0, 15
+ punpcklbw m6, m4, m5
+ punpckhbw m4, m5
+ PMADDUBSW m6, m14, m5, m7, 1
+ PMADDUBSW m4, m14, m5, m7, 0
+ paddw m3, m6
+ paddw m2, m4
+ PALIGNR m4, m1, m0, 12
+ PALIGNR m5, m1, m0, 14
+ punpcklbw m6, m4, m5
+ punpckhbw m4, m5
+ PMADDUBSW m6, m13, m5, m7, 1
+ PMADDUBSW m4, m13, m5, m7, 0
+ paddw m3, m6
+ paddw m2, m4
+ PALIGNR m6, m1, m0, 13
+ %if cpuflag(ssse3)
+ pxor m5, m5
+ %endif
+ punpcklbw m4, m6, m5
+ punpckhbw m6, m5
+ psllw m5, m4, 7
+ psllw m7, m6, 7
+ psubw m5, m10
+ psubw m7, m10
+ pmullw m4, m12
+ pmullw m6, m12
+ paddw m3, m4
+ paddw m2, m6
+ paddsw m3, m5
+ paddsw m2, m7
+ psraw m3, 3
+ psraw m2, 3
+ paddw m3, m11
+ paddw m2, m11
+ mova [dstptrq+ 0], m3
+ mova [dstptrq+16], m2
%endif
+
+ mova m0, m1
add srcptrq, 16
add dstptrq, 32
sub xd, 16
@@ -317,7 +433,9 @@
dec hd
jg .loop
RET
+%endmacro
+%macro WIENER_V 0
%if ARCH_X86_64
cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge
mov edged, edgem
@@ -324,11 +442,10 @@
movifnidn fvq, fvmp
movifnidn hd, hm
movq m15, [fvq]
- pshufb m14, m15, [pb_4_5_6_7]
- pshufb m15, m15, [pb_0_1_2_3]
+ pshufd m14, m15, q1111
+ pshufd m15, m15, q0000
paddw m14, [pw_0_128]
- movd m12, [pd_1024]
- pshufd m12, m12, 0
+ mova m12, [pd_1024]
DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr
@@ -351,8 +468,8 @@
SETUP_PIC edged
movq m0, [fvq]
- pshufb m1, m0, [PIC_sym(pb_4_5_6_7)]
- pshufb m0, m0, [PIC_sym(pb_0_1_2_3)]
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
paddw m1, [PIC_sym(pw_0_128)]
mova [esp+0x50], m0
mova [esp+0x40], m1
@@ -504,6 +621,15 @@
sub wd, 8
jg .loop_x
RET
+%endmacro
+
+INIT_XMM sse2
+WIENER_H
+WIENER_V
+
+INIT_XMM ssse3
+WIENER_H
+WIENER_V
;;;;;;;;;;;;;;;;;;;;;;;;;;
;; self-guided ;;