shithub: dav1d

Download patch

ref: 1f32abd286557fc2fb1bee3dbf31c2dcce337c17
parent: 1d7754830ec78b9124c4c8be198aa802669675db
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Tue Oct 16 04:46:04 EDT 2018

Add infrastructure for LR SIMD and unit tests.

wiener_luma_8bpc_c: 326272.1
wiener_luma_8bpc_avx2: 19841.5

Decoding time of first 1000 frames of Chimera-8bit-1920x1080.ivf goes
from 27.471 to 23.558 seconds.

--- a/src/looprestoration.c
+++ b/src/looprestoration.c
@@ -65,6 +65,11 @@
         pixel_copy(dst_l, p, unit_w);
         pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
         pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+        if (have_left) {
+            pixel_copy(dst_l, &left[0][1], 3);
+            pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+            pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+        }
     }
 
     pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
@@ -81,6 +86,11 @@
         pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
         pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
         pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+        if (have_left) {
+            pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+            pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+            pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+        }
     }
 
     // Inner UNIT_WxSTRIPE_H
@@ -560,4 +570,8 @@
 void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {
     c->wiener = wiener_c;
     c->selfguided = selfguided_c;
+
+#if ARCH_X86 && BITDEPTH == 8
+    bitfn(dav1d_loop_restoration_dsp_init_x86)(c);
+#endif
 }
--- a/src/looprestoration.h
+++ b/src/looprestoration.h
@@ -40,21 +40,31 @@
     LR_HAVE_BOTTOM = 1 << 3,
 };
 
+#ifdef BITDEPTH
+typedef const pixel (*const_left_pixel_row)[4];
+#else
+typedef const void *const_left_pixel_row;
+#endif
+
 // Although the spec applies restoration filters over 4x4 blocks, the wiener
 // filter can be applied to a bigger surface.
 //    * w is constrained by the restoration unit size (w <= 256)
 //    * h is constrained by the stripe height (h <= 64)
-typedef void (*wienerfilter_fn)(pixel *dst, ptrdiff_t dst_stride,
-                                const void *left /*const pixel (*left)[4]*/,
-                                const pixel *lpf, ptrdiff_t lpf_stride,
-                                int w, int h, const int16_t filterh[7],
-                                const int16_t filterv[7], enum LrEdgeFlags edges);
+#define decl_wiener_filter_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const_left_pixel_row left, \
+            const pixel *lpf, ptrdiff_t lpf_stride, \
+            int w, int h, const int16_t filterh[7], \
+            const int16_t filterv[7], enum LrEdgeFlags edges)
+typedef decl_wiener_filter_fn(*wienerfilter_fn);
 
-typedef void (*selfguided_fn)(pixel *dst, ptrdiff_t dst_stride,
-                              const void *left /*const pixel (*left)[4]*/,
-                              const pixel *lpf, ptrdiff_t lpf_stride,
-                              int w, int h, int sgr_idx, const int16_t sgr_w[2],
-                              const enum LrEdgeFlags edges);
+#define decl_selfguided_filter_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const_left_pixel_row left, \
+            const pixel *lpf, ptrdiff_t lpf_stride, \
+            int w, int h, int sgr_idx, const int16_t sgr_w[2], \
+            const enum LrEdgeFlags edges)
+typedef decl_selfguided_filter_fn(*selfguided_fn);
 
 typedef struct Dav1dLoopRestorationDSPContext {
     wienerfilter_fn wiener;
@@ -63,5 +73,8 @@
 
 void dav1d_loop_restoration_dsp_init_8bpc(Dav1dLoopRestorationDSPContext *c);
 void dav1d_loop_restoration_dsp_init_10bpc(Dav1dLoopRestorationDSPContext *c);
+
+void dav1d_loop_restoration_dsp_init_x86_8bpc(Dav1dLoopRestorationDSPContext *c);
+void dav1d_loop_restoration_dsp_init_x86_10bpc(Dav1dLoopRestorationDSPContext *c);
 
 #endif /* __DAV1D_SRC_LOOPRESTORATION_H__ */
--- a/src/meson.build
+++ b/src/meson.build
@@ -101,6 +101,7 @@
             'x86/ipred_init.c',
             'x86/itx_init.c',
             'x86/loopfilter_init.c',
+            'x86/looprestoration_init.c',
             'x86/mc_init.c',
         )
 
@@ -110,6 +111,7 @@
             'x86/ipred.asm',
             'x86/itx.asm',
             'x86/loopfilter.asm',
+            'x86/looprestoration.asm',
             'x86/mc.asm',
         )
 
--- /dev/null
+++ b/src/x86/looprestoration.asm
@@ -1,0 +1,303 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+pb_right_ext_mask: times 32 db 0xff
+                   times 32 db 0
+pb_14x0_1_2: times 14 db 0
+             db 1, 2
+pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
+                  db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
+pb_15: times 16 db 15
+pw_128: times 2 dw 128
+pw_2048: times 2 dw 2048
+pw_16380: times 2 dw 16380
+pw_0_128: dw 0, 128
+pd_1024: dd 1024
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
+    vpbroadcastb m15, [fhq+0]
+    vpbroadcastb m14, [fhq+2]
+    vpbroadcastb m13, [fhq+4]
+    vpbroadcastw m12, [fhq+6]
+    vpbroadcastd  m9, [pw_128]
+    paddw        m12, m9
+    vpbroadcastd m11, [pw_2048]
+    vpbroadcastd m10, [pw_16380]
+    lea          r11, [pb_right_ext_mask]
+
+    DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
+
+    ; if (edge & has_right) align_w_to_32
+    ; else w -= 32, and use that as limit in x loop
+    test       edged, 2 ; has_right
+    jnz .align
+    mov        xlimq, -3
+    jmp .loop
+.align:
+    add           wd, 31
+    and           wd, ~31
+    xor        xlimd, xlimd
+
+    ; main y loop for vertical filter
+.loop:
+    mov      srcptrq, srcq
+    mov      dstptrq, dstq
+    lea           xq, [wq+xlimq]
+
+    ; load left edge pixels
+    test       edged, 1 ; have_left
+    jz .emu_left
+    test       leftq, leftq ; left == NULL for the edge-extended bottom/top
+    jz .load_left_combined
+    movd         xm0, [leftq]
+    pinsrd       xm0, [srcq], 1
+    pslldq       xm0, 9
+    jmp .left_load_done
+.load_left_combined:
+    movq         xm0, [srcq-5]
+    jmp .left_load_done
+.emu_left:
+    movd         xm0, [srcq]
+    pshufb       xm0, [pb_14x0_1_2]
+
+    ; load right edge pixels
+.left_load_done:
+    cmp           xd, 32
+    jg .main_load
+    test          xd, xd
+    jg .load_and_splat
+    je .splat_right
+
+    ; for very small images (w=[1-2]), edge-extend the original cache,
+    ; ugly, but only runs in very odd cases
+    add           wd, wd
+    pshufb       xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
+    shr           wd, 1
+
+    ; main x loop, mostly this starts in .main_load
+.splat_right:
+    ; no need to load new pixels, just extend them from the (possibly previously
+    ; extended) previous load into m0
+    pshufb       xm1, xm0, [pb_15]
+    jmp .main_loop
+.load_and_splat:
+    ; load new pixels and extend edge for right-most
+    movu          m1, [srcptrq+3]
+    sub          r11, xq
+    movu          m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32]
+    add          r11, xq
+    vpbroadcastb  m3, [srcptrq+2+xq]
+    pand          m1, m2
+    pandn         m3, m2, m3
+    por           m1, m3
+    jmp .main_loop
+.main_load:
+    ; load subsequent line
+    movu          m1, [srcptrq+3]
+.main_loop:
+    vinserti128   m0, xm1, 1
+
+    palignr       m2, m1, m0, 10
+    palignr       m3, m1, m0, 11
+    palignr       m4, m1, m0, 12
+    palignr       m5, m1, m0, 13
+    palignr       m6, m1, m0, 14
+    palignr       m7, m1, m0, 15
+
+    punpcklbw     m0, m2, m1
+    punpckhbw     m2, m1
+    punpcklbw     m8, m3, m7
+    punpckhbw     m3, m7
+    punpcklbw     m7, m4, m6
+    punpckhbw     m4, m6
+    pxor          m9, m9
+    punpcklbw     m6, m5, m9
+    punpckhbw     m5, m9
+
+    pmaddubsw     m0, m15
+    pmaddubsw     m2, m15
+    pmaddubsw     m8, m14
+    pmaddubsw     m3, m14
+    pmaddubsw     m7, m13
+    pmaddubsw     m4, m13
+    pmullw        m6, m12
+    pmullw        m5, m12
+    ; note that m6/5 are unsigned here, whereas the others are signed
+    psubw         m0, m10
+    psubw         m2, m10
+    paddw         m0, m8
+    paddw         m2, m3
+    paddw         m0, m7
+    paddw         m2, m4
+    paddw         m0, m6
+    paddw         m2, m5
+    psraw         m0, 3
+    psraw         m2, 3
+    paddw         m0, m11
+    paddw         m2, m11
+    mova   [dstptrq], xm0
+    mova [dstptrq+16], xm2
+    vextracti128 [dstptrq+32], m0, 1
+    vextracti128 [dstptrq+48], m2, 1
+    vextracti128 xm0, m1, 1
+    add      srcptrq, 32
+    add      dstptrq, 64
+    sub           xq, 32
+    cmp           xd, 32
+    jg .main_load
+    test          xd, xd
+    jg .load_and_splat
+    cmp           xd, xlimd
+    jg .splat_right
+
+    add         srcq, strideq
+    add         dstq, 384*2
+    dec           hd
+    jg .loop
+    RET
+
+cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
+    vpbroadcastd m14, [fvq+4]
+    vpbroadcastd m15, [fvq]
+    vpbroadcastd m13, [pw_0_128]
+    paddw        m14, m13
+    vpbroadcastd m12, [pd_1024]
+
+    DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr
+    and        ylimd, 8 ; have_bottom
+    shr        ylimd, 2
+    sub        ylimd, 3
+
+    ; main x loop for vertical filter, does one column of 16 pixels
+.loop_x:
+    mova          m3, [midq] ; middle line
+
+    ; load top pixels
+    test       edged, 4 ; have_top
+    jz .emu_top
+    mova          m0, [midq-384*4]
+    mova          m2, [midq-384*2]
+    mova          m1, m0
+    jmp .load_bottom_pixels
+.emu_top:
+    mova          m0, m3
+    mova          m1, m3
+    mova          m2, m3
+
+    ; load bottom pixels
+.load_bottom_pixels:
+    mov           yd, hd
+    mov        mptrq, midq
+    mov      dstptrq, dstq
+    add           yd, ylimd
+    jg .load_threelines
+
+    ; the remainder here is somewhat messy but only runs in very weird
+    ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
+    ; so performance is not terribly important here...
+    je .load_twolines
+    cmp           yd, -1
+    je .load_oneline
+    ; h == 1 case
+    mova          m5, m3
+    mova          m4, m3
+    mova          m6, m3
+    jmp .loop
+.load_oneline:
+    ; h == 2 case
+    mova          m4, [midq+384*2]
+    mova          m5, m4
+    mova          m6, m4
+    jmp .loop
+.load_twolines:
+    ; h == 3 case
+    mova          m4, [midq+384*2]
+    mova          m5, [midq+384*4]
+    mova          m6, m5
+    jmp .loop
+.load_threelines:
+    ; h > 3 case
+    mova          m4, [midq+384*2]
+    mova          m5, [midq+384*4]
+    ; third line loaded in main loop below
+
+    ; main y loop for vertical filter
+.loop_load:
+    ; load one line into m6. if that pixel is no longer available, do
+    ; nothing, since m6 still has the data from the previous line in it. We
+    ; try to structure the loop so that the common case is evaluated fastest
+    mova          m6, [mptrq+384*6]
+.loop:
+    paddw         m7, m0, m6
+    paddw         m8, m1, m5
+    paddw         m9, m2, m4
+    punpcklwd    m10, m7, m8
+    punpckhwd     m7, m8
+    punpcklwd    m11, m9, m3
+    punpckhwd     m9, m3
+    pmaddwd      m10, m15
+    pmaddwd       m7, m15
+    pmaddwd      m11, m14
+    pmaddwd       m9, m14
+    paddd        m10, m11
+    paddd         m7, m9
+    paddd        m10, m12
+    paddd         m7, m12
+    psrad        m10, 11
+    psrad         m7, 11
+    packssdw     m10, m7
+    packuswb     m10, m10
+    vpermq       m10, m10, q3120
+    mova   [dstptrq], xm10
+    ; shift pixels one position
+    mova          m0, m1
+    mova          m1, m2
+    mova          m2, m3
+    mova          m3, m4
+    mova          m4, m5
+    mova          m5, m6
+    add      dstptrq, strideq
+    add        mptrq, 384*2
+    dec           yd
+    jg .loop_load
+    ; for the bottom pixels, continue using m6 (as extended edge)
+    cmp           yd, ylimd
+    jg .loop
+
+    add         dstq, 16
+    add         midq, 32
+    sub           wd, 16
+    jg .loop_x
+    RET
+%endif ; ARCH_X86_64
--- /dev/null
+++ b/src/x86/looprestoration_init.c
@@ -1,0 +1,88 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <assert.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#if BITDEPTH == 8 && ARCH_X86_64
+void dav1d_wiener_filter_h_avx2(int16_t *dst, const pixel (*left)[4],
+                                const pixel *src, ptrdiff_t stride,
+                                const int16_t fh[7], const intptr_t w,
+                                int h, enum LrEdgeFlags edges);
+void dav1d_wiener_filter_v_avx2(pixel *dst, ptrdiff_t stride,
+                                const int16_t *mid, int w, int h,
+                                const int16_t fv[7], enum LrEdgeFlags edges);
+
+// Future potential optimizations:
+// - special chroma versions which don't filter [0]/[6];
+// - running filter_h_avx2 transposed (one col of 32 pixels per iteration, top
+//   to bottom) instead of scanline-ordered should be faster since then the
+//   if (have_left) and similar conditions run only once instead of per line;
+// - filter_v_avx2 currently runs 16 pixels per iteration, it should be possible
+//   to run 32 (like filter_h_avx2), and then all vpermqs can go;
+// - maybe split out the top/bottom filter_h_avx2 from the main body filter_h_avx2,
+//   since then the have_left condition can be inlined;
+// - consider having the wrapper (wiener_filter_avx2) also in hand-written
+//   assembly, so the setup overhead is minimized.
+
+static void wiener_filter_avx2(pixel *const dst, const ptrdiff_t dst_stride,
+                               const pixel (*const left)[4],
+                               const pixel *lpf, const ptrdiff_t lpf_stride,
+                               const int w, const int h, const int16_t fh[7],
+                               const int16_t fv[7], const enum LrEdgeFlags edges)
+{
+    ALIGN_STK_32(int16_t, mid, 68 * 384,);
+
+    // horizontal filter
+    dav1d_wiener_filter_h_avx2(&mid[2 * 384], left, dst, dst_stride,
+                               fh, w, h, edges);
+    if (edges & LR_HAVE_TOP)
+        dav1d_wiener_filter_h_avx2(mid, NULL, lpf, lpf_stride,
+                                   fh, w, 2, edges);
+    if (edges & LR_HAVE_BOTTOM)
+        dav1d_wiener_filter_h_avx2(&mid[(2 + h) * 384], NULL,
+                                   lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride,
+                                   fh, w, 2, edges);
+
+    dav1d_wiener_filter_v_avx2(dst, dst_stride, &mid[2*384], w, h, fv, edges);
+}
+#endif
+
+void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+    c->wiener = wiener_filter_avx2;
+#endif
+}
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -67,6 +67,8 @@
     { "itx_10bpc", checkasm_check_itx_10bpc },
     { "loopfilter_8bpc", checkasm_check_loopfilter_8bpc },
     { "loopfilter_10bpc", checkasm_check_loopfilter_10bpc },
+    { "looprestoration_8bpc", checkasm_check_looprestoration_8bpc },
+    { "looprestoration_10bpc", checkasm_check_looprestoration_10bpc },
     { "mc_8bpc", checkasm_check_mc_8bpc },
     { "mc_10bpc", checkasm_check_mc_10bpc },
     { 0 }
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -45,6 +45,9 @@
 void checkasm_check_loopfilter_8bpc(void);
 void checkasm_check_loopfilter_10bpc(void);
 
+void checkasm_check_looprestoration_8bpc(void);
+void checkasm_check_looprestoration_10bpc(void);
+
 void checkasm_check_mc_8bpc(void);
 void checkasm_check_mc_10bpc(void);
 
--- /dev/null
+++ b/tests/checkasm/looprestoration.c
@@ -1,0 +1,127 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tests/checkasm/checkasm.h"
+
+#include <string.h>
+
+#include "src/levels.h"
+#include "src/looprestoration.h"
+
+static void init_tmp(pixel *buf, const ptrdiff_t stride,
+                     const int w, const int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)
+            buf[x] = rand() & ((1 << BITDEPTH) - 1);
+        buf += PXSTRIDE(stride);
+    }
+}
+
+static int cmp2d(const pixel *a, const pixel *b, const ptrdiff_t stride,
+                 const int w, const int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)
+            if (a[x] != b[x]) return (y << 16) | x;
+        a += PXSTRIDE(stride);
+        b += PXSTRIDE(stride);
+    }
+    return -1;
+}
+
+static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
+    ALIGN_STK_32(pixel, c_dst, 448 * 64,);
+    ALIGN_STK_32(pixel, a_dst, 448 * 64,);
+    ALIGN_STK_32(pixel, h_edge, 448 * 8,);
+    pixel left[64][4];
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride,
+                 const pixel (*const left)[4],
+                 const pixel *lpf, ptrdiff_t lpf_stride,
+                 int w, int h, const int16_t filterh[7],
+                 const int16_t filterv[7], enum LrEdgeFlags edges);
+
+    init_tmp(c_dst, 448 * sizeof(pixel), 448, 64);
+    init_tmp(h_edge, 448 * sizeof(pixel), 448, 8);
+    init_tmp(left, 4 * sizeof(pixel), 4, 64);
+
+    for (int pl = 0; pl < 2; pl++) {
+        if (check_func(c->wiener, "wiener_%s_%dbpc",
+                       pl ? "chroma" : "luma", BITDEPTH))
+        {
+            int16_t filter[2][3], filter_v[7], filter_h[7];
+
+            filter[0][0] = pl ? 0 : (rand() & 15) - 5;
+            filter[0][1] = (rand() & 31) - 23;
+            filter[0][2] = (rand() & 63) - 17;
+            filter[1][0] = pl ? 0 : (rand() & 15) - 5;
+            filter[1][1] = (rand() & 31) - 23;
+            filter[1][2] = (rand() & 63) - 17;
+
+            filter_h[0] = filter_h[6] = filter[0][0];
+            filter_h[1] = filter_h[5] = filter[0][1];
+            filter_h[2] = filter_h[4] = filter[0][2];
+            filter_h[3] = -((filter_h[0] + filter_h[1] + filter_h[2]) * 2);
+
+            filter_v[0] = filter_v[6] = filter[1][0];
+            filter_v[1] = filter_v[5] = filter[1][1];
+            filter_v[2] = filter_v[4] = filter[1][2];
+            filter_v[3] = -((filter_v[0] + filter_v[1] + filter_v[2]) * 2);
+
+            const int base_w = 1 + (rand() % 384);
+            const int base_h = 1 + (rand() & 63);
+            for (enum LrEdgeFlags edges = 0; edges <= 0; edges++) {
+                const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
+                const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;
+
+                memcpy(a_dst, c_dst, sizeof(c_dst));
+
+                call_ref(c_dst + 32, 448 * sizeof(pixel), left,
+                         h_edge + 32, 448 * sizeof(pixel),
+                         w, h, filter_h, filter_v, edges);
+                call_new(a_dst + 32, 448 * sizeof(pixel), left,
+                         h_edge + 32, 448 * sizeof(pixel),
+                         w, h, filter_h, filter_v, edges);
+                const int res = cmp2d(c_dst + 32, a_dst + 32, 448 * sizeof(pixel), w, h);
+                if (res != -1) fail();
+            }
+            bench_new(a_dst + 32, 448 * sizeof(pixel), left,
+                      h_edge + 32, 448 * sizeof(pixel),
+                      256, 64, filter_h, filter_v, 0xf);
+        }
+    }
+    report("wiener");
+}
+
+void bitfn(checkasm_check_looprestoration)(void) {
+    Dav1dLoopRestorationDSPContext c;
+
+    bitfn(dav1d_loop_restoration_dsp_init)(&c);
+
+    check_wiener(&c);
+}
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -38,6 +38,7 @@
         'checkasm/ipred.c',
         'checkasm/itx.c',
         'checkasm/loopfilter.c',
+        'checkasm/looprestoration.c',
         'checkasm/mc.c',
     )