shithub: dav1d

Download patch

ref: e1be33b9c8cb20c62b26b9e3f02d206ddf54a80e
parent: c58e9d576c4eaf393f6751ea6375803acd5dec81
author: Martin Storsjö <martin@martin.st>
date: Sun Feb 9 18:39:11 EST 2020

arm32: looprestoration: Prepare for 16 bpc by splitting code to separate files

looprestoration_common.S contains functions that can be used as is
with one single instantiation of the functions for both 8 and 16 bpc.
This file will be built once, regardless of which bitdepths are enabled.

looprestoration_tmpl.S contains functions where the source can be shared
and templated between 8 and 16 bpc. This will be included by the separate
8/16bpc implementaton files.

--- a/src/arm/32/looprestoration.S
+++ b/src/arm/32/looprestoration.S
@@ -676,6 +676,8 @@
 
 #define SUM_STRIDE (384+16)
 
+#include "looprestoration_tmpl.S"
+
 // void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
 //                                 const pixel (*left)[4],
 //                                 const pixel *src, const ptrdiff_t stride,
@@ -1236,863 +1238,4 @@
         vpop            {q4-q7}
         pop             {r4-r11,pc}
 .purgem add5
-endfunc
-
-// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box3_v_neon, export=1
-        push            {r4-r9,lr}
-        ldr             r4,  [sp, #28]
-        add             r12, r3,  #2 // Number of output rows to move back
-        mov             lr,  r3      // Number of input rows to move back
-        add             r2,  r2,  #2 // Actual summed width
-        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
-        mov             r8,       #(2*SUM_STRIDE) // sum stride
-        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
-        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
-
-        tst             r4,  #4 // LR_HAVE_TOP
-        beq             0f
-        // If have top, read from row -2.
-        sub             r5,  r0,  #(4*SUM_STRIDE)
-        sub             r6,  r1,  #(2*SUM_STRIDE)
-        add             lr,  lr,  #2
-        b               1f
-0:
-        // !LR_HAVE_TOP
-        // If we don't have top, read from row 0 even if
-        // we start writing to row -1.
-        add             r5,  r0,  #(4*SUM_STRIDE)
-        add             r6,  r1,  #(2*SUM_STRIDE)
-1:
-
-        tst             r4,  #8 // LR_HAVE_BOTTOM
-        beq             1f
-        // LR_HAVE_BOTTOM
-        add             r3,  r3,  #2  // Sum all h+2 lines with the main loop
-        add             lr,  lr,  #2
-1:
-        mov             r9,  r3       // Backup of h for next loops
-
-1:
-        // Start of horizontal loop; start one vertical filter slice.
-        // Start loading rows into q8-q13 and q0-q2 taking top
-        // padding into consideration.
-        tst             r4,  #4 // LR_HAVE_TOP
-        vld1.32         {q8,  q9},  [r5, :128], r7
-        vld1.16         {q0},       [r6, :128], r8
-        beq             2f
-        // LR_HAVE_TOP
-        vld1.32         {q10, q11}, [r5, :128], r7
-        vld1.16         {q1},       [r6, :128], r8
-        vld1.32         {q12, q13}, [r5, :128], r7
-        vld1.16         {q2},       [r6, :128], r8
-        b               3f
-2:      // !LR_HAVE_TOP
-        vmov            q10, q8
-        vmov            q11, q9
-        vmov            q1,  q0
-        vmov            q12, q8
-        vmov            q13, q9
-        vmov            q2,  q0
-
-3:
-        subs            r3,  r3,  #1
-.macro add3
-        vadd.i32        q8,  q8,  q10
-        vadd.i32        q9,  q9,  q11
-        vadd.i16        q0,  q0,  q1
-        vadd.i32        q8,  q8,  q12
-        vadd.i32        q9,  q9,  q13
-        vadd.i16        q0,  q0,  q2
-        vst1.32         {q8, q9}, [r0, :128], r7
-        vst1.16         {q0},     [r1, :128], r8
-.endm
-        add3
-        vmov            q8,  q10
-        vmov            q9,  q11
-        vmov            q0,  q1
-        vmov            q10, q12
-        vmov            q11, q13
-        vmov            q1,  q2
-        ble             4f
-        vld1.32         {q12, q13}, [r5, :128], r7
-        vld1.16         {q2},       [r6, :128], r8
-        b               3b
-
-4:
-        tst             r4,  #8 // LR_HAVE_BOTTOM
-        bne             5f
-        // !LR_HAVE_BOTTOM
-        // Produce two more rows, extending the already loaded rows.
-        add3
-        vmov            q8,  q10
-        vmov            q9,  q11
-        vmov            q0,  q1
-        add3
-
-5:      // End of one vertical slice.
-        subs            r2,  r2,  #8
-        ble             0f
-        // Move pointers back up to the top and loop horizontally.
-        // Input pointers
-        mls             r5,  r7,  lr,  r5
-        mls             r6,  r8,  lr,  r6
-        // Output pointers
-        mls             r0,  r7,  r12, r0
-        mls             r1,  r8,  r12, r1
-        add             r0,  r0,  #32
-        add             r1,  r1,  #16
-        add             r5,  r5,  #32
-        add             r6,  r6,  #16
-        mov             r3,  r9
-        b               1b
-
-0:
-        pop             {r4-r9,pc}
-.purgem add3
-endfunc
-
-// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box5_v_neon, export=1
-        push            {r4-r9,lr}
-        vpush           {q5-q7}
-        ldr             r4,  [sp, #76]
-        add             r12, r3,  #2 // Number of output rows to move back
-        mov             lr,  r3      // Number of input rows to move back
-        add             r2,  r2,  #8 // Actual summed width
-        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
-        mov             r8,       #(2*SUM_STRIDE) // sum stride
-        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
-        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
-
-        tst             r4,  #4 // LR_HAVE_TOP
-        beq             0f
-        // If have top, read from row -2.
-        sub             r5,  r0,  #(4*SUM_STRIDE)
-        sub             r6,  r1,  #(2*SUM_STRIDE)
-        add             lr,  lr,  #2
-        b               1f
-0:
-        // !LR_HAVE_TOP
-        // If we don't have top, read from row 0 even if
-        // we start writing to row -1.
-        add             r5,  r0,  #(4*SUM_STRIDE)
-        add             r6,  r1,  #(2*SUM_STRIDE)
-1:
-
-        tst             r4,  #8 // LR_HAVE_BOTTOM
-        beq             0f
-        // LR_HAVE_BOTTOM
-        add             r3,  r3,  #2  // Handle h+2 lines with the main loop
-        add             lr,  lr,  #2
-        b               1f
-0:
-        // !LR_HAVE_BOTTOM
-        sub             r3,  r3,  #1  // Handle h-1 lines with the main loop
-1:
-        mov             r9,  r3       // Backup of h for next loops
-
-1:
-        // Start of horizontal loop; start one vertical filter slice.
-        // Start loading rows into q6-q15 and q0-q3,q5 taking top
-        // padding into consideration.
-        tst             r4,  #4 // LR_HAVE_TOP
-        vld1.32         {q6,  q7},  [r5, :128], r7
-        vld1.16         {q0},       [r6, :128], r8
-        beq             2f
-        // LR_HAVE_TOP
-        vld1.32         {q10, q11}, [r5, :128], r7
-        vld1.16         {q2},       [r6, :128], r8
-        vmov            q8,  q6
-        vmov            q9,  q7
-        vmov            q1,  q0
-        vld1.32         {q12, q13}, [r5, :128], r7
-        vld1.16         {q3},       [r6, :128], r8
-        b               3f
-2:      // !LR_HAVE_TOP
-        vmov            q8,  q6
-        vmov            q9,  q7
-        vmov            q1,  q0
-        vmov            q10, q6
-        vmov            q11, q7
-        vmov            q2,  q0
-        vmov            q12, q6
-        vmov            q13, q7
-        vmov            q3,  q0
-
-3:
-        cmp             r3,  #0
-        beq             4f
-        vld1.32         {q14, q15}, [r5, :128], r7
-        vld1.16         {q5},       [r6, :128], r8
-
-3:
-        // Start of vertical loop
-        subs            r3,  r3,  #2
-.macro add5
-        vadd.i32        q6,  q6,  q8
-        vadd.i32        q7,  q7,  q9
-        vadd.i16        q0,  q0,  q1
-        vadd.i32        q6,  q6,  q10
-        vadd.i32        q7,  q7,  q11
-        vadd.i16        q0,  q0,  q2
-        vadd.i32        q6,  q6,  q12
-        vadd.i32        q7,  q7,  q13
-        vadd.i16        q0,  q0,  q3
-        vadd.i32        q6,  q6,  q14
-        vadd.i32        q7,  q7,  q15
-        vadd.i16        q0,  q0,  q5
-        vst1.32         {q6, q7}, [r0, :128], r7
-        vst1.16         {q0},     [r1, :128], r8
-.endm
-        add5
-.macro shift2
-        vmov            q6,  q10
-        vmov            q7,  q11
-        vmov            q0,  q2
-        vmov            q8,  q12
-        vmov            q9,  q13
-        vmov            q1,  q3
-        vmov            q10, q14
-        vmov            q11, q15
-        vmov            q2,  q5
-.endm
-        shift2
-        add             r0,  r0,  r7
-        add             r1,  r1,  r8
-        ble             5f
-        vld1.32         {q12, q13}, [r5, :128], r7
-        vld1.16         {q3},       [r6, :128], r8
-        vld1.32         {q14, q15}, [r5, :128], r7
-        vld1.16         {q5},       [r6, :128], r8
-        b               3b
-
-4:
-        // h == 1, !LR_HAVE_BOTTOM.
-        // Pad the last row with the only content row, and add.
-        vmov            q14, q12
-        vmov            q15, q13
-        vmov            q5,  q3
-        add5
-        shift2
-        add             r0,  r0,  r7
-        add             r1,  r1,  r8
-        add5
-        b               6f
-
-5:
-        tst             r4,  #8 // LR_HAVE_BOTTOM
-        bne             6f
-        // !LR_HAVE_BOTTOM
-        cmp             r3,  #0
-        bne             5f
-        // The intended three edge rows left; output the one at h-2 and
-        // the past edge one at h.
-        vld1.32         {q12, q13}, [r5, :128], r7
-        vld1.16         {q3},       [r6, :128], r8
-        // Pad the past-edge row from the last content row.
-        vmov            q14, q12
-        vmov            q15, q13
-        vmov            q5,  q3
-        add5
-        shift2
-        add             r0,  r0,  r7
-        add             r1,  r1,  r8
-        // The last two rows are already padded properly here.
-        add5
-        b               6f
-
-5:
-        // r3 == -1, two rows left, output one.
-        // Pad the last two rows from the mid one.
-        vmov            q12, q10
-        vmov            q13, q11
-        vmov            q3,  q2
-        vmov            q14, q10
-        vmov            q15, q11
-        vmov            q5,  q2
-        add5
-        add             r0,  r0,  r7
-        add             r1,  r1,  r8
-        b               6f
-
-6:      // End of one vertical slice.
-        subs            r2,  r2,  #8
-        ble             0f
-        // Move pointers back up to the top and loop horizontally.
-        // Input pointers
-        mls             r5,  r7,  lr,  r5
-        mls             r6,  r8,  lr,  r6
-        // Output pointers
-        mls             r0,  r7,  r12, r0
-        mls             r1,  r8,  r12, r1
-        add             r0,  r0,  #32
-        add             r1,  r1,  #16
-        add             r5,  r5,  #32
-        add             r6,  r6,  #16
-        mov             r3,  r9
-        b               1b
-
-0:
-        vpop            {q5-q7}
-        pop             {r4-r9,pc}
-.purgem add5
-endfunc
-
-// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
-//                              const int w, const int h, const int strength);
-// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
-//                              const int w, const int h, const int strength);
-function sgr_calc_ab1_neon, export=1
-        push            {r4-r5,lr}
-        vpush           {q4-q7}
-        ldr             r4,  [sp, #76]
-        add             r3,  r3,  #2   // h += 2
-        vmov.i32        q15, #9        // n
-        movw            r5,  #455
-        mov             lr,  #SUM_STRIDE
-        b               sgr_calc_ab_neon
-endfunc
-
-function sgr_calc_ab2_neon, export=1
-        push            {r4-r5,lr}
-        vpush           {q4-q7}
-        ldr             r4,  [sp, #76]
-        add             r3,  r3,  #3   // h += 3
-        asr             r3,  r3,  #1   // h /= 2
-        vmov.i32        q15, #25       // n
-        mov             r5,  #164
-        mov             lr,  #(2*SUM_STRIDE)
-endfunc
-
-function sgr_calc_ab_neon
-        movrel          r12, X(sgr_x_by_x)
-        vld1.8          {q8, q9}, [r12, :128]!
-        vmov.i8         q11, #5
-        vmov.i8         d10, #55       // idx of last 5
-        vld1.8          {q10},    [r12, :128]
-        vmov.i8         d11, #72       // idx of last 4
-        vmov.i8         d12, #101      // idx of last 3
-        vmov.i8         d13, #169      // idx of last 2
-        vmov.i8         d14, #254      // idx of last 1
-        vmov.i8         d15, #32       // elements consumed in first vtbl
-        add             r2,  r2,  #2   // w += 2
-        add             r12, r2,  #7
-        bic             r12, r12, #7   // aligned w
-        sub             r12, lr,  r12  // increment between rows
-        vmov.i16        q13, #256
-        vdup.32         q12, r4
-        vdup.32         q14, r5        // one_by_x
-        sub             r0,  r0,  #(4*(SUM_STRIDE))
-        sub             r1,  r1,  #(2*(SUM_STRIDE))
-        mov             r4,  r2        // backup of w
-        vsub.i8         q8,  q8,  q11
-        vsub.i8         q9,  q9,  q11
-        vsub.i8         q10, q10, q11
-1:
-        subs            r2,  r2,  #8
-        vld1.32         {q0, q1}, [r0, :128] // a
-        vld1.16         {q2},     [r1, :128] // b
-        vmul.i32        q0,  q0,  q15  // a * n
-        vmul.i32        q1,  q1,  q15  // a * n
-        vmull.u16       q3,  d4,  d4   // b * b
-        vmull.u16       q4,  d5,  d5   // b * b
-        vqsub.u32       q0,  q0,  q3   // imax(a * n - b * b, 0)
-        vqsub.u32       q1,  q1,  q4   // imax(a * n - b * b, 0)
-        vmul.i32        q0,  q0,  q12  // p * s
-        vmul.i32        q1,  q1,  q12  // p * s
-        vqshrn.u32      d0,  q0,  #16
-        vqshrn.u32      d1,  q1,  #16
-        vqrshrn.u16     d0,  q0,  #4   // imin(z, 255)
-
-        vcgt.u8         d2,  d0,  d10  // = -1 if sgr_x_by_x[d0] < 5
-        vcgt.u8         d3,  d0,  d11  // = -1 if sgr_x_by_x[d0] < 4
-        vtbl.8          d1,  {q8, q9}, d0
-        vcgt.u8         d6,  d0,  d12  // = -1 if sgr_x_by_x[d0] < 3
-        vsub.i8         d9,  d0,  d15  // indices for vtbx
-        vcgt.u8         d7,  d0,  d13  // = -1 if sgr_x_by_x[d0] < 2
-        vadd.i8         d2,  d2,  d3
-        vtbx.8          d1,  {q10}, d9
-        vcgt.u8         d8,  d0,  d14  // = -1 if sgr_x_by_x[d0] < 1
-        vadd.i8         d6,  d6,  d7
-        vadd.i8         d8,  d8,  d22
-        vadd.i8         d2,  d2,  d6
-        vadd.i8         d1,  d1,  d8
-        vadd.i8         d1,  d1,  d2
-        vmovl.u8        q0,  d1        // x
-
-        vmull.u16       q1,  d0,  d4   // x * BB[i]
-        vmull.u16       q2,  d1,  d5   // x * BB[i]
-        vmul.i32        q1,  q1,  q14  // x * BB[i] * sgr_one_by_x
-        vmul.i32        q2,  q2,  q14  // x * BB[i] * sgr_one_by_x
-        vrshr.s32       q1,  q1,  #12  // AA[i]
-        vrshr.s32       q2,  q2,  #12  // AA[i]
-        vsub.i16        q0,  q13, q0   // 256 - x
-
-        vst1.32         {q1, q2}, [r0, :128]!
-        vst1.16         {q0},     [r1, :128]!
-        bgt             1b
-
-        subs            r3,  r3,  #1
-        ble             0f
-        add             r0,  r0,  r12, lsl #2
-        add             r1,  r1,  r12, lsl #1
-        mov             r2,  r4
-        b               1b
-0:
-        vpop            {q4-q7}
-        pop             {r4-r5,pc}
-endfunc
-
-#define FILTER_OUT_STRIDE 384
-
-// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
-//                                         const pixel *src, const ptrdiff_t stride,
-//                                         const int32_t *a, const int16_t *b,
-//                                         const int w, const int h);
-function sgr_finish_filter1_8bpc_neon, export=1
-        push            {r4-r11,lr}
-        vpush           {q4-q7}
-        ldrd            r4,  r5,  [sp, #100]
-        ldr             r6,  [sp, #108]
-        sub             r7,  r3,  #(4*SUM_STRIDE)
-        add             r8,  r3,  #(4*SUM_STRIDE)
-        sub             r9,  r4,  #(2*SUM_STRIDE)
-        add             r10, r4,  #(2*SUM_STRIDE)
-        mov             r11, #SUM_STRIDE
-        mov             r12, #FILTER_OUT_STRIDE
-        add             lr,  r5,  #3
-        bic             lr,  lr,  #3 // Aligned width
-        sub             r2,  r2,  lr
-        sub             r12, r12, lr
-        sub             r11, r11, lr
-        sub             r11, r11, #4 // We read 4 extra elements from both a and b
-        mov             lr,  r5
-        vmov.i16        q14, #3
-        vmov.i32        q15, #3
-1:
-        vld1.16         {q0},       [r9,  :128]!
-        vld1.16         {q1},       [r4,  :128]!
-        vld1.16         {q2},       [r10, :128]!
-        vld1.32         {q8,  q9},  [r7,  :128]!
-        vld1.32         {q10, q11}, [r3,  :128]!
-        vld1.32         {q12, q13}, [r8,  :128]!
-
-2:
-        subs            r5,  r5,  #4
-        vext.8          d6,  d0,  d1,  #2  // -stride
-        vext.8          d7,  d2,  d3,  #2  // 0
-        vext.8          d8,  d4,  d5,  #2  // +stride
-        vext.8          d9,  d0,  d1,  #4  // +1-stride
-        vext.8          d10, d2,  d3,  #4  // +1
-        vext.8          d11, d4,  d5,  #4  // +1+stride
-        vadd.i16        d2,  d2,  d6       // -1, -stride
-        vadd.i16        d7,  d7,  d8       // 0, +stride
-        vadd.i16        d0,  d0,  d9       // -1-stride, +1-stride
-        vadd.i16        d2,  d2,  d7
-        vadd.i16        d4,  d4,  d11      // -1+stride, +1+stride
-        vadd.i16        d2,  d2,  d10      // +1
-        vadd.i16        d0,  d0,  d4
-
-        vext.8          q3,  q8,  q9,  #4  // -stride
-        vshl.i16        d2,  d2,  #2
-        vext.8          q4,  q8,  q9,  #8  // +1-stride
-        vext.8          q5,  q10, q11, #4  // 0
-        vext.8          q6,  q10, q11, #8  // +1
-        vmla.i16        d2,  d0,  d28      // * 3 -> a
-        vadd.i32        q3,  q3,  q10      // -stride, -1
-        vadd.i32        q8,  q8,  q4       // -1-stride, +1-stride
-        vadd.i32        q5,  q5,  q6       // 0, +1
-        vadd.i32        q8,  q8,  q12      // -1+stride
-        vadd.i32        q3,  q3,  q5
-        vext.8          q7,  q12, q13, #4  // +stride
-        vext.8          q10, q12, q13, #8  // +1+stride
-        vld1.32         {d24[0]}, [r1, :32]! // src
-        vadd.i32        q3,  q3,  q7       // +stride
-        vadd.i32        q8,  q8,  q10      // +1+stride
-        vshl.i32        q3,  q3,  #2
-        vmla.i32        q3,  q8,  q15      // * 3 -> b
-        vmovl.u8        q12, d24           // src
-        vmov            d0,  d1
-        vmlal.u16       q3,  d2,  d24      // b + a * src
-        vmov            d2,  d3
-        vrshrn.i32      d6,  q3,  #9
-        vmov            d4,  d5
-        vst1.16         {d6}, [r0]!
-
-        ble             3f
-        vmov            q8,  q9
-        vmov            q10, q11
-        vmov            q12, q13
-        vld1.16         {d1},  [r9,  :64]!
-        vld1.16         {d3},  [r4,  :64]!
-        vld1.16         {d5},  [r10, :64]!
-        vld1.32         {q9},  [r7,  :128]!
-        vld1.32         {q11}, [r3,  :128]!
-        vld1.32         {q13}, [r8,  :128]!
-        b               2b
-
-3:
-        subs            r6,  r6,  #1
-        ble             0f
-        mov             r5,  lr
-        add             r0,  r0,  r12, lsl #1
-        add             r1,  r1,  r2
-        add             r3,  r3,  r11, lsl #2
-        add             r7,  r7,  r11, lsl #2
-        add             r8,  r8,  r11, lsl #2
-        add             r4,  r4,  r11, lsl #1
-        add             r9,  r9,  r11, lsl #1
-        add             r10, r10, r11, lsl #1
-        b               1b
-0:
-        vpop            {q4-q7}
-        pop             {r4-r11,pc}
-endfunc
-
-// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
-//                                         const pixel *src, const ptrdiff_t stride,
-//                                         const int32_t *a, const int16_t *b,
-//                                         const int w, const int h);
-function sgr_finish_filter2_8bpc_neon, export=1
-        push            {r4-r11,lr}
-        vpush           {q4-q7}
-        ldrd            r4,  r5,  [sp, #100]
-        ldr             r6,  [sp, #108]
-        add             r7,  r3,  #(4*(SUM_STRIDE))
-        sub             r3,  r3,  #(4*(SUM_STRIDE))
-        add             r8,  r4,  #(2*(SUM_STRIDE))
-        sub             r4,  r4,  #(2*(SUM_STRIDE))
-        mov             r9,  #(2*SUM_STRIDE)
-        mov             r10, #FILTER_OUT_STRIDE
-        add             r11, r5,  #7
-        bic             r11, r11, #7 // Aligned width
-        sub             r2,  r2,  r11
-        sub             r10, r10, r11
-        sub             r9,  r9,  r11
-        sub             r9,  r9,  #4 // We read 4 extra elements from a
-        sub             r12, r9,  #4 // We read 8 extra elements from b
-        mov             lr,  r5
-
-1:
-        vld1.16         {q0,  q1},  [r4, :128]!
-        vld1.16         {q2,  q3},  [r8, :128]!
-        vld1.32         {q8,  q9},  [r3, :128]!
-        vld1.32         {q11, q12}, [r7, :128]!
-        vld1.32         {q10},      [r3, :128]!
-        vld1.32         {q13},      [r7, :128]!
-
-2:
-        vmov.i16        q14, #5
-        vmov.i16        q15, #6
-        subs            r5,  r5,  #8
-        vext.8          q4,  q0,  q1,  #4  // +1-stride
-        vext.8          q5,  q2,  q3,  #4  // +1+stride
-        vext.8          q6,  q0,  q1,  #2  // -stride
-        vext.8          q7,  q2,  q3,  #2  // +stride
-        vadd.i16        q0,  q0,  q4       // -1-stride, +1-stride
-        vadd.i16        q5,  q2,  q5       // -1+stride, +1+stride
-        vadd.i16        q2,  q6,  q7       // -stride, +stride
-        vadd.i16        q0,  q0,  q5
-
-        vext.8          q4,  q8,  q9,  #8  // +1-stride
-        vext.8          q5,  q9,  q10, #8
-        vext.8          q6,  q11, q12, #8  // +1+stride
-        vext.8          q7,  q12, q13, #8
-        vmul.i16        q0,  q0,  q14      // * 5
-        vmla.i16        q0,  q2,  q15      // * 6
-        vadd.i32        q4,  q4,  q8       // -1-stride, +1-stride
-        vadd.i32        q5,  q5,  q9
-        vadd.i32        q6,  q6,  q11      // -1+stride, +1+stride
-        vadd.i32        q7,  q7,  q12
-        vadd.i32        q4,  q4,  q6
-        vadd.i32        q5,  q5,  q7
-        vext.8          q6,  q8,  q9,  #4  // -stride
-        vext.8          q7,  q9,  q10, #4
-        vext.8          q8,  q11, q12, #4  // +stride
-        vext.8          q11, q12, q13, #4
-
-        vld1.8          {d4}, [r1, :64]!
-
-        vmov.i32        q14, #5
-        vmov.i32        q15, #6
-
-        vadd.i32        q6,  q6,  q8       // -stride, +stride
-        vadd.i32        q7,  q7,  q11
-        vmul.i32        q4,  q4,  q14      // * 5
-        vmla.i32        q4,  q6,  q15      // * 6
-        vmul.i32        q5,  q5,  q14      // * 5
-        vmla.i32        q5,  q7,  q15      // * 6
-
-        vmovl.u8        q2,  d4
-        vmlal.u16       q4,  d0,  d4       // b + a * src
-        vmlal.u16       q5,  d1,  d5       // b + a * src
-        vmov            q0,  q1
-        vrshrn.i32      d8,  q4,  #9
-        vrshrn.i32      d9,  q5,  #9
-        vmov            q2,  q3
-        vst1.16         {q4}, [r0, :128]!
-
-        ble             3f
-        vmov            q8,  q10
-        vmov            q11, q13
-        vld1.16         {q1},       [r4, :128]!
-        vld1.16         {q3},       [r8, :128]!
-        vld1.32         {q9,  q10}, [r3, :128]!
-        vld1.32         {q12, q13}, [r7, :128]!
-        b               2b
-
-3:
-        subs            r6,  r6,  #1
-        ble             0f
-        mov             r5,  lr
-        add             r0,  r0,  r10, lsl #1
-        add             r1,  r1,  r2
-        add             r3,  r3,  r9,  lsl #2
-        add             r7,  r7,  r9,  lsl #2
-        add             r4,  r4,  r12, lsl #1
-        add             r8,  r8,  r12, lsl #1
-
-        vld1.32         {q8, q9}, [r3, :128]!
-        vld1.16         {q0, q1}, [r4, :128]!
-        vld1.32         {q10},    [r3, :128]!
-
-        vmov.i16        q12, #5
-        vmov.i16        q13, #6
-
-4:
-        subs            r5,  r5,  #8
-        vext.8          q3,  q0,  q1,  #4  // +1
-        vext.8          q2,  q0,  q1,  #2  // 0
-        vadd.i16        q0,  q0,  q3       // -1, +1
-
-        vext.8          q4,  q8,  q9,  #4  // 0
-        vext.8          q5,  q9,  q10, #4
-        vext.8          q6,  q8,  q9,  #8  // +1
-        vext.8          q7,  q9,  q10, #8
-        vmul.i16        q2,  q2,  q13      // * 6
-        vmla.i16        q2,  q0,  q12      // * 5 -> a
-        vld1.8          {d22}, [r1, :64]!
-        vadd.i32        q8,  q8,  q6       // -1, +1
-        vadd.i32        q9,  q9,  q7
-        vmovl.u8        q11, d22
-        vmul.i32        q4,  q4,  q15      // * 6
-        vmla.i32        q4,  q8,  q14      // * 5 -> b
-        vmul.i32        q5,  q5,  q15      // * 6
-        vmla.i32        q5,  q9,  q14      // * 5 -> b
-
-        vmlal.u16       q4,  d4,  d22      // b + a * src
-        vmlal.u16       q5,  d5,  d23
-        vmov            q0,  q1
-        vrshrn.i32      d8,  q4,  #8
-        vrshrn.i32      d9,  q5,  #8
-        vmov            q8,  q10
-        vst1.16         {q4}, [r0, :128]!
-
-        ble             5f
-        vld1.16         {q1},      [r4, :128]!
-        vld1.32         {q9, q10}, [r3, :128]!
-        b               4b
-
-5:
-        subs            r6,  r6,  #1
-        ble             0f
-        mov             r5,  lr
-        sub             r3,  r3,  r11, lsl #2 // Rewind r3/r4 to where they started
-        sub             r4,  r4,  r11, lsl #1
-        add             r0,  r0,  r10, lsl #1
-        add             r1,  r1,  r2
-        sub             r3,  r3,  #16
-        sub             r4,  r4,  #16
-        b               1b
-0:
-        vpop            {q4-q7}
-        pop             {r4-r11,pc}
-endfunc
-
-// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
-//                                    const pixel *src, const ptrdiff_t src_stride,
-//                                    const int16_t *t1, const int w, const int h,
-//                                    const int wt);
-function sgr_weighted1_8bpc_neon, export=1
-        push            {r4-r9,lr}
-        ldrd            r4,  r5,  [sp, #28]
-        ldrd            r6,  r7,  [sp, #36]
-        vdup.16         d31, r7
-        cmp             r6,  #2
-        add             r9,  r0,  r1
-        add             r12, r2,  r3
-        add             lr,  r4,  #2*FILTER_OUT_STRIDE
-        mov             r7,  #(4*FILTER_OUT_STRIDE)
-        lsl             r1,  r1,  #1
-        lsl             r3,  r3,  #1
-        add             r8,  r5,  #7
-        bic             r8,  r8,  #7 // Aligned width
-        sub             r1,  r1,  r8
-        sub             r3,  r3,  r8
-        sub             r7,  r7,  r8, lsl #1
-        mov             r8,  r5
-        blt             2f
-1:
-        vld1.8          {d0},  [r2,  :64]!
-        vld1.8          {d16}, [r12, :64]!
-        vld1.16         {q1},  [r4,  :128]!
-        vld1.16         {q9},  [lr,  :128]!
-        subs            r5,  r5,  #8
-        vshll.u8        q0,  d0,  #4     // u
-        vshll.u8        q8,  d16, #4     // u
-        vsub.i16        q1,  q1,  q0     // t1 - u
-        vsub.i16        q9,  q9,  q8     // t1 - u
-        vshll.u16       q2,  d0,  #7     // u << 7
-        vshll.u16       q3,  d1,  #7     // u << 7
-        vshll.u16       q10, d16, #7     // u << 7
-        vshll.u16       q11, d17, #7     // u << 7
-        vmlal.s16       q2,  d2,  d31    // v
-        vmlal.s16       q3,  d3,  d31    // v
-        vmlal.s16       q10, d18, d31    // v
-        vmlal.s16       q11, d19, d31    // v
-        vrshrn.i32      d4,  q2,  #11
-        vrshrn.i32      d5,  q3,  #11
-        vrshrn.i32      d20, q10, #11
-        vrshrn.i32      d21, q11, #11
-        vqmovun.s16     d4,  q2
-        vqmovun.s16     d20, q10
-        vst1.8          {d4},  [r0]!
-        vst1.8          {d20}, [r9]!
-        bgt             1b
-
-        sub             r6,  r6,  #2
-        cmp             r6,  #1
-        blt             0f
-        mov             r5,  r8
-        add             r0,  r0,  r1
-        add             r9,  r9,  r1
-        add             r2,  r2,  r3
-        add             r12, r12, r3
-        add             r4,  r4,  r7
-        add             lr,  lr,  r7
-        beq             2f
-        b               1b
-
-2:
-        vld1.8          {d0}, [r2, :64]!
-        vld1.16         {q1}, [r4, :128]!
-        subs            r5,  r5,  #8
-        vshll.u8        q0,  d0,  #4     // u
-        vsub.i16        q1,  q1,  q0     // t1 - u
-        vshll.u16       q2,  d0,  #7     // u << 7
-        vshll.u16       q3,  d1,  #7     // u << 7
-        vmlal.s16       q2,  d2,  d31    // v
-        vmlal.s16       q3,  d3,  d31    // v
-        vrshrn.i32      d4,  q2,  #11
-        vrshrn.i32      d5,  q3,  #11
-        vqmovun.s16     d2,  q2
-        vst1.8          {d2}, [r0]!
-        bgt             2b
-0:
-        pop             {r4-r9,pc}
-endfunc
-
-// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-//                                    const pixel *src, const ptrdiff_t src_stride,
-//                                    const int16_t *t1, const int16_t *t2,
-//                                    const int w, const int h,
-//                                    const int16_t wt[2]);
-function sgr_weighted2_8bpc_neon, export=1
-        push            {r4-r11,lr}
-        ldrd            r4,  r5,  [sp, #36]
-        ldrd            r6,  r7,  [sp, #44]
-        ldr             r8,  [sp, #52]
-        cmp             r7,  #2
-        add             r10, r0,  r1
-        add             r11, r2,  r3
-        add             r12, r4,  #2*FILTER_OUT_STRIDE
-        add             lr,  r5,  #2*FILTER_OUT_STRIDE
-        vld2.16         {d30[], d31[]}, [r8] // wt[0], wt[1]
-        mov             r8,  #4*FILTER_OUT_STRIDE
-        lsl             r1,  r1,  #1
-        lsl             r3,  r3,  #1
-        add             r9,  r6,  #7
-        bic             r9,  r9,  #7 // Aligned width
-        sub             r1,  r1,  r9
-        sub             r3,  r3,  r9
-        sub             r8,  r8,  r9, lsl #1
-        mov             r9,  r6
-        blt             2f
-1:
-        vld1.8          {d0},  [r2,  :64]!
-        vld1.8          {d16}, [r11, :64]!
-        vld1.16         {q1},  [r4,  :128]!
-        vld1.16         {q9},  [r12, :128]!
-        vld1.16         {q2},  [r5,  :128]!
-        vld1.16         {q10}, [lr,  :128]!
-        subs            r6,  r6,  #8
-        vshll.u8        q0,  d0,  #4     // u
-        vshll.u8        q8,  d16, #4     // u
-        vsub.i16        q1,  q1,  q0     // t1 - u
-        vsub.i16        q2,  q2,  q0     // t2 - u
-        vsub.i16        q9,  q9,  q8     // t1 - u
-        vsub.i16        q10, q10, q8     // t2 - u
-        vshll.u16       q3,  d0,  #7     // u << 7
-        vshll.u16       q0,  d1,  #7     // u << 7
-        vshll.u16       q11, d16, #7     // u << 7
-        vshll.u16       q8,  d17, #7     // u << 7
-        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
-        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
-        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
-        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
-        vmlal.s16       q11, d18, d30    // wt[0] * (t1 - u)
-        vmlal.s16       q11, d20, d31    // wt[1] * (t2 - u)
-        vmlal.s16       q8,  d19, d30    // wt[0] * (t1 - u)
-        vmlal.s16       q8,  d21, d31    // wt[1] * (t2 - u)
-        vrshrn.i32      d6,  q3,  #11
-        vrshrn.i32      d7,  q0,  #11
-        vrshrn.i32      d22, q11, #11
-        vrshrn.i32      d23, q8,  #11
-        vqmovun.s16     d6,  q3
-        vqmovun.s16     d22, q11
-        vst1.8          {d6},  [r0]!
-        vst1.8          {d22}, [r10]!
-        bgt             1b
-
-        subs            r7,  r7,  #2
-        cmp             r7,  #1
-        blt             0f
-        mov             r6,  r9
-        add             r0,  r0,  r1
-        add             r10, r10, r1
-        add             r2,  r2,  r3
-        add             r11, r11, r3
-        add             r4,  r4,  r8
-        add             r12, r12, r8
-        add             r5,  r5,  r8
-        add             lr,  lr,  r8
-        beq             2f
-        b               1b
-
-2:
-        vld1.8          {d0}, [r2, :64]!
-        vld1.16         {q1}, [r4, :128]!
-        vld1.16         {q2}, [r5, :128]!
-        subs            r6,  r6,  #8
-        vshll.u8        q0,  d0,  #4     // u
-        vsub.i16        q1,  q1,  q0     // t1 - u
-        vsub.i16        q2,  q2,  q0     // t2 - u
-        vshll.u16       q3,  d0,  #7     // u << 7
-        vshll.u16       q0,  d1,  #7     // u << 7
-        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
-        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
-        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
-        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
-        vrshrn.i32      d6,  q3,  #11
-        vrshrn.i32      d7,  q0,  #11
-        vqmovun.s16     d6,  q3
-        vst1.8          {d6}, [r0]!
-        bgt             1b
-0:
-        pop             {r4-r11,pc}
 endfunc
--- /dev/null
+++ b/src/arm/32/looprestoration_common.S
@@ -1,0 +1,441 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+        push            {r4-r9,lr}
+        ldr             r4,  [sp, #28]
+        add             r12, r3,  #2 // Number of output rows to move back
+        mov             lr,  r3      // Number of input rows to move back
+        add             r2,  r2,  #2 // Actual summed width
+        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             r8,       #(2*SUM_STRIDE) // sum stride
+        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             r4,  #4 // LR_HAVE_TOP
+        beq             0f
+        // If have top, read from row -2.
+        sub             r5,  r0,  #(4*SUM_STRIDE)
+        sub             r6,  r1,  #(2*SUM_STRIDE)
+        add             lr,  lr,  #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             r5,  r0,  #(4*SUM_STRIDE)
+        add             r6,  r1,  #(2*SUM_STRIDE)
+1:
+
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        beq             1f
+        // LR_HAVE_BOTTOM
+        add             r3,  r3,  #2  // Sum all h+2 lines with the main loop
+        add             lr,  lr,  #2
+1:
+        mov             r9,  r3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into q8-q13 and q0-q2 taking top
+        // padding into consideration.
+        tst             r4,  #4 // LR_HAVE_TOP
+        vld1.32         {q8,  q9},  [r5, :128], r7
+        vld1.16         {q0},       [r6, :128], r8
+        beq             2f
+        // LR_HAVE_TOP
+        vld1.32         {q10, q11}, [r5, :128], r7
+        vld1.16         {q1},       [r6, :128], r8
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q2},       [r6, :128], r8
+        b               3f
+2:      // !LR_HAVE_TOP
+        vmov            q10, q8
+        vmov            q11, q9
+        vmov            q1,  q0
+        vmov            q12, q8
+        vmov            q13, q9
+        vmov            q2,  q0
+
+3:
+        subs            r3,  r3,  #1
+.macro add3
+        vadd.i32        q8,  q8,  q10
+        vadd.i32        q9,  q9,  q11
+        vadd.i16        q0,  q0,  q1
+        vadd.i32        q8,  q8,  q12
+        vadd.i32        q9,  q9,  q13
+        vadd.i16        q0,  q0,  q2
+        vst1.32         {q8, q9}, [r0, :128], r7
+        vst1.16         {q0},     [r1, :128], r8
+.endm
+        add3
+        vmov            q8,  q10
+        vmov            q9,  q11
+        vmov            q0,  q1
+        vmov            q10, q12
+        vmov            q11, q13
+        vmov            q1,  q2
+        ble             4f
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q2},       [r6, :128], r8
+        b               3b
+
+4:
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        bne             5f
+        // !LR_HAVE_BOTTOM
+        // Produce two more rows, extending the already loaded rows.
+        add3
+        vmov            q8,  q10
+        vmov            q9,  q11
+        vmov            q0,  q1
+        add3
+
+5:      // End of one vertical slice.
+        subs            r2,  r2,  #8
+        ble             0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        mls             r5,  r7,  lr,  r5
+        mls             r6,  r8,  lr,  r6
+        // Output pointers
+        mls             r0,  r7,  r12, r0
+        mls             r1,  r8,  r12, r1
+        add             r0,  r0,  #32
+        add             r1,  r1,  #16
+        add             r5,  r5,  #32
+        add             r6,  r6,  #16
+        mov             r3,  r9
+        b               1b
+
+0:
+        pop             {r4-r9,pc}
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+        push            {r4-r9,lr}
+        vpush           {q5-q7}
+        ldr             r4,  [sp, #76]
+        add             r12, r3,  #2 // Number of output rows to move back
+        mov             lr,  r3      // Number of input rows to move back
+        add             r2,  r2,  #8 // Actual summed width
+        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             r8,       #(2*SUM_STRIDE) // sum stride
+        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             r4,  #4 // LR_HAVE_TOP
+        beq             0f
+        // If have top, read from row -2.
+        sub             r5,  r0,  #(4*SUM_STRIDE)
+        sub             r6,  r1,  #(2*SUM_STRIDE)
+        add             lr,  lr,  #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             r5,  r0,  #(4*SUM_STRIDE)
+        add             r6,  r1,  #(2*SUM_STRIDE)
+1:
+
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        beq             0f
+        // LR_HAVE_BOTTOM
+        add             r3,  r3,  #2  // Handle h+2 lines with the main loop
+        add             lr,  lr,  #2
+        b               1f
+0:
+        // !LR_HAVE_BOTTOM
+        sub             r3,  r3,  #1  // Handle h-1 lines with the main loop
+1:
+        mov             r9,  r3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into q6-q15 and q0-q3,q5 taking top
+        // padding into consideration.
+        tst             r4,  #4 // LR_HAVE_TOP
+        vld1.32         {q6,  q7},  [r5, :128], r7
+        vld1.16         {q0},       [r6, :128], r8
+        beq             2f
+        // LR_HAVE_TOP
+        vld1.32         {q10, q11}, [r5, :128], r7
+        vld1.16         {q2},       [r6, :128], r8
+        vmov            q8,  q6
+        vmov            q9,  q7
+        vmov            q1,  q0
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q3},       [r6, :128], r8
+        b               3f
+2:      // !LR_HAVE_TOP
+        vmov            q8,  q6
+        vmov            q9,  q7
+        vmov            q1,  q0
+        vmov            q10, q6
+        vmov            q11, q7
+        vmov            q2,  q0
+        vmov            q12, q6
+        vmov            q13, q7
+        vmov            q3,  q0
+
+3:
+        cmp             r3,  #0
+        beq             4f
+        vld1.32         {q14, q15}, [r5, :128], r7
+        vld1.16         {q5},       [r6, :128], r8
+
+3:
+        // Start of vertical loop
+        subs            r3,  r3,  #2
+.macro add5
+        vadd.i32        q6,  q6,  q8
+        vadd.i32        q7,  q7,  q9
+        vadd.i16        q0,  q0,  q1
+        vadd.i32        q6,  q6,  q10
+        vadd.i32        q7,  q7,  q11
+        vadd.i16        q0,  q0,  q2
+        vadd.i32        q6,  q6,  q12
+        vadd.i32        q7,  q7,  q13
+        vadd.i16        q0,  q0,  q3
+        vadd.i32        q6,  q6,  q14
+        vadd.i32        q7,  q7,  q15
+        vadd.i16        q0,  q0,  q5
+        vst1.32         {q6, q7}, [r0, :128], r7
+        vst1.16         {q0},     [r1, :128], r8
+.endm
+        add5
+.macro shift2
+        vmov            q6,  q10
+        vmov            q7,  q11
+        vmov            q0,  q2
+        vmov            q8,  q12
+        vmov            q9,  q13
+        vmov            q1,  q3
+        vmov            q10, q14
+        vmov            q11, q15
+        vmov            q2,  q5
+.endm
+        shift2
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        ble             5f
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q3},       [r6, :128], r8
+        vld1.32         {q14, q15}, [r5, :128], r7
+        vld1.16         {q5},       [r6, :128], r8
+        b               3b
+
+4:
+        // h == 1, !LR_HAVE_BOTTOM.
+        // Pad the last row with the only content row, and add.
+        vmov            q14, q12
+        vmov            q15, q13
+        vmov            q5,  q3
+        add5
+        shift2
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        add5
+        b               6f
+
+5:
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        bne             6f
+        // !LR_HAVE_BOTTOM
+        cmp             r3,  #0
+        bne             5f
+        // The intended three edge rows left; output the one at h-2 and
+        // the past edge one at h.
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q3},       [r6, :128], r8
+        // Pad the past-edge row from the last content row.
+        vmov            q14, q12
+        vmov            q15, q13
+        vmov            q5,  q3
+        add5
+        shift2
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        // The last two rows are already padded properly here.
+        add5
+        b               6f
+
+5:
+        // r3 == -1, two rows left, output one.
+        // Pad the last two rows from the mid one.
+        vmov            q12, q10
+        vmov            q13, q11
+        vmov            q3,  q2
+        vmov            q14, q10
+        vmov            q15, q11
+        vmov            q5,  q2
+        add5
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        b               6f
+
+6:      // End of one vertical slice.
+        subs            r2,  r2,  #8
+        ble             0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        mls             r5,  r7,  lr,  r5
+        mls             r6,  r8,  lr,  r6
+        // Output pointers
+        mls             r0,  r7,  r12, r0
+        mls             r1,  r8,  r12, r1
+        add             r0,  r0,  #32
+        add             r1,  r1,  #16
+        add             r5,  r5,  #32
+        add             r6,  r6,  #16
+        mov             r3,  r9
+        b               1b
+
+0:
+        vpop            {q5-q7}
+        pop             {r4-r9,pc}
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength);
+function sgr_calc_ab1_neon, export=1
+        push            {r4-r5,lr}
+        vpush           {q4-q7}
+        ldr             r4,  [sp, #76]
+        add             r3,  r3,  #2   // h += 2
+        vmov.i32        q15, #9        // n
+        movw            r5,  #455
+        mov             lr,  #SUM_STRIDE
+        b               sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+        push            {r4-r5,lr}
+        vpush           {q4-q7}
+        ldr             r4,  [sp, #76]
+        add             r3,  r3,  #3   // h += 3
+        asr             r3,  r3,  #1   // h /= 2
+        vmov.i32        q15, #25       // n
+        mov             r5,  #164
+        mov             lr,  #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+        movrel          r12, X(sgr_x_by_x)
+        vld1.8          {q8, q9}, [r12, :128]!
+        vmov.i8         q11, #5
+        vmov.i8         d10, #55       // idx of last 5
+        vld1.8          {q10},    [r12, :128]
+        vmov.i8         d11, #72       // idx of last 4
+        vmov.i8         d12, #101      // idx of last 3
+        vmov.i8         d13, #169      // idx of last 2
+        vmov.i8         d14, #254      // idx of last 1
+        vmov.i8         d15, #32       // elements consumed in first vtbl
+        add             r2,  r2,  #2   // w += 2
+        add             r12, r2,  #7
+        bic             r12, r12, #7   // aligned w
+        sub             r12, lr,  r12  // increment between rows
+        vmov.i16        q13, #256
+        vdup.32         q12, r4
+        vdup.32         q14, r5        // one_by_x
+        sub             r0,  r0,  #(4*(SUM_STRIDE))
+        sub             r1,  r1,  #(2*(SUM_STRIDE))
+        mov             r4,  r2        // backup of w
+        vsub.i8         q8,  q8,  q11
+        vsub.i8         q9,  q9,  q11
+        vsub.i8         q10, q10, q11
+1:
+        subs            r2,  r2,  #8
+        vld1.32         {q0, q1}, [r0, :128] // a
+        vld1.16         {q2},     [r1, :128] // b
+        vmul.i32        q0,  q0,  q15  // a * n
+        vmul.i32        q1,  q1,  q15  // a * n
+        vmull.u16       q3,  d4,  d4   // b * b
+        vmull.u16       q4,  d5,  d5   // b * b
+        vqsub.u32       q0,  q0,  q3   // imax(a * n - b * b, 0)
+        vqsub.u32       q1,  q1,  q4   // imax(a * n - b * b, 0)
+        vmul.i32        q0,  q0,  q12  // p * s
+        vmul.i32        q1,  q1,  q12  // p * s
+        vqshrn.u32      d0,  q0,  #16
+        vqshrn.u32      d1,  q1,  #16
+        vqrshrn.u16     d0,  q0,  #4   // imin(z, 255)
+
+        vcgt.u8         d2,  d0,  d10  // = -1 if sgr_x_by_x[d0] < 5
+        vcgt.u8         d3,  d0,  d11  // = -1 if sgr_x_by_x[d0] < 4
+        vtbl.8          d1,  {q8, q9}, d0
+        vcgt.u8         d6,  d0,  d12  // = -1 if sgr_x_by_x[d0] < 3
+        vsub.i8         d9,  d0,  d15  // indices for vtbx
+        vcgt.u8         d7,  d0,  d13  // = -1 if sgr_x_by_x[d0] < 2
+        vadd.i8         d2,  d2,  d3
+        vtbx.8          d1,  {q10}, d9
+        vcgt.u8         d8,  d0,  d14  // = -1 if sgr_x_by_x[d0] < 1
+        vadd.i8         d6,  d6,  d7
+        vadd.i8         d8,  d8,  d22
+        vadd.i8         d2,  d2,  d6
+        vadd.i8         d1,  d1,  d8
+        vadd.i8         d1,  d1,  d2
+        vmovl.u8        q0,  d1        // x
+
+        vmull.u16       q1,  d0,  d4   // x * BB[i]
+        vmull.u16       q2,  d1,  d5   // x * BB[i]
+        vmul.i32        q1,  q1,  q14  // x * BB[i] * sgr_one_by_x
+        vmul.i32        q2,  q2,  q14  // x * BB[i] * sgr_one_by_x
+        vrshr.s32       q1,  q1,  #12  // AA[i]
+        vrshr.s32       q2,  q2,  #12  // AA[i]
+        vsub.i16        q0,  q13, q0   // 256 - x
+
+        vst1.32         {q1, q2}, [r0, :128]!
+        vst1.16         {q0},     [r1, :128]!
+        bgt             1b
+
+        subs            r3,  r3,  #1
+        ble             0f
+        add             r0,  r0,  r12, lsl #2
+        add             r1,  r1,  r12, lsl #1
+        mov             r2,  r4
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r5,pc}
+endfunc
--- /dev/null
+++ b/src/arm/32/looprestoration_tmpl.S
@@ -1,0 +1,477 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter1_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldr             r6,  [sp, #108]
+        sub             r7,  r3,  #(4*SUM_STRIDE)
+        add             r8,  r3,  #(4*SUM_STRIDE)
+        sub             r9,  r4,  #(2*SUM_STRIDE)
+        add             r10, r4,  #(2*SUM_STRIDE)
+        mov             r11, #SUM_STRIDE
+        mov             r12, #FILTER_OUT_STRIDE
+        add             lr,  r5,  #3
+        bic             lr,  lr,  #3 // Aligned width
+        sub             r2,  r2,  lr
+        sub             r12, r12, lr
+        sub             r11, r11, lr
+        sub             r11, r11, #4 // We read 4 extra elements from both a and b
+        mov             lr,  r5
+        vmov.i16        q14, #3
+        vmov.i32        q15, #3
+1:
+        vld1.16         {q0},       [r9,  :128]!
+        vld1.16         {q1},       [r4,  :128]!
+        vld1.16         {q2},       [r10, :128]!
+        vld1.32         {q8,  q9},  [r7,  :128]!
+        vld1.32         {q10, q11}, [r3,  :128]!
+        vld1.32         {q12, q13}, [r8,  :128]!
+
+2:
+        subs            r5,  r5,  #4
+        vext.8          d6,  d0,  d1,  #2  // -stride
+        vext.8          d7,  d2,  d3,  #2  // 0
+        vext.8          d8,  d4,  d5,  #2  // +stride
+        vext.8          d9,  d0,  d1,  #4  // +1-stride
+        vext.8          d10, d2,  d3,  #4  // +1
+        vext.8          d11, d4,  d5,  #4  // +1+stride
+        vadd.i16        d2,  d2,  d6       // -1, -stride
+        vadd.i16        d7,  d7,  d8       // 0, +stride
+        vadd.i16        d0,  d0,  d9       // -1-stride, +1-stride
+        vadd.i16        d2,  d2,  d7
+        vadd.i16        d4,  d4,  d11      // -1+stride, +1+stride
+        vadd.i16        d2,  d2,  d10      // +1
+        vadd.i16        d0,  d0,  d4
+
+        vext.8          q3,  q8,  q9,  #4  // -stride
+        vshl.i16        d2,  d2,  #2
+        vext.8          q4,  q8,  q9,  #8  // +1-stride
+        vext.8          q5,  q10, q11, #4  // 0
+        vext.8          q6,  q10, q11, #8  // +1
+        vmla.i16        d2,  d0,  d28      // * 3 -> a
+        vadd.i32        q3,  q3,  q10      // -stride, -1
+        vadd.i32        q8,  q8,  q4       // -1-stride, +1-stride
+        vadd.i32        q5,  q5,  q6       // 0, +1
+        vadd.i32        q8,  q8,  q12      // -1+stride
+        vadd.i32        q3,  q3,  q5
+        vext.8          q7,  q12, q13, #4  // +stride
+        vext.8          q10, q12, q13, #8  // +1+stride
+        vld1.32         {d24[0]}, [r1, :32]! // src
+        vadd.i32        q3,  q3,  q7       // +stride
+        vadd.i32        q8,  q8,  q10      // +1+stride
+        vshl.i32        q3,  q3,  #2
+        vmla.i32        q3,  q8,  q15      // * 3 -> b
+        vmovl.u8        q12, d24           // src
+        vmov            d0,  d1
+        vmlal.u16       q3,  d2,  d24      // b + a * src
+        vmov            d2,  d3
+        vrshrn.i32      d6,  q3,  #9
+        vmov            d4,  d5
+        vst1.16         {d6}, [r0]!
+
+        ble             3f
+        vmov            q8,  q9
+        vmov            q10, q11
+        vmov            q12, q13
+        vld1.16         {d1},  [r9,  :64]!
+        vld1.16         {d3},  [r4,  :64]!
+        vld1.16         {d5},  [r10, :64]!
+        vld1.32         {q9},  [r7,  :128]!
+        vld1.32         {q11}, [r3,  :128]!
+        vld1.32         {q13}, [r8,  :128]!
+        b               2b
+
+3:
+        subs            r6,  r6,  #1
+        ble             0f
+        mov             r5,  lr
+        add             r0,  r0,  r12, lsl #1
+        add             r1,  r1,  r2
+        add             r3,  r3,  r11, lsl #2
+        add             r7,  r7,  r11, lsl #2
+        add             r8,  r8,  r11, lsl #2
+        add             r4,  r4,  r11, lsl #1
+        add             r9,  r9,  r11, lsl #1
+        add             r10, r10, r11, lsl #1
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter2_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldr             r6,  [sp, #108]
+        add             r7,  r3,  #(4*(SUM_STRIDE))
+        sub             r3,  r3,  #(4*(SUM_STRIDE))
+        add             r8,  r4,  #(2*(SUM_STRIDE))
+        sub             r4,  r4,  #(2*(SUM_STRIDE))
+        mov             r9,  #(2*SUM_STRIDE)
+        mov             r10, #FILTER_OUT_STRIDE
+        add             r11, r5,  #7
+        bic             r11, r11, #7 // Aligned width
+        sub             r2,  r2,  r11
+        sub             r10, r10, r11
+        sub             r9,  r9,  r11
+        sub             r9,  r9,  #4 // We read 4 extra elements from a
+        sub             r12, r9,  #4 // We read 8 extra elements from b
+        mov             lr,  r5
+
+1:
+        vld1.16         {q0,  q1},  [r4, :128]!
+        vld1.16         {q2,  q3},  [r8, :128]!
+        vld1.32         {q8,  q9},  [r3, :128]!
+        vld1.32         {q11, q12}, [r7, :128]!
+        vld1.32         {q10},      [r3, :128]!
+        vld1.32         {q13},      [r7, :128]!
+
+2:
+        vmov.i16        q14, #5
+        vmov.i16        q15, #6
+        subs            r5,  r5,  #8
+        vext.8          q4,  q0,  q1,  #4  // +1-stride
+        vext.8          q5,  q2,  q3,  #4  // +1+stride
+        vext.8          q6,  q0,  q1,  #2  // -stride
+        vext.8          q7,  q2,  q3,  #2  // +stride
+        vadd.i16        q0,  q0,  q4       // -1-stride, +1-stride
+        vadd.i16        q5,  q2,  q5       // -1+stride, +1+stride
+        vadd.i16        q2,  q6,  q7       // -stride, +stride
+        vadd.i16        q0,  q0,  q5
+
+        vext.8          q4,  q8,  q9,  #8  // +1-stride
+        vext.8          q5,  q9,  q10, #8
+        vext.8          q6,  q11, q12, #8  // +1+stride
+        vext.8          q7,  q12, q13, #8
+        vmul.i16        q0,  q0,  q14      // * 5
+        vmla.i16        q0,  q2,  q15      // * 6
+        vadd.i32        q4,  q4,  q8       // -1-stride, +1-stride
+        vadd.i32        q5,  q5,  q9
+        vadd.i32        q6,  q6,  q11      // -1+stride, +1+stride
+        vadd.i32        q7,  q7,  q12
+        vadd.i32        q4,  q4,  q6
+        vadd.i32        q5,  q5,  q7
+        vext.8          q6,  q8,  q9,  #4  // -stride
+        vext.8          q7,  q9,  q10, #4
+        vext.8          q8,  q11, q12, #4  // +stride
+        vext.8          q11, q12, q13, #4
+
+        vld1.8          {d4}, [r1, :64]!
+
+        vmov.i32        q14, #5
+        vmov.i32        q15, #6
+
+        vadd.i32        q6,  q6,  q8       // -stride, +stride
+        vadd.i32        q7,  q7,  q11
+        vmul.i32        q4,  q4,  q14      // * 5
+        vmla.i32        q4,  q6,  q15      // * 6
+        vmul.i32        q5,  q5,  q14      // * 5
+        vmla.i32        q5,  q7,  q15      // * 6
+
+        vmovl.u8        q2,  d4
+        vmlal.u16       q4,  d0,  d4       // b + a * src
+        vmlal.u16       q5,  d1,  d5       // b + a * src
+        vmov            q0,  q1
+        vrshrn.i32      d8,  q4,  #9
+        vrshrn.i32      d9,  q5,  #9
+        vmov            q2,  q3
+        vst1.16         {q4}, [r0, :128]!
+
+        ble             3f
+        vmov            q8,  q10
+        vmov            q11, q13
+        vld1.16         {q1},       [r4, :128]!
+        vld1.16         {q3},       [r8, :128]!
+        vld1.32         {q9,  q10}, [r3, :128]!
+        vld1.32         {q12, q13}, [r7, :128]!
+        b               2b
+
+3:
+        subs            r6,  r6,  #1
+        ble             0f
+        mov             r5,  lr
+        add             r0,  r0,  r10, lsl #1
+        add             r1,  r1,  r2
+        add             r3,  r3,  r9,  lsl #2
+        add             r7,  r7,  r9,  lsl #2
+        add             r4,  r4,  r12, lsl #1
+        add             r8,  r8,  r12, lsl #1
+
+        vld1.32         {q8, q9}, [r3, :128]!
+        vld1.16         {q0, q1}, [r4, :128]!
+        vld1.32         {q10},    [r3, :128]!
+
+        vmov.i16        q12, #5
+        vmov.i16        q13, #6
+
+4:
+        subs            r5,  r5,  #8
+        vext.8          q3,  q0,  q1,  #4  // +1
+        vext.8          q2,  q0,  q1,  #2  // 0
+        vadd.i16        q0,  q0,  q3       // -1, +1
+
+        vext.8          q4,  q8,  q9,  #4  // 0
+        vext.8          q5,  q9,  q10, #4
+        vext.8          q6,  q8,  q9,  #8  // +1
+        vext.8          q7,  q9,  q10, #8
+        vmul.i16        q2,  q2,  q13      // * 6
+        vmla.i16        q2,  q0,  q12      // * 5 -> a
+        vld1.8          {d22}, [r1, :64]!
+        vadd.i32        q8,  q8,  q6       // -1, +1
+        vadd.i32        q9,  q9,  q7
+        vmovl.u8        q11, d22
+        vmul.i32        q4,  q4,  q15      // * 6
+        vmla.i32        q4,  q8,  q14      // * 5 -> b
+        vmul.i32        q5,  q5,  q15      // * 6
+        vmla.i32        q5,  q9,  q14      // * 5 -> b
+
+        vmlal.u16       q4,  d4,  d22      // b + a * src
+        vmlal.u16       q5,  d5,  d23
+        vmov            q0,  q1
+        vrshrn.i32      d8,  q4,  #8
+        vrshrn.i32      d9,  q5,  #8
+        vmov            q8,  q10
+        vst1.16         {q4}, [r0, :128]!
+
+        ble             5f
+        vld1.16         {q1},      [r4, :128]!
+        vld1.32         {q9, q10}, [r3, :128]!
+        b               4b
+
+5:
+        subs            r6,  r6,  #1
+        ble             0f
+        mov             r5,  lr
+        sub             r3,  r3,  r11, lsl #2 // Rewind r3/r4 to where they started
+        sub             r4,  r4,  r11, lsl #1
+        add             r0,  r0,  r10, lsl #1
+        add             r1,  r1,  r2
+        sub             r3,  r3,  #16
+        sub             r4,  r4,  #16
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int w, const int h,
+//                                    const int wt);
+function sgr_weighted1_8bpc_neon, export=1
+        push            {r4-r9,lr}
+        ldrd            r4,  r5,  [sp, #28]
+        ldrd            r6,  r7,  [sp, #36]
+        vdup.16         d31, r7
+        cmp             r6,  #2
+        add             r9,  r0,  r1
+        add             r12, r2,  r3
+        add             lr,  r4,  #2*FILTER_OUT_STRIDE
+        mov             r7,  #(4*FILTER_OUT_STRIDE)
+        lsl             r1,  r1,  #1
+        lsl             r3,  r3,  #1
+        add             r8,  r5,  #7
+        bic             r8,  r8,  #7 // Aligned width
+        sub             r1,  r1,  r8
+        sub             r3,  r3,  r8
+        sub             r7,  r7,  r8, lsl #1
+        mov             r8,  r5
+        blt             2f
+1:
+        vld1.8          {d0},  [r2,  :64]!
+        vld1.8          {d16}, [r12, :64]!
+        vld1.16         {q1},  [r4,  :128]!
+        vld1.16         {q9},  [lr,  :128]!
+        subs            r5,  r5,  #8
+        vshll.u8        q0,  d0,  #4     // u
+        vshll.u8        q8,  d16, #4     // u
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vsub.i16        q9,  q9,  q8     // t1 - u
+        vshll.u16       q2,  d0,  #7     // u << 7
+        vshll.u16       q3,  d1,  #7     // u << 7
+        vshll.u16       q10, d16, #7     // u << 7
+        vshll.u16       q11, d17, #7     // u << 7
+        vmlal.s16       q2,  d2,  d31    // v
+        vmlal.s16       q3,  d3,  d31    // v
+        vmlal.s16       q10, d18, d31    // v
+        vmlal.s16       q11, d19, d31    // v
+        vrshrn.i32      d4,  q2,  #11
+        vrshrn.i32      d5,  q3,  #11
+        vrshrn.i32      d20, q10, #11
+        vrshrn.i32      d21, q11, #11
+        vqmovun.s16     d4,  q2
+        vqmovun.s16     d20, q10
+        vst1.8          {d4},  [r0]!
+        vst1.8          {d20}, [r9]!
+        bgt             1b
+
+        sub             r6,  r6,  #2
+        cmp             r6,  #1
+        blt             0f
+        mov             r5,  r8
+        add             r0,  r0,  r1
+        add             r9,  r9,  r1
+        add             r2,  r2,  r3
+        add             r12, r12, r3
+        add             r4,  r4,  r7
+        add             lr,  lr,  r7
+        beq             2f
+        b               1b
+
+2:
+        vld1.8          {d0}, [r2, :64]!
+        vld1.16         {q1}, [r4, :128]!
+        subs            r5,  r5,  #8
+        vshll.u8        q0,  d0,  #4     // u
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vshll.u16       q2,  d0,  #7     // u << 7
+        vshll.u16       q3,  d1,  #7     // u << 7
+        vmlal.s16       q2,  d2,  d31    // v
+        vmlal.s16       q3,  d3,  d31    // v
+        vrshrn.i32      d4,  q2,  #11
+        vrshrn.i32      d5,  q3,  #11
+        vqmovun.s16     d2,  q2
+        vst1.8          {d2}, [r0]!
+        bgt             2b
+0:
+        pop             {r4-r9,pc}
+endfunc
+
+// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int16_t *t2,
+//                                    const int w, const int h,
+//                                    const int16_t wt[2]);
+function sgr_weighted2_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        ldrd            r4,  r5,  [sp, #36]
+        ldrd            r6,  r7,  [sp, #44]
+        ldr             r8,  [sp, #52]
+        cmp             r7,  #2
+        add             r10, r0,  r1
+        add             r11, r2,  r3
+        add             r12, r4,  #2*FILTER_OUT_STRIDE
+        add             lr,  r5,  #2*FILTER_OUT_STRIDE
+        vld2.16         {d30[], d31[]}, [r8] // wt[0], wt[1]
+        mov             r8,  #4*FILTER_OUT_STRIDE
+        lsl             r1,  r1,  #1
+        lsl             r3,  r3,  #1
+        add             r9,  r6,  #7
+        bic             r9,  r9,  #7 // Aligned width
+        sub             r1,  r1,  r9
+        sub             r3,  r3,  r9
+        sub             r8,  r8,  r9, lsl #1
+        mov             r9,  r6
+        blt             2f
+1:
+        vld1.8          {d0},  [r2,  :64]!
+        vld1.8          {d16}, [r11, :64]!
+        vld1.16         {q1},  [r4,  :128]!
+        vld1.16         {q9},  [r12, :128]!
+        vld1.16         {q2},  [r5,  :128]!
+        vld1.16         {q10}, [lr,  :128]!
+        subs            r6,  r6,  #8
+        vshll.u8        q0,  d0,  #4     // u
+        vshll.u8        q8,  d16, #4     // u
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vsub.i16        q2,  q2,  q0     // t2 - u
+        vsub.i16        q9,  q9,  q8     // t1 - u
+        vsub.i16        q10, q10, q8     // t2 - u
+        vshll.u16       q3,  d0,  #7     // u << 7
+        vshll.u16       q0,  d1,  #7     // u << 7
+        vshll.u16       q11, d16, #7     // u << 7
+        vshll.u16       q8,  d17, #7     // u << 7
+        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
+        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
+        vmlal.s16       q11, d18, d30    // wt[0] * (t1 - u)
+        vmlal.s16       q11, d20, d31    // wt[1] * (t2 - u)
+        vmlal.s16       q8,  d19, d30    // wt[0] * (t1 - u)
+        vmlal.s16       q8,  d21, d31    // wt[1] * (t2 - u)
+        vrshrn.i32      d6,  q3,  #11
+        vrshrn.i32      d7,  q0,  #11
+        vrshrn.i32      d22, q11, #11
+        vrshrn.i32      d23, q8,  #11
+        vqmovun.s16     d6,  q3
+        vqmovun.s16     d22, q11
+        vst1.8          {d6},  [r0]!
+        vst1.8          {d22}, [r10]!
+        bgt             1b
+
+        subs            r7,  r7,  #2
+        cmp             r7,  #1
+        blt             0f
+        mov             r6,  r9
+        add             r0,  r0,  r1
+        add             r10, r10, r1
+        add             r2,  r2,  r3
+        add             r11, r11, r3
+        add             r4,  r4,  r8
+        add             r12, r12, r8
+        add             r5,  r5,  r8
+        add             lr,  lr,  r8
+        beq             2f
+        b               1b
+
+2:
+        vld1.8          {d0}, [r2, :64]!
+        vld1.16         {q1}, [r4, :128]!
+        vld1.16         {q2}, [r5, :128]!
+        subs            r6,  r6,  #8
+        vshll.u8        q0,  d0,  #4     // u
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vsub.i16        q2,  q2,  q0     // t2 - u
+        vshll.u16       q3,  d0,  #7     // u << 7
+        vshll.u16       q0,  d1,  #7     // u << 7
+        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
+        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
+        vrshrn.i32      d6,  q3,  #11
+        vrshrn.i32      d7,  q0,  #11
+        vqmovun.s16     d6,  q3
+        vst1.8          {d6}, [r0]!
+        bgt             1b
+0:
+        pop             {r4-r11,pc}
+endfunc
--- a/src/meson.build
+++ b/src/meson.build
@@ -132,6 +132,7 @@
             endif
         elif host_machine.cpu_family().startswith('arm')
             libdav1d_sources_asm = files(
+                'arm/32/looprestoration_common.S',
                 'arm/32/msac.S',
             )