ref: 7cf5d7535f44d7c2d00e368575d0d26b66c73121
parent: 32e265a86e535b5fad47bcac9b54f83e1e5eab33
author: Martin Storsjö <martin@martin.st>
date: Sun Feb 9 18:39:11 EST 2020
arm64: looprestoration: Prepare for 16 bpc by splitting code to separate files looprestoration_common.S contains functions that can be used as is with one single instantiation of the functions for both 8 and 16 bpc. This file will be built once, regardless of which bitdepths are enabled. looprestoration_tmpl.S contains functions where the source can be shared and templated between 8 and 16 bpc. This will be included by the separate 8/16bpc implementaton files.
--- a/src/arm/64/looprestoration.S
+++ b/src/arm/64/looprestoration.S
@@ -617,6 +617,8 @@
#define SUM_STRIDE (384+16)
+#include "looprestoration_tmpl.S"
+
// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
@@ -1145,841 +1147,4 @@
0:
ret
.purgem add5
-endfunc
-
-// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
-// const int w, const int h,
-// const enum LrEdgeFlags edges);
-function sgr_box3_v_neon, export=1
- add w10, w3, #2 // Number of output rows to move back
- mov w11, w3 // Number of input rows to move back
- add w2, w2, #2 // Actual summed width
- mov x7, #(4*SUM_STRIDE) // sumsq stride
- mov x8, #(2*SUM_STRIDE) // sum stride
- sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
- sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
-
- tst w4, #4 // LR_HAVE_TOP
- b.eq 0f
- // If have top, read from row -2.
- sub x5, x0, #(4*SUM_STRIDE)
- sub x6, x1, #(2*SUM_STRIDE)
- add w11, w11, #2
- b 1f
-0:
- // !LR_HAVE_TOP
- // If we don't have top, read from row 0 even if
- // we start writing to row -1.
- add x5, x0, #(4*SUM_STRIDE)
- add x6, x1, #(2*SUM_STRIDE)
-1:
-
- tst w4, #8 // LR_HAVE_BOTTOM
- b.eq 1f
- // LR_HAVE_BOTTOM
- add w3, w3, #2 // Sum all h+2 lines with the main loop
- add w11, w11, #2
-1:
- mov w9, w3 // Backup of h for next loops
-
-1:
- // Start of horizontal loop; start one vertical filter slice.
- // Start loading rows into v16-v21 and v24-v26 taking top
- // padding into consideration.
- tst w4, #4 // LR_HAVE_TOP
- ld1 {v16.4s, v17.4s}, [x5], x7
- ld1 {v24.8h}, [x6], x8
- b.eq 2f
- // LR_HAVE_TOP
- ld1 {v18.4s, v19.4s}, [x5], x7
- ld1 {v25.8h}, [x6], x8
- ld1 {v20.4s, v21.4s}, [x5], x7
- ld1 {v26.8h}, [x6], x8
- b 3f
-2: // !LR_HAVE_TOP
- mov v18.16b, v16.16b
- mov v19.16b, v17.16b
- mov v25.16b, v24.16b
- mov v20.16b, v16.16b
- mov v21.16b, v17.16b
- mov v26.16b, v24.16b
-
-3:
- subs w3, w3, #1
-.macro add3
- add v16.4s, v16.4s, v18.4s
- add v17.4s, v17.4s, v19.4s
- add v24.8h, v24.8h, v25.8h
- add v16.4s, v16.4s, v20.4s
- add v17.4s, v17.4s, v21.4s
- add v24.8h, v24.8h, v26.8h
- st1 {v16.4s, v17.4s}, [x0], x7
- st1 {v24.8h}, [x1], x8
-.endm
- add3
- mov v16.16b, v18.16b
- mov v17.16b, v19.16b
- mov v24.16b, v25.16b
- mov v18.16b, v20.16b
- mov v19.16b, v21.16b
- mov v25.16b, v26.16b
- b.le 4f
- ld1 {v20.4s, v21.4s}, [x5], x7
- ld1 {v26.8h}, [x6], x8
- b 3b
-
-4:
- tst w4, #8 // LR_HAVE_BOTTOM
- b.ne 5f
- // !LR_HAVE_BOTTOM
- // Produce two more rows, extending the already loaded rows.
- add3
- mov v16.16b, v18.16b
- mov v17.16b, v19.16b
- mov v24.16b, v25.16b
- add3
-
-5: // End of one vertical slice.
- subs w2, w2, #8
- b.le 0f
- // Move pointers back up to the top and loop horizontally.
- // Input pointers
- msub x5, x7, x11, x5
- msub x6, x8, x11, x6
- // Output pointers
- msub x0, x7, x10, x0
- msub x1, x8, x10, x1
- add x0, x0, #32
- add x1, x1, #16
- add x5, x5, #32
- add x6, x6, #16
- mov w3, w9
- b 1b
-
-0:
- ret
-.purgem add3
-endfunc
-
-// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
-// const int w, const int h,
-// const enum LrEdgeFlags edges);
-function sgr_box5_v_neon, export=1
- add w10, w3, #2 // Number of output rows to move back
- mov w11, w3 // Number of input rows to move back
- add w2, w2, #8 // Actual summed width
- mov x7, #(4*SUM_STRIDE) // sumsq stride
- mov x8, #(2*SUM_STRIDE) // sum stride
- sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
- sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
-
- tst w4, #4 // LR_HAVE_TOP
- b.eq 0f
- // If have top, read from row -2.
- sub x5, x0, #(4*SUM_STRIDE)
- sub x6, x1, #(2*SUM_STRIDE)
- add w11, w11, #2
- b 1f
-0:
- // !LR_HAVE_TOP
- // If we don't have top, read from row 0 even if
- // we start writing to row -1.
- add x5, x0, #(4*SUM_STRIDE)
- add x6, x1, #(2*SUM_STRIDE)
-1:
-
- tst w4, #8 // LR_HAVE_BOTTOM
- b.eq 0f
- // LR_HAVE_BOTTOM
- add w3, w3, #2 // Handle h+2 lines with the main loop
- add w11, w11, #2
- b 1f
-0:
- // !LR_HAVE_BOTTOM
- sub w3, w3, #1 // Handle h-1 lines with the main loop
-1:
- mov w9, w3 // Backup of h for next loops
-
-1:
- // Start of horizontal loop; start one vertical filter slice.
- // Start loading rows into v16-v25 and v26-v30 taking top
- // padding into consideration.
- tst w4, #4 // LR_HAVE_TOP
- ld1 {v16.4s, v17.4s}, [x5], x7
- ld1 {v26.8h}, [x6], x8
- b.eq 2f
- // LR_HAVE_TOP
- ld1 {v20.4s, v21.4s}, [x5], x7
- ld1 {v28.8h}, [x6], x8
- mov v18.16b, v16.16b
- mov v19.16b, v17.16b
- mov v27.16b, v26.16b
- ld1 {v22.4s, v23.4s}, [x5], x7
- ld1 {v29.8h}, [x6], x8
- b 3f
-2: // !LR_HAVE_TOP
- mov v18.16b, v16.16b
- mov v19.16b, v17.16b
- mov v27.16b, v26.16b
- mov v20.16b, v16.16b
- mov v21.16b, v17.16b
- mov v28.16b, v26.16b
- mov v22.16b, v16.16b
- mov v23.16b, v17.16b
- mov v29.16b, v26.16b
-
-3:
- cbz w3, 4f
- ld1 {v24.4s, v25.4s}, [x5], x7
- ld1 {v30.8h}, [x6], x8
-
-3:
- // Start of vertical loop
- subs w3, w3, #2
-.macro add5
- add v16.4s, v16.4s, v18.4s
- add v17.4s, v17.4s, v19.4s
- add v26.8h, v26.8h, v27.8h
- add v0.4s, v20.4s, v22.4s
- add v1.4s, v21.4s, v23.4s
- add v2.8h, v28.8h, v29.8h
- add v16.4s, v16.4s, v24.4s
- add v17.4s, v17.4s, v25.4s
- add v26.8h, v26.8h, v30.8h
- add v16.4s, v16.4s, v0.4s
- add v17.4s, v17.4s, v1.4s
- add v26.8h, v26.8h, v2.8h
- st1 {v16.4s, v17.4s}, [x0], x7
- st1 {v26.8h}, [x1], x8
-.endm
- add5
-.macro shift2
- mov v16.16b, v20.16b
- mov v17.16b, v21.16b
- mov v26.16b, v28.16b
- mov v18.16b, v22.16b
- mov v19.16b, v23.16b
- mov v27.16b, v29.16b
- mov v20.16b, v24.16b
- mov v21.16b, v25.16b
- mov v28.16b, v30.16b
-.endm
- shift2
- add x0, x0, x7
- add x1, x1, x8
- b.le 5f
- ld1 {v22.4s, v23.4s}, [x5], x7
- ld1 {v29.8h}, [x6], x8
- ld1 {v24.4s, v25.4s}, [x5], x7
- ld1 {v30.8h}, [x6], x8
- b 3b
-
-4:
- // h == 1, !LR_HAVE_BOTTOM.
- // Pad the last row with the only content row, and add.
- mov v24.16b, v22.16b
- mov v25.16b, v23.16b
- mov v30.16b, v29.16b
- add5
- shift2
- add x0, x0, x7
- add x1, x1, x8
- add5
- b 6f
-
-5:
- tst w4, #8 // LR_HAVE_BOTTOM
- b.ne 6f
- // !LR_HAVE_BOTTOM
- cbnz w3, 5f
- // The intended three edge rows left; output the one at h-2 and
- // the past edge one at h.
- ld1 {v22.4s, v23.4s}, [x5], x7
- ld1 {v29.8h}, [x6], x8
- // Pad the past-edge row from the last content row.
- mov v24.16b, v22.16b
- mov v25.16b, v23.16b
- mov v30.16b, v29.16b
- add5
- shift2
- add x0, x0, x7
- add x1, x1, x8
- // The last two rows are already padded properly here.
- add5
- b 6f
-
-5:
- // w3 == -1, two rows left, output one.
- // Pad the last two rows from the mid one.
- mov v22.16b, v20.16b
- mov v23.16b, v21.16b
- mov v29.16b, v28.16b
- mov v24.16b, v20.16b
- mov v25.16b, v21.16b
- mov v30.16b, v28.16b
- add5
- add x0, x0, x7
- add x1, x1, x8
- b 6f
-
-6: // End of one vertical slice.
- subs w2, w2, #8
- b.le 0f
- // Move pointers back up to the top and loop horizontally.
- // Input pointers
- msub x5, x7, x11, x5
- msub x6, x8, x11, x6
- // Output pointers
- msub x0, x7, x10, x0
- msub x1, x8, x10, x1
- add x0, x0, #32
- add x1, x1, #16
- add x5, x5, #32
- add x6, x6, #16
- mov w3, w9
- b 1b
-
-0:
- ret
-.purgem add5
-endfunc
-
-// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
-// const int w, const int h, const int strength);
-// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
-// const int w, const int h, const int strength);
-function sgr_calc_ab1_neon, export=1
- add x3, x3, #2 // h += 2
- movi v31.4s, #9 // n
- mov x5, #455
- mov x8, #SUM_STRIDE
- b sgr_calc_ab_neon
-endfunc
-
-function sgr_calc_ab2_neon, export=1
- add x3, x3, #3 // h += 3
- asr x3, x3, #1 // h /= 2
- movi v31.4s, #25 // n
- mov x5, #164
- mov x8, #(2*SUM_STRIDE)
-endfunc
-
-function sgr_calc_ab_neon
- movrel x12, X(sgr_x_by_x)
- ld1 {v16.16b, v17.16b, v18.16b}, [x12]
- movi v19.16b, #5
- movi v20.8b, #55 // idx of last 5
- movi v21.8b, #72 // idx of last 4
- movi v22.8b, #101 // idx of last 3
- movi v23.8b, #169 // idx of last 2
- movi v24.8b, #254 // idx of last 1
- add x2, x2, #2 // w += 2
- add x7, x2, #7
- bic x7, x7, #7 // aligned w
- sub x7, x8, x7 // increment between rows
- movi v29.8h, #1, lsl #8
- dup v28.4s, w4
- dup v30.4s, w5 // one_by_x
- sub x0, x0, #(4*(SUM_STRIDE))
- sub x1, x1, #(2*(SUM_STRIDE))
- mov x6, x2 // backup of w
- sub v16.16b, v16.16b, v19.16b
- sub v17.16b, v17.16b, v19.16b
- sub v18.16b, v18.16b, v19.16b
-1:
- subs x2, x2, #8
- ld1 {v0.4s, v1.4s}, [x0] // a
- ld1 {v2.8h}, [x1] // b
- mul v0.4s, v0.4s, v31.4s // a * n
- mul v1.4s, v1.4s, v31.4s // a * n
- umull v3.4s, v2.4h, v2.4h // b * b
- umull2 v4.4s, v2.8h, v2.8h // b * b
- uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
- uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
- mul v0.4s, v0.4s, v28.4s // p * s
- mul v1.4s, v1.4s, v28.4s // p * s
- uqshrn v0.4h, v0.4s, #16
- uqshrn2 v0.8h, v1.4s, #16
- uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
-
- cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
- cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
- tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
- cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
- cmhi v5.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
- add v25.8b, v25.8b, v26.8b
- cmhi v6.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
- add v27.8b, v27.8b, v5.8b
- add v6.8b, v6.8b, v19.8b
- add v25.8b, v25.8b, v27.8b
- add v1.8b, v1.8b, v6.8b
- add v1.8b, v1.8b, v25.8b
- uxtl v1.8h, v1.8b // x
-
- umull v3.4s, v1.4h, v2.4h // x * BB[i]
- umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
- mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
- mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
- srshr v3.4s, v3.4s, #12 // AA[i]
- srshr v4.4s, v4.4s, #12 // AA[i]
- sub v2.8h, v29.8h, v1.8h // 256 - x
-
- st1 {v3.4s, v4.4s}, [x0], #32
- st1 {v2.8h}, [x1], #16
- b.gt 1b
-
- subs x3, x3, #1
- b.le 0f
- add x0, x0, x7, lsl #2
- add x1, x1, x7, lsl #1
- mov x2, x6
- b 1b
-0:
- ret
-endfunc
-
-#define FILTER_OUT_STRIDE 384
-
-// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
-// const pixel *src, const ptrdiff_t stride,
-// const int32_t *a, const int16_t *b,
-// const int w, const int h);
-function sgr_finish_filter1_8bpc_neon, export=1
- sub x7, x3, #(4*SUM_STRIDE)
- add x8, x3, #(4*SUM_STRIDE)
- sub x9, x4, #(2*SUM_STRIDE)
- add x10, x4, #(2*SUM_STRIDE)
- mov x11, #SUM_STRIDE
- mov x12, #FILTER_OUT_STRIDE
- add x13, x5, #7
- bic x13, x13, #7 // Aligned width
- sub x2, x2, x13
- sub x12, x12, x13
- sub x11, x11, x13
- sub x11, x11, #4 // We read 4 extra elements from a
- sub x14, x11, #4 // We read 8 extra elements from b
- mov x13, x5
- movi v6.8h, #3
- movi v7.4s, #3
-1:
- ld1 {v0.8h, v1.8h}, [x9], #32
- ld1 {v2.8h, v3.8h}, [x4], #32
- ld1 {v4.8h, v5.8h}, [x10], #32
- ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
- ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48
- ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48
-
-2:
- subs x5, x5, #8
- ext v25.16b, v0.16b, v1.16b, #2 // -stride
- ext v26.16b, v2.16b, v3.16b, #2 // 0
- ext v27.16b, v4.16b, v5.16b, #2 // +stride
- ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
- ext v29.16b, v2.16b, v3.16b, #4 // +1
- ext v30.16b, v4.16b, v5.16b, #4 // +1+stride
- add v2.8h, v2.8h, v25.8h // -1, -stride
- add v26.8h, v26.8h, v27.8h // 0, +stride
- add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
- add v2.8h, v2.8h, v26.8h
- add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride
- add v2.8h, v2.8h, v29.8h // +1
- add v0.8h, v0.8h, v4.8h
-
- ext v25.16b, v16.16b, v17.16b, #4 // -stride
- ext v26.16b, v17.16b, v18.16b, #4
- shl v2.8h, v2.8h, #2
- ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
- ext v28.16b, v17.16b, v18.16b, #8
- ext v29.16b, v19.16b, v20.16b, #4 // 0
- ext v30.16b, v20.16b, v21.16b, #4
- mla v2.8h, v0.8h, v6.8h // * 3 -> a
- add v25.4s, v25.4s, v19.4s // -stride, -1
- add v26.4s, v26.4s, v20.4s
- add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride
- add v17.4s, v17.4s, v28.4s
- ext v27.16b, v19.16b, v20.16b, #8 // +1
- ext v28.16b, v20.16b, v21.16b, #8
- add v16.4s, v16.4s, v22.4s // -1+stride
- add v17.4s, v17.4s, v23.4s
- add v29.4s, v29.4s, v27.4s // 0, +1
- add v30.4s, v30.4s, v28.4s
- add v25.4s, v25.4s, v29.4s
- add v26.4s, v26.4s, v30.4s
- ext v27.16b, v22.16b, v23.16b, #4 // +stride
- ext v28.16b, v23.16b, v24.16b, #4
- ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
- ext v30.16b, v23.16b, v24.16b, #8
- ld1 {v19.8b}, [x1], #8 // src
- add v25.4s, v25.4s, v27.4s // +stride
- add v26.4s, v26.4s, v28.4s
- add v16.4s, v16.4s, v29.4s // +1+stride
- add v17.4s, v17.4s, v30.4s
- shl v25.4s, v25.4s, #2
- shl v26.4s, v26.4s, #2
- mla v25.4s, v16.4s, v7.4s // * 3 -> b
- mla v26.4s, v17.4s, v7.4s
- uxtl v19.8h, v19.8b // src
- mov v0.16b, v1.16b
- umlal v25.4s, v2.4h, v19.4h // b + a * src
- umlal2 v26.4s, v2.8h, v19.8h
- mov v2.16b, v3.16b
- rshrn v25.4h, v25.4s, #9
- rshrn2 v25.8h, v26.4s, #9
- mov v4.16b, v5.16b
- st1 {v25.8h}, [x0], #16
-
- b.le 3f
- mov v16.16b, v18.16b
- mov v19.16b, v21.16b
- mov v22.16b, v24.16b
- ld1 {v1.8h}, [x9], #16
- ld1 {v3.8h}, [x4], #16
- ld1 {v5.8h}, [x10], #16
- ld1 {v17.4s, v18.4s}, [x7], #32
- ld1 {v20.4s, v21.4s}, [x3], #32
- ld1 {v23.4s, v24.4s}, [x8], #32
- b 2b
-
-3:
- subs x6, x6, #1
- b.le 0f
- mov x5, x13
- add x0, x0, x12, lsl #1
- add x1, x1, x2
- add x3, x3, x11, lsl #2
- add x7, x7, x11, lsl #2
- add x8, x8, x11, lsl #2
- add x4, x4, x14, lsl #1
- add x9, x9, x14, lsl #1
- add x10, x10, x14, lsl #1
- b 1b
-0:
- ret
-endfunc
-
-// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
-// const pixel *src, const ptrdiff_t stride,
-// const int32_t *a, const int16_t *b,
-// const int w, const int h);
-function sgr_finish_filter2_8bpc_neon, export=1
- add x7, x3, #(4*(SUM_STRIDE))
- sub x3, x3, #(4*(SUM_STRIDE))
- add x8, x4, #(2*(SUM_STRIDE))
- sub x4, x4, #(2*(SUM_STRIDE))
- mov x9, #(2*SUM_STRIDE)
- mov x10, #FILTER_OUT_STRIDE
- add x11, x5, #7
- bic x11, x11, #7 // Aligned width
- sub x2, x2, x11
- sub x10, x10, x11
- sub x9, x9, x11
- sub x9, x9, #4 // We read 4 extra elements from a
- sub x12, x9, #4 // We read 8 extra elements from b
- mov x11, x5
- movi v4.8h, #5
- movi v5.4s, #5
- movi v6.8h, #6
- movi v7.4s, #6
-1:
- ld1 {v0.8h, v1.8h}, [x4], #32
- ld1 {v2.8h, v3.8h}, [x8], #32
- ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
- ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
-
-2:
- subs x5, x5, #8
- ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
- ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
- ext v22.16b, v0.16b, v1.16b, #2 // -stride
- ext v23.16b, v2.16b, v3.16b, #2 // +stride
- add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
- add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
- add v2.8h, v22.8h, v23.8h // -stride, +stride
- add v0.8h, v0.8h, v25.8h
-
- ext v22.16b, v16.16b, v17.16b, #4 // -stride
- ext v23.16b, v17.16b, v18.16b, #4
- ext v24.16b, v19.16b, v20.16b, #4 // +stride
- ext v25.16b, v20.16b, v21.16b, #4
- ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
- ext v27.16b, v17.16b, v18.16b, #8
- ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
- ext v29.16b, v20.16b, v21.16b, #8
- mul v0.8h, v0.8h, v4.8h // * 5
- mla v0.8h, v2.8h, v6.8h // * 6
- ld1 {v31.8b}, [x1], #8
- add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
- add v17.4s, v17.4s, v27.4s
- add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
- add v20.4s, v20.4s, v29.4s
- add v16.4s, v16.4s, v19.4s
- add v17.4s, v17.4s, v20.4s
-
- add v22.4s, v22.4s, v24.4s // -stride, +stride
- add v23.4s, v23.4s, v25.4s
- // This is, surprisingly, faster than other variants where the
- // mul+mla pairs are further apart, on Cortex A53.
- mul v16.4s, v16.4s, v5.4s // * 5
- mla v16.4s, v22.4s, v7.4s // * 6
- mul v17.4s, v17.4s, v5.4s // * 5
- mla v17.4s, v23.4s, v7.4s // * 6
-
- uxtl v31.8h, v31.8b
- umlal v16.4s, v0.4h, v31.4h // b + a * src
- umlal2 v17.4s, v0.8h, v31.8h
- mov v0.16b, v1.16b
- rshrn v16.4h, v16.4s, #9
- rshrn2 v16.8h, v17.4s, #9
- mov v2.16b, v3.16b
- st1 {v16.8h}, [x0], #16
-
- b.le 3f
- mov v16.16b, v18.16b
- mov v19.16b, v21.16b
- ld1 {v1.8h}, [x4], #16
- ld1 {v3.8h}, [x8], #16
- ld1 {v17.4s, v18.4s}, [x3], #32
- ld1 {v20.4s, v21.4s}, [x7], #32
- b 2b
-
-3:
- subs x6, x6, #1
- b.le 0f
- mov x5, x11
- add x0, x0, x10, lsl #1
- add x1, x1, x2
- add x3, x3, x9, lsl #2
- add x7, x7, x9, lsl #2
- add x4, x4, x12, lsl #1
- add x8, x8, x12, lsl #1
- mov x13, x3
- mov x14, x4
-
- ld1 {v0.8h, v1.8h}, [x4], #32
- ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
-
-4:
- subs x5, x5, #8
- ext v23.16b, v0.16b, v1.16b, #4 // +1
- ext v22.16b, v0.16b, v1.16b, #2 // 0
- add v0.8h, v0.8h, v23.8h // -1, +1
-
- ext v24.16b, v16.16b, v17.16b, #4 // 0
- ext v25.16b, v17.16b, v18.16b, #4
- ext v26.16b, v16.16b, v17.16b, #8 // +1
- ext v27.16b, v17.16b, v18.16b, #8
- mul v2.8h, v22.8h, v6.8h // * 6
- mla v2.8h, v0.8h, v4.8h // * 5 -> a
- ld1 {v31.8b}, [x1], #8
- add v16.4s, v16.4s, v26.4s // -1, +1
- add v17.4s, v17.4s, v27.4s
- uxtl v31.8h, v31.8b
- // This is, surprisingly, faster than other variants where the
- // mul+mla pairs are further apart, on Cortex A53.
- mul v24.4s, v24.4s, v7.4s // * 6
- mla v24.4s, v16.4s, v5.4s // * 5 -> b
- mul v25.4s, v25.4s, v7.4s // * 6
- mla v25.4s, v17.4s, v5.4s // * 5 -> b
-
- umlal v24.4s, v2.4h, v31.4h // b + a * src
- umlal2 v25.4s, v2.8h, v31.8h
- mov v0.16b, v1.16b
- rshrn v24.4h, v24.4s, #8
- rshrn2 v24.8h, v25.4s, #8
- mov v16.16b, v18.16b
- st1 {v24.8h}, [x0], #16
-
- b.le 5f
- ld1 {v1.8h}, [x4], #16
- ld1 {v17.4s, v18.4s}, [x3], #32
- b 4b
-
-5:
- subs x6, x6, #1
- b.le 0f
- mov x5, x11
- add x0, x0, x10, lsl #1
- add x1, x1, x2
- mov x3, x13 // Rewind x3/x4 to where they started
- mov x4, x14
- b 1b
-0:
- ret
-endfunc
-
-// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
-// const pixel *src, const ptrdiff_t src_stride,
-// const int16_t *t1, const int w, const int h,
-// const int wt);
-function sgr_weighted1_8bpc_neon, export=1
- dup v31.8h, w7
- cmp x6, #2
- add x9, x0, x1
- add x10, x2, x3
- add x11, x4, #2*FILTER_OUT_STRIDE
- mov x7, #(4*FILTER_OUT_STRIDE)
- lsl x1, x1, #1
- lsl x3, x3, #1
- add x8, x5, #7
- bic x8, x8, #7 // Aligned width
- sub x1, x1, x8
- sub x3, x3, x8
- sub x7, x7, x8, lsl #1
- mov x8, x5
- b.lt 2f
-1:
- ld1 {v0.8b}, [x2], #8
- ld1 {v4.8b}, [x10], #8
- ld1 {v1.8h}, [x4], #16
- ld1 {v5.8h}, [x11], #16
- subs x5, x5, #8
- ushll v0.8h, v0.8b, #4 // u
- ushll v4.8h, v4.8b, #4 // u
- sub v1.8h, v1.8h, v0.8h // t1 - u
- sub v5.8h, v5.8h, v4.8h // t1 - u
- ushll v2.4s, v0.4h, #7 // u << 7
- ushll2 v3.4s, v0.8h, #7 // u << 7
- ushll v6.4s, v4.4h, #7 // u << 7
- ushll2 v7.4s, v4.8h, #7 // u << 7
- smlal v2.4s, v1.4h, v31.4h // v
- smlal2 v3.4s, v1.8h, v31.8h // v
- smlal v6.4s, v5.4h, v31.4h // v
- smlal2 v7.4s, v5.8h, v31.8h // v
- rshrn v2.4h, v2.4s, #11
- rshrn2 v2.8h, v3.4s, #11
- rshrn v6.4h, v6.4s, #11
- rshrn2 v6.8h, v7.4s, #11
- sqxtun v2.8b, v2.8h
- sqxtun v6.8b, v6.8h
- st1 {v2.8b}, [x0], #8
- st1 {v6.8b}, [x9], #8
- b.gt 1b
-
- sub x6, x6, #2
- cmp x6, #1
- b.lt 0f
- mov x5, x8
- add x0, x0, x1
- add x9, x9, x1
- add x2, x2, x3
- add x10, x10, x3
- add x4, x4, x7
- add x11, x11, x7
- b.eq 2f
- b 1b
-
-2:
- ld1 {v0.8b}, [x2], #8
- ld1 {v1.8h}, [x4], #16
- subs x5, x5, #8
- ushll v0.8h, v0.8b, #4 // u
- sub v1.8h, v1.8h, v0.8h // t1 - u
- ushll v2.4s, v0.4h, #7 // u << 7
- ushll2 v3.4s, v0.8h, #7 // u << 7
- smlal v2.4s, v1.4h, v31.4h // v
- smlal2 v3.4s, v1.8h, v31.8h // v
- rshrn v2.4h, v2.4s, #11
- rshrn2 v2.8h, v3.4s, #11
- sqxtun v2.8b, v2.8h
- st1 {v2.8b}, [x0], #8
- b.gt 2b
-0:
- ret
-endfunc
-
-// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-// const pixel *src, const ptrdiff_t src_stride,
-// const int16_t *t1, const int16_t *t2,
-// const int w, const int h,
-// const int16_t wt[2]);
-function sgr_weighted2_8bpc_neon, export=1
- ldr x8, [sp]
- cmp x7, #2
- add x10, x0, x1
- add x11, x2, x3
- add x12, x4, #2*FILTER_OUT_STRIDE
- add x13, x5, #2*FILTER_OUT_STRIDE
- ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
- mov x8, #4*FILTER_OUT_STRIDE
- lsl x1, x1, #1
- lsl x3, x3, #1
- add x9, x6, #7
- bic x9, x9, #7 // Aligned width
- sub x1, x1, x9
- sub x3, x3, x9
- sub x8, x8, x9, lsl #1
- mov x9, x6
- b.lt 2f
-1:
- ld1 {v0.8b}, [x2], #8
- ld1 {v16.8b}, [x11], #8
- ld1 {v1.8h}, [x4], #16
- ld1 {v17.8h}, [x12], #16
- ld1 {v2.8h}, [x5], #16
- ld1 {v18.8h}, [x13], #16
- subs x6, x6, #8
- ushll v0.8h, v0.8b, #4 // u
- ushll v16.8h, v16.8b, #4 // u
- sub v1.8h, v1.8h, v0.8h // t1 - u
- sub v2.8h, v2.8h, v0.8h // t2 - u
- sub v17.8h, v17.8h, v16.8h // t1 - u
- sub v18.8h, v18.8h, v16.8h // t2 - u
- ushll v3.4s, v0.4h, #7 // u << 7
- ushll2 v4.4s, v0.8h, #7 // u << 7
- ushll v19.4s, v16.4h, #7 // u << 7
- ushll2 v20.4s, v16.8h, #7 // u << 7
- smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
- smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
- smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
- smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
- smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
- smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
- smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
- smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
- rshrn v3.4h, v3.4s, #11
- rshrn2 v3.8h, v4.4s, #11
- rshrn v19.4h, v19.4s, #11
- rshrn2 v19.8h, v20.4s, #11
- sqxtun v3.8b, v3.8h
- sqxtun v19.8b, v19.8h
- st1 {v3.8b}, [x0], #8
- st1 {v19.8b}, [x10], #8
- b.gt 1b
-
- subs x7, x7, #2
- cmp x7, #1
- b.lt 0f
- mov x6, x9
- add x0, x0, x1
- add x10, x10, x1
- add x2, x2, x3
- add x11, x11, x3
- add x4, x4, x8
- add x12, x12, x8
- add x5, x5, x8
- add x13, x13, x8
- b.eq 2f
- b 1b
-
-2:
- ld1 {v0.8b}, [x2], #8
- ld1 {v1.8h}, [x4], #16
- ld1 {v2.8h}, [x5], #16
- subs x6, x6, #8
- ushll v0.8h, v0.8b, #4 // u
- sub v1.8h, v1.8h, v0.8h // t1 - u
- sub v2.8h, v2.8h, v0.8h // t2 - u
- ushll v3.4s, v0.4h, #7 // u << 7
- ushll2 v4.4s, v0.8h, #7 // u << 7
- smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
- smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
- smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
- smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
- rshrn v3.4h, v3.4s, #11
- rshrn2 v3.8h, v4.4s, #11
- sqxtun v3.8b, v3.8h
- st1 {v3.8b}, [x0], #8
- b.gt 1b
-0:
- ret
endfunc
--- /dev/null
+++ b/src/arm/64/looprestoration_common.S
@@ -1,0 +1,422 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+ add w10, w3, #2 // Number of output rows to move back
+ mov w11, w3 // Number of input rows to move back
+ add w2, w2, #2 // Actual summed width
+ mov x7, #(4*SUM_STRIDE) // sumsq stride
+ mov x8, #(2*SUM_STRIDE) // sum stride
+ sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst w4, #4 // LR_HAVE_TOP
+ b.eq 0f
+ // If have top, read from row -2.
+ sub x5, x0, #(4*SUM_STRIDE)
+ sub x6, x1, #(2*SUM_STRIDE)
+ add w11, w11, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add x5, x0, #(4*SUM_STRIDE)
+ add x6, x1, #(2*SUM_STRIDE)
+1:
+
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.eq 1f
+ // LR_HAVE_BOTTOM
+ add w3, w3, #2 // Sum all h+2 lines with the main loop
+ add w11, w11, #2
+1:
+ mov w9, w3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into v16-v21 and v24-v26 taking top
+ // padding into consideration.
+ tst w4, #4 // LR_HAVE_TOP
+ ld1 {v16.4s, v17.4s}, [x5], x7
+ ld1 {v24.8h}, [x6], x8
+ b.eq 2f
+ // LR_HAVE_TOP
+ ld1 {v18.4s, v19.4s}, [x5], x7
+ ld1 {v25.8h}, [x6], x8
+ ld1 {v20.4s, v21.4s}, [x5], x7
+ ld1 {v26.8h}, [x6], x8
+ b 3f
+2: // !LR_HAVE_TOP
+ mov v18.16b, v16.16b
+ mov v19.16b, v17.16b
+ mov v25.16b, v24.16b
+ mov v20.16b, v16.16b
+ mov v21.16b, v17.16b
+ mov v26.16b, v24.16b
+
+3:
+ subs w3, w3, #1
+.macro add3
+ add v16.4s, v16.4s, v18.4s
+ add v17.4s, v17.4s, v19.4s
+ add v24.8h, v24.8h, v25.8h
+ add v16.4s, v16.4s, v20.4s
+ add v17.4s, v17.4s, v21.4s
+ add v24.8h, v24.8h, v26.8h
+ st1 {v16.4s, v17.4s}, [x0], x7
+ st1 {v24.8h}, [x1], x8
+.endm
+ add3
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v24.16b, v25.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v25.16b, v26.16b
+ b.le 4f
+ ld1 {v20.4s, v21.4s}, [x5], x7
+ ld1 {v26.8h}, [x6], x8
+ b 3b
+
+4:
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.ne 5f
+ // !LR_HAVE_BOTTOM
+ // Produce two more rows, extending the already loaded rows.
+ add3
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v24.16b, v25.16b
+ add3
+
+5: // End of one vertical slice.
+ subs w2, w2, #8
+ b.le 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ msub x5, x7, x11, x5
+ msub x6, x8, x11, x6
+ // Output pointers
+ msub x0, x7, x10, x0
+ msub x1, x8, x10, x1
+ add x0, x0, #32
+ add x1, x1, #16
+ add x5, x5, #32
+ add x6, x6, #16
+ mov w3, w9
+ b 1b
+
+0:
+ ret
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+ add w10, w3, #2 // Number of output rows to move back
+ mov w11, w3 // Number of input rows to move back
+ add w2, w2, #8 // Actual summed width
+ mov x7, #(4*SUM_STRIDE) // sumsq stride
+ mov x8, #(2*SUM_STRIDE) // sum stride
+ sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst w4, #4 // LR_HAVE_TOP
+ b.eq 0f
+ // If have top, read from row -2.
+ sub x5, x0, #(4*SUM_STRIDE)
+ sub x6, x1, #(2*SUM_STRIDE)
+ add w11, w11, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add x5, x0, #(4*SUM_STRIDE)
+ add x6, x1, #(2*SUM_STRIDE)
+1:
+
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.eq 0f
+ // LR_HAVE_BOTTOM
+ add w3, w3, #2 // Handle h+2 lines with the main loop
+ add w11, w11, #2
+ b 1f
+0:
+ // !LR_HAVE_BOTTOM
+ sub w3, w3, #1 // Handle h-1 lines with the main loop
+1:
+ mov w9, w3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into v16-v25 and v26-v30 taking top
+ // padding into consideration.
+ tst w4, #4 // LR_HAVE_TOP
+ ld1 {v16.4s, v17.4s}, [x5], x7
+ ld1 {v26.8h}, [x6], x8
+ b.eq 2f
+ // LR_HAVE_TOP
+ ld1 {v20.4s, v21.4s}, [x5], x7
+ ld1 {v28.8h}, [x6], x8
+ mov v18.16b, v16.16b
+ mov v19.16b, v17.16b
+ mov v27.16b, v26.16b
+ ld1 {v22.4s, v23.4s}, [x5], x7
+ ld1 {v29.8h}, [x6], x8
+ b 3f
+2: // !LR_HAVE_TOP
+ mov v18.16b, v16.16b
+ mov v19.16b, v17.16b
+ mov v27.16b, v26.16b
+ mov v20.16b, v16.16b
+ mov v21.16b, v17.16b
+ mov v28.16b, v26.16b
+ mov v22.16b, v16.16b
+ mov v23.16b, v17.16b
+ mov v29.16b, v26.16b
+
+3:
+ cbz w3, 4f
+ ld1 {v24.4s, v25.4s}, [x5], x7
+ ld1 {v30.8h}, [x6], x8
+
+3:
+ // Start of vertical loop
+ subs w3, w3, #2
+.macro add5
+ add v16.4s, v16.4s, v18.4s
+ add v17.4s, v17.4s, v19.4s
+ add v26.8h, v26.8h, v27.8h
+ add v0.4s, v20.4s, v22.4s
+ add v1.4s, v21.4s, v23.4s
+ add v2.8h, v28.8h, v29.8h
+ add v16.4s, v16.4s, v24.4s
+ add v17.4s, v17.4s, v25.4s
+ add v26.8h, v26.8h, v30.8h
+ add v16.4s, v16.4s, v0.4s
+ add v17.4s, v17.4s, v1.4s
+ add v26.8h, v26.8h, v2.8h
+ st1 {v16.4s, v17.4s}, [x0], x7
+ st1 {v26.8h}, [x1], x8
+.endm
+ add5
+.macro shift2
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ mov v26.16b, v28.16b
+ mov v18.16b, v22.16b
+ mov v19.16b, v23.16b
+ mov v27.16b, v29.16b
+ mov v20.16b, v24.16b
+ mov v21.16b, v25.16b
+ mov v28.16b, v30.16b
+.endm
+ shift2
+ add x0, x0, x7
+ add x1, x1, x8
+ b.le 5f
+ ld1 {v22.4s, v23.4s}, [x5], x7
+ ld1 {v29.8h}, [x6], x8
+ ld1 {v24.4s, v25.4s}, [x5], x7
+ ld1 {v30.8h}, [x6], x8
+ b 3b
+
+4:
+ // h == 1, !LR_HAVE_BOTTOM.
+ // Pad the last row with the only content row, and add.
+ mov v24.16b, v22.16b
+ mov v25.16b, v23.16b
+ mov v30.16b, v29.16b
+ add5
+ shift2
+ add x0, x0, x7
+ add x1, x1, x8
+ add5
+ b 6f
+
+5:
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.ne 6f
+ // !LR_HAVE_BOTTOM
+ cbnz w3, 5f
+ // The intended three edge rows left; output the one at h-2 and
+ // the past edge one at h.
+ ld1 {v22.4s, v23.4s}, [x5], x7
+ ld1 {v29.8h}, [x6], x8
+ // Pad the past-edge row from the last content row.
+ mov v24.16b, v22.16b
+ mov v25.16b, v23.16b
+ mov v30.16b, v29.16b
+ add5
+ shift2
+ add x0, x0, x7
+ add x1, x1, x8
+ // The last two rows are already padded properly here.
+ add5
+ b 6f
+
+5:
+ // w3 == -1, two rows left, output one.
+ // Pad the last two rows from the mid one.
+ mov v22.16b, v20.16b
+ mov v23.16b, v21.16b
+ mov v29.16b, v28.16b
+ mov v24.16b, v20.16b
+ mov v25.16b, v21.16b
+ mov v30.16b, v28.16b
+ add5
+ add x0, x0, x7
+ add x1, x1, x8
+ b 6f
+
+6: // End of one vertical slice.
+ subs w2, w2, #8
+ b.le 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ msub x5, x7, x11, x5
+ msub x6, x8, x11, x6
+ // Output pointers
+ msub x0, x7, x10, x0
+ msub x1, x8, x10, x1
+ add x0, x0, #32
+ add x1, x1, #16
+ add x5, x5, #32
+ add x6, x6, #16
+ mov w3, w9
+ b 1b
+
+0:
+ ret
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength);
+function sgr_calc_ab1_neon, export=1
+ add x3, x3, #2 // h += 2
+ movi v31.4s, #9 // n
+ mov x5, #455
+ mov x8, #SUM_STRIDE
+ b sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+ add x3, x3, #3 // h += 3
+ asr x3, x3, #1 // h /= 2
+ movi v31.4s, #25 // n
+ mov x5, #164
+ mov x8, #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+ movrel x12, X(sgr_x_by_x)
+ ld1 {v16.16b, v17.16b, v18.16b}, [x12]
+ movi v19.16b, #5
+ movi v20.8b, #55 // idx of last 5
+ movi v21.8b, #72 // idx of last 4
+ movi v22.8b, #101 // idx of last 3
+ movi v23.8b, #169 // idx of last 2
+ movi v24.8b, #254 // idx of last 1
+ add x2, x2, #2 // w += 2
+ add x7, x2, #7
+ bic x7, x7, #7 // aligned w
+ sub x7, x8, x7 // increment between rows
+ movi v29.8h, #1, lsl #8
+ dup v28.4s, w4
+ dup v30.4s, w5 // one_by_x
+ sub x0, x0, #(4*(SUM_STRIDE))
+ sub x1, x1, #(2*(SUM_STRIDE))
+ mov x6, x2 // backup of w
+ sub v16.16b, v16.16b, v19.16b
+ sub v17.16b, v17.16b, v19.16b
+ sub v18.16b, v18.16b, v19.16b
+1:
+ subs x2, x2, #8
+ ld1 {v0.4s, v1.4s}, [x0] // a
+ ld1 {v2.8h}, [x1] // b
+ mul v0.4s, v0.4s, v31.4s // a * n
+ mul v1.4s, v1.4s, v31.4s // a * n
+ umull v3.4s, v2.4h, v2.4h // b * b
+ umull2 v4.4s, v2.8h, v2.8h // b * b
+ uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
+ uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
+ mul v0.4s, v0.4s, v28.4s // p * s
+ mul v1.4s, v1.4s, v28.4s // p * s
+ uqshrn v0.4h, v0.4s, #16
+ uqshrn2 v0.8h, v1.4s, #16
+ uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
+
+ cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
+ cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
+ tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
+ cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
+ cmhi v5.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
+ add v25.8b, v25.8b, v26.8b
+ cmhi v6.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
+ add v27.8b, v27.8b, v5.8b
+ add v6.8b, v6.8b, v19.8b
+ add v25.8b, v25.8b, v27.8b
+ add v1.8b, v1.8b, v6.8b
+ add v1.8b, v1.8b, v25.8b
+ uxtl v1.8h, v1.8b // x
+
+ umull v3.4s, v1.4h, v2.4h // x * BB[i]
+ umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
+ mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ srshr v3.4s, v3.4s, #12 // AA[i]
+ srshr v4.4s, v4.4s, #12 // AA[i]
+ sub v2.8h, v29.8h, v1.8h // 256 - x
+
+ st1 {v3.4s, v4.4s}, [x0], #32
+ st1 {v2.8h}, [x1], #16
+ b.gt 1b
+
+ subs x3, x3, #1
+ b.le 0f
+ add x0, x0, x7, lsl #2
+ add x1, x1, x7, lsl #1
+ mov x2, x6
+ b 1b
+0:
+ ret
+endfunc
--- /dev/null
+++ b/src/arm/64/looprestoration_tmpl.S
@@ -1,0 +1,474 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter1_8bpc_neon, export=1
+ sub x7, x3, #(4*SUM_STRIDE)
+ add x8, x3, #(4*SUM_STRIDE)
+ sub x9, x4, #(2*SUM_STRIDE)
+ add x10, x4, #(2*SUM_STRIDE)
+ mov x11, #SUM_STRIDE
+ mov x12, #FILTER_OUT_STRIDE
+ add x13, x5, #7
+ bic x13, x13, #7 // Aligned width
+ sub x2, x2, x13
+ sub x12, x12, x13
+ sub x11, x11, x13
+ sub x11, x11, #4 // We read 4 extra elements from a
+ sub x14, x11, #4 // We read 8 extra elements from b
+ mov x13, x5
+ movi v6.8h, #3
+ movi v7.4s, #3
+1:
+ ld1 {v0.8h, v1.8h}, [x9], #32
+ ld1 {v2.8h, v3.8h}, [x4], #32
+ ld1 {v4.8h, v5.8h}, [x10], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48
+ ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48
+
+2:
+ subs x5, x5, #8
+ ext v25.16b, v0.16b, v1.16b, #2 // -stride
+ ext v26.16b, v2.16b, v3.16b, #2 // 0
+ ext v27.16b, v4.16b, v5.16b, #2 // +stride
+ ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v29.16b, v2.16b, v3.16b, #4 // +1
+ ext v30.16b, v4.16b, v5.16b, #4 // +1+stride
+ add v2.8h, v2.8h, v25.8h // -1, -stride
+ add v26.8h, v26.8h, v27.8h // 0, +stride
+ add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
+ add v2.8h, v2.8h, v26.8h
+ add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride
+ add v2.8h, v2.8h, v29.8h // +1
+ add v0.8h, v0.8h, v4.8h
+
+ ext v25.16b, v16.16b, v17.16b, #4 // -stride
+ ext v26.16b, v17.16b, v18.16b, #4
+ shl v2.8h, v2.8h, #2
+ ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v28.16b, v17.16b, v18.16b, #8
+ ext v29.16b, v19.16b, v20.16b, #4 // 0
+ ext v30.16b, v20.16b, v21.16b, #4
+ mla v2.8h, v0.8h, v6.8h // * 3 -> a
+ add v25.4s, v25.4s, v19.4s // -stride, -1
+ add v26.4s, v26.4s, v20.4s
+ add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v28.4s
+ ext v27.16b, v19.16b, v20.16b, #8 // +1
+ ext v28.16b, v20.16b, v21.16b, #8
+ add v16.4s, v16.4s, v22.4s // -1+stride
+ add v17.4s, v17.4s, v23.4s
+ add v29.4s, v29.4s, v27.4s // 0, +1
+ add v30.4s, v30.4s, v28.4s
+ add v25.4s, v25.4s, v29.4s
+ add v26.4s, v26.4s, v30.4s
+ ext v27.16b, v22.16b, v23.16b, #4 // +stride
+ ext v28.16b, v23.16b, v24.16b, #4
+ ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
+ ext v30.16b, v23.16b, v24.16b, #8
+ ld1 {v19.8b}, [x1], #8 // src
+ add v25.4s, v25.4s, v27.4s // +stride
+ add v26.4s, v26.4s, v28.4s
+ add v16.4s, v16.4s, v29.4s // +1+stride
+ add v17.4s, v17.4s, v30.4s
+ shl v25.4s, v25.4s, #2
+ shl v26.4s, v26.4s, #2
+ mla v25.4s, v16.4s, v7.4s // * 3 -> b
+ mla v26.4s, v17.4s, v7.4s
+ uxtl v19.8h, v19.8b // src
+ mov v0.16b, v1.16b
+ umlal v25.4s, v2.4h, v19.4h // b + a * src
+ umlal2 v26.4s, v2.8h, v19.8h
+ mov v2.16b, v3.16b
+ rshrn v25.4h, v25.4s, #9
+ rshrn2 v25.8h, v26.4s, #9
+ mov v4.16b, v5.16b
+ st1 {v25.8h}, [x0], #16
+
+ b.le 3f
+ mov v16.16b, v18.16b
+ mov v19.16b, v21.16b
+ mov v22.16b, v24.16b
+ ld1 {v1.8h}, [x9], #16
+ ld1 {v3.8h}, [x4], #16
+ ld1 {v5.8h}, [x10], #16
+ ld1 {v17.4s, v18.4s}, [x7], #32
+ ld1 {v20.4s, v21.4s}, [x3], #32
+ ld1 {v23.4s, v24.4s}, [x8], #32
+ b 2b
+
+3:
+ subs x6, x6, #1
+ b.le 0f
+ mov x5, x13
+ add x0, x0, x12, lsl #1
+ add x1, x1, x2
+ add x3, x3, x11, lsl #2
+ add x7, x7, x11, lsl #2
+ add x8, x8, x11, lsl #2
+ add x4, x4, x14, lsl #1
+ add x9, x9, x14, lsl #1
+ add x10, x10, x14, lsl #1
+ b 1b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter2_8bpc_neon, export=1
+ add x7, x3, #(4*(SUM_STRIDE))
+ sub x3, x3, #(4*(SUM_STRIDE))
+ add x8, x4, #(2*(SUM_STRIDE))
+ sub x4, x4, #(2*(SUM_STRIDE))
+ mov x9, #(2*SUM_STRIDE)
+ mov x10, #FILTER_OUT_STRIDE
+ add x11, x5, #7
+ bic x11, x11, #7 // Aligned width
+ sub x2, x2, x11
+ sub x10, x10, x11
+ sub x9, x9, x11
+ sub x9, x9, #4 // We read 4 extra elements from a
+ sub x12, x9, #4 // We read 8 extra elements from b
+ mov x11, x5
+ movi v4.8h, #5
+ movi v5.4s, #5
+ movi v6.8h, #6
+ movi v7.4s, #6
+1:
+ ld1 {v0.8h, v1.8h}, [x4], #32
+ ld1 {v2.8h, v3.8h}, [x8], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
+
+2:
+ subs x5, x5, #8
+ ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
+ ext v22.16b, v0.16b, v1.16b, #2 // -stride
+ ext v23.16b, v2.16b, v3.16b, #2 // +stride
+ add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
+ add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
+ add v2.8h, v22.8h, v23.8h // -stride, +stride
+ add v0.8h, v0.8h, v25.8h
+
+ ext v22.16b, v16.16b, v17.16b, #4 // -stride
+ ext v23.16b, v17.16b, v18.16b, #4
+ ext v24.16b, v19.16b, v20.16b, #4 // +stride
+ ext v25.16b, v20.16b, v21.16b, #4
+ ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v27.16b, v17.16b, v18.16b, #8
+ ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
+ ext v29.16b, v20.16b, v21.16b, #8
+ mul v0.8h, v0.8h, v4.8h // * 5
+ mla v0.8h, v2.8h, v6.8h // * 6
+ ld1 {v31.8b}, [x1], #8
+ add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v27.4s
+ add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
+ add v20.4s, v20.4s, v29.4s
+ add v16.4s, v16.4s, v19.4s
+ add v17.4s, v17.4s, v20.4s
+
+ add v22.4s, v22.4s, v24.4s // -stride, +stride
+ add v23.4s, v23.4s, v25.4s
+ // This is, surprisingly, faster than other variants where the
+ // mul+mla pairs are further apart, on Cortex A53.
+ mul v16.4s, v16.4s, v5.4s // * 5
+ mla v16.4s, v22.4s, v7.4s // * 6
+ mul v17.4s, v17.4s, v5.4s // * 5
+ mla v17.4s, v23.4s, v7.4s // * 6
+
+ uxtl v31.8h, v31.8b
+ umlal v16.4s, v0.4h, v31.4h // b + a * src
+ umlal2 v17.4s, v0.8h, v31.8h
+ mov v0.16b, v1.16b
+ rshrn v16.4h, v16.4s, #9
+ rshrn2 v16.8h, v17.4s, #9
+ mov v2.16b, v3.16b
+ st1 {v16.8h}, [x0], #16
+
+ b.le 3f
+ mov v16.16b, v18.16b
+ mov v19.16b, v21.16b
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v3.8h}, [x8], #16
+ ld1 {v17.4s, v18.4s}, [x3], #32
+ ld1 {v20.4s, v21.4s}, [x7], #32
+ b 2b
+
+3:
+ subs x6, x6, #1
+ b.le 0f
+ mov x5, x11
+ add x0, x0, x10, lsl #1
+ add x1, x1, x2
+ add x3, x3, x9, lsl #2
+ add x7, x7, x9, lsl #2
+ add x4, x4, x12, lsl #1
+ add x8, x8, x12, lsl #1
+ mov x13, x3
+ mov x14, x4
+
+ ld1 {v0.8h, v1.8h}, [x4], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
+
+4:
+ subs x5, x5, #8
+ ext v23.16b, v0.16b, v1.16b, #4 // +1
+ ext v22.16b, v0.16b, v1.16b, #2 // 0
+ add v0.8h, v0.8h, v23.8h // -1, +1
+
+ ext v24.16b, v16.16b, v17.16b, #4 // 0
+ ext v25.16b, v17.16b, v18.16b, #4
+ ext v26.16b, v16.16b, v17.16b, #8 // +1
+ ext v27.16b, v17.16b, v18.16b, #8
+ mul v2.8h, v22.8h, v6.8h // * 6
+ mla v2.8h, v0.8h, v4.8h // * 5 -> a
+ ld1 {v31.8b}, [x1], #8
+ add v16.4s, v16.4s, v26.4s // -1, +1
+ add v17.4s, v17.4s, v27.4s
+ uxtl v31.8h, v31.8b
+ // This is, surprisingly, faster than other variants where the
+ // mul+mla pairs are further apart, on Cortex A53.
+ mul v24.4s, v24.4s, v7.4s // * 6
+ mla v24.4s, v16.4s, v5.4s // * 5 -> b
+ mul v25.4s, v25.4s, v7.4s // * 6
+ mla v25.4s, v17.4s, v5.4s // * 5 -> b
+
+ umlal v24.4s, v2.4h, v31.4h // b + a * src
+ umlal2 v25.4s, v2.8h, v31.8h
+ mov v0.16b, v1.16b
+ rshrn v24.4h, v24.4s, #8
+ rshrn2 v24.8h, v25.4s, #8
+ mov v16.16b, v18.16b
+ st1 {v24.8h}, [x0], #16
+
+ b.le 5f
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v17.4s, v18.4s}, [x3], #32
+ b 4b
+
+5:
+ subs x6, x6, #1
+ b.le 0f
+ mov x5, x11
+ add x0, x0, x10, lsl #1
+ add x1, x1, x2
+ mov x3, x13 // Rewind x3/x4 to where they started
+ mov x4, x14
+ b 1b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int w, const int h,
+// const int wt);
+function sgr_weighted1_8bpc_neon, export=1
+ dup v31.8h, w7
+ cmp x6, #2
+ add x9, x0, x1
+ add x10, x2, x3
+ add x11, x4, #2*FILTER_OUT_STRIDE
+ mov x7, #(4*FILTER_OUT_STRIDE)
+ lsl x1, x1, #1
+ lsl x3, x3, #1
+ add x8, x5, #7
+ bic x8, x8, #7 // Aligned width
+ sub x1, x1, x8
+ sub x3, x3, x8
+ sub x7, x7, x8, lsl #1
+ mov x8, x5
+ b.lt 2f
+1:
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v4.8b}, [x10], #8
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v5.8h}, [x11], #16
+ subs x5, x5, #8
+ ushll v0.8h, v0.8b, #4 // u
+ ushll v4.8h, v4.8b, #4 // u
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v5.8h, v5.8h, v4.8h // t1 - u
+ ushll v2.4s, v0.4h, #7 // u << 7
+ ushll2 v3.4s, v0.8h, #7 // u << 7
+ ushll v6.4s, v4.4h, #7 // u << 7
+ ushll2 v7.4s, v4.8h, #7 // u << 7
+ smlal v2.4s, v1.4h, v31.4h // v
+ smlal2 v3.4s, v1.8h, v31.8h // v
+ smlal v6.4s, v5.4h, v31.4h // v
+ smlal2 v7.4s, v5.8h, v31.8h // v
+ rshrn v2.4h, v2.4s, #11
+ rshrn2 v2.8h, v3.4s, #11
+ rshrn v6.4h, v6.4s, #11
+ rshrn2 v6.8h, v7.4s, #11
+ sqxtun v2.8b, v2.8h
+ sqxtun v6.8b, v6.8h
+ st1 {v2.8b}, [x0], #8
+ st1 {v6.8b}, [x9], #8
+ b.gt 1b
+
+ sub x6, x6, #2
+ cmp x6, #1
+ b.lt 0f
+ mov x5, x8
+ add x0, x0, x1
+ add x9, x9, x1
+ add x2, x2, x3
+ add x10, x10, x3
+ add x4, x4, x7
+ add x11, x11, x7
+ b.eq 2f
+ b 1b
+
+2:
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v1.8h}, [x4], #16
+ subs x5, x5, #8
+ ushll v0.8h, v0.8b, #4 // u
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ ushll v2.4s, v0.4h, #7 // u << 7
+ ushll2 v3.4s, v0.8h, #7 // u << 7
+ smlal v2.4s, v1.4h, v31.4h // v
+ smlal2 v3.4s, v1.8h, v31.8h // v
+ rshrn v2.4h, v2.4s, #11
+ rshrn2 v2.8h, v3.4s, #11
+ sqxtun v2.8b, v2.8h
+ st1 {v2.8b}, [x0], #8
+ b.gt 2b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int16_t *t2,
+// const int w, const int h,
+// const int16_t wt[2]);
+function sgr_weighted2_8bpc_neon, export=1
+ ldr x8, [sp]
+ cmp x7, #2
+ add x10, x0, x1
+ add x11, x2, x3
+ add x12, x4, #2*FILTER_OUT_STRIDE
+ add x13, x5, #2*FILTER_OUT_STRIDE
+ ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
+ mov x8, #4*FILTER_OUT_STRIDE
+ lsl x1, x1, #1
+ lsl x3, x3, #1
+ add x9, x6, #7
+ bic x9, x9, #7 // Aligned width
+ sub x1, x1, x9
+ sub x3, x3, x9
+ sub x8, x8, x9, lsl #1
+ mov x9, x6
+ b.lt 2f
+1:
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v16.8b}, [x11], #8
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v17.8h}, [x12], #16
+ ld1 {v2.8h}, [x5], #16
+ ld1 {v18.8h}, [x13], #16
+ subs x6, x6, #8
+ ushll v0.8h, v0.8b, #4 // u
+ ushll v16.8h, v16.8b, #4 // u
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v2.8h, v2.8h, v0.8h // t2 - u
+ sub v17.8h, v17.8h, v16.8h // t1 - u
+ sub v18.8h, v18.8h, v16.8h // t2 - u
+ ushll v3.4s, v0.4h, #7 // u << 7
+ ushll2 v4.4s, v0.8h, #7 // u << 7
+ ushll v19.4s, v16.4h, #7 // u << 7
+ ushll2 v20.4s, v16.8h, #7 // u << 7
+ smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
+ smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
+ rshrn v3.4h, v3.4s, #11
+ rshrn2 v3.8h, v4.4s, #11
+ rshrn v19.4h, v19.4s, #11
+ rshrn2 v19.8h, v20.4s, #11
+ sqxtun v3.8b, v3.8h
+ sqxtun v19.8b, v19.8h
+ st1 {v3.8b}, [x0], #8
+ st1 {v19.8b}, [x10], #8
+ b.gt 1b
+
+ subs x7, x7, #2
+ cmp x7, #1
+ b.lt 0f
+ mov x6, x9
+ add x0, x0, x1
+ add x10, x10, x1
+ add x2, x2, x3
+ add x11, x11, x3
+ add x4, x4, x8
+ add x12, x12, x8
+ add x5, x5, x8
+ add x13, x13, x8
+ b.eq 2f
+ b 1b
+
+2:
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v2.8h}, [x5], #16
+ subs x6, x6, #8
+ ushll v0.8h, v0.8b, #4 // u
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v2.8h, v2.8h, v0.8h // t2 - u
+ ushll v3.4s, v0.4h, #7 // u << 7
+ ushll2 v4.4s, v0.8h, #7 // u << 7
+ smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
+ rshrn v3.4h, v3.4s, #11
+ rshrn2 v3.8h, v4.4s, #11
+ sqxtun v3.8b, v3.8h
+ st1 {v3.8b}, [x0], #8
+ b.gt 1b
+0:
+ ret
+endfunc
--- a/src/meson.build
+++ b/src/meson.build
@@ -102,6 +102,7 @@
)
if host_machine.cpu_family() == 'aarch64'
libdav1d_sources += files(
+ 'arm/64/looprestoration_common.S',
'arm/64/msac.S',
)