ref: 513dfa990804496780a7fa9ee0707b84e1976c13
parent: b6bb8536ad299d52a5ff49a4f0317b923ce6b8bb
author: Martin Storsjö <martin@martin.st>
date: Thu Nov 15 11:15:30 EST 2018
arm64: looprestoration: NEON optimized wiener filter The relative speedup compared to C code is around 4.2 for a Cortex A53 and 5.1 for a Snapdragon 835 (compared to GCC's autovectorized code), 6-7x compared to GCC's output without autovectorization, and ~8x compared to clang's output (which doesn't seem to try to vectorize this function).
--- /dev/null
+++ b/src/arm/64/looprestoration.S
@@ -1,0 +1,627 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
+// const pixel *src, ptrdiff_t stride,
+// const int16_t fh[7], const intptr_t w,
+// int h, enum LrEdgeFlags edges);
+function wiener_filter_h_neon, export=1
+ mov w8, w5
+ ld1 {v0.8h}, [x4]
+ mov w9, #(1 << 14) - (1 << 2)
+ dup v30.8h, w9
+ movi v31.8h, #8, lsl #8
+ // Calculate mid_stride
+ add w10, w5, #7
+ bic w10, w10, #7
+ lsl w10, w10, #1
+
+ // Clear the last unused element of v0, to allow filtering a single
+ // pixel with one plain mul+addv.
+ ins v0.h[7], wzr
+
+ // Set up pointers for reading/writing alternate rows
+ add x12, x0, x10
+ lsl w10, w10, #1
+ add x13, x2, x3
+ lsl x3, x3, #1
+
+ // Subtract the width from mid_strid3
+ sub x10, x10, w5, uxtw #1
+
+ // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
+ cmp w5, #8
+ add w11, w5, #13
+ bic w11, w11, #7
+ b.ge 1f
+ mov w11, #16
+1:
+ sub x3, x3, w11, uxtw
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 2f
+ // LR_HAVE_LEFT
+ cbnz x1, 0f
+ // left == NULL
+ sub x2, x2, #3
+ sub x13, x13, #3
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add x3, x3, #3
+
+
+1: // Loop vertically
+ ld1 {v3.16b}, [x2], #16
+ ld1 {v5.16b}, [x13], #16
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 0f
+ cbz x1, 2f
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.s}[3], [x1], #4
+ // Move x2/x13 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x2, x2, #3
+ sub x13, x13, #3
+ ld1 {v4.s}[3], [x1], #4
+ ext v3.16b, v2.16b, v3.16b, #13
+ ext v5.16b, v4.16b, v5.16b, #13
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 3x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ dup v4.16b, v5.b[0]
+ // Move x2 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x2, x2, #3
+ sub x13, x13, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+ ext v5.16b, v4.16b, v5.16b, #13
+
+2:
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v5.8b
+ uxtl2 v5.8h, v5.16b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w9, w5, #14
+ ldr b28, [x2, w9, sxtw]
+ ldr b29, [x13, w9, sxtw]
+ // Fill v28/v29 with the right padding pixel
+ dup v28.8b, v28.b[0]
+ dup v29.8b, v29.b[0]
+ uxtl v28.8h, v28.8b
+ uxtl v29.8h, v29.8b
+3: // !LR_HAVE_RIGHT
+ // If we'll have to pad the right edge we need to quit early here.
+ cmp w5, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+ cmp w5, #7
+ b.ge 5f // If w >= 7, we can filter 4 pixels
+ b 6f
+
+4: // Loop horizontally
+.macro filter wd
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ mul v6\wd, v2\wd, v0.h[0]
+ mla v6\wd, v16\wd, v0.h[1]
+ mla v6\wd, v17\wd, v0.h[2]
+ mla v6\wd, v18\wd, v0.h[3]
+ mla v6\wd, v19\wd, v0.h[4]
+ mla v6\wd, v20\wd, v0.h[5]
+ mla v6\wd, v21\wd, v0.h[6]
+ ext v22.16b, v4.16b, v5.16b, #2
+ ext v23.16b, v4.16b, v5.16b, #4
+ ext v24.16b, v4.16b, v5.16b, #6
+ ext v25.16b, v4.16b, v5.16b, #8
+ ext v26.16b, v4.16b, v5.16b, #10
+ ext v27.16b, v4.16b, v5.16b, #12
+ mul v7\wd, v4\wd, v0.h[0]
+ mla v7\wd, v22\wd, v0.h[1]
+ mla v7\wd, v23\wd, v0.h[2]
+ mla v7\wd, v24\wd, v0.h[3]
+ mla v7\wd, v25\wd, v0.h[4]
+ mla v7\wd, v26\wd, v0.h[5]
+ mla v7\wd, v27\wd, v0.h[6]
+
+ shl v18\wd, v18\wd, #7
+ shl v24\wd, v24\wd, #7
+ sub v18\wd, v18\wd, v30\wd
+ sub v24\wd, v24\wd, v30\wd
+ sqadd v6\wd, v6\wd, v18\wd
+ sqadd v7\wd, v7\wd, v24\wd
+ sshr v6\wd, v6\wd, #3
+ sshr v7\wd, v7\wd, #3
+ add v6\wd, v6\wd, v31\wd
+ add v7\wd, v7\wd, v31\wd
+.endm
+ filter .8h
+ st1 {v6.8h}, [x0], #16
+ st1 {v7.8h}, [x12], #16
+
+ subs w5, w5, #8
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ mov v2.16b, v3.16b
+ mov v4.16b, v5.16b
+ ld1 {v3.8b}, [x2], #8
+ ld1 {v5.8b}, [x13], #8
+ uxtl v3.8h, v3.8b
+ uxtl v5.8h, v5.8b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+5: // Filter 4 pixels, 7 <= w < 11
+ filter .4h
+ st1 {v6.4h}, [x0], #8
+ st1 {v7.4h}, [x12], #8
+
+ subs w5, w5, #4 // 3 <= w < 7
+ ext v2.16b, v2.16b, v3.16b, #8
+ ext v3.16b, v3.16b, v3.16b, #8
+ ext v4.16b, v4.16b, v5.16b, #8
+ ext v5.16b, v5.16b, v5.16b, #8
+
+6: // Pad the right edge and filter the last few pixels.
+ // w < 7, w+3 pixels valid in v2-v3
+ cmp w5, #5
+ b.lt 7f
+ b.gt 8f
+ // w == 5, 8 pixels valid in v2, v3 invalid
+ mov v3.16b, v28.16b
+ mov v5.16b, v29.16b
+ b 88f
+
+7: // 1 <= w < 5, 4-7 pixels valid in v2
+ sub w9, w5, #1
+ // w9 = (pixels valid - 4)
+ adr x11, L(variable_shift_tbl)
+ ldrh w9, [x11, w9, uxtw #1]
+ sub x11, x11, w9, uxth
+ mov v3.16b, v28.16b
+ mov v5.16b, v29.16b
+ br x11
+ // Shift v2 right, shifting out invalid pixels,
+ // shift v2 left to the original offset, shifting in padding pixels.
+44: // 4 pixels valid
+ ext v2.16b, v2.16b, v2.16b, #8
+ ext v2.16b, v2.16b, v3.16b, #8
+ ext v4.16b, v4.16b, v4.16b, #8
+ ext v4.16b, v4.16b, v5.16b, #8
+ b 88f
+55: // 5 pixels valid
+ ext v2.16b, v2.16b, v2.16b, #10
+ ext v2.16b, v2.16b, v3.16b, #6
+ ext v4.16b, v4.16b, v4.16b, #10
+ ext v4.16b, v4.16b, v5.16b, #6
+ b 88f
+66: // 6 pixels valid
+ ext v2.16b, v2.16b, v2.16b, #12
+ ext v2.16b, v2.16b, v3.16b, #4
+ ext v4.16b, v4.16b, v4.16b, #12
+ ext v4.16b, v4.16b, v5.16b, #4
+ b 88f
+77: // 7 pixels valid
+ ext v2.16b, v2.16b, v2.16b, #14
+ ext v2.16b, v2.16b, v3.16b, #2
+ ext v4.16b, v4.16b, v4.16b, #14
+ ext v4.16b, v4.16b, v5.16b, #2
+ b 88f
+
+L(variable_shift_tbl):
+ .hword L(variable_shift_tbl) - 44b
+ .hword L(variable_shift_tbl) - 55b
+ .hword L(variable_shift_tbl) - 66b
+ .hword L(variable_shift_tbl) - 77b
+
+8: // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3
+ ins v28.h[0], v3.h[0]
+ ins v29.h[0], v5.h[0]
+ mov v3.16b, v28.16b
+ mov v5.16b, v29.16b
+
+88:
+ // w < 7, v2-v3 padded properly
+ cmp w5, #4
+ b.lt 888f
+
+ // w >= 4, filter 4 pixels
+ filter .4h
+ st1 {v6.4h}, [x0], #8
+ st1 {v7.4h}, [x12], #8
+ subs w5, w5, #4 // 0 <= w < 4
+ ext v2.16b, v2.16b, v3.16b, #8
+ ext v4.16b, v4.16b, v5.16b, #8
+ b.eq 9f
+888: // 1 <= w < 4, filter 1 pixel at a time
+ mul v6.8h, v2.8h, v0.8h
+ mul v7.8h, v4.8h, v0.8h
+ addv h6, v6.8h
+ addv h7, v7.8h
+ dup v16.4h, v2.h[3]
+ dup v17.4h, v4.h[3]
+ shl v16.4h, v16.4h, #7
+ shl v17.4h, v17.4h, #7
+ sub v16.4h, v16.4h, v30.4h
+ sub v17.4h, v17.4h, v30.4h
+ sqadd v6.4h, v6.4h, v16.4h
+ sqadd v7.4h, v7.4h, v17.4h
+ sshr v6.4h, v6.4h, #3
+ sshr v7.4h, v7.4h, #3
+ add v6.4h, v6.4h, v31.4h
+ add v7.4h, v7.4h, v31.4h
+ st1 {v6.h}[0], [x0], #2
+ st1 {v7.h}[0], [x12], #2
+ subs w5, w5, #1
+ ext v2.16b, v2.16b, v3.16b, #2
+ ext v4.16b, v4.16b, v5.16b, #2
+ b.gt 888b
+
+9:
+ subs w6, w6, #2
+ b.le 0f
+ // Jump to the next row and loop horizontally
+ add x0, x0, x10
+ add x12, x12, x10
+ add x2, x2, x3
+ add x13, x13, x3
+ mov w5, w8
+ b 1b
+0:
+ ret
+.purgem filter
+endfunc
+
+// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
+// const int16_t *mid, int w, int h,
+// const int16_t fv[7], enum LrEdgeFlags edges,
+// ptrdiff_t mid_stride);
+function wiener_filter_v_neon, export=1
+ mov w8, w4
+ ld1 {v0.8h}, [x5]
+ mov w9, #128
+ dup v1.8h, w9
+ add v1.8h, v1.8h, v0.8h
+
+ // Calculate the number of rows to move back when looping vertically
+ mov w11, w4
+ tst w6, #4 // LR_HAVE_TOP
+ b.eq 0f
+ sub x2, x2, x7, lsl #1
+ add w11, w11, #2
+0:
+ tst w6, #8 // LR_HAVE_BOTTOM
+ b.eq 1f
+ add w11, w11, #2
+
+1: // Start of horizontal loop; start one vertical filter slice.
+ // Load rows into v16-v19 and pad properly.
+ tst w6, #4 // LR_HAVE_TOP
+ ld1 {v16.8h}, [x2], x7
+ b.eq 2f
+ // LR_HAVE_TOP
+ ld1 {v18.8h}, [x2], x7
+ mov v17.16b, v16.16b
+ ld1 {v19.8h}, [x2], x7
+ b 3f
+2: // !LR_HAVE_TOP
+ mov v17.16b, v16.16b
+ mov v18.16b, v16.16b
+ mov v19.16b, v16.16b
+
+3:
+ cmp w4, #4
+ b.lt 5f
+ // Start filtering normally; fill in v20-v22 with unique rows.
+ ld1 {v20.8h}, [x2], x7
+ ld1 {v21.8h}, [x2], x7
+ ld1 {v22.8h}, [x2], x7
+
+4:
+.macro filter compare
+ subs w4, w4, #1
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ smull v2.4s, v16.4h, v0.h[0]
+ smlal v2.4s, v17.4h, v0.h[1]
+ smlal v2.4s, v18.4h, v0.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v20.4h, v0.h[4]
+ smlal v2.4s, v21.4h, v0.h[5]
+ smlal v2.4s, v22.4h, v0.h[6]
+ smull2 v3.4s, v16.8h, v0.h[0]
+ smlal2 v3.4s, v17.8h, v0.h[1]
+ smlal2 v3.4s, v18.8h, v0.h[2]
+ smlal2 v3.4s, v19.8h, v1.h[3]
+ smlal2 v3.4s, v20.8h, v0.h[4]
+ smlal2 v3.4s, v21.8h, v0.h[5]
+ smlal2 v3.4s, v22.8h, v0.h[6]
+ sqrshrun v2.4h, v2.4s, #11
+ sqrshrun2 v2.8h, v3.4s, #11
+ sqxtun v2.8b, v2.8h
+ st1 {v2.8b}, [x0], x1
+.if \compare
+ cmp w4, #4
+.else
+ b.le 9f
+.endif
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+ mov v18.16b, v19.16b
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ mov v21.16b, v22.16b
+.endm
+ filter 1
+ b.lt 7f
+ ld1 {v22.8h}, [x2], x7
+ b 4b
+
+5: // Less than 4 rows in total; not all of v20-v21 are filled yet.
+ tst w6, #8 // LR_HAVE_BOTTOM
+ b.eq 6f
+ // LR_HAVE_BOTTOM
+ cmp w4, #2
+ // We load at least 2 rows in all cases.
+ ld1 {v20.8h}, [x2], x7
+ ld1 {v21.8h}, [x2], x7
+ b.gt 53f // 3 rows in total
+ b.eq 52f // 2 rows in total
+51: // 1 row in total, v19 already loaded, load edge into v20-v22.
+ mov v22.16b, v21.16b
+ b 8f
+52: // 2 rows in total, v19 already loaded, load v20 with content data
+ // and 2 rows of edge.
+ ld1 {v22.8h}, [x2], x7
+ mov v23.16b, v22.16b
+ b 8f
+53:
+ // 3 rows in total, v19 already loaded, load v20 and v21 with content
+ // and 2 rows of edge.
+ ld1 {v22.8h}, [x2], x7
+ ld1 {v23.8h}, [x2], x7
+ mov v24.16b, v23.16b
+ b 8f
+
+6:
+ // !LR_HAVE_BOTTOM
+ cmp w4, #2
+ b.gt 63f // 3 rows in total
+ b.eq 62f // 2 rows in total
+61: // 1 row in total, v19 already loaded, pad that into v20-v22.
+ mov v20.16b, v19.16b
+ mov v21.16b, v19.16b
+ mov v22.16b, v19.16b
+ b 8f
+62: // 2 rows in total, v19 already loaded, load v20 and pad that into v20-v23.
+ ld1 {v20.8h}, [x2], x7
+ mov v21.16b, v20.16b
+ mov v22.16b, v20.16b
+ mov v23.16b, v20.16b
+ b 8f
+63:
+ // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24.
+ ld1 {v20.8h}, [x2], x7
+ ld1 {v21.8h}, [x2], x7
+ mov v22.16b, v21.16b
+ mov v23.16b, v21.16b
+ mov v24.16b, v21.16b
+ b 8f
+
+7:
+ // All registers up to v21 are filled already, 3 valid rows left.
+ // < 4 valid rows left; fill in padding and filter the last
+ // few rows.
+ tst w6, #8 // LR_HAVE_BOTTOM
+ b.eq 71f
+ // LR_HAVE_BOTTOM; load 2 rows of edge.
+ ld1 {v22.8h}, [x2], x7
+ ld1 {v23.8h}, [x2], x7
+ mov v24.16b, v23.16b
+ b 8f
+71:
+ // !LR_HAVE_BOTTOM, pad 3 rows
+ mov v22.16b, v21.16b
+ mov v23.16b, v21.16b
+ mov v24.16b, v21.16b
+
+8: // At this point, all registers up to v22-v24 are loaded with
+ // edge/padding (depending on how many rows are left).
+ filter 0 // This branches to 9f when done
+ mov v22.16b, v23.16b
+ mov v23.16b, v24.16b
+ b 8b
+
+9: // End of one vertical slice.
+ subs w3, w3, #8
+ b.le 0f
+ // Move pointers back up to the top and loop horizontally.
+ msub x0, x1, x8, x0
+ msub x2, x7, x11, x2
+ add x0, x0, #8
+ add x2, x2, #16
+ mov w4, w8
+ b 1b
+
+0:
+ ret
+.purgem filter
+endfunc
+
+// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
+// const pixel *src, int w, int h);
+function copy_narrow_neon, export=1
+ adr x5, L(copy_narrow_tbl)
+ ldrh w6, [x5, w3, uxtw #1]
+ sub x5, x5, w6, uxth
+ br x5
+10:
+ add x7, x0, x1
+ lsl x1, x1, #1
+18:
+ cmp w4, #8
+ b.lt 110f
+ subs w4, w4, #8
+ ld1 {v0.8b}, [x2], #8
+ st1 {v0.b}[0], [x0], x1
+ st1 {v0.b}[1], [x7], x1
+ st1 {v0.b}[2], [x0], x1
+ st1 {v0.b}[3], [x7], x1
+ st1 {v0.b}[4], [x0], x1
+ st1 {v0.b}[5], [x7], x1
+ st1 {v0.b}[6], [x0], x1
+ st1 {v0.b}[7], [x7], x1
+ b.le 0f
+ b 18b
+110:
+ asr x1, x1, #1
+11:
+ subs w4, w4, #1
+ ld1 {v0.b}[0], [x2], #1
+ st1 {v0.b}[0], [x0], x1
+ b.ge 11b
+0:
+ ret
+
+20:
+ add x7, x0, x1
+ lsl x1, x1, #1
+24:
+ cmp w4, #4
+ b.lt 210f
+ subs w4, w4, #4
+ ld1 {v0.4h}, [x2], #8
+ st1 {v0.h}[0], [x0], x1
+ st1 {v0.h}[1], [x7], x1
+ st1 {v0.h}[2], [x0], x1
+ st1 {v0.h}[3], [x7], x1
+ b.le 0f
+ b 24b
+210:
+ asr x1, x1, #1
+22:
+ subs w4, w4, #1
+ ld1 {v0.h}[0], [x2], #2
+ st1 {v0.h}[0], [x0], x1
+ b.ge 22b
+0:
+ ret
+
+30:
+ ldrh w5, [x2]
+ ldrb w6, [x2, #2]
+ add x2, x2, #3
+ subs w4, w4, #1
+ strh w5, [x0]
+ strb w6, [x0, #2]
+ add x0, x0, x1
+ b.gt 30b
+ ret
+
+40:
+ add x7, x0, x1
+ lsl x1, x1, #1
+42:
+ cmp w4, #2
+ b.lt 41f
+ subs w4, w4, #2
+ ld1 {v0.2s}, [x2], #8
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[1], [x7], x1
+ b.le 0f
+ b 42b
+41:
+ ld1 {v0.s}[0], [x2]
+ st1 {v0.s}[0], [x0]
+0:
+ ret
+
+50:
+ ldr w5, [x2]
+ ldrb w6, [x2, #4]
+ add x2, x2, #5
+ subs w4, w4, #1
+ str w5, [x0]
+ strb w6, [x0, #4]
+ add x0, x0, x1
+ b.gt 50b
+ ret
+
+60:
+ ldr w5, [x2]
+ ldrh w6, [x2, #4]
+ add x2, x2, #6
+ subs w4, w4, #1
+ str w5, [x0]
+ strh w6, [x0, #4]
+ add x0, x0, x1
+ b.gt 60b
+ ret
+
+70:
+ ldr w5, [x2]
+ ldrh w6, [x2, #4]
+ ldrb w7, [x2, #6]
+ add x2, x2, #7
+ subs w4, w4, #1
+ str w5, [x0]
+ strh w6, [x0, #4]
+ strb w7, [x0, #6]
+ add x0, x0, x1
+ b.gt 70b
+ ret
+
+L(copy_narrow_tbl):
+ .hword 0
+ .hword L(copy_narrow_tbl) - 10b
+ .hword L(copy_narrow_tbl) - 20b
+ .hword L(copy_narrow_tbl) - 30b
+ .hword L(copy_narrow_tbl) - 40b
+ .hword L(copy_narrow_tbl) - 50b
+ .hword L(copy_narrow_tbl) - 60b
+ .hword L(copy_narrow_tbl) - 70b
+endfunc
--- /dev/null
+++ b/src/arm/looprestoration_init_tmpl.c
@@ -1,0 +1,106 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#include "common/attributes.h"
+#include "common/intops.h"
+#include "src/tables.h"
+
+#if BITDEPTH == 8 && ARCH_AARCH64
+// This calculates things slightly differently than the reference C version.
+// This version calculates roughly this:
+// int16_t sum = 0;
+// for (int i = 0; i < 7; i++)
+// sum += src[idx] * fh[i];
+// int16_t sum2 = (src[x] << 7) - (1 << (BITDEPTH + 6)) + rounding_off_h;
+// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
+// sum += 2048;
+void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
+ const pixel *src, ptrdiff_t stride,
+ const int16_t fh[7], const intptr_t w,
+ int h, enum LrEdgeFlags edges);
+// This calculates things slightly differently than the reference C version.
+// This version calculates roughly this:
+// fv[3] += 128;
+// int32_t sum = 0;
+// for (int i = 0; i < 7; i++)
+// sum += mid[idx] * fv[i];
+// sum = (sum + rounding_off_v) >> round_bits_v;
+// This function assumes that the width is a multiple of 8.
+void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
+ const int16_t *mid, int w, int h,
+ const int16_t fv[7], enum LrEdgeFlags edges,
+ ptrdiff_t mid_stride);
+void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
+ const pixel *src, int w, int h);
+
+static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf, const ptrdiff_t lpf_stride,
+ const int w, const int h, const int16_t fh[7],
+ const int16_t fv[7], const enum LrEdgeFlags edges)
+{
+ ALIGN_STK_32(int16_t, mid, 68 * 384,);
+ int mid_stride = (w + 7) & ~7;
+
+ // Horizontal filter
+ dav1d_wiener_filter_h_neon(&mid[2 * mid_stride], left, dst, dst_stride,
+ fh, w, h, edges);
+ if (edges & LR_HAVE_TOP)
+ dav1d_wiener_filter_h_neon(mid, NULL, lpf, lpf_stride,
+ fh, w, 2, edges);
+ if (edges & LR_HAVE_BOTTOM)
+ dav1d_wiener_filter_h_neon(&mid[(2 + h) * mid_stride], NULL,
+ lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride,
+ fh, w, 2, edges);
+
+ // Vertical filter
+ if (w >= 8)
+ dav1d_wiener_filter_v_neon(dst, dst_stride, &mid[2*mid_stride],
+ w & ~7, h, fv, edges, mid_stride * sizeof(*mid));
+ if (w & 7) {
+ // For uneven widths, do a full 8 pixel wide filtering into a temp
+ // buffer and copy out the narrow slice of pixels separately into dest.
+ ALIGN_STK_16(pixel, tmp, 64 * 8,);
+ dav1d_wiener_filter_v_neon(tmp, w & 7, &mid[2*mid_stride + (w & ~7)],
+ w & 7, h, fv, edges, mid_stride * sizeof(*mid));
+ dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, tmp, w & 7, h);
+ }
+}
+#endif
+
+void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if BITDEPTH == 8 && ARCH_AARCH64
+ c->wiener = wiener_filter_neon;
+#endif
+}
--- a/src/looprestoration.h
+++ b/src/looprestoration.h
@@ -74,6 +74,8 @@
void dav1d_loop_restoration_dsp_init_8bpc(Dav1dLoopRestorationDSPContext *c);
void dav1d_loop_restoration_dsp_init_10bpc(Dav1dLoopRestorationDSPContext *c);
+void dav1d_loop_restoration_dsp_init_arm_8bpc(Dav1dLoopRestorationDSPContext *c);
+void dav1d_loop_restoration_dsp_init_arm_10bpc(Dav1dLoopRestorationDSPContext *c);
void dav1d_loop_restoration_dsp_init_x86_8bpc(Dav1dLoopRestorationDSPContext *c);
void dav1d_loop_restoration_dsp_init_x86_10bpc(Dav1dLoopRestorationDSPContext *c);
--- a/src/looprestoration_tmpl.c
+++ b/src/looprestoration_tmpl.c
@@ -573,7 +573,11 @@
c->wiener = wiener_c;
c->selfguided = selfguided_c;
-#if HAVE_ASM && ARCH_X86 && BITDEPTH == 8
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ bitfn(dav1d_loop_restoration_dsp_init_arm)(c);
+#elif ARCH_X86
bitfn(dav1d_loop_restoration_dsp_init_x86)(c);
+#endif
#endif
}
--- a/src/meson.build
+++ b/src/meson.build
@@ -84,10 +84,12 @@
'arm/cpu.c',
)
libdav1d_tmpl_sources += files(
+ 'arm/looprestoration_init_tmpl.c',
'arm/mc_init_tmpl.c',
)
if host_machine.cpu_family() == 'aarch64'
libdav1d_sources += files(
+ 'arm/64/looprestoration.S',
'arm/64/mc.S',
)
elif host_machine.cpu_family().startswith('arm')