ref: 2e3fd4d6c9fe649a032218093e50e55577528dde
parent: dfa24594cbd5c2e8dee253abf7bd04edc3bb0612
parent: 802790f181a30f02d93aa83ae364f81b341c9b4a
author: Sigrid Haflínudóttir <sigrid@gloot.com>
date: Mon Dec 14 14:22:44 EST 2020
Merge remote-tracking branch 'upstream/master'
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -464,9 +464,12 @@
-Dtestdata_tests=true
-Dlogging=false
-Db_sanitize=address
- -Denable_asm=false
- ninja -C build
- - cd build && time meson test -v --setup=sanitizer
+ - cd build
+ - exit_code=0
+ - time meson test -v --setup=sanitizer --test-args "--cpumask 0" || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --test-args "--cpumask 0xff" || exit_code=$((exit_code + $?))
+ - if [ $exit_code -ne 0 ]; then exit $exit_code; fi
test-debian-msan:
extends:
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,18 @@
+Changes for 0.8.0 'Eurasian hobby":
+-----------------------------------
+
+0.8.0 is a major update for dav1d:
+ - Improve the performance by using a picture buffer pool;
+ The improvements can reach 10% on some cases on Windows.
+ - Support for Apple ARM Silicon
+ - ARM32 optimizations for 8bit bitdepth for ipred paeth, smooth, cfl
+ - ARM32 optimizations for 10/12/16bit bitdepth for mc_avg/mask/w_avg,
+ put/prep 8tap/bilin, wiener and CDEF filters
+ - ARM64 optimizations for cfl_ac 444 for all bitdepths
+ - x86 optimizations for MC 8-tap, mc_scaled in AVX2
+ - x86 optimizations for CDEF in SSE and {put/prep}_{8tap/bilin} in SSSE3
+
+
Changes for 0.7.1 'Frigatebird':
------------------------------
--- a/THANKS.md
+++ b/THANKS.md
@@ -16,13 +16,16 @@
And all the dav1d Authors (git shortlog -sn), including:
-Janne Grunau, Ronald S. Bultje, Martin Storsjö, Henrik Gramner, James Almer,
-Marvin Scholz, Luc Trudeau, Jean-Baptiste Kempf, Victorien Le Couviour--Tuffet,
-David Michael Barr, Hugo Beauzée-Luyssen, Steve Lhomme, Nathan E. Egge,
-Francois Cartegnie, Konstantin Pavlov, Liwei Wang, Xuefeng Jiang,
-Derek Buitenhuis, Raphaël Zumer, Niklas Haas, Michael Bradshaw, Kyle Siefring,
-Raphael Zumer, Boyuan Xiao, Thierry Foucu, Matthias Dressel, Thomas Daede,
-Rupert Swarbrick, Jan Beich, Dale Curtis, SmilingWolf, Tristan Laurent,
-Vittorio Giovara, Rostislav Pehlivanov, Shiz, skal, Steinar Midtskogen,
-Luca Barbato, Justin Bull, Jean-Yves Avenard, Timo Gurr, Fred Barbier,
-Anisse Astier, Pablo Stebler, Nicolas Frattaroli, Mark Shuttleworth.
+Martin Storsjö, Janne Grunau, Henrik Gramner, Ronald S. Bultje, James Almer,
+Marvin Scholz, Luc Trudeau, Victorien Le Couviour--Tuffet, Jean-Baptiste Kempf,
+Hugo Beauzée-Luyssen, Matthias Dressel, Konstantin Pavlov, David Michael Barr,
+Steve Lhomme, Niklas Haas, B Krishnan Iyer, Francois Cartegnie, Liwei Wang,
+Nathan E. Egge, Derek Buitenhuis, Michael Bradshaw, Raphaël Zumer,
+Xuefeng Jiang, Luca Barbato, Jan Beich, Wan-Teh Chang, Justin Bull, Boyuan Xiao,
+Dale Curtis, Kyle Siefring, Raphael Zumer, Rupert Swarbrick, Thierry Foucu,
+Thomas Daede, Colin Lee, Emmanuel Gil Peyrot, Lynne, Michail Alvanos,
+Nico Weber, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier,
+Dmitriy Sychov, Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard,
+Mark Shuttleworth, Matthieu Bouron, Nicolas Frattaroli, Pablo Stebler,
+Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Sylvestre Ledru, Timo Gurr,
+Tristan Matthews, Xavier Claessens, Xu Guangxin, kossh1 and skal.
--- a/include/common/mem.h
+++ /dev/null
@@ -1,84 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef DAV1D_COMMON_MEM_H
-#define DAV1D_COMMON_MEM_H
-
-#include <stdlib.h>
-
-#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
-#include <malloc.h>
-#endif
-
-#include "common/attributes.h"
-
-/*
- * Allocate align-byte aligned memory. The return value can be released
- * by calling the dav1d_free_aligned() function.
- */
-static inline void *dav1d_alloc_aligned(size_t sz, size_t align) {
- assert(!(align & (align - 1)));
-#ifdef HAVE_POSIX_MEMALIGN
- void *ptr;
- if (posix_memalign(&ptr, align, sz)) return NULL;
- return ptr;
-#elif defined(HAVE_ALIGNED_MALLOC)
- return _aligned_malloc(sz, align);
-#elif defined(HAVE_MEMALIGN)
- return memalign(align, sz);
-#else
-#error Missing aligned alloc implementation
-#endif
-}
-
-static inline void dav1d_free_aligned(void* ptr) {
-#ifdef HAVE_POSIX_MEMALIGN
- free(ptr);
-#elif defined(HAVE_ALIGNED_MALLOC)
- _aligned_free(ptr);
-#elif defined(HAVE_MEMALIGN)
- free(ptr);
-#endif
-}
-
-static inline void dav1d_freep_aligned(void* ptr) {
- void **mem = (void **) ptr;
- if (*mem) {
- dav1d_free_aligned(*mem);
- *mem = NULL;
- }
-}
-
-static inline void freep(void *ptr) {
- void **mem = (void **) ptr;
- if (*mem) {
- free(*mem);
- *mem = NULL;
- }
-}
-
-#endif /* DAV1D_COMMON_MEM_H */
--- a/include/dav1d/meson.build
+++ b/include/dav1d/meson.build
@@ -31,13 +31,13 @@
output: 'version.h',
configuration: version_h_data)
-dav1d_api_headers = files(
+dav1d_api_headers = [
'common.h',
'data.h',
'dav1d.h',
'headers.h',
'picture.h',
- )
+]
# install headers
install_headers(dav1d_api_headers,
--- a/meson.build
+++ b/meson.build
@@ -23,12 +23,12 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
- version: '0.7.1',
+ version: '0.8.0',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
- meson_version: '>= 0.47.0')
+ meson_version: '>= 0.49.0')
dav1d_soname_version = '5.0.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
@@ -118,6 +118,17 @@
thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
rt_dependency = []
+
+ rc_version_array = meson.project_version().split('.')
+ winmod = import('windows')
+ rc_data = configuration_data()
+ rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0])
+ rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1])
+ rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2])
+ rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
+ rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
+ rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
+ rc_data.set('COPYRIGHT_YEARS', '2020')
else
thread_dependency = dependency('threads')
thread_compat_dep = []
@@ -227,7 +238,7 @@
# Compiler flags that should be set
# But when the compiler does not supports them
# it is not an error and silently tolerated
-if cc.get_id() != 'msvc'
+if cc.get_argument_syntax() != 'msvc'
optional_arguments += [
'-Wundef',
'-Werror=vla',
@@ -426,6 +437,28 @@
])
endif
+use_gaspp = false
+if (is_asm_enabled and
+ (host_machine.cpu_family() == 'aarch64' or
+ host_machine.cpu_family().startswith('arm')) and
+ cc.get_argument_syntax() == 'msvc')
+ gaspp = find_program('gas-preprocessor.pl')
+ use_gaspp = true
+ gaspp_gen = generator(gaspp,
+ output: '@BASENAME@.obj',
+ arguments: [
+ '-as-type', 'armasm',
+ '-arch', host_machine.cpu_family(),
+ '--',
+ host_machine.cpu_family() == 'aarch64' ? 'armasm64' : 'armasm',
+ '-nologo',
+ '-I@0@'.format(dav1d_src_root),
+ '-I@0@/'.format(meson.current_build_dir()),
+ '@INPUT@',
+ '-c',
+ '-o', '@OUTPUT@'
+ ])
+endif
# Generate config.h
config_h_target = configure_file(output: 'config.h', configuration: cdata)
--- a/src/arm/32/cdef.S
+++ b/src/arm/32/cdef.S
@@ -27,6 +27,7 @@
#include "src/arm/asm.S"
#include "util.S"
+#include "cdef_tmpl.S"
// n1 = s0/d0
// w1 = d0/q0
@@ -190,11 +191,9 @@
beq 1f
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
- ldrh r12, [r3], #2
- vldr \n1, [r1]
- vdup.16 d2, r12
+ vld1.16 {d2[]}, [r3, :16]!
ldrh r12, [r1, #\w]
- add r1, r1, r2
+ load_n_incr d0, r1, r2, \w
subs r5, r5, #1
vmov.16 d2[1], r12
vmovl.u8 q0, d0
@@ -207,9 +206,8 @@
b 3f
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
- ldrh r12, [r3], #2
+ vld1.16 {d2[]}, [r3, :16]!
load_n_incr d0, r1, r2, \w
- vdup.16 d2, r12
subs r5, r5, #1
vmovl.u8 q0, d0
vmovl.u8 q1, d2
@@ -327,231 +325,13 @@
padding_func_edged 8, 16, d0, 64
padding_func_edged 4, 8, s0, 32
-.macro dir_table w, stride
-const directions\w
- .byte -1 * \stride + 1, -2 * \stride + 2
- .byte 0 * \stride + 1, -1 * \stride + 2
- .byte 0 * \stride + 1, 0 * \stride + 2
- .byte 0 * \stride + 1, 1 * \stride + 2
- .byte 1 * \stride + 1, 2 * \stride + 2
- .byte 1 * \stride + 0, 2 * \stride + 1
- .byte 1 * \stride + 0, 2 * \stride + 0
- .byte 1 * \stride + 0, 2 * \stride - 1
-// Repeated, to avoid & 7
- .byte -1 * \stride + 1, -2 * \stride + 2
- .byte 0 * \stride + 1, -1 * \stride + 2
- .byte 0 * \stride + 1, 0 * \stride + 2
- .byte 0 * \stride + 1, 1 * \stride + 2
- .byte 1 * \stride + 1, 2 * \stride + 2
- .byte 1 * \stride + 0, 2 * \stride + 1
-endconst
-.endm
+tables
-dir_table 8, 16
-dir_table 4, 8
+filter 8, 8
+filter 4, 8
-const pri_taps
- .byte 4, 2, 3, 3
-endconst
+find_dir 8
-.macro load_px d11, d12, d21, d22, w
-.if \w == 8
- add r6, r2, r9, lsl #1 // x + off
- sub r9, r2, r9, lsl #1 // x - off
- vld1.16 {\d11,\d12}, [r6] // p0
- vld1.16 {\d21,\d22}, [r9] // p1
-.else
- add r6, r2, r9, lsl #1 // x + off
- sub r9, r2, r9, lsl #1 // x - off
- vld1.16 {\d11}, [r6] // p0
- add r6, r6, #2*8 // += stride
- vld1.16 {\d21}, [r9] // p1
- add r9, r9, #2*8 // += stride
- vld1.16 {\d12}, [r6] // p0
- vld1.16 {\d22}, [r9] // p1
-.endif
-.endm
-.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
-.if \min
- vmin.u16 q2, q2, \s1
- vmax.s16 q3, q3, \s1
- vmin.u16 q2, q2, \s2
- vmax.s16 q3, q3, \s2
-.endif
- vabd.u16 q8, q0, \s1 // abs(diff)
- vabd.u16 q11, q0, \s2 // abs(diff)
- vshl.u16 q9, q8, \shift // abs(diff) >> shift
- vshl.u16 q12, q11, \shift // abs(diff) >> shift
- vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
- vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
- vsub.i16 q10, \s1, q0 // diff = p0 - px
- vsub.i16 q13, \s2, q0 // diff = p1 - px
- vneg.s16 q8, q9 // -clip
- vneg.s16 q11, q12 // -clip
- vmin.s16 q10, q10, q9 // imin(diff, clip)
- vmin.s16 q13, q13, q12 // imin(diff, clip)
- vdup.16 q9, \tap // taps[k]
- vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip)
- vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip)
- vmla.i16 q1, q10, q9 // sum += taps[k] * constrain()
- vmla.i16 q1, q13, q9 // sum += taps[k] * constrain()
-.endm
-
-// void dav1d_cdef_filterX_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
-// const uint16_t *tmp, int pri_strength,
-// int sec_strength, int dir, int damping,
-// int h, size_t edges);
-.macro filter_func w, pri, sec, min, suffix
-function cdef_filter\w\suffix\()_neon
- cmp r8, #0xf
- beq cdef_filter\w\suffix\()_edged_neon
-.if \pri
- movrel_local r8, pri_taps
- and r9, r3, #1
- add r8, r8, r9, lsl #1
-.endif
- movrel_local r9, directions\w
- add r5, r9, r5, lsl #1
- vmov.u16 d17, #15
- vdup.16 d16, r6 // damping
-
-.if \pri
- vdup.16 q5, r3 // threshold
-.endif
-.if \sec
- vdup.16 q7, r4 // threshold
-.endif
- vmov.16 d8[0], r3
- vmov.16 d8[1], r4
- vclz.i16 d8, d8 // clz(threshold)
- vsub.i16 d8, d17, d8 // ulog2(threshold)
- vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
- vneg.s16 d8, d8 // -shift
-.if \sec
- vdup.16 q6, d8[1]
-.endif
-.if \pri
- vdup.16 q4, d8[0]
-.endif
-
-1:
-.if \w == 8
- vld1.16 {q0}, [r2, :128] // px
-.else
- add r12, r2, #2*8
- vld1.16 {d0}, [r2, :64] // px
- vld1.16 {d1}, [r12, :64] // px
-.endif
-
- vmov.u16 q1, #0 // sum
-.if \min
- vmov.u16 q2, q0 // min
- vmov.u16 q3, q0 // max
-.endif
-
- // Instead of loading sec_taps 2, 1 from memory, just set it
- // to 2 initially and decrease for the second round.
- // This is also used as loop counter.
- mov lr, #2 // sec_taps[0]
-
-2:
-.if \pri
- ldrsb r9, [r5] // off1
-
- load_px d28, d29, d30, d31, \w
-.endif
-
-.if \sec
- add r5, r5, #4 // +2*2
- ldrsb r9, [r5] // off2
-.endif
-
-.if \pri
- ldrb r12, [r8] // *pri_taps
-
- handle_pixel q14, q15, q5, q4, r12, \min
-.endif
-
-.if \sec
- load_px d28, d29, d30, d31, \w
-
- add r5, r5, #8 // +2*4
- ldrsb r9, [r5] // off3
-
- handle_pixel q14, q15, q7, q6, lr, \min
-
- load_px d28, d29, d30, d31, \w
-
- handle_pixel q14, q15, q7, q6, lr, \min
-
- sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
-.else
- add r5, r5, #1 // r5 += 1
-.endif
- subs lr, lr, #1 // sec_tap-- (value)
-.if \pri
- add r8, r8, #1 // pri_taps++ (pointer)
-.endif
- bne 2b
-
- vshr.s16 q14, q1, #15 // -(sum < 0)
- vadd.i16 q1, q1, q14 // sum - (sum < 0)
- vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
- vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4
-.if \min
- vmin.s16 q0, q0, q3
- vmax.s16 q0, q0, q2 // iclip(px + .., min, max)
-.endif
- vmovn.u16 d0, q0
-.if \w == 8
- add r2, r2, #2*16 // tmp += tmp_stride
- subs r7, r7, #1 // h--
- vst1.8 {d0}, [r0, :64], r1
-.else
- vst1.32 {d0[0]}, [r0, :32], r1
- add r2, r2, #2*16 // tmp += 2*tmp_stride
- subs r7, r7, #2 // h -= 2
- vst1.32 {d0[1]}, [r0, :32], r1
-.endif
-
- // Reset pri_taps and directions back to the original point
- sub r5, r5, #2
-.if \pri
- sub r8, r8, #2
-.endif
-
- bgt 1b
- vpop {q4-q7}
- pop {r4-r9,pc}
-endfunc
-.endm
-
-.macro filter w
-filter_func \w, pri=1, sec=0, min=0, suffix=_pri
-filter_func \w, pri=0, sec=1, min=0, suffix=_sec
-filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
-
-function cdef_filter\w\()_8bpc_neon, export=1
- push {r4-r9,lr}
- vpush {q4-q7}
- ldrd r4, r5, [sp, #92]
- ldrd r6, r7, [sp, #100]
- ldr r8, [sp, #108]
- cmp r3, #0 // pri_strength
- bne 1f
- b cdef_filter\w\()_sec_neon // only sec
-1:
- cmp r4, #0 // sec_strength
- bne 1f
- b cdef_filter\w\()_pri_neon // only pri
-1:
- b cdef_filter\w\()_pri_sec_neon // both pri and sec
-endfunc
-.endm
-
-filter 8
-filter 4
-
.macro load_px_8 d11, d12, d21, d22, w
.if \w == 8
add r6, r2, r9 // x + off
@@ -756,219 +536,3 @@
filter_8 8
filter_8 4
-
-const div_table, align=4
- .short 840, 420, 280, 210, 168, 140, 120, 105
-endconst
-
-const alt_fact, align=4
- .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
-endconst
-
-// int dav1d_cdef_find_dir_8bpc_neon(const pixel *img, const ptrdiff_t stride,
-// unsigned *const var)
-function cdef_find_dir_8bpc_neon, export=1
- push {lr}
- vpush {q4-q7}
- sub sp, sp, #32 // cost
- mov r3, #8
- vmov.u16 q1, #0 // q0-q1 sum_diag[0]
- vmov.u16 q3, #0 // q2-q3 sum_diag[1]
- vmov.u16 q5, #0 // q4-q5 sum_hv[0-1]
- vmov.u16 q8, #0 // q6,d16 sum_alt[0]
- // q7,d17 sum_alt[1]
- vmov.u16 q9, #0 // q9,d22 sum_alt[2]
- vmov.u16 q11, #0
- vmov.u16 q10, #0 // q10,d23 sum_alt[3]
-
-
-.irpc i, 01234567
- vld1.8 {d30}, [r0, :64], r1
- vmov.u8 d31, #128
- vsubl.u8 q15, d30, d31 // img[x] - 128
- vmov.u16 q14, #0
-
-.if \i == 0
- vmov q0, q15 // sum_diag[0]
-.else
- vext.8 q12, q14, q15, #(16-2*\i)
- vext.8 q13, q15, q14, #(16-2*\i)
- vadd.i16 q0, q0, q12 // sum_diag[0]
- vadd.i16 q1, q1, q13 // sum_diag[0]
-.endif
- vrev64.16 q13, q15
- vswp d26, d27 // [-x]
-.if \i == 0
- vmov q2, q13 // sum_diag[1]
-.else
- vext.8 q12, q14, q13, #(16-2*\i)
- vext.8 q13, q13, q14, #(16-2*\i)
- vadd.i16 q2, q2, q12 // sum_diag[1]
- vadd.i16 q3, q3, q13 // sum_diag[1]
-.endif
-
- vpadd.u16 d26, d30, d31 // [(x >> 1)]
- vmov.u16 d27, #0
- vpadd.u16 d24, d26, d28
- vpadd.u16 d24, d24, d28 // [y]
- vmov.u16 r12, d24[0]
- vadd.i16 q5, q5, q15 // sum_hv[1]
-.if \i < 4
- vmov.16 d8[\i], r12 // sum_hv[0]
-.else
- vmov.16 d9[\i-4], r12 // sum_hv[0]
-.endif
-
-.if \i == 0
- vmov.u16 q6, q13 // sum_alt[0]
-.else
- vext.8 q12, q14, q13, #(16-2*\i)
- vext.8 q14, q13, q14, #(16-2*\i)
- vadd.i16 q6, q6, q12 // sum_alt[0]
- vadd.i16 d16, d16, d28 // sum_alt[0]
-.endif
- vrev64.16 d26, d26 // [-(x >> 1)]
- vmov.u16 q14, #0
-.if \i == 0
- vmov q7, q13 // sum_alt[1]
-.else
- vext.8 q12, q14, q13, #(16-2*\i)
- vext.8 q13, q13, q14, #(16-2*\i)
- vadd.i16 q7, q7, q12 // sum_alt[1]
- vadd.i16 d17, d17, d26 // sum_alt[1]
-.endif
-
-.if \i < 6
- vext.8 q12, q14, q15, #(16-2*(3-(\i/2)))
- vext.8 q13, q15, q14, #(16-2*(3-(\i/2)))
- vadd.i16 q9, q9, q12 // sum_alt[2]
- vadd.i16 d22, d22, d26 // sum_alt[2]
-.else
- vadd.i16 q9, q9, q15 // sum_alt[2]
-.endif
-.if \i == 0
- vmov q10, q15 // sum_alt[3]
-.elseif \i == 1
- vadd.i16 q10, q10, q15 // sum_alt[3]
-.else
- vext.8 q12, q14, q15, #(16-2*(\i/2))
- vext.8 q13, q15, q14, #(16-2*(\i/2))
- vadd.i16 q10, q10, q12 // sum_alt[3]
- vadd.i16 d23, d23, d26 // sum_alt[3]
-.endif
-.endr
-
- vmov.u32 q15, #105
-
- vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0]
- vmlal.s16 q12, d9, d9
- vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1]
- vmlal.s16 q13, d11, d11
- vadd.s32 d8, d24, d25
- vadd.s32 d9, d26, d27
- vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17)
- vmul.i32 d8, d8, d30 // cost[2,6] *= 105
-
- vrev64.16 q1, q1
- vrev64.16 q3, q3
- vext.8 q1, q1, q1, #10 // sum_diag[0][14-n]
- vext.8 q3, q3, q3, #10 // sum_diag[1][14-n]
-
- vstr s16, [sp, #2*4] // cost[2]
- vstr s17, [sp, #6*4] // cost[6]
-
- movrel_local r12, div_table
- vld1.16 {q14}, [r12, :128]
-
- vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0]
- vmull.s16 q12, d1, d1
- vmlal.s16 q5, d2, d2
- vmlal.s16 q12, d3, d3
- vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1]
- vmull.s16 q1, d5, d5
- vmlal.s16 q0, d6, d6
- vmlal.s16 q1, d7, d7
- vmovl.u16 q13, d28 // div_table
- vmovl.u16 q14, d29
- vmul.i32 q5, q5, q13 // cost[0]
- vmla.i32 q5, q12, q14
- vmul.i32 q0, q0, q13 // cost[4]
- vmla.i32 q0, q1, q14
- vadd.i32 d10, d10, d11
- vadd.i32 d0, d0, d1
- vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1
-
- movrel_local r12, alt_fact
- vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105
-
- vstr s0, [sp, #0*4] // cost[0]
- vstr s1, [sp, #4*4] // cost[4]
-
- vmovl.u16 q13, d29 // div_table[2*m+1] + 105
- vmovl.u16 q14, d30
- vmovl.u16 q15, d31
-
-.macro cost_alt dest, s1, s2, s3, s4, s5, s6
- vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n]
- vmull.s16 q2, \s2, \s2
- vmull.s16 q3, \s3, \s3
- vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n]
- vmull.s16 q12, \s5, \s5
- vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here
- vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact
- vmla.i32 q1, q2, q14
- vmla.i32 q1, q3, q15
- vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact
- vmla.i32 q5, q12, q14
- vmla.i32 q5, q6, q15
- vadd.i32 d2, d2, d3
- vadd.i32 d3, d10, d11
- vpadd.i32 \dest, d2, d3 // *cost_ptr
-.endm
- cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3]
- cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7]
- vstr s28, [sp, #1*4] // cost[1]
- vstr s29, [sp, #3*4] // cost[3]
-
- mov r0, #0 // best_dir
- vmov.32 r1, d0[0] // best_cost
- mov r3, #1 // n
-
- vstr s30, [sp, #5*4] // cost[5]
- vstr s31, [sp, #7*4] // cost[7]
-
- vmov.32 r12, d14[0]
-
-.macro find_best s1, s2, s3
-.ifnb \s2
- vmov.32 lr, \s2
-.endif
- cmp r12, r1 // cost[n] > best_cost
- itt gt
- movgt r0, r3 // best_dir = n
- movgt r1, r12 // best_cost = cost[n]
-.ifnb \s2
- add r3, r3, #1 // n++
- cmp lr, r1 // cost[n] > best_cost
- vmov.32 r12, \s3
- itt gt
- movgt r0, r3 // best_dir = n
- movgt r1, lr // best_cost = cost[n]
- add r3, r3, #1 // n++
-.endif
-.endm
- find_best d14[0], d8[0], d14[1]
- find_best d14[1], d0[1], d15[0]
- find_best d15[0], d8[1], d15[1]
- find_best d15[1]
-
- eor r3, r0, #4 // best_dir ^4
- ldr r12, [sp, r3, lsl #2]
- sub r1, r1, r12 // best_cost - cost[best_dir ^ 4]
- lsr r1, r1, #10
- str r1, [r2] // *var
-
- add sp, sp, #32
- vpop {q4-q7}
- pop {pc}
-endfunc
--- /dev/null
+++ b/src/arm/32/cdef16.S
@@ -1,0 +1,232 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+// r1 = d0/q0
+// r2 = d2/q1
+.macro pad_top_bot_16 s1, s2, w, stride, r1, r2, align, ret
+ tst r6, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r6, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ vldr s8, [\s1, #-4]
+ vld1.16 {\r1}, [\s1, :\align]
+ vldr s9, [\s1, #2*\w]
+ vldr s10, [\s2, #-4]
+ vld1.16 {\r2}, [\s2, :\align]
+ vldr s11, [\s2, #2*\w]
+ vstr s8, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s10, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s11, [r0, #2*\w]
+.if \ret
+ pop {r4-r7,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vldr s8, [\s1, #-4]
+ vld1.16 {\r1}, [\s1, :\align]
+ vldr s9, [\s2, #-4]
+ vld1.16 {\r2}, [\s2, :\align]
+ vstr s8, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s9, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r7,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+2:
+ // !CDEF_HAVE_LEFT
+ tst r6, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ vld1.16 {\r1}, [\s1, :\align]
+ vldr s8, [\s1, #2*\w]
+ vld1.16 {\r2}, [\s2, :\align]
+ vldr s9, [\s2, #2*\w]
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s8, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+.if \ret
+ pop {r4-r7,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.16 {\r1}, [\s1, :\align]
+ vld1.16 {\r2}, [\s2, :\align]
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r7,pc}
+.else
+ add r0, r0, #2*\stride
+.endif
+3:
+.endm
+
+// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top, int h,
+// enum CdefEdgeFlags edges);
+
+// r1 = d0/q0
+// r2 = d2/q1
+.macro padding_func_16 w, stride, r1, r2, align
+function cdef_padding\w\()_16bpc_neon, export=1
+ push {r4-r7,lr}
+ ldrd r4, r5, [sp, #20]
+ ldr r6, [sp, #28]
+ vmov.i16 q3, #0x8000
+ tst r6, #4 // CDEF_HAVE_TOP
+ bne 1f
+ // !CDEF_HAVE_TOP
+ sub r12, r0, #2*(2*\stride+2)
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ b 3f
+1:
+ // CDEF_HAVE_TOP
+ add r7, r4, r2
+ sub r0, r0, #2*(2*\stride)
+ pad_top_bot_16 r4, r7, \w, \stride, \r1, \r2, \align, 0
+
+ // Middle section
+3:
+ tst r6, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r6, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ vld1.32 {d2[]}, [r3, :32]!
+ vldr s5, [r1, #2*\w]
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r5, r5, #1
+ vstr s4, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s5, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.32 {d2[]}, [r3, :32]!
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r5, r5, #1
+ vstr s4, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+ b 3f
+2:
+ tst r6, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ vldr s4, [r1, #2*\w]
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r5, r5, #1
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s4, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r5, r5, #1
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+
+3:
+ tst r6, #8 // CDEF_HAVE_BOTTOM
+ bne 1f
+ // !CDEF_HAVE_BOTTOM
+ sub r12, r0, #4
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ pop {r4-r7,pc}
+1:
+ // CDEF_HAVE_BOTTOM
+ add r7, r1, r2
+ pad_top_bot_16 r1, r7, \w, \stride, \r1, \r2, \align, 1
+endfunc
+.endm
+
+padding_func_16 8, 16, q0, q1, 128
+padding_func_16 4, 8, d0, d2, 64
+
+tables
+
+filter 8, 16
+filter 4, 16
+
+find_dir 16
--- /dev/null
+++ b/src/arm/32/cdef_tmpl.S
@@ -1,0 +1,515 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro dir_table w, stride
+const directions\w
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+ .byte 1 * \stride + 0, 2 * \stride + 0
+ .byte 1 * \stride + 0, 2 * \stride - 1
+// Repeated, to avoid & 7
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+endconst
+.endm
+
+.macro tables
+dir_table 8, 16
+dir_table 4, 8
+
+const pri_taps
+ .byte 4, 2, 3, 3
+endconst
+.endm
+
+.macro load_px d11, d12, d21, d22, w
+.if \w == 8
+ add r6, r2, r9, lsl #1 // x + off
+ sub r9, r2, r9, lsl #1 // x - off
+ vld1.16 {\d11,\d12}, [r6] // p0
+ vld1.16 {\d21,\d22}, [r9] // p1
+.else
+ add r6, r2, r9, lsl #1 // x + off
+ sub r9, r2, r9, lsl #1 // x - off
+ vld1.16 {\d11}, [r6] // p0
+ add r6, r6, #2*8 // += stride
+ vld1.16 {\d21}, [r9] // p1
+ add r9, r9, #2*8 // += stride
+ vld1.16 {\d12}, [r6] // p0
+ vld1.16 {\d22}, [r9] // p1
+.endif
+.endm
+.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
+.if \min
+ vmin.u16 q2, q2, \s1
+ vmax.s16 q3, q3, \s1
+ vmin.u16 q2, q2, \s2
+ vmax.s16 q3, q3, \s2
+.endif
+ vabd.u16 q8, q0, \s1 // abs(diff)
+ vabd.u16 q11, q0, \s2 // abs(diff)
+ vshl.u16 q9, q8, \shift // abs(diff) >> shift
+ vshl.u16 q12, q11, \shift // abs(diff) >> shift
+ vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
+ vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
+ vsub.i16 q10, \s1, q0 // diff = p0 - px
+ vsub.i16 q13, \s2, q0 // diff = p1 - px
+ vneg.s16 q8, q9 // -clip
+ vneg.s16 q11, q12 // -clip
+ vmin.s16 q10, q10, q9 // imin(diff, clip)
+ vmin.s16 q13, q13, q12 // imin(diff, clip)
+ vdup.16 q9, \tap // taps[k]
+ vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip)
+ vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip)
+ vmla.i16 q1, q10, q9 // sum += taps[k] * constrain()
+ vmla.i16 q1, q13, q9 // sum += taps[k] * constrain()
+.endm
+
+// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
+// const uint16_t *tmp, int pri_strength,
+// int sec_strength, int dir, int damping,
+// int h, size_t edges);
+.macro filter_func w, bpc, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_\bpc\()bpc_neon
+.if \bpc == 8
+ cmp r8, #0xf
+ beq cdef_filter\w\suffix\()_edged_neon
+.endif
+.if \pri
+.if \bpc == 16
+ clz r9, r9
+ sub r9, r9, #24 // -bitdepth_min_8
+ neg r9, r9 // bitdepth_min_8
+.endif
+ movrel_local r8, pri_taps
+.if \bpc == 16
+ lsr r9, r3, r9 // pri_strength >> bitdepth_min_8
+ and r9, r9, #1 // (pri_strength >> bitdepth_min_8) & 1
+.else
+ and r9, r3, #1
+.endif
+ add r8, r8, r9, lsl #1
+.endif
+ movrel_local r9, directions\w
+ add r5, r9, r5, lsl #1
+ vmov.u16 d17, #15
+ vdup.16 d16, r6 // damping
+
+.if \pri
+ vdup.16 q5, r3 // threshold
+.endif
+.if \sec
+ vdup.16 q7, r4 // threshold
+.endif
+ vmov.16 d8[0], r3
+ vmov.16 d8[1], r4
+ vclz.i16 d8, d8 // clz(threshold)
+ vsub.i16 d8, d17, d8 // ulog2(threshold)
+ vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
+ vneg.s16 d8, d8 // -shift
+.if \sec
+ vdup.16 q6, d8[1]
+.endif
+.if \pri
+ vdup.16 q4, d8[0]
+.endif
+
+1:
+.if \w == 8
+ vld1.16 {q0}, [r2, :128] // px
+.else
+ add r12, r2, #2*8
+ vld1.16 {d0}, [r2, :64] // px
+ vld1.16 {d1}, [r12, :64] // px
+.endif
+
+ vmov.u16 q1, #0 // sum
+.if \min
+ vmov.u16 q2, q0 // min
+ vmov.u16 q3, q0 // max
+.endif
+
+ // Instead of loading sec_taps 2, 1 from memory, just set it
+ // to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
+ mov lr, #2 // sec_taps[0]
+
+2:
+.if \pri
+ ldrsb r9, [r5] // off1
+
+ load_px d28, d29, d30, d31, \w
+.endif
+
+.if \sec
+ add r5, r5, #4 // +2*2
+ ldrsb r9, [r5] // off2
+.endif
+
+.if \pri
+ ldrb r12, [r8] // *pri_taps
+
+ handle_pixel q14, q15, q5, q4, r12, \min
+.endif
+
+.if \sec
+ load_px d28, d29, d30, d31, \w
+
+ add r5, r5, #8 // +2*4
+ ldrsb r9, [r5] // off3
+
+ handle_pixel q14, q15, q7, q6, lr, \min
+
+ load_px d28, d29, d30, d31, \w
+
+ handle_pixel q14, q15, q7, q6, lr, \min
+
+ sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
+.else
+ add r5, r5, #1 // r5 += 1
+.endif
+ subs lr, lr, #1 // sec_tap-- (value)
+.if \pri
+ add r8, r8, #1 // pri_taps++ (pointer)
+.endif
+ bne 2b
+
+ vshr.s16 q14, q1, #15 // -(sum < 0)
+ vadd.i16 q1, q1, q14 // sum - (sum < 0)
+ vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
+ vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4
+.if \min
+ vmin.s16 q0, q0, q3
+ vmax.s16 q0, q0, q2 // iclip(px + .., min, max)
+.endif
+.if \bpc == 8
+ vmovn.u16 d0, q0
+.endif
+.if \w == 8
+ add r2, r2, #2*16 // tmp += tmp_stride
+ subs r7, r7, #1 // h--
+.if \bpc == 8
+ vst1.8 {d0}, [r0, :64], r1
+.else
+ vst1.16 {q0}, [r0, :128], r1
+.endif
+.else
+.if \bpc == 8
+ vst1.32 {d0[0]}, [r0, :32], r1
+.else
+ vst1.16 {d0}, [r0, :64], r1
+.endif
+ add r2, r2, #2*16 // tmp += 2*tmp_stride
+ subs r7, r7, #2 // h -= 2
+.if \bpc == 8
+ vst1.32 {d0[1]}, [r0, :32], r1
+.else
+ vst1.16 {d1}, [r0, :64], r1
+.endif
+.endif
+
+ // Reset pri_taps and directions back to the original point
+ sub r5, r5, #2
+.if \pri
+ sub r8, r8, #2
+.endif
+
+ bgt 1b
+ vpop {q4-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+.macro filter w, bpc
+filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_\bpc\()bpc_neon, export=1
+ push {r4-r9,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #92]
+ ldrd r6, r7, [sp, #100]
+.if \bpc == 16
+ ldrd r8, r9, [sp, #108]
+.else
+ ldr r8, [sp, #108]
+.endif
+ cmp r3, #0 // pri_strength
+ bne 1f
+ b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
+1:
+ cmp r4, #0 // sec_strength
+ bne 1f
+ b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
+1:
+ b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
+endfunc
+.endm
+
+const div_table, align=4
+ .short 840, 420, 280, 210, 168, 140, 120, 105
+endconst
+
+const alt_fact, align=4
+ .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
+endconst
+
+.macro cost_alt dest, s1, s2, s3, s4, s5, s6
+ vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n]
+ vmull.s16 q2, \s2, \s2
+ vmull.s16 q3, \s3, \s3
+ vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n]
+ vmull.s16 q12, \s5, \s5
+ vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here
+ vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact
+ vmla.i32 q1, q2, q14
+ vmla.i32 q1, q3, q15
+ vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact
+ vmla.i32 q5, q12, q14
+ vmla.i32 q5, q6, q15
+ vadd.i32 d2, d2, d3
+ vadd.i32 d3, d10, d11
+ vpadd.i32 \dest, d2, d3 // *cost_ptr
+.endm
+
+.macro find_best s1, s2, s3
+.ifnb \s2
+ vmov.32 lr, \s2
+.endif
+ cmp r12, r1 // cost[n] > best_cost
+ itt gt
+ movgt r0, r3 // best_dir = n
+ movgt r1, r12 // best_cost = cost[n]
+.ifnb \s2
+ add r3, r3, #1 // n++
+ cmp lr, r1 // cost[n] > best_cost
+ vmov.32 r12, \s3
+ itt gt
+ movgt r0, r3 // best_dir = n
+ movgt r1, lr // best_cost = cost[n]
+ add r3, r3, #1 // n++
+.endif
+.endm
+
+// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
+// unsigned *const var)
+.macro find_dir bpc
+function cdef_find_dir_\bpc\()bpc_neon, export=1
+ push {lr}
+ vpush {q4-q7}
+.if \bpc == 16
+ clz r3, r3 // clz(bitdepth_max)
+ sub lr, r3, #24 // -bitdepth_min_8
+.endif
+ sub sp, sp, #32 // cost
+ mov r3, #8
+ vmov.u16 q1, #0 // q0-q1 sum_diag[0]
+ vmov.u16 q3, #0 // q2-q3 sum_diag[1]
+ vmov.u16 q5, #0 // q4-q5 sum_hv[0-1]
+ vmov.u16 q8, #0 // q6,d16 sum_alt[0]
+ // q7,d17 sum_alt[1]
+ vmov.u16 q9, #0 // q9,d22 sum_alt[2]
+ vmov.u16 q11, #0
+ vmov.u16 q10, #0 // q10,d23 sum_alt[3]
+
+
+.irpc i, 01234567
+.if \bpc == 8
+ vld1.8 {d30}, [r0, :64], r1
+ vmov.u8 d31, #128
+ vsubl.u8 q15, d30, d31 // img[x] - 128
+.else
+ vld1.16 {q15}, [r0, :128], r1
+ vdup.16 q14, lr // -bitdepth_min_8
+ vshl.u16 q15, q15, q14
+ vmov.u16 q14, #128
+ vsub.i16 q15, q15, q14 // img[x] - 128
+.endif
+ vmov.u16 q14, #0
+
+.if \i == 0
+ vmov q0, q15 // sum_diag[0]
+.else
+ vext.8 q12, q14, q15, #(16-2*\i)
+ vext.8 q13, q15, q14, #(16-2*\i)
+ vadd.i16 q0, q0, q12 // sum_diag[0]
+ vadd.i16 q1, q1, q13 // sum_diag[0]
+.endif
+ vrev64.16 q13, q15
+ vswp d26, d27 // [-x]
+.if \i == 0
+ vmov q2, q13 // sum_diag[1]
+.else
+ vext.8 q12, q14, q13, #(16-2*\i)
+ vext.8 q13, q13, q14, #(16-2*\i)
+ vadd.i16 q2, q2, q12 // sum_diag[1]
+ vadd.i16 q3, q3, q13 // sum_diag[1]
+.endif
+
+ vpadd.u16 d26, d30, d31 // [(x >> 1)]
+ vmov.u16 d27, #0
+ vpadd.u16 d24, d26, d28
+ vpadd.u16 d24, d24, d28 // [y]
+ vmov.u16 r12, d24[0]
+ vadd.i16 q5, q5, q15 // sum_hv[1]
+.if \i < 4
+ vmov.16 d8[\i], r12 // sum_hv[0]
+.else
+ vmov.16 d9[\i-4], r12 // sum_hv[0]
+.endif
+
+.if \i == 0
+ vmov.u16 q6, q13 // sum_alt[0]
+.else
+ vext.8 q12, q14, q13, #(16-2*\i)
+ vext.8 q14, q13, q14, #(16-2*\i)
+ vadd.i16 q6, q6, q12 // sum_alt[0]
+ vadd.i16 d16, d16, d28 // sum_alt[0]
+.endif
+ vrev64.16 d26, d26 // [-(x >> 1)]
+ vmov.u16 q14, #0
+.if \i == 0
+ vmov q7, q13 // sum_alt[1]
+.else
+ vext.8 q12, q14, q13, #(16-2*\i)
+ vext.8 q13, q13, q14, #(16-2*\i)
+ vadd.i16 q7, q7, q12 // sum_alt[1]
+ vadd.i16 d17, d17, d26 // sum_alt[1]
+.endif
+
+.if \i < 6
+ vext.8 q12, q14, q15, #(16-2*(3-(\i/2)))
+ vext.8 q13, q15, q14, #(16-2*(3-(\i/2)))
+ vadd.i16 q9, q9, q12 // sum_alt[2]
+ vadd.i16 d22, d22, d26 // sum_alt[2]
+.else
+ vadd.i16 q9, q9, q15 // sum_alt[2]
+.endif
+.if \i == 0
+ vmov q10, q15 // sum_alt[3]
+.elseif \i == 1
+ vadd.i16 q10, q10, q15 // sum_alt[3]
+.else
+ vext.8 q12, q14, q15, #(16-2*(\i/2))
+ vext.8 q13, q15, q14, #(16-2*(\i/2))
+ vadd.i16 q10, q10, q12 // sum_alt[3]
+ vadd.i16 d23, d23, d26 // sum_alt[3]
+.endif
+.endr
+
+ vmov.u32 q15, #105
+
+ vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0]
+ vmlal.s16 q12, d9, d9
+ vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1]
+ vmlal.s16 q13, d11, d11
+ vadd.s32 d8, d24, d25
+ vadd.s32 d9, d26, d27
+ vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17)
+ vmul.i32 d8, d8, d30 // cost[2,6] *= 105
+
+ vrev64.16 q1, q1
+ vrev64.16 q3, q3
+ vext.8 q1, q1, q1, #10 // sum_diag[0][14-n]
+ vext.8 q3, q3, q3, #10 // sum_diag[1][14-n]
+
+ vstr s16, [sp, #2*4] // cost[2]
+ vstr s17, [sp, #6*4] // cost[6]
+
+ movrel_local r12, div_table
+ vld1.16 {q14}, [r12, :128]
+
+ vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0]
+ vmull.s16 q12, d1, d1
+ vmlal.s16 q5, d2, d2
+ vmlal.s16 q12, d3, d3
+ vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1]
+ vmull.s16 q1, d5, d5
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q1, d7, d7
+ vmovl.u16 q13, d28 // div_table
+ vmovl.u16 q14, d29
+ vmul.i32 q5, q5, q13 // cost[0]
+ vmla.i32 q5, q12, q14
+ vmul.i32 q0, q0, q13 // cost[4]
+ vmla.i32 q0, q1, q14
+ vadd.i32 d10, d10, d11
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1
+
+ movrel_local r12, alt_fact
+ vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105
+
+ vstr s0, [sp, #0*4] // cost[0]
+ vstr s1, [sp, #4*4] // cost[4]
+
+ vmovl.u16 q13, d29 // div_table[2*m+1] + 105
+ vmovl.u16 q14, d30
+ vmovl.u16 q15, d31
+
+ cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3]
+ cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7]
+ vstr s28, [sp, #1*4] // cost[1]
+ vstr s29, [sp, #3*4] // cost[3]
+
+ mov r0, #0 // best_dir
+ vmov.32 r1, d0[0] // best_cost
+ mov r3, #1 // n
+
+ vstr s30, [sp, #5*4] // cost[5]
+ vstr s31, [sp, #7*4] // cost[7]
+
+ vmov.32 r12, d14[0]
+
+ find_best d14[0], d8[0], d14[1]
+ find_best d14[1], d0[1], d15[0]
+ find_best d15[0], d8[1], d15[1]
+ find_best d15[1]
+
+ eor r3, r0, #4 // best_dir ^4
+ ldr r12, [sp, r3, lsl #2]
+ sub r1, r1, r12 // best_cost - cost[best_dir ^ 4]
+ lsr r1, r1, #10
+ str r1, [r2] // *var
+
+ add sp, sp, #32
+ vpop {q4-q7}
+ pop {pc}
+endfunc
+.endm
--- a/src/arm/32/loopfilter.S
+++ b/src/arm/32/loopfilter.S
@@ -515,7 +515,7 @@
lpf_8_wd8
sub r10, r0, r1, lsl #1
- sub r10, r10, r1
+ sub r10, r10, r1
vst1.8 {d21}, [r10, :64], r1 // p2
vst1.8 {d24}, [r0, :64], r1 // q0
vst1.8 {d22}, [r10, :64], r1 // p1
@@ -783,11 +783,11 @@
vld1.8 {d6[]}, [r5] // sharp[1]
sub r5, r5, #8
vbif d1, d0, d3 // if (!l[0][0]) L = l[offset][0]
+ vtst.32 d2, d1, d2 // L != 0
vmul.i32 d1, d1, d4 // L
.ifc \type, y
vdup.32 d15, r2 // vmask[2]
.endif
- vtst.32 d2, d1, d2 // L != 0
vdup.32 d14, r7 // vmask[1]
vmov r10, r11, d2
orrs r10, r10, r11
--- /dev/null
+++ b/src/arm/32/loopfilter16.S
@@ -1,0 +1,860 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro loop_filter wd
+function lpf_4_wd\wd\()_neon
+ vabd.u16 d0, d22, d23 // abs(p1 - p0)
+ vabd.u16 d1, d25, d24 // abs(q1 - q0)
+ vabd.u16 d2, d23, d24 // abs(p0 - q0)
+ vabd.u16 d3, d22, d25 // abs(p1 - q1)
+.if \wd >= 6
+ vabd.u16 d4, d21, d22 // abs(p2 - p1)
+ vabd.u16 d5, d26, d25 // abs(q2 - q1)
+.endif
+.if \wd >= 8
+ vabd.u16 d6, d20, d21 // abs(p3 - p2)
+ vabd.u16 d7, d27, d26 // abs(q3 - q3)
+.endif
+.if \wd >= 6
+ vmax.u16 d4, d4, d5
+.endif
+ vqadd.u16 d2, d2, d2 // abs(p0 - q0) * 2
+.if \wd >= 8
+ vmax.u16 d6, d6, d7
+.endif
+ vshr.u16 d3, d3, #1
+.if \wd >= 8
+ vmax.u16 d4, d4, d6
+.endif
+ vmax.u16 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0))
+ vqadd.u16 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+ vmax.u16 d4, d0, d4
+ vcge.u16 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+ vcge.u16 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+ vcge.u16 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+ vand d1, d1, d2 // fm && wd >= 4 (implicit)
+.if \wd >= 6
+ vmov d14, d1 // fm && wd > 4 (implicit)
+.endif
+.if \wd >= 16
+ vmov d15, d1 // fm && wd == 16 (implicit)
+.endif
+
+ vmov r10, r11, d1
+ orrs r10, r10, r11
+ beq 9f // if (!fm || wd < 4) return;
+
+.if \wd >= 6
+ vmov.i16 d10, #1
+ vabd.u16 d2, d21, d23 // abs(p2 - p0)
+ vabd.u16 d3, d22, d23 // abs(p1 - p0)
+ vabd.u16 d4, d25, d24 // abs(q1 - q0)
+ vabd.u16 d5, d26, d24 // abs(q2 - q0)
+ vdup.16 d9, r9 // bitdepth_min_8
+.if \wd >= 8
+ vabd.u16 d6, d20, d23 // abs(p3 - p0)
+ vabd.u16 d7, d27, d24 // abs(q3 - q0)
+.endif
+ vmax.u16 d2, d2, d3
+ vmax.u16 d4, d4, d5
+.if \wd >= 8
+ vmax.u16 d6, d6, d7
+.endif
+ vmax.u16 d2, d2, d4
+ vshl.u16 d10, d10, d9 // F = 1 << bitdepth_min_8
+.if \wd >= 8
+ vmax.u16 d2, d2, d6
+.endif
+
+.if \wd == 16
+ vabd.u16 d3, d17, d23 // abs(p6 - p0)
+ vabd.u16 d4, d18, d23 // abs(p5 - p0)
+ vabd.u16 d5, d19, d23 // abs(p4 - p0)
+.endif
+ vcge.u16 d2, d10, d2 // flat8in
+.if \wd == 16
+ vabd.u16 d6, d28, d24 // abs(q4 - q0)
+ vabd.u16 d7, d29, d24 // abs(q5 - q0)
+ vabd.u16 d8, d30, d24 // abs(q6 - q0)
+.endif
+ vand d14, d2, d14 // flat8in && fm && wd > 4
+ vbic d1, d1, d14 // fm && wd >= 4 && !flat8in
+.if \wd == 16
+ vmax.u16 d3, d3, d4
+ vmax.u16 d5, d5, d6
+.endif
+ vmov r10, r11, d1
+.if \wd == 16
+ vmax.u16 d7, d7, d8
+ vmax.u16 d3, d3, d5
+ vmax.u16 d3, d3, d7
+ vcge.u16 d3, d10, d3 // flat8out
+.endif
+ orrs r10, r10, r11
+.if \wd == 16
+ vand d15, d15, d3 // flat8out && fm && wd == 16
+ vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16
+ vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out
+.endif
+ beq 1f // skip wd == 4 case
+.endif
+
+ vdup.16 d3, r8 // bitdepth_max
+ vsub.u16 d2, d22, d25 // p1 - q1
+ vshr.u16 d3, d3, #1 // 128 << bitdepth_min_8 - 1
+ vcgt.u16 d0, d0, d12 // hev
+ vmvn d9, d3 // - 128 * (1 << bitdepth_min_8)
+ vmin.s16 d2, d2, d3 // iclip_diff(p1 - q1)
+ vmax.s16 d2, d2, d9 // iclip_diff(p1 - q1)
+ vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1)
+ vsub.u16 d2, d24, d23
+ vmov.i16 d6, #3
+ vbic d0, d1, d0 // (fm && wd >= 4 && !hev)
+ vmul.i16 d2, d2, d6
+ vmov.i16 d6, #4
+ vadd.i16 d2, d2, d4
+ vmin.s16 d2, d2, d3 // f = iclip_diff()
+ vmov.i16 d7, #3
+ vmax.s16 d2, d2, d9 // f = iclip_diff()
+ vqadd.s16 d4, d6, d2 // f + 4
+ vqadd.s16 d5, d7, d2 // f + 3
+ vmin.s16 d4, d4, d3 // imin(f + 4, 128 << bitdepth_min_8 - 1)
+ vmin.s16 d5, d5, d3 // imin(f + 3, 128 << bitdepth_min_8 - 1)
+ vshr.s16 d4, d4, #3 // f1
+ vshr.s16 d5, d5, #3 // f2
+ vmov.i16 d9, #0
+ vdup.16 d3, r8 // bitdepth_max
+ vqadd.s16 d2, d23, d5 // p0 + f2
+ vqsub.s16 d6, d24, d4 // q0 - f1
+ vrshr.s16 d4, d4, #1 // (f1 + 1) >> 1
+ vmin.s16 d2, d2, d3 // out p0 = iclip_pixel()
+ vmin.s16 d6, d6, d3 // out q0 = iclip_pixel()
+ vmax.s16 d2, d2, d9 // out p0 = iclip_pixel()
+ vmax.s16 d6, d6, d9 // out q0 = iclip_pixel()
+ vbit d23, d2, d1 // if (fm && wd >= 4)
+ vbit d24, d6, d1 // if (fm && wd >= 4)
+ vqadd.s16 d2, d22, d4 // p1 + f
+ vqsub.s16 d6, d25, d4 // q1 - f
+ vmin.s16 d2, d2, d3 // out p1 = iclip_pixel()
+ vmin.s16 d6, d6, d3 // out q1 = iclip_pixel()
+ vmax.s16 d2, d2, d9 // out p1 = iclip_pixel()
+ vmax.s16 d6, d6, d9 // out q1 = iclip_pixel()
+ vbit d22, d2, d0 // if (fm && wd >= 4 && !hev)
+ vbit d25, d6, d0 // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 2f // skip if there's no flat8in
+
+ vadd.i16 d0, d21, d21 // p2 * 2
+ vadd.i16 d2, d21, d22 // p2 + p1
+ vadd.i16 d4, d22, d23 // p1 + p0
+ vadd.i16 d6, d23, d24 // p0 + q0
+ vadd.i16 d8, d0, d2
+ vadd.i16 d10, d4, d6
+ vadd.i16 d12, d24, d25 // q0 + q1
+ vadd.i16 d8, d8, d10
+ vsub.i16 d12, d12, d0
+ vadd.i16 d10, d25, d26 // q1 + q2
+ vrshr.u16 d0, d8, #3 // out p1
+
+ vadd.i16 d8, d8, d12
+ vsub.i16 d10, d10, d2
+ vadd.i16 d12, d26, d26 // q2 + q2
+ vrshr.u16 d1, d8, #3 // out p0
+
+ vadd.i16 d8, d8, d10
+ vsub.i16 d12, d12, d4
+ vrshr.u16 d2, d8, #3 // out q0
+
+ vbit d22, d0, d14 // p1 if (flat8in)
+ vadd.i16 d8, d8, d12
+ vbit d23, d1, d14 // p0 if (flat8in)
+ vrshr.u16 d3, d8, #3 // out q1
+ vbit d24, d2, d14 // q0 if (flat8in)
+ vbit d25, d3, d14 // q1 if (flat8in)
+.elseif \wd >= 8
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+.if \wd == 8
+ beq 8f // skip if there's no flat8in
+.else
+ beq 2f // skip if there's no flat8in
+.endif
+
+ vadd.i16 d0, d20, d21 // p3 + p2
+ vadd.i16 d2, d22, d25 // p1 + q1
+ vadd.i16 d4, d20, d22 // p3 + p1
+ vadd.i16 d6, d23, d26 // p0 + q2
+ vadd.i16 d8, d0, d0 // 2 * (p3 + p2)
+ vadd.i16 d9, d23, d24 // p0 + q0
+ vadd.i16 d8, d8, d4 // + p3 + p1
+ vsub.i16 d2, d2, d0 // p1 + q1 - p3 - p2
+ vadd.i16 d8, d8, d9 // + p0 + q0
+ vsub.i16 d6, d6, d4 // p0 + q2 - p3 - p1
+ vrshr.u16 d10, d8, #3 // out p2
+
+ vadd.i16 d8, d8, d2
+ vadd.i16 d0, d20, d23 // p3 + p0
+ vadd.i16 d2, d24, d27 // q0 + q3
+ vrshr.u16 d11, d8, #3 // out p1
+
+ vadd.i16 d8, d8, d6
+ vsub.i16 d2, d2, d0 // q0 + q3 - p3 - p0
+ vadd.i16 d4, d21, d24 // p2 + q0
+ vadd.i16 d6, d25, d27 // q1 + q3
+ vrshr.u16 d12, d8, #3 // out p0
+
+ vadd.i16 d8, d8, d2
+ vsub.i16 d6, d6, d4 // q1 + q3 - p2 - q0
+ vadd.i16 d0, d22, d25 // p1 + q1
+ vadd.i16 d2, d26, d27 // q2 + q3
+ vrshr.u16 d13, d8, #3 // out q0
+
+ vadd.i16 d8, d8, d6
+ vsub.i16 d2, d2, d0 // q2 + q3 - p1 - q1
+ vrshr.u16 d0, d8, #3 // out q1
+
+ vadd.i16 d8, d8, d2
+
+ vbit d21, d10, d14
+ vbit d22, d11, d14
+ vbit d23, d12, d14
+ vrshr.u16 d1, d8, #3 // out q2
+ vbit d24, d13, d14
+ vbit d25, d0, d14
+ vbit d26, d1, d14
+.endif
+2:
+.if \wd == 16
+ vmov r10, r11, d15
+ orrs r10, r10, r11
+ bne 1f // check if flat8out is needed
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 8f // if there was no flat8in, just write the inner 4 pixels
+ b 7f // if flat8in was used, write the inner 6 pixels
+1:
+
+ vadd.i16 d2, d17, d17 // p6 + p6
+ vadd.i16 d4, d17, d18 // p6 + p5
+ vadd.i16 d6, d17, d19 // p6 + p4
+ vadd.i16 d8, d17, d20 // p6 + p3
+ vadd.i16 d12, d2, d4
+ vadd.i16 d10, d6, d8
+ vadd.i16 d6, d17, d21 // p6 + p2
+ vadd.i16 d12, d12, d10
+ vadd.i16 d8, d17, d22 // p6 + p1
+ vadd.i16 d10, d18, d23 // p5 + p0
+ vadd.i16 d6, d6, d8
+ vadd.i16 d8, d19, d24 // p4 + q0
+ vadd.i16 d12, d12, d6
+ vadd.i16 d10, d10, d8
+ vadd.i16 d6, d20, d25 // p3 + q1
+ vadd.i16 d12, d12, d10
+ vsub.i16 d6, d6, d2
+ vadd.i16 d2, d21, d26 // p2 + q2
+ vrshr.u16 d0, d12, #4 // out p5
+ vadd.i16 d12, d12, d6 // - (p6 + p6) + (p3 + q1)
+ vsub.i16 d2, d2, d4
+ vadd.i16 d4, d22, d27 // p1 + q3
+ vadd.i16 d6, d17, d19 // p6 + p4
+ vrshr.u16 d1, d12, #4 // out p4
+ vadd.i16 d12, d12, d2 // - (p6 + p5) + (p2 + q2)
+ vsub.i16 d4, d4, d6
+ vadd.i16 d6, d23, d28 // p0 + q4
+ vadd.i16 d8, d17, d20 // p6 + p3
+ vrshr.u16 d2, d12, #4 // out p3
+ vadd.i16 d12, d12, d4 // - (p6 + p4) + (p1 + q3)
+ vsub.i16 d6, d6, d8
+ vadd.i16 d8, d24, d29 // q0 + q5
+ vadd.i16 d4, d17, d21 // p6 + p2
+ vrshr.u16 d3, d12, #4 // out p2
+ vadd.i16 d12, d12, d6 // - (p6 + p3) + (p0 + q4)
+ vsub.i16 d8, d8, d4
+ vadd.i16 d6, d25, d30 // q1 + q6
+ vadd.i16 d10, d17, d22 // p6 + p1
+ vrshr.u16 d4, d12, #4 // out p1
+ vadd.i16 d12, d12, d8 // - (p6 + p2) + (q0 + q5)
+ vsub.i16 d6, d6, d10
+ vadd.i16 d8, d26, d30 // q2 + q6
+ vbif d0, d18, d15 // out p5
+ vadd.i16 d10, d18, d23 // p5 + p0
+ vrshr.u16 d5, d12, #4 // out p0
+ vadd.i16 d12, d12, d6 // - (p6 + p1) + (q1 + q6)
+ vsub.i16 d8, d8, d10
+ vadd.i16 d10, d27, d30 // q3 + q6
+ vbif d1, d19, d15 // out p4
+ vadd.i16 d18, d19, d24 // p4 + q0
+ vrshr.u16 d6, d12, #4 // out q0
+ vadd.i16 d12, d12, d8 // - (p5 + p0) + (q2 + q6)
+ vsub.i16 d10, d10, d18
+ vadd.i16 d8, d28, d30 // q4 + q6
+ vbif d2, d20, d15 // out p3
+ vadd.i16 d18, d20, d25 // p3 + q1
+ vrshr.u16 d7, d12, #4 // out q1
+ vadd.i16 d12, d12, d10 // - (p4 + q0) + (q3 + q6)
+ vsub.i16 d18, d8, d18
+ vadd.i16 d10, d29, d30 // q5 + q6
+ vbif d3, d21, d15 // out p2
+ vadd.i16 d20, d21, d26 // p2 + q2
+ vrshr.u16 d8, d12, #4 // out q2
+ vadd.i16 d12, d12, d18 // - (p3 + q1) + (q4 + q6)
+ vsub.i16 d10, d10, d20
+ vadd.i16 d18, d30, d30 // q6 + q6
+ vbif d4, d22, d15 // out p1
+ vadd.i16 d20, d22, d27 // p1 + q3
+ vrshr.u16 d9, d12, #4 // out q3
+ vadd.i16 d12, d12, d10 // - (p2 + q2) + (q5 + q6)
+ vsub.i16 d18, d18, d20
+ vbif d5, d23, d15 // out p0
+ vrshr.u16 d10, d12, #4 // out q4
+ vadd.i16 d12, d12, d18 // - (p1 + q3) + (q6 + q6)
+ vrshr.u16 d11, d12, #4 // out q5
+ vbif d6, d24, d15 // out q0
+ vbif d7, d25, d15 // out q1
+ vbif d8, d26, d15 // out q2
+ vbif d9, d27, d15 // out q3
+ vbif d10, d28, d15 // out q4
+ vbif d11, d29, d15 // out q5
+.endif
+
+ bx lr
+.if \wd == 16
+7:
+ // Return to a shorter epilogue, writing only the inner 6 pixels
+ bx r6
+.endif
+.if \wd >= 8
+8:
+ // Return to a shorter epilogue, writing only the inner 4 pixels
+ bx r7
+.endif
+9:
+ // Return directly without writing back any pixels
+ bx r12
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_4_wd16
+ adr r6, 7f + CONFIG_THUMB
+ adr r7, 8f + CONFIG_THUMB
+ bl lpf_4_wd16_neon
+.endm
+
+.macro lpf_4_wd8
+ adr r7, 8f + CONFIG_THUMB
+ bl lpf_4_wd8_neon
+.endm
+
+.macro lpf_4_wd6
+ bl lpf_4_wd6_neon
+.endm
+
+.macro lpf_4_wd4
+ bl lpf_4_wd4_neon
+.endm
+
+function lpf_v_4_4_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+
+ lpf_4_wd4
+
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_4_4_neon
+ mov r12, lr
+ sub r10, r0, #4
+ add r0, r10, r1, lsl #1
+ vld1.16 {d22}, [r10], r1
+ vld1.16 {d24}, [r0], r1
+ vld1.16 {d23}, [r10], r1
+ vld1.16 {d25}, [r0], r1
+ add r0, r0, #4
+
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+
+ lpf_4_wd4
+
+ sub r10, r0, r1, lsl #2
+ sub r10, r10, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+function lpf_v_6_4_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vld1.16 {d21}, [r10, :64], r1 // p2
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+
+ lpf_4_wd6
+
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_6_4_neon
+ mov r12, lr
+ sub r10, r0, #8
+ vld1.16 {d20}, [r10, :64], r1
+ vld1.16 {d24}, [r0, :64], r1
+ vld1.16 {d21}, [r10, :64], r1
+ vld1.16 {d25}, [r0, :64], r1
+ vld1.16 {d22}, [r10, :64], r1
+ vld1.16 {d26}, [r0, :64], r1
+ vld1.16 {d23}, [r10, :64], r1
+ vld1.16 {d27}, [r0, :64], r1
+
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ lpf_4_wd6
+
+ sub r0, r0, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ sub r10, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+function lpf_v_8_4_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #2
+ vld1.16 {d20}, [r10, :64], r1 // p3
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d21}, [r10, :64], r1 // p2
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d26}, [r0, :64], r1 // q2
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d27}, [r0, :64], r1 // q3
+ sub r0, r0, r1, lsl #2
+
+ lpf_4_wd8
+
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vst1.16 {d21}, [r10, :64], r1 // p2
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_8_4_neon
+ mov r12, lr
+ sub r10, r0, #8
+ vld1.16 {d20}, [r10, :64], r1
+ vld1.16 {d24}, [r0, :64], r1
+ vld1.16 {d21}, [r10, :64], r1
+ vld1.16 {d25}, [r0, :64], r1
+ vld1.16 {d22}, [r10, :64], r1
+ vld1.16 {d26}, [r0, :64], r1
+ vld1.16 {d23}, [r10, :64], r1
+ vld1.16 {d27}, [r0, :64], r1
+
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ lpf_4_wd8
+
+ sub r0, r0, r1, lsl #2
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ sub r10, r0, #8
+
+ vst1.16 {d20}, [r10, :64], r1
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d21}, [r10, :64], r1
+ vst1.16 {d25}, [r0, :64], r1
+ vst1.16 {d22}, [r10, :64], r1
+ vst1.16 {d26}, [r0, :64], r1
+ vst1.16 {d23}, [r10, :64], r1
+ vst1.16 {d27}, [r0, :64], r1
+ bx r12
+8:
+ sub r0, r0, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ sub r10, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+function lpf_v_16_4_neon
+ mov r12, lr
+
+ sub r10, r0, r1, lsl #3
+ add r10, r10, r1
+ vld1.16 {d17}, [r10, :64], r1 // p6
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d18}, [r10, :64], r1 // p5
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ vld1.16 {d19}, [r10, :64], r1 // p4
+ vld1.16 {d26}, [r0, :64], r1 // q2
+ vld1.16 {d20}, [r10, :64], r1 // p3
+ vld1.16 {d27}, [r0, :64], r1 // q3
+ vld1.16 {d21}, [r10, :64], r1 // p2
+ vld1.16 {d28}, [r0, :64], r1 // q4
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d29}, [r0, :64], r1 // q5
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d30}, [r0, :64], r1 // q6
+ sub r0, r0, r1, lsl #3
+ add r0, r0, r1
+
+ lpf_4_wd16
+
+ sub r10, r0, r1, lsl #2
+ sub r10, r10, r1, lsl #1
+ vst1.16 {d0}, [r10, :64], r1 // p5
+ vst1.16 {d6}, [r0, :64], r1 // q0
+ vst1.16 {d1}, [r10, :64], r1 // p4
+ vst1.16 {d7}, [r0, :64], r1 // q1
+ vst1.16 {d2}, [r10, :64], r1 // p3
+ vst1.16 {d8}, [r0, :64], r1 // q2
+ vst1.16 {d3}, [r10, :64], r1 // p2
+ vst1.16 {d9}, [r0, :64], r1 // q3
+ vst1.16 {d4}, [r10, :64], r1 // p1
+ vst1.16 {d10}, [r0, :64], r1 // q4
+ vst1.16 {d5}, [r10, :64], r1 // p0
+ vst1.16 {d11}, [r0, :64], r1 // q5
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+ bx r12
+7:
+ sub r10, r0, r1
+ sub r10, r10, r1, lsl #1
+ vst1.16 {d21}, [r10, :64], r1 // p2
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_16_4_neon
+ mov r12, lr
+ sub r10, r0, #16
+ sub r0, r0, #8
+ vld1.16 {d16}, [r10, :64], r1
+ vld1.16 {d20}, [r0, :64], r1
+ vld1.16 {d17}, [r10, :64], r1
+ vld1.16 {d21}, [r0, :64], r1
+ vld1.16 {d18}, [r10, :64], r1
+ vld1.16 {d22}, [r0, :64], r1
+ vld1.16 {d19}, [r10, :64], r1
+ vld1.16 {d23}, [r0, :64], r1
+ sub r10, r10, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ add r10, r10, #16
+ add r0, r0, #16
+ vld1.16 {d24}, [r10, :64], r1
+ vld1.16 {d28}, [r0, :64], r1
+ vld1.16 {d25}, [r10, :64], r1
+ vld1.16 {d29}, [r0, :64], r1
+ vld1.16 {d26}, [r10, :64], r1
+ vld1.16 {d30}, [r0, :64], r1
+ vld1.16 {d27}, [r10, :64], r1
+ vld1.16 {d31}, [r0, :64], r1
+ sub r0, r0, #8
+
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+ lpf_4_wd16
+
+ sub r0, r0, r1, lsl #2
+ transpose_4x4h q8, q0, d16, d17, d0, d1
+ transpose_4x4h q1, q2, d2, d3, d4, d5
+ transpose_4x4h q3, q4, d6, d7, d8, d9
+ transpose_4x4h q5, q15, d10, d11, d30, d31
+ sub r10, r0, #16
+ sub r0, r0, #8
+
+ vst1.16 {d16}, [r10, :64], r1
+ vst1.16 {d2}, [r0, :64], r1
+ vst1.16 {d17}, [r10, :64], r1
+ vst1.16 {d3}, [r0, :64], r1
+ vst1.16 {d0}, [r10, :64], r1
+ vst1.16 {d4}, [r0, :64], r1
+ vst1.16 {d1}, [r10, :64], r1
+ vst1.16 {d5}, [r0, :64], r1
+ sub r10, r10, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ add r10, r10, #16
+ add r0, r0, #16
+ vst1.16 {d6}, [r10, :64], r1
+ vst1.16 {d10}, [r0, :64], r1
+ vst1.16 {d7}, [r10, :64], r1
+ vst1.16 {d11}, [r0, :64], r1
+ vst1.16 {d8}, [r10, :64], r1
+ vst1.16 {d30}, [r0, :64], r1
+ vst1.16 {d9}, [r10, :64], r1
+ vst1.16 {d31}, [r0, :64], r1
+ sub r0, r0, #8
+
+ bx r12
+
+7:
+ sub r0, r0, r1, lsl #2
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ sub r10, r0, #8
+
+ vst1.16 {d20}, [r10, :64], r1
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d21}, [r10, :64], r1
+ vst1.16 {d25}, [r0, :64], r1
+ vst1.16 {d22}, [r10, :64], r1
+ vst1.16 {d26}, [r0, :64], r1
+ vst1.16 {d23}, [r10, :64], r1
+ vst1.16 {d27}, [r0, :64], r1
+ bx r12
+8:
+ sub r0, r0, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ sub r10, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint32_t *const vmask,
+// const uint8_t (*l)[4], ptrdiff_t b4_stride,
+// const Av1FilterLUT *lut, const int w,
+// const int bitdepth_max)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r8, [sp, #112] // bitdepth_max; the 'w' parameter isn't loaded
+ sub sp, sp, #8
+ clz r9, r8
+ rsb r9, r9, #24 // bitdepth_min_8
+ ldrd r6, r7, [r2] // vmask[0], vmask[1]
+.ifc \type, y
+ ldr r2, [r2, #8] // vmask[2]
+.endif
+ add r5, r5, #128 // Move to sharp part of lut
+.ifc \type, y
+ orr r7, r7, r2 // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+ sub r4, r3, r4, lsl #2
+.else
+ sub r3, r3, #4
+ lsl r4, r4, #2
+.endif
+ orr r6, r6, r7 // vmask[0] |= vmask[1]
+
+1:
+ tst r6, #0x01
+ strd r6, r7, [sp]
+.ifc \dir, v
+ ldrb r10, [r4], #4
+ ldrb r11, [r3], #4
+.else
+ ldrb r10, [r3]
+ ldrb r11, [r3, #4]
+ add r3, r3, r4
+.endif
+ beq 7f // if (!(vm & bits)) continue;
+
+ orrs r12, r10, r11
+ vdup.16 d31, r9 // bitdepth_min_8
+ beq 7f // if (!(l[0][0] | l[offset][0])) continue;
+ cmp r11, #0 // Check for nonzero values in l[0][0]
+ ldrb r6, [r5], #8 // sharp[0]
+ it eq
+ moveq r11, r10 // if (!l[0][0]) L = l[offset][0]
+ ldrb r12, [r5] // sharp[1]
+ lsr r6, r11, r6 // L >> sharp[0]
+ sub r5, r5, #8
+ cmp r12, r6
+ lsr r10, r11, #4 // H
+ add r11, r11, #2 // L + 2
+ it lt
+ movlt r6, r12 // imin(L >> sharp[0], sharp[1])
+ add r11, r11, r11 // 2*(L + 2)
+ cmp r6, #1
+ lsl r10, r10, r9 // H << bitdepth_min_8
+ it lt
+ movlt r6, #1 // imax(imin(), 1) = limit = I
+ vdup.16 d12, r10 // H << bitdepth_min_8
+ add r11, r11, r6 // 2*(L + 2) + limit = E
+ lsl r6, r6, r9 // I << bitdepth_min_8
+ lsl r11, r11, r9 // E << bitdepth_min_8
+ vdup.16 d11, r6 // I << bitdepth_min_8
+ vdup.16 d10, r11 // E << bitdepth_min_8
+
+.ifc \type, y
+ tst r2, #0x01
+ beq 2f
+ // wd16
+ bl lpf_\dir\()_16_4_neon
+ b 8f
+2:
+.endif
+ tst r7, #0x01
+ beq 3f
+.ifc \type, y
+ // wd8
+ bl lpf_\dir\()_8_4_neon
+.else
+ // wd6
+ bl lpf_\dir\()_6_4_neon
+.endif
+ b 8f
+3:
+ // wd4
+ bl lpf_\dir\()_4_4_neon
+.ifc \dir, h
+ b 8f
+7:
+ // For dir h, the functions above increment r0.
+ // If the whole function is skipped, increment it here instead.
+ add r0, r0, r1, lsl #2
+.else
+7:
+.endif
+8:
+ ldrd r6, r7, [sp]
+.ifc \type, y
+ lsr r2, r2, #1 // vmask[2] >>= 1
+.endif
+.ifc \dir, v
+ add r0, r0, #8
+.else
+ // For dir h, r0 is returned incremented
+.endif
+ lsrs r6, r6, #1 // vmask[0] >>= 1
+ lsr r7, r7, #1 // vmask[1] >>= 1
+ bne 1b
+
+ add sp, sp, #8
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
--- a/src/arm/32/looprestoration.S
+++ b/src/arm/32/looprestoration.S
@@ -30,7 +30,7 @@
// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
-// const int16_t fh[7], const intptr_t w,
+// const int16_t fh[8], intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_8bpc_neon, export=1
push {r4-r11,lr}
@@ -38,10 +38,10 @@
ldrd r4, r5, [sp, #52]
ldrd r6, r7, [sp, #60]
mov r8, r5
- vld1.16 {q0}, [r4]
+ vld1.16 {q0}, [r4, :128]
movw r9, #(1 << 14) - (1 << 2)
- vdup.16 q14, r9
- vmov.s16 q15, #2048
+ vdup.16 q14, r9
+ vmov.s16 q15, #2048
// Calculate mid_stride
add r10, r5, #7
bic r10, r10, #7
@@ -108,8 +108,8 @@
0:
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q2 to have 3x the first byte at the front.
- vdup.8 q1, d4[0]
- vdup.8 q8, d18[0]
+ vdup.8 q1, d4[0]
+ vdup.8 q8, d18[0]
// Move r2 back to account for the last 3 bytes we loaded before,
// which we shifted out.
sub r2, r2, #3
@@ -127,7 +127,7 @@
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
- sub r9, r5, #14
+ sub r9, r5, #14
ldrb r11, [r2, r9]
ldrb r9, [lr, r9]
// Fill q12/q13 with the right padding pixel
@@ -144,7 +144,6 @@
b 6f
4: // Loop horizontally
-.macro filter_8
// This is tuned as some sort of compromise between Cortex A7, A8,
// A9 and A53.
vmul.s16 q3, q1, d0[0]
@@ -187,8 +186,6 @@
vshr.s16 q10, q10, #3
vadd.s16 q3, q3, q15
vadd.s16 q10, q10, q15
-.endm
- filter_8
vst1.16 {q3}, [r0, :128]!
vst1.16 {q10}, [r12, :128]!
@@ -206,50 +203,43 @@
5: // Filter 4 pixels, 7 <= w < 11
.macro filter_4
+ vext.8 d20, d2, d3, #2
+ vext.8 d21, d2, d3, #4
+ vext.8 d22, d2, d3, #6
+ vext.8 d23, d3, d4, #2
+ vext.8 d8, d3, d4, #4
vmul.s16 d6, d2, d0[0]
- vext.8 q10, q1, q2, #2
- vext.8 q11, q1, q2, #4
vmla.s16 d6, d20, d0[1]
- vmla.s16 d6, d22, d0[2]
- vext.8 q10, q1, q2, #6
- vext.8 q11, q1, q2, #8
- vmla.s16 d6, d20, d0[3]
- vmla.s16 d6, d22, d1[0]
- vext.8 q10, q1, q2, #10
- vext.8 q11, q1, q2, #12
- vmla.s16 d6, d20, d1[1]
- vmla.s16 d6, d22, d1[2]
+ vmla.s16 d6, d21, d0[2]
+ vmla.s16 d6, d22, d0[3]
+ vmla.s16 d6, d3, d1[0]
+ vmla.s16 d6, d23, d1[1]
+ vmla.s16 d6, d8, d1[2]
- vmul.s16 d20, d16, d0[0]
- vext.8 q11, q8, q9, #2
- vext.8 q4, q8, q9, #4
- vmla.s16 d20, d22, d0[1]
- vmla.s16 d20, d8, d0[2]
- vext.8 q11, q8, q9, #6
- vext.8 q4, q8, q9, #8
- vmla.s16 d20, d22, d0[3]
- vmla.s16 d20, d8, d1[0]
- vext.8 q11, q8, q9, #10
- vext.8 q4, q8, q9, #12
- vmla.s16 d20, d22, d1[1]
- vmla.s16 d20, d8, d1[2]
+ vext.8 d20, d16, d17, #2
+ vext.8 d21, d16, d17, #4
+ vext.8 d22, d16, d17, #6
+ vext.8 d23, d17, d18, #2
+ vext.8 d8, d17, d18, #4
+ vmul.s16 d7, d16, d0[0]
+ vmla.s16 d7, d20, d0[1]
+ vmla.s16 d7, d21, d0[2]
+ vmla.s16 d7, d22, d0[3]
+ vmla.s16 d7, d17, d1[0]
+ vmla.s16 d7, d23, d1[1]
+ vmla.s16 d7, d8, d1[2]
- vext.8 q11, q1, q2, #6
- vshl.s16 d22, d22, #7
- vsub.s16 d22, d22, d28
- vqadd.s16 d6, d6, d22
- vext.8 q11, q8, q9, #6
- vshl.s16 d22, d22, #7
- vsub.s16 d22, d22, d28
- vqadd.s16 d20, d20, d22
- vshr.s16 d6, d6, #3
- vshr.s16 d20, d20, #3
- vadd.s16 d6, d6, d30
- vadd.s16 d20, d20, d30
+ vext.8 d22, d2, d3, #6
+ vext.8 d23, d16, d17, #6
+ vshl.s16 q11, q11, #7
+ vsub.s16 q11, q11, q14
+ vqadd.s16 q3, q3, q11
+ vshr.s16 q3, q3, #3
+ vadd.s16 q3, q3, q15
.endm
filter_4
vst1.16 {d6}, [r0, :64]!
- vst1.16 {d20}, [r12, :64]!
+ vst1.16 {d7}, [r12, :64]!
subs r5, r5, #4 // 3 <= w < 7
vext.8 q1, q1, q2, #8
@@ -323,7 +313,7 @@
// w >= 4, filter 4 pixels
filter_4
vst1.16 {d6}, [r0, :64]!
- vst1.16 {d20}, [r12, :64]!
+ vst1.16 {d7}, [r12, :64]!
subs r5, r5, #4 // 0 <= w < 4
vext.8 q1, q1, q2, #8
vext.8 q8, q8, q9, #8
@@ -338,11 +328,11 @@
vdup.16 d25, d16[3]
vpadd.s16 d6, d6, d6
vtrn.16 d24, d25
- vshl.s16 d24, d24, #7
- vsub.s16 d24, d24, d28
- vqadd.s16 d6, d6, d24
- vshr.s16 d6, d6, #3
- vadd.s16 d6, d6, d30
+ vshl.s16 d24, d24, #7
+ vsub.s16 d24, d24, d28
+ vqadd.s16 d6, d6, d24
+ vshr.s16 d6, d6, #3
+ vadd.s16 d6, d6, d30
vst1.s16 {d6[0]}, [r0, :16]!
vst1.s16 {d6[1]}, [r12, :16]!
subs r5, r5, #1
@@ -363,13 +353,12 @@
0:
vpop {q4}
pop {r4-r11,pc}
-.purgem filter_8
.purgem filter_4
endfunc
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
-// const int16_t fv[7], enum LrEdgeFlags edges,
+// const int16_t fv[8], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
function wiener_filter_v_8bpc_neon, export=1
push {r4-r7,lr}
@@ -376,11 +365,7 @@
ldrd r4, r5, [sp, #20]
ldrd r6, r7, [sp, #28]
mov lr, r4
- vmov.s16 q1, #0
- mov r12, #128
- vld1.16 {q0}, [r5]
- vmov.s16 d2[3], r12
- vadd.s16 q0, q0, q1
+ vld1.16 {q0}, [r5, :128]
// Calculate the number of rows to move back when looping vertically
mov r12, r4
@@ -422,22 +407,22 @@
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
- vmull.s16 q2, d16, d0[0]
- vmlal.s16 q2, d18, d0[1]
- vmlal.s16 q2, d20, d0[2]
- vmlal.s16 q2, d22, d0[3]
- vmlal.s16 q2, d24, d1[0]
- vmlal.s16 q2, d26, d1[1]
- vmlal.s16 q2, d28, d1[2]
- vmull.s16 q3, d17, d0[0]
- vmlal.s16 q3, d19, d0[1]
- vmlal.s16 q3, d21, d0[2]
- vmlal.s16 q3, d23, d0[3]
- vmlal.s16 q3, d25, d1[0]
- vmlal.s16 q3, d27, d1[1]
- vmlal.s16 q3, d29, d1[2]
- vqrshrun.s32 d4, q2, #11
- vqrshrun.s32 d5, q3, #11
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d20, d0[2]
+ vmlal.s16 q2, d22, d0[3]
+ vmlal.s16 q2, d24, d1[0]
+ vmlal.s16 q2, d26, d1[1]
+ vmlal.s16 q2, d28, d1[2]
+ vmull.s16 q3, d17, d0[0]
+ vmlal.s16 q3, d19, d0[1]
+ vmlal.s16 q3, d21, d0[2]
+ vmlal.s16 q3, d23, d0[3]
+ vmlal.s16 q3, d25, d1[0]
+ vmlal.s16 q3, d27, d1[1]
+ vmlal.s16 q3, d29, d1[2]
+ vqrshrun.s32 d4, q2, #11
+ vqrshrun.s32 d5, q3, #11
vqmovun.s16 d4, q2
vst1.8 {d4}, [r0], r1
.if \compare
@@ -473,7 +458,7 @@
52: // 2 rows in total, q11 already loaded, load q12 with content data
// and 2 rows of edge.
vld1.16 {q14}, [r2, :128], r7
- vmov q15, q14
+ vmov q15, q14
b 8f
53:
// 3 rows in total, q11 already loaded, load q12 and q13 with content
@@ -615,8 +600,8 @@
asr r1, r1, #1
22:
subs r4, r4, #1
- vld1.16 {d0[]}, [r2]!
- vst1.16 {d0[0]}, [r0], r1
+ vld1.16 {d0[]}, [r2, :16]!
+ vst1.16 {d0[0]}, [r0, :16], r1
bgt 22b
0:
pop {r4,pc}
@@ -644,8 +629,8 @@
ble 0f
b 42b
41:
- vld1.32 {d0[]}, [r2]
- vst1.32 {d0[0]}, [r0]
+ vld1.32 {d0[]}, [r2, :32]
+ vst1.32 {d0[0]}, [r0, :32]
0:
pop {r4,pc}
@@ -687,6 +672,8 @@
#define SUM_STRIDE (384+16)
+#include "looprestoration_tmpl.S"
+
// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
@@ -785,7 +772,7 @@
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
- sub lr, r5, #(2 + 16 - 2 + 1)
+ sub lr, r5, #(2 + 16 - 2 + 1)
ldrb r11, [r3, lr]
ldrb lr, [r12, lr]
// Fill q14/q15 with the right padding pixel
@@ -1058,7 +1045,7 @@
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
- sub lr, r5, #(2 + 16 - 3 + 1)
+ sub lr, r5, #(2 + 16 - 3 + 1)
ldrb r11, [r3, lr]
ldrb lr, [r12, lr]
// Fill q14/q15 with the right padding pixel
@@ -1100,7 +1087,7 @@
vaddl_u16_n q12, q13, d2, d3, d16, d17, \w
vaddl_u16_n q8, q9, d18, d19, d20, d21, \w
vaddw_u16_n q12, q13, d22, d23, \w
- vadd_i32_n q12, q13, q8, q9, \w
+ vadd_i32_n q12, q13, q8, q9, \w
vext.8 q8, q5, q6, #2
vext.8 q9, q5, q6, #4
vext.8 q10, q5, q6, #6
@@ -1152,7 +1139,7 @@
6: // Pad the right edge and produce the last few pixels.
// w < 7, w+1 pixels valid in q0/q4
- sub lr, r5, #1
+ sub lr, r5, #1
// lr = pixels valid - 2
adr r11, L(box5_variable_shift_tbl)
ldr lr, [r11, lr, lsl #2]
@@ -1249,862 +1236,4 @@
.purgem add5
endfunc
-// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
-// const int w, const int h,
-// const enum LrEdgeFlags edges);
-function sgr_box3_v_neon, export=1
- push {r4-r9,lr}
- ldr r4, [sp, #28]
- add r12, r3, #2 // Number of output rows to move back
- mov lr, r3 // Number of input rows to move back
- add r2, r2, #2 // Actual summed width
- mov r7, #(4*SUM_STRIDE) // sumsq stride
- mov r8, #(2*SUM_STRIDE) // sum stride
- sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
- sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
-
- tst r4, #4 // LR_HAVE_TOP
- beq 0f
- // If have top, read from row -2.
- sub r5, r0, #(4*SUM_STRIDE)
- sub r6, r1, #(2*SUM_STRIDE)
- add lr, lr, #2
- b 1f
-0:
- // !LR_HAVE_TOP
- // If we don't have top, read from row 0 even if
- // we start writing to row -1.
- add r5, r0, #(4*SUM_STRIDE)
- add r6, r1, #(2*SUM_STRIDE)
-1:
-
- tst r4, #8 // LR_HAVE_BOTTOM
- beq 1f
- // LR_HAVE_BOTTOM
- add r3, r3, #2 // Sum all h+2 lines with the main loop
- add lr, lr, #2
-1:
- mov r9, r3 // Backup of h for next loops
-
-1:
- // Start of horizontal loop; start one vertical filter slice.
- // Start loading rows into q8-q13 and q0-q2 taking top
- // padding into consideration.
- tst r4, #4 // LR_HAVE_TOP
- vld1.32 {q8, q9}, [r5, :128], r7
- vld1.16 {q0}, [r6, :128], r8
- beq 2f
- // LR_HAVE_TOP
- vld1.32 {q10, q11}, [r5, :128], r7
- vld1.16 {q1}, [r6, :128], r8
- vld1.32 {q12, q13}, [r5, :128], r7
- vld1.16 {q2}, [r6, :128], r8
- b 3f
-2: // !LR_HAVE_TOP
- vmov q10, q8
- vmov q11, q9
- vmov q1, q0
- vmov q12, q8
- vmov q13, q9
- vmov q2, q0
-
-3:
- subs r3, r3, #1
-.macro add3
- vadd.i32 q8, q8, q10
- vadd.i32 q9, q9, q11
- vadd.i16 q0, q0, q1
- vadd.i32 q8, q8, q12
- vadd.i32 q9, q9, q13
- vadd.i16 q0, q0, q2
- vst1.32 {q8, q9}, [r0, :128], r7
- vst1.16 {q0}, [r1, :128], r8
-.endm
- add3
- vmov q8, q10
- vmov q9, q11
- vmov q0, q1
- vmov q10, q12
- vmov q11, q13
- vmov q1, q2
- ble 4f
- vld1.32 {q12, q13}, [r5, :128], r7
- vld1.16 {q2}, [r6, :128], r8
- b 3b
-
-4:
- tst r4, #8 // LR_HAVE_BOTTOM
- bne 5f
- // !LR_HAVE_BOTTOM
- // Produce two more rows, extending the already loaded rows.
- add3
- vmov q8, q10
- vmov q9, q11
- vmov q0, q1
- add3
-
-5: // End of one vertical slice.
- subs r2, r2, #8
- ble 0f
- // Move pointers back up to the top and loop horizontally.
- // Input pointers
- mls r5, r7, lr, r5
- mls r6, r8, lr, r6
- // Output pointers
- mls r0, r7, r12, r0
- mls r1, r8, r12, r1
- add r0, r0, #32
- add r1, r1, #16
- add r5, r5, #32
- add r6, r6, #16
- mov r3, r9
- b 1b
-
-0:
- pop {r4-r9,pc}
-.purgem add3
-endfunc
-
-// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
-// const int w, const int h,
-// const enum LrEdgeFlags edges);
-function sgr_box5_v_neon, export=1
- push {r4-r9,lr}
- vpush {q5-q7}
- ldr r4, [sp, #76]
- add r12, r3, #2 // Number of output rows to move back
- mov lr, r3 // Number of input rows to move back
- add r2, r2, #8 // Actual summed width
- mov r7, #(4*SUM_STRIDE) // sumsq stride
- mov r8, #(2*SUM_STRIDE) // sum stride
- sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
- sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
-
- tst r4, #4 // LR_HAVE_TOP
- beq 0f
- // If have top, read from row -2.
- sub r5, r0, #(4*SUM_STRIDE)
- sub r6, r1, #(2*SUM_STRIDE)
- add lr, lr, #2
- b 1f
-0:
- // !LR_HAVE_TOP
- // If we don't have top, read from row 0 even if
- // we start writing to row -1.
- add r5, r0, #(4*SUM_STRIDE)
- add r6, r1, #(2*SUM_STRIDE)
-1:
-
- tst r4, #8 // LR_HAVE_BOTTOM
- beq 0f
- // LR_HAVE_BOTTOM
- add r3, r3, #2 // Handle h+2 lines with the main loop
- add lr, lr, #2
- b 1f
-0:
- // !LR_HAVE_BOTTOM
- sub r3, r3, #1 // Handle h-1 lines with the main loop
-1:
- mov r9, r3 // Backup of h for next loops
-
-1:
- // Start of horizontal loop; start one vertical filter slice.
- // Start loading rows into q6-q15 and q0-q3,q5 taking top
- // padding into consideration.
- tst r4, #4 // LR_HAVE_TOP
- vld1.32 {q6, q7}, [r5, :128], r7
- vld1.16 {q0}, [r6, :128], r8
- beq 2f
- // LR_HAVE_TOP
- vld1.32 {q10, q11}, [r5, :128], r7
- vld1.16 {q2}, [r6, :128], r8
- vmov q8, q6
- vmov q9, q7
- vmov q1, q0
- vld1.32 {q12, q13}, [r5, :128], r7
- vld1.16 {q3}, [r6, :128], r8
- b 3f
-2: // !LR_HAVE_TOP
- vmov q8, q6
- vmov q9, q7
- vmov q1, q0
- vmov q10, q6
- vmov q11, q7
- vmov q2, q0
- vmov q12, q6
- vmov q13, q7
- vmov q3, q0
-
-3:
- cmp r3, #0
- beq 4f
- vld1.32 {q14, q15}, [r5, :128], r7
- vld1.16 {q5}, [r6, :128], r8
-
-3:
- // Start of vertical loop
- subs r3, r3, #2
-.macro add5
- vadd.i32 q6, q6, q8
- vadd.i32 q7, q7, q9
- vadd.i16 q0, q0, q1
- vadd.i32 q6, q6, q10
- vadd.i32 q7, q7, q11
- vadd.i16 q0, q0, q2
- vadd.i32 q6, q6, q12
- vadd.i32 q7, q7, q13
- vadd.i16 q0, q0, q3
- vadd.i32 q6, q6, q14
- vadd.i32 q7, q7, q15
- vadd.i16 q0, q0, q5
- vst1.32 {q6, q7}, [r0, :128], r7
- vst1.16 {q0}, [r1, :128], r8
-.endm
- add5
-.macro shift2
- vmov q6, q10
- vmov q7, q11
- vmov q0, q2
- vmov q8, q12
- vmov q9, q13
- vmov q1, q3
- vmov q10, q14
- vmov q11, q15
- vmov q2, q5
-.endm
- shift2
- add r0, r0, r7
- add r1, r1, r8
- ble 5f
- vld1.32 {q12, q13}, [r5, :128], r7
- vld1.16 {q3}, [r6, :128], r8
- vld1.32 {q14, q15}, [r5, :128], r7
- vld1.16 {q5}, [r6, :128], r8
- b 3b
-
-4:
- // h == 1, !LR_HAVE_BOTTOM.
- // Pad the last row with the only content row, and add.
- vmov q14, q12
- vmov q15, q13
- vmov q5, q3
- add5
- shift2
- add r0, r0, r7
- add r1, r1, r8
- add5
- b 6f
-
-5:
- tst r4, #8 // LR_HAVE_BOTTOM
- bne 6f
- // !LR_HAVE_BOTTOM
- cmp r3, #0
- bne 5f
- // The intended three edge rows left; output the one at h-2 and
- // the past edge one at h.
- vld1.32 {q12, q13}, [r5, :128], r7
- vld1.16 {q3}, [r6, :128], r8
- // Pad the past-edge row from the last content row.
- vmov q14, q12
- vmov q15, q13
- vmov q5, q3
- add5
- shift2
- add r0, r0, r7
- add r1, r1, r8
- // The last two rows are already padded properly here.
- add5
- b 6f
-
-5:
- // r3 == -1, two rows left, output one.
- // Pad the last two rows from the mid one.
- vmov q12, q10
- vmov q13, q11
- vmov q3, q2
- vmov q14, q10
- vmov q15, q11
- vmov q5, q2
- add5
- add r0, r0, r7
- add r1, r1, r8
- b 6f
-
-6: // End of one vertical slice.
- subs r2, r2, #8
- ble 0f
- // Move pointers back up to the top and loop horizontally.
- // Input pointers
- mls r5, r7, lr, r5
- mls r6, r8, lr, r6
- // Output pointers
- mls r0, r7, r12, r0
- mls r1, r8, r12, r1
- add r0, r0, #32
- add r1, r1, #16
- add r5, r5, #32
- add r6, r6, #16
- mov r3, r9
- b 1b
-
-0:
- vpop {q5-q7}
- pop {r4-r9,pc}
-.purgem add5
-endfunc
-
-// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
-// const int w, const int h, const int strength);
-// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
-// const int w, const int h, const int strength);
-function sgr_calc_ab1_neon, export=1
- push {r4-r5,lr}
- vpush {q4-q7}
- ldr r4, [sp, #76]
- add r3, r3, #2 // h += 2
- vmov.i32 q15, #9 // n
- movw r5, #455
- mov lr, #SUM_STRIDE
- b sgr_calc_ab_neon
-endfunc
-
-function sgr_calc_ab2_neon, export=1
- push {r4-r5,lr}
- vpush {q4-q7}
- ldr r4, [sp, #76]
- add r3, r3, #3 // h += 3
- asr r3, r3, #1 // h /= 2
- vmov.i32 q15, #25 // n
- mov r5, #164
- mov lr, #(2*SUM_STRIDE)
-endfunc
-
-function sgr_calc_ab_neon
- movrel r12, X(sgr_x_by_x)
- vld1.8 {q8, q9}, [r12, :128]!
- vmov.i8 q11, #5
- vmov.i8 d10, #55 // idx of last 5
- vld1.8 {q10}, [r12, :128]
- vmov.i8 d11, #72 // idx of last 4
- vmov.i8 d12, #101 // idx of last 3
- vmov.i8 d13, #169 // idx of last 2
- vmov.i8 d14, #254 // idx of last 1
- vmov.i8 d15, #32 // elements consumed in first vtbl
- add r2, r2, #2 // w += 2
- add r12, r2, #7
- bic r12, r12, #7 // aligned w
- sub r12, lr, r12 // increment between rows
- vmov.i16 q13, #256
- vdup.32 q12, r4
- vdup.32 q14, r5 // one_by_x
- sub r0, r0, #(4*(SUM_STRIDE))
- sub r1, r1, #(2*(SUM_STRIDE))
- mov r4, r2 // backup of w
- vsub.i8 q8, q8, q11
- vsub.i8 q9, q9, q11
- vsub.i8 q10, q10, q11
-1:
- subs r2, r2, #8
- vld1.32 {q0, q1}, [r0, :128] // a
- vld1.16 {q2}, [r1, :128] // b
- vmul.i32 q0, q0, q15 // a * n
- vmul.i32 q1, q1, q15 // a * n
- vmull.u16 q3, d4, d4 // b * b
- vmull.u16 q4, d5, d5 // b * b
- vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0)
- vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0)
- vmul.i32 q0, q0, q12 // p * s
- vmul.i32 q1, q1, q12 // p * s
- vqshrn.u32 d0, q0, #16
- vqshrn.u32 d1, q1, #16
- vqrshrn.u16 d0, q0, #4 // imin(z, 255)
-
- vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5
- vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4
- vtbl.8 d1, {q8, q9}, d0
- vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3
- vsub.i8 d9, d0, d15 // indices for vtbx
- vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2
- vadd.i8 d2, d2, d3
- vtbx.8 d1, {q10}, d9
- vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1
- vadd.i8 d6, d6, d7
- vadd.i8 d8, d8, d22
- vadd.i8 d2, d2, d6
- vadd.i8 d1, d1, d8
- vadd.i8 d1, d1, d2
- vmovl.u8 q0, d1 // x
-
- vmull.u16 q1, d0, d4 // x * BB[i]
- vmull.u16 q2, d1, d5 // x * BB[i]
- vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x
- vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x
- vrshr.s32 q1, q1, #12 // AA[i]
- vrshr.s32 q2, q2, #12 // AA[i]
- vsub.i16 q0, q13, q0 // 256 - x
-
- vst1.32 {q1, q2}, [r0, :128]!
- vst1.16 {q0}, [r1, :128]!
- bgt 1b
-
- subs r3, r3, #1
- ble 0f
- add r0, r0, r12, lsl #2
- add r1, r1, r12, lsl #1
- mov r2, r4
- b 1b
-0:
- vpop {q4-q7}
- pop {r4-r5,pc}
-endfunc
-
-#define FILTER_OUT_STRIDE 384
-
-// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
-// const pixel *src, const ptrdiff_t stride,
-// const int32_t *a, const int16_t *b,
-// const int w, const int h);
-function sgr_finish_filter1_8bpc_neon, export=1
- push {r4-r11,lr}
- vpush {q4-q7}
- ldrd r4, r5, [sp, #100]
- ldr r6, [sp, #108]
- sub r7, r3, #(4*SUM_STRIDE)
- add r8, r3, #(4*SUM_STRIDE)
- sub r9, r4, #(2*SUM_STRIDE)
- add r10, r4, #(2*SUM_STRIDE)
- mov r11, #SUM_STRIDE
- mov r12, #FILTER_OUT_STRIDE
- add lr, r5, #3
- bic lr, lr, #3 // Aligned width
- sub r2, r2, lr
- sub r12, r12, lr
- sub r11, r11, lr
- sub r11, r11, #4 // We read 4 extra elements from both a and b
- mov lr, r5
- vmov.i16 q14, #3
- vmov.i32 q15, #3
-1:
- vld1.16 {q0}, [r9]!
- vld1.16 {q1}, [r4]!
- vld1.16 {q2}, [r10]!
- vld1.32 {q8, q9}, [r7]!
- vld1.32 {q10, q11}, [r3]!
- vld1.32 {q12, q13}, [r8]!
-
-2:
- subs r5, r5, #4
- vext.8 d6, d0, d1, #2 // -stride
- vext.8 d7, d2, d3, #2 // 0
- vext.8 d8, d4, d5, #2 // +stride
- vext.8 d9, d0, d1, #4 // +1-stride
- vext.8 d10, d2, d3, #4 // +1
- vext.8 d11, d4, d5, #4 // +1+stride
- vadd.i16 d2, d2, d6 // -1, -stride
- vadd.i16 d7, d7, d8 // 0, +stride
- vadd.i16 d0, d0, d9 // -1-stride, +1-stride
- vadd.i16 d2, d2, d7
- vadd.i16 d4, d4, d11 // -1+stride, +1+stride
- vadd.i16 d2, d2, d10 // +1
- vadd.i16 d0, d0, d4
-
- vext.8 q3, q8, q9, #4 // -stride
- vshl.i16 d2, d2, #2
- vext.8 q4, q8, q9, #8 // +1-stride
- vext.8 q5, q10, q11, #4 // 0
- vext.8 q6, q10, q11, #8 // +1
- vmla.i16 d2, d0, d28 // * 3 -> a
- vadd.i32 q3, q3, q10 // -stride, -1
- vadd.i32 q8, q8, q4 // -1-stride, +1-stride
- vadd.i32 q5, q5, q6 // 0, +1
- vadd.i32 q8, q8, q12 // -1+stride
- vadd.i32 q3, q3, q5
- vext.8 q7, q12, q13, #4 // +stride
- vext.8 q10, q12, q13, #8 // +1+stride
- vld1.32 {d24[0]}, [r1]! // src
- vadd.i32 q3, q3, q7 // +stride
- vadd.i32 q8, q8, q10 // +1+stride
- vshl.i32 q3, q3, #2
- vmla.i32 q3, q8, q15 // * 3 -> b
- vmovl.u8 q12, d24 // src
- vmov d0, d1
- vmlal.u16 q3, d2, d24 // b + a * src
- vmov d2, d3
- vrshrn.i32 d6, q3, #9
- vmov d4, d5
- vst1.16 {d6}, [r0]!
-
- ble 3f
- vmov q8, q9
- vmov q10, q11
- vmov q12, q13
- vld1.16 {d1}, [r9]!
- vld1.16 {d3}, [r4]!
- vld1.16 {d5}, [r10]!
- vld1.32 {q9}, [r7]!
- vld1.32 {q11}, [r3]!
- vld1.32 {q13}, [r8]!
- b 2b
-
-3:
- subs r6, r6, #1
- ble 0f
- mov r5, lr
- add r0, r0, r12, lsl #1
- add r1, r1, r2
- add r3, r3, r11, lsl #2
- add r7, r7, r11, lsl #2
- add r8, r8, r11, lsl #2
- add r4, r4, r11, lsl #1
- add r9, r9, r11, lsl #1
- add r10, r10, r11, lsl #1
- b 1b
-0:
- vpop {q4-q7}
- pop {r4-r11,pc}
-endfunc
-
-// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
-// const pixel *src, const ptrdiff_t stride,
-// const int32_t *a, const int16_t *b,
-// const int w, const int h);
-function sgr_finish_filter2_8bpc_neon, export=1
- push {r4-r11,lr}
- vpush {q4-q7}
- ldrd r4, r5, [sp, #100]
- ldr r6, [sp, #108]
- add r7, r3, #(4*(SUM_STRIDE))
- sub r3, r3, #(4*(SUM_STRIDE))
- add r8, r4, #(2*(SUM_STRIDE))
- sub r4, r4, #(2*(SUM_STRIDE))
- mov r9, #(2*SUM_STRIDE)
- mov r10, #FILTER_OUT_STRIDE
- add r11, r5, #7
- bic r11, r11, #7 // Aligned width
- sub r2, r2, r11
- sub r10, r10, r11
- sub r9, r9, r11
- sub r9, r9, #4 // We read 4 extra elements from a
- sub r12, r9, #4 // We read 8 extra elements from b
- mov lr, r5
-
-1:
- vld1.16 {q0, q1}, [r4]!
- vld1.16 {q2, q3}, [r8]!
- vld1.32 {q8, q9}, [r3]!
- vld1.32 {q11, q12}, [r7]!
- vld1.32 {q10}, [r3]!
- vld1.32 {q13}, [r7]!
-
-2:
- vmov.i16 q14, #5
- vmov.i16 q15, #6
- subs r5, r5, #8
- vext.8 q4, q0, q1, #4 // +1-stride
- vext.8 q5, q2, q3, #4 // +1+stride
- vext.8 q6, q0, q1, #2 // -stride
- vext.8 q7, q2, q3, #2 // +stride
- vadd.i16 q0, q0, q4 // -1-stride, +1-stride
- vadd.i16 q5, q2, q5 // -1+stride, +1+stride
- vadd.i16 q2, q6, q7 // -stride, +stride
- vadd.i16 q0, q0, q5
-
- vext.8 q4, q8, q9, #8 // +1-stride
- vext.8 q5, q9, q10, #8
- vext.8 q6, q11, q12, #8 // +1+stride
- vext.8 q7, q12, q13, #8
- vmul.i16 q0, q0, q14 // * 5
- vmla.i16 q0, q2, q15 // * 6
- vadd.i32 q4, q4, q8 // -1-stride, +1-stride
- vadd.i32 q5, q5, q9
- vadd.i32 q6, q6, q11 // -1+stride, +1+stride
- vadd.i32 q7, q7, q12
- vadd.i32 q4, q4, q6
- vadd.i32 q5, q5, q7
- vext.8 q6, q8, q9, #4 // -stride
- vext.8 q7, q9, q10, #4
- vext.8 q8, q11, q12, #4 // +stride
- vext.8 q11, q12, q13, #4
-
- vld1.8 {d4}, [r1]!
-
- vmov.i32 q14, #5
- vmov.i32 q15, #6
-
- vadd.i32 q6, q6, q8 // -stride, +stride
- vadd.i32 q7, q7, q11
- vmul.i32 q4, q4, q14 // * 5
- vmla.i32 q4, q6, q15 // * 6
- vmul.i32 q5, q5, q14 // * 5
- vmla.i32 q5, q7, q15 // * 6
-
- vmovl.u8 q2, d4
- vmlal.u16 q4, d0, d4 // b + a * src
- vmlal.u16 q5, d1, d5 // b + a * src
- vmov q0, q1
- vrshrn.i32 d8, q4, #9
- vrshrn.i32 d9, q5, #9
- vmov q2, q3
- vst1.16 {q4}, [r0]!
-
- ble 3f
- vmov q8, q10
- vmov q11, q13
- vld1.16 {q1}, [r4]!
- vld1.16 {q3}, [r8]!
- vld1.32 {q9, q10}, [r3]!
- vld1.32 {q12, q13}, [r7]!
- b 2b
-
-3:
- subs r6, r6, #1
- ble 0f
- mov r5, lr
- add r0, r0, r10, lsl #1
- add r1, r1, r2
- add r3, r3, r9, lsl #2
- add r7, r7, r9, lsl #2
- add r4, r4, r12, lsl #1
- add r8, r8, r12, lsl #1
-
- vld1.32 {q8, q9}, [r3]!
- vld1.16 {q0, q1}, [r4]!
- vld1.32 {q10}, [r3]!
-
- vmov.i16 q12, #5
- vmov.i16 q13, #6
-
-4:
- subs r5, r5, #8
- vext.8 q3, q0, q1, #4 // +1
- vext.8 q2, q0, q1, #2 // 0
- vadd.i16 q0, q0, q3 // -1, +1
-
- vext.8 q4, q8, q9, #4 // 0
- vext.8 q5, q9, q10, #4
- vext.8 q6, q8, q9, #8 // +1
- vext.8 q7, q9, q10, #8
- vmul.i16 q2, q2, q13 // * 6
- vmla.i16 q2, q0, q12 // * 5 -> a
- vld1.8 {d22}, [r1]!
- vadd.i32 q8, q8, q6 // -1, +1
- vadd.i32 q9, q9, q7
- vmovl.u8 q11, d22
- vmul.i32 q4, q4, q15 // * 6
- vmla.i32 q4, q8, q14 // * 5 -> b
- vmul.i32 q5, q5, q15 // * 6
- vmla.i32 q5, q9, q14 // * 5 -> b
-
- vmlal.u16 q4, d4, d22 // b + a * src
- vmlal.u16 q5, d5, d23
- vmov q0, q1
- vrshrn.i32 d8, q4, #8
- vrshrn.i32 d9, q5, #8
- vmov q8, q10
- vst1.16 {q4}, [r0]!
-
- ble 5f
- vld1.16 {q1}, [r4]!
- vld1.32 {q9, q10}, [r3]!
- b 4b
-
-5:
- subs r6, r6, #1
- ble 0f
- mov r5, lr
- sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started
- sub r4, r4, r11, lsl #1
- add r0, r0, r10, lsl #1
- add r1, r1, r2
- sub r3, r3, #16
- sub r4, r4, #16
- b 1b
-0:
- vpop {q4-q7}
- pop {r4-r11,pc}
-endfunc
-
-// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
-// const pixel *src, const ptrdiff_t src_stride,
-// const int16_t *t1, const int w, const int h,
-// const int wt);
-function sgr_weighted1_8bpc_neon, export=1
- push {r4-r9,lr}
- ldrd r4, r5, [sp, #28]
- ldrd r6, r7, [sp, #36]
- ldr r8, [sp, #44]
- vdup.16 d31, r7
- cmp r6, #2
- add r9, r0, r1
- add r12, r2, r3
- add lr, r4, #2*FILTER_OUT_STRIDE
- mov r7, #(4*FILTER_OUT_STRIDE)
- lsl r1, r1, #1
- lsl r3, r3, #1
- add r8, r5, #7
- bic r8, r8, #7 // Aligned width
- sub r1, r1, r8
- sub r3, r3, r8
- sub r7, r7, r8, lsl #1
- mov r8, r5
- blt 2f
-1:
- vld1.8 {d0}, [r2]!
- vld1.8 {d16}, [r12]!
- vld1.16 {q1}, [r4]!
- vld1.16 {q9}, [lr]!
- subs r5, r5, #8
- vshll.u8 q0, d0, #4 // u
- vshll.u8 q8, d16, #4 // u
- vsub.i16 q1, q1, q0 // t1 - u
- vsub.i16 q9, q9, q8 // t1 - u
- vshll.u16 q2, d0, #7 // u << 7
- vshll.u16 q3, d1, #7 // u << 7
- vshll.u16 q10, d16, #7 // u << 7
- vshll.u16 q11, d17, #7 // u << 7
- vmlal.s16 q2, d2, d31 // v
- vmlal.s16 q3, d3, d31 // v
- vmlal.s16 q10, d18, d31 // v
- vmlal.s16 q11, d19, d31 // v
- vrshrn.i32 d4, q2, #11
- vrshrn.i32 d5, q3, #11
- vrshrn.i32 d20, q10, #11
- vrshrn.i32 d21, q11, #11
- vqmovun.s16 d4, q2
- vqmovun.s16 d20, q10
- vst1.8 {d4}, [r0]!
- vst1.8 {d20}, [r9]!
- bgt 1b
-
- sub r6, r6, #2
- cmp r6, #1
- blt 0f
- mov r5, r8
- add r0, r0, r1
- add r9, r9, r1
- add r2, r2, r3
- add r12, r12, r3
- add r4, r4, r7
- add lr, lr, r7
- beq 2f
- b 1b
-
-2:
- vld1.8 {d0}, [r2]!
- vld1.16 {q1}, [r4]!
- subs r5, r5, #8
- vshll.u8 q0, d0, #4 // u
- vsub.i16 q1, q1, q0 // t1 - u
- vshll.u16 q2, d0, #7 // u << 7
- vshll.u16 q3, d1, #7 // u << 7
- vmlal.s16 q2, d2, d31 // v
- vmlal.s16 q3, d3, d31 // v
- vrshrn.i32 d4, q2, #11
- vrshrn.i32 d5, q3, #11
- vqmovun.s16 d2, q2
- vst1.8 {d2}, [r0]!
- bgt 2b
-0:
- pop {r4-r9,pc}
-endfunc
-
-// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-// const pixel *src, const ptrdiff_t src_stride,
-// const int16_t *t1, const int16_t *t2,
-// const int w, const int h,
-// const int16_t wt[2]);
-function sgr_weighted2_8bpc_neon, export=1
- push {r4-r11,lr}
- ldrd r4, r5, [sp, #36]
- ldrd r6, r7, [sp, #44]
- ldr r8, [sp, #52]
- cmp r7, #2
- add r10, r0, r1
- add r11, r2, r3
- add r12, r4, #2*FILTER_OUT_STRIDE
- add lr, r5, #2*FILTER_OUT_STRIDE
- vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1]
- mov r8, #4*FILTER_OUT_STRIDE
- lsl r1, r1, #1
- lsl r3, r3, #1
- add r9, r6, #7
- bic r9, r9, #7 // Aligned width
- sub r1, r1, r9
- sub r3, r3, r9
- sub r8, r8, r9, lsl #1
- mov r9, r6
- blt 2f
-1:
- vld1.8 {d0}, [r2]!
- vld1.8 {d16}, [r11]!
- vld1.16 {q1}, [r4]!
- vld1.16 {q9}, [r12]!
- vld1.16 {q2}, [r5]!
- vld1.16 {q10}, [lr]!
- subs r6, r6, #8
- vshll.u8 q0, d0, #4 // u
- vshll.u8 q8, d16, #4 // u
- vsub.i16 q1, q1, q0 // t1 - u
- vsub.i16 q2, q2, q0 // t2 - u
- vsub.i16 q9, q9, q8 // t1 - u
- vsub.i16 q10, q10, q8 // t2 - u
- vshll.u16 q3, d0, #7 // u << 7
- vshll.u16 q0, d1, #7 // u << 7
- vshll.u16 q11, d16, #7 // u << 7
- vshll.u16 q8, d17, #7 // u << 7
- vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
- vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
- vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
- vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
- vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u)
- vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u)
- vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u)
- vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u)
- vrshrn.i32 d6, q3, #11
- vrshrn.i32 d7, q0, #11
- vrshrn.i32 d22, q11, #11
- vrshrn.i32 d23, q8, #11
- vqmovun.s16 d6, q3
- vqmovun.s16 d22, q11
- vst1.8 {d6}, [r0]!
- vst1.8 {d22}, [r10]!
- bgt 1b
-
- subs r7, r7, #2
- cmp r7, #1
- blt 0f
- mov r6, r9
- add r0, r0, r1
- add r10, r10, r1
- add r2, r2, r3
- add r11, r11, r3
- add r4, r4, r8
- add r12, r12, r8
- add r5, r5, r8
- add lr, lr, r8
- beq 2f
- b 1b
-
-2:
- vld1.8 {d0}, [r2]!
- vld1.16 {q1}, [r4]!
- vld1.16 {q2}, [r5]!
- subs r6, r6, #8
- vshll.u8 q0, d0, #4 // u
- vsub.i16 q1, q1, q0 // t1 - u
- vsub.i16 q2, q2, q0 // t2 - u
- vshll.u16 q3, d0, #7 // u << 7
- vshll.u16 q0, d1, #7 // u << 7
- vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
- vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
- vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
- vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
- vrshrn.i32 d6, q3, #11
- vrshrn.i32 d7, q0, #11
- vqmovun.s16 d6, q3
- vst1.8 {d6}, [r0]!
- bgt 1b
-0:
- pop {r4-r11,pc}
-endfunc
+sgr_funcs 8
--- /dev/null
+++ b/src/arm/32/looprestoration16.S
@@ -1,0 +1,1270 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
+// const pixel *src, ptrdiff_t stride,
+// const int16_t fh[7], const intptr_t w,
+// int h, enum LrEdgeFlags edges,
+// const int bitdepth_max);
+function wiener_filter_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ ldr r8, [sp, #116] // bitdepth_max
+ vld1.16 {q0}, [r4, :128]
+ clz r8, r8
+ vmov.i32 q14, #1
+ sub r9, r8, #38 // -(bitdepth + 6)
+ sub r8, r8, #25 // -round_bits_h
+ neg r9, r9 // bitdepth + 6
+ vdup.32 q1, r9
+ vdup.32 q13, r8 // -round_bits_h
+ vmov.i16 q15, #8192
+ vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6)
+ mov r8, r5
+ // Calculate mid_stride
+ add r10, r5, #7
+ bic r10, r10, #7
+ lsl r10, r10, #1
+
+ // Clear the last unused element of q0, to allow filtering a single
+ // pixel with one plain vmul+vpadd.
+ mov r12, #0
+ vmov.16 d1[3], r12
+
+ // Set up pointers for reading/writing alternate rows
+ add r12, r0, r10
+ lsl r10, r10, #1
+ add lr, r2, r3
+ lsl r3, r3, #1
+
+ // Subtract the width from mid_stride
+ sub r10, r10, r5, lsl #1
+
+ // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
+ cmp r5, #8
+ add r11, r5, #13
+ bic r11, r11, #7
+ bge 1f
+ mov r11, #16
+1:
+ sub r3, r3, r11, lsl #1
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r1, #0
+ bne 0f
+ // left == NULL
+ sub r2, r2, #6
+ sub lr, lr, #6
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r3, r3, #6
+
+
+1: // Loop vertically
+ vld1.16 {q2, q3}, [r2]!
+ vld1.16 {q4, q5}, [lr]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r1, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d3}, [r1]!
+ // Move r2/lr back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r2, r2, #6
+ sub lr, lr, #6
+ vld1.16 {d13}, [r1]!
+ vext.8 q3, q2, q3, #10
+ vext.8 q2, q1, q2, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
+ // and shift q2/q3 to have 3x the first pixel at the front.
+ vdup.16 q1, d4[0]
+ vdup.16 q6, d8[0]
+ // Move r2 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub r2, r2, #6
+ sub lr, lr, #6
+ vext.8 q3, q2, q3, #10
+ vext.8 q2, q1, q2, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+
+2:
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub r9, r5, #14
+ lsl r9, r9, #1
+ ldrh r11, [r2, r9]
+ ldrh r9, [lr, r9]
+ // Fill q11/q12 with the right padding pixel
+ vdup.16 q11, r11
+ vdup.16 q12, r9
+3: // !LR_HAVE_RIGHT
+ // If we'll have to pad the right edge we need to quit early here.
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+ cmp r5, #7
+ bge 5f // If w >= 7, we can filter 4 pixels
+ b 6f
+
+4: // Loop horizontally
+ vext.8 q8, q2, q3, #2
+ vext.8 q9, q2, q3, #4
+ vext.8 q10, q2, q3, #6
+ vmull.s16 q6, d4, d0[0]
+ vmlal.s16 q6, d16, d0[1]
+ vmlal.s16 q6, d18, d0[2]
+ vmlal.s16 q6, d20, d0[3]
+ vmull.s16 q7, d5, d0[0]
+ vmlal.s16 q7, d17, d0[1]
+ vmlal.s16 q7, d19, d0[2]
+ vmlal.s16 q7, d21, d0[3]
+ vext.8 q8, q2, q3, #8
+ vext.8 q9, q2, q3, #10
+ vext.8 q10, q2, q3, #12
+ vmlal.s16 q6, d16, d1[0]
+ vmlal.s16 q6, d18, d1[1]
+ vmlal.s16 q6, d20, d1[2]
+ vmlal.s16 q7, d17, d1[0]
+ vmlal.s16 q7, d19, d1[1]
+ vmlal.s16 q7, d21, d1[2]
+ vext.8 q2, q4, q5, #2
+ vext.8 q10, q4, q5, #6
+ vmull.s16 q8, d8, d0[0]
+ vmlal.s16 q8, d4, d0[1]
+ vmlal.s16 q8, d20, d0[3]
+ vmull.s16 q9, d9, d0[0]
+ vmlal.s16 q9, d5, d0[1]
+ vmlal.s16 q9, d21, d0[3]
+ vext.8 q2, q4, q5, #4
+ vext.8 q10, q4, q5, #8
+ vmlal.s16 q8, d4, d0[2]
+ vmlal.s16 q8, d20, d1[0]
+ vmlal.s16 q9, d5, d0[2]
+ vmlal.s16 q9, d21, d1[0]
+ vext.8 q2, q4, q5, #10
+ vext.8 q10, q4, q5, #12
+ vmlal.s16 q8, d4, d1[1]
+ vmlal.s16 q8, d20, d1[2]
+ vmlal.s16 q9, d5, d1[1]
+ vmlal.s16 q9, d21, d1[2]
+
+ vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
+ vadd.i32 q6, q6, q14
+ vadd.i32 q7, q7, q14
+ vadd.i32 q8, q8, q14
+ vadd.i32 q9, q9, q14
+ vrshl.s32 q6, q6, q13
+ vrshl.s32 q7, q7, q13
+ vrshl.s32 q8, q8, q13
+ vrshl.s32 q9, q9, q13
+ vqmovun.s32 d12, q6
+ vqmovun.s32 d13, q7
+ vqmovun.s32 d14, q8
+ vqmovun.s32 d15, q9
+ vmin.u16 q6, q6, q10
+ vmin.u16 q7, q7, q10
+ vsub.i16 q6, q6, q15
+ vsub.i16 q7, q7, q15
+ vst1.16 {q6}, [r0, :128]!
+ vst1.16 {q7}, [r12, :128]!
+
+ subs r5, r5, #8
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q2, q3
+ vmov q4, q5
+ vld1.16 {q3}, [r2]!
+ vld1.16 {q5}, [lr]!
+ bne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+5: // Filter 4 pixels, 7 <= w < 11
+.macro filter_4
+ vext.8 d18, d4, d5, #6
+ vext.8 d16, d4, d5, #2
+ vext.8 d17, d4, d5, #4
+ vext.8 d19, d5, d6, #2
+ vext.8 d20, d5, d6, #4
+ vmull.s16 q6, d4, d0[0]
+ vmlal.s16 q6, d16, d0[1]
+ vmlal.s16 q6, d17, d0[2]
+ vmlal.s16 q6, d18, d0[3]
+ vmlal.s16 q6, d5, d1[0]
+ vmlal.s16 q6, d19, d1[1]
+ vmlal.s16 q6, d20, d1[2]
+
+ vext.8 d18, d8, d9, #6
+ vext.8 d16, d8, d9, #2
+ vext.8 d17, d8, d9, #4
+ vext.8 d19, d9, d10, #2
+ vext.8 d20, d9, d10, #4
+ vmull.s16 q7, d8, d0[0]
+ vmlal.s16 q7, d16, d0[1]
+ vmlal.s16 q7, d17, d0[2]
+ vmlal.s16 q7, d18, d0[3]
+ vmlal.s16 q7, d9, d1[0]
+ vmlal.s16 q7, d19, d1[1]
+ vmlal.s16 q7, d20, d1[2]
+
+ vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
+ vadd.i32 q6, q6, q14
+ vadd.i32 q7, q7, q14
+ vrshl.s32 q6, q6, q13
+ vrshl.s32 q7, q7, q13
+ vqmovun.s32 d12, q6
+ vqmovun.s32 d13, q7
+ vmin.u16 q6, q6, q10
+ vsub.i16 q6, q6, q15
+.endm
+ filter_4
+ vst1.16 {d12}, [r0, :64]!
+ vst1.16 {d13}, [r12, :64]!
+
+ subs r5, r5, #4 // 3 <= w < 7
+ vext.8 q2, q2, q3, #8
+ vext.8 q3, q3, q3, #8
+ vext.8 q4, q4, q5, #8
+ vext.8 q5, q5, q5, #8
+
+6: // Pad the right edge and filter the last few pixels.
+ // w < 7, w+3 pixels valid in q2-q3
+ cmp r5, #5
+ blt 7f
+ bgt 8f
+ // w == 5, 8 pixels valid in q2, q3 invalid
+ vmov q3, q11
+ vmov q5, q12
+ b 88f
+
+7: // 1 <= w < 5, 4-7 pixels valid in q2
+ sub r9, r5, #1
+ // r9 = (pixels valid - 4)
+ adr r11, L(variable_shift_tbl)
+ ldr r9, [r11, r9, lsl #2]
+ add r11, r11, r9
+ vmov q3, q11
+ vmov q5, q12
+ bx r11
+
+ .align 2
+L(variable_shift_tbl):
+ .word 44f - L(variable_shift_tbl) + CONFIG_THUMB
+ .word 55f - L(variable_shift_tbl) + CONFIG_THUMB
+ .word 66f - L(variable_shift_tbl) + CONFIG_THUMB
+ .word 77f - L(variable_shift_tbl) + CONFIG_THUMB
+
+44: // 4 pixels valid in q2/q4, fill the high half with padding.
+ vmov d5, d6
+ vmov d9, d10
+ b 88f
+ // Shift q2 right, shifting out invalid pixels,
+ // shift q2 left to the original offset, shifting in padding pixels.
+55: // 5 pixels valid
+ vext.8 q2, q2, q2, #10
+ vext.8 q2, q2, q3, #6
+ vext.8 q4, q4, q4, #10
+ vext.8 q4, q4, q5, #6
+ b 88f
+66: // 6 pixels valid
+ vext.8 q2, q2, q2, #12
+ vext.8 q2, q2, q3, #4
+ vext.8 q4, q4, q4, #12
+ vext.8 q4, q4, q5, #4
+ b 88f
+77: // 7 pixels valid
+ vext.8 q2, q2, q2, #14
+ vext.8 q2, q2, q3, #2
+ vext.8 q4, q4, q4, #14
+ vext.8 q4, q4, q5, #2
+ b 88f
+
+8: // w > 5, w == 6, 9 pixels valid in q2-q3, 1 pixel valid in q3
+ vext.8 q3, q3, q3, #2
+ vext.8 q3, q3, q11, #14
+ vext.8 q5, q5, q5, #2
+ vext.8 q5, q5, q12, #14
+
+88:
+ // w < 7, q2-q3 padded properly
+ cmp r5, #4
+ blt 888f
+
+ // w >= 4, filter 4 pixels
+ filter_4
+ vst1.16 {d12}, [r0, :64]!
+ vst1.16 {d13}, [r12, :64]!
+ subs r5, r5, #4 // 0 <= w < 4
+ vext.8 q2, q2, q3, #8
+ vext.8 q4, q4, q5, #8
+ beq 9f
+888: // 1 <= w < 4, filter 1 pixel at a time
+ vmull.s16 q6, d4, d0
+ vmull.s16 q7, d5, d1
+ vmull.s16 q8, d8, d0
+ vmull.s16 q9, d9, d1
+ vadd.i32 q6, q7
+ vadd.i32 q8, q9
+ vpadd.i32 d12, d12, d13
+ vpadd.i32 d13, d16, d17
+ vpadd.i32 d12, d12, d13
+ vadd.i32 d12, d12, d28
+ vmvn.i16 d20, #0x8000 // 0x7fff = (1 << 15) - 1
+ vrshl.s32 d12, d12, d26
+ vqmovun.s32 d12, q6
+ vmin.u16 d12, d12, d20
+ vsub.i16 d12, d12, d30
+ vst1.16 {d12[0]}, [r0, :16]!
+ vst1.16 {d12[1]}, [r12, :16]!
+ subs r5, r5, #1
+ vext.8 q2, q2, q3, #2
+ vext.8 q4, q4, q5, #2
+ bgt 888b
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r10
+ add r12, r12, r10
+ add r2, r2, r3
+ add lr, lr, r3
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.purgem filter_4
+endfunc
+
+// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
+// const int16_t *mid, int w, int h,
+// const int16_t fv[7], enum LrEdgeFlags edges,
+// ptrdiff_t mid_stride, const int bitdepth_max);
+function wiener_filter_v_16bpc_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q5}
+ ldrd r4, r5, [sp, #52]
+ ldrd r6, r7, [sp, #60]
+ ldr lr, [sp, #68] // bitdepth_max
+ vld1.16 {q0}, [r5, :128]
+ vdup.16 q5, lr
+ clz lr, lr
+ sub lr, lr, #11 // round_bits_v
+ vdup.32 q4, lr
+ mov lr, r4
+ vneg.s32 q4, q4 // -round_bits_v
+
+ // Calculate the number of rows to move back when looping vertically
+ mov r12, r4
+ tst r6, #4 // LR_HAVE_TOP
+ beq 0f
+ sub r2, r2, r7, lsl #1
+ add r12, r12, #2
+0:
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 1f
+ add r12, r12, #2
+
+1: // Start of horizontal loop; start one vertical filter slice.
+ // Load rows into q8-q11 and pad properly.
+ tst r6, #4 // LR_HAVE_TOP
+ vld1.16 {q8}, [r2, :128], r7
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.16 {q10}, [r2, :128], r7
+ vmov q9, q8
+ vld1.16 {q11}, [r2, :128], r7
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q9, q8
+ vmov q10, q8
+ vmov q11, q8
+
+3:
+ cmp r4, #4
+ blt 5f
+ // Start filtering normally; fill in q12-q14 with unique rows.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vld1.16 {q14}, [r2, :128], r7
+
+4:
+.macro filter compare
+ subs r4, r4, #1
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d20, d0[2]
+ vmlal.s16 q2, d22, d0[3]
+ vmlal.s16 q2, d24, d1[0]
+ vmlal.s16 q2, d26, d1[1]
+ vmlal.s16 q2, d28, d1[2]
+ vmull.s16 q3, d17, d0[0]
+ vmlal.s16 q3, d19, d0[1]
+ vmlal.s16 q3, d21, d0[2]
+ vmlal.s16 q3, d23, d0[3]
+ vmlal.s16 q3, d25, d1[0]
+ vmlal.s16 q3, d27, d1[1]
+ vmlal.s16 q3, d29, d1[2]
+ vrshl.s32 q2, q2, q4 // round_bits_v
+ vrshl.s32 q3, q3, q4
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q5 // bitdepth_max
+ vst1.16 {q2}, [r0], r1
+.if \compare
+ cmp r4, #4
+.else
+ ble 9f
+.endif
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+ vmov q11, q12
+ vmov q12, q13
+ vmov q13, q14
+.endm
+ filter 1
+ blt 7f
+ vld1.16 {q14}, [r2, :128], r7
+ b 4b
+
+5: // Less than 4 rows in total; not all of q12-q13 are filled yet.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 6f
+ // LR_HAVE_BOTTOM
+ cmp r4, #2
+ // We load at least 2 rows in all cases.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ bgt 53f // 3 rows in total
+ beq 52f // 2 rows in total
+51: // 1 row in total, q11 already loaded, load edge into q12-q14.
+ vmov q13, q12
+ b 8f
+52: // 2 rows in total, q11 already loaded, load q12 with content data
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vmov q15, q14
+ b 8f
+53:
+ // 3 rows in total, q11 already loaded, load q12 and q13 with content
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+
+6:
+ // !LR_HAVE_BOTTOM
+ cmp r4, #2
+ bgt 63f // 3 rows in total
+ beq 62f // 2 rows in total
+61: // 1 row in total, q11 already loaded, pad that into q12-q14.
+ vmov q12, q11
+ vmov q13, q11
+ vmov q14, q11
+ b 8f
+62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
+ vld1.16 {q12}, [r2, :128], r7
+ vmov q13, q12
+ vmov q14, q12
+ vmov q15, q12
+ b 8f
+63:
+ // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+ b 8f
+
+7:
+ // All registers up to q13 are filled already, 3 valid rows left.
+ // < 4 valid rows left; fill in padding and filter the last
+ // few rows.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 71f
+ // LR_HAVE_BOTTOM; load 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+71:
+ // !LR_HAVE_BOTTOM, pad 3 rows
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+
+8: // At this point, all registers up to q14-q15,q1 are loaded with
+ // edge/padding (depending on how many rows are left).
+ filter 0 // This branches to 9f when done
+ vmov q14, q15
+ vmov q15, q1
+ b 8b
+
+9: // End of one vertical slice.
+ subs r3, r3, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ mls r0, r1, lr, r0
+ mls r2, r7, r12, r2
+ add r0, r0, #16
+ add r2, r2, #16
+ mov r4, lr
+ b 1b
+
+0:
+ vpop {q4-q5}
+ pop {r4-r7,pc}
+.purgem filter
+endfunc
+
+// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride,
+// const pixel *src, int w, int h);
+function copy_narrow_16bpc_neon, export=1
+ push {r4,lr}
+ ldr r4, [sp, #8]
+ adr r12, L(copy_narrow_tbl)
+ ldr r3, [r12, r3, lsl #2]
+ add r12, r12, r3
+ bx r12
+
+ .align 2
+L(copy_narrow_tbl):
+ .word 0
+ .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
+ .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
+ .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
+ .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
+ .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
+ .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
+ .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
+
+10:
+ add r3, r0, r1
+ lsl r1, r1, #1
+18:
+ subs r4, r4, #8
+ blt 110f
+ vld1.16 {q0}, [r2, :128]!
+ vst1.16 {d0[0]}, [r0, :16], r1
+ vst1.16 {d0[1]}, [r3, :16], r1
+ vst1.16 {d0[2]}, [r0, :16], r1
+ vst1.16 {d0[3]}, [r3, :16], r1
+ vst1.16 {d1[0]}, [r0, :16], r1
+ vst1.16 {d1[1]}, [r3, :16], r1
+ vst1.16 {d1[2]}, [r0, :16], r1
+ vst1.16 {d1[3]}, [r3, :16], r1
+ ble 0f
+ b 18b
+110:
+ add r4, r4, #8
+ asr r1, r1, #1
+11:
+ subs r4, r4, #1
+ vld1.16 {d0[]}, [r2]!
+ vst1.16 {d0[0]}, [r0], r1
+ bgt 11b
+0:
+ pop {r4,pc}
+
+20:
+ add r3, r0, r1
+ lsl r1, r1, #1
+24:
+ subs r4, r4, #4
+ blt 210f
+ vld1.32 {q0}, [r2, :128]!
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[1]}, [r3, :32], r1
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r3, :32], r1
+ ble 0f
+ b 24b
+210:
+ add r4, r4, #4
+ asr r1, r1, #1
+22:
+ subs r4, r4, #1
+ vld1.32 {d0[]}, [r2, :32]!
+ vst1.32 {d0[0]}, [r0, :32], r1
+ bgt 22b
+0:
+ pop {r4,pc}
+
+30:
+ ldr r3, [r2]
+ ldrh r12, [r2, #4]
+ add r2, r2, #6
+ subs r4, r4, #1
+ str r3, [r0]
+ strh r12, [r0, #4]
+ add r0, r0, r1
+ bgt 30b
+ pop {r4,pc}
+
+40:
+ add r3, r0, r1
+ lsl r1, r1, #1
+42:
+ subs r4, r4, #2
+ blt 41f
+ vld1.16 {q0}, [r2, :128]!
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r3, :64], r1
+ ble 0f
+ b 42b
+41:
+ vld1.16 {d0}, [r2, :64]
+ vst1.16 {d0}, [r0, :64]
+0:
+ pop {r4,pc}
+
+50:
+ vld1.16 {d0}, [r2]
+ ldrh r12, [r2, #8]
+ add r2, r2, #10
+ subs r4, r4, #1
+ vst1.16 {d0}, [r0]
+ strh r12, [r0, #8]
+ add r0, r0, r1
+ bgt 50b
+ pop {r4,pc}
+
+60:
+ vld1.16 {d0}, [r2]
+ ldr r12, [r2, #8]
+ add r2, r2, #12
+ subs r4, r4, #1
+ vst1.16 {d0}, [r0]
+ str r12, [r0, #8]
+ add r0, r0, r1
+ bgt 60b
+ pop {r4,pc}
+
+70:
+ vld1.16 {d0}, [r2]
+ ldr r12, [r2, #8]
+ ldrh lr, [r2, #12]
+ add r2, r2, #14
+ subs r4, r4, #1
+ vst1.16 {d0}, [r0]
+ str r12, [r0, #8]
+ strh lr, [r0, #12]
+ add r0, r0, r1
+ bgt 70b
+ pop {r4,pc}
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 0f
+ // !LR_HAVE_RIGHT
+ add lr, r5, #3
+ bic lr, lr, #3
+ b 1f
+0:
+ add lr, r5, #7
+ bic lr, lr, #7
+1:
+ sub r9, r9, lr, lsl #1
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Subtract the number of pixels read from the input from the stride
+ add lr, r5, #14
+ bic lr, lr, #7
+ sub r4, r4, lr, lsl #1
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #4
+ sub r12, r12, #4
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 2 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #4
+
+
+1: // Loop vertically
+ vld1.16 {q0, q1}, [r3]!
+ vld1.16 {q4, q5}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d5}, [r2]!
+ // Move r3/r12 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #4
+ sub r12, r12, #4
+ vld1.16 {d13}, [r2]!
+ vext.8 q1, q0, q1, #12
+ vext.8 q0, q2, q0, #12
+ vext.8 q5, q4, q5, #12
+ vext.8 q4, q6, q4, #12
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+ // and shift q0 to have 2x the first byte at the front.
+ vdup.16 q2, d0[0]
+ vdup.16 q6, d8[0]
+ // Move r3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub r3, r3, #4
+ sub r12, r12, #4
+ vext.8 q1, q0, q1, #12
+ vext.8 q0, q2, q0, #12
+ vext.8 q5, q4, q5, #12
+ vext.8 q4, q6, q4, #12
+
+2:
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 2 + 1)
+ lsl lr, lr, #1
+ ldrh r11, [r3, lr]
+ ldrh lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.16 q14, r11
+ vdup.16 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+ // If we'll have to pad the right edge we need to quit early here.
+ cmp r5, #10
+ bge 4f // If w >= 10, all used input pixels are valid
+ cmp r5, #6
+ bge 5f // If w >= 6, we can filter 4 pixels
+ b 6f
+
+4: // Loop horizontally
+.macro add3 w
+.if \w > 4
+ vext.8 q8, q0, q1, #2
+ vext.8 q10, q4, q5, #2
+ vext.8 q9, q0, q1, #4
+ vext.8 q11, q4, q5, #4
+ vadd.i16 q2, q0, q8
+ vadd.i16 q3, q4, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+.else
+ vext.8 d16, d0, d1, #2
+ vext.8 d20, d8, d9, #2
+ vext.8 d18, d0, d1, #4
+ vext.8 d22, d8, d9, #4
+ vadd.i16 d4, d0, d16
+ vadd.i16 d6, d8, d20
+ vadd.i16 d4, d4, d18
+ vadd.i16 d6, d6, d22
+.endif
+
+ vmull.u16 q6, d0, d0
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d18, d18
+ vmull.u16 q12, d8, d8
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d22, d22
+.if \w > 4
+ vmull.u16 q7, d1, d1
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmull.u16 q13, d9, d9
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+.endif
+.endm
+ add3 8
+ vst1.16 {q2}, [r1, :128]!
+ vst1.16 {q3}, [r11, :128]!
+ vst1.32 {q6, q7}, [r0, :128]!
+ vst1.32 {q12, q13}, [r10, :128]!
+
+ subs r5, r5, #8
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q0, q1
+ vmov q4, q5
+ vld1.16 {q1}, [r3]!
+ vld1.16 {q5}, [r12]!
+
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+5: // Produce 4 pixels, 6 <= w < 10
+ add3 4
+ vst1.16 {d4}, [r1, :64]!
+ vst1.16 {d6}, [r11, :64]!
+ vst1.32 {q6}, [r0, :128]!
+ vst1.32 {q12}, [r10, :128]!
+
+ subs r5, r5, #4 // 2 <= w < 6
+ vext.8 q0, q0, q1, #8
+ vext.8 q4, q4, q5, #8
+
+6: // Pad the right edge and produce the last few pixels.
+ // 2 <= w < 6, 2-5 pixels valid in q0
+ sub lr, r5, #2
+ // lr = (pixels valid - 2)
+ adr r11, L(box3_variable_shift_tbl)
+ ldr lr, [r11, lr, lsl #2]
+ add r11, r11, lr
+ bx r11
+
+ .align 2
+L(box3_variable_shift_tbl):
+ .word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+ .word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+ .word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+ .word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+
+ // Shift q0 right, shifting out invalid pixels,
+ // shift q0 left to the original offset, shifting in padding pixels.
+22: // 2 pixels valid
+ vext.8 q0, q0, q0, #4
+ vext.8 q4, q4, q4, #4
+ vext.8 q0, q0, q14, #12
+ vext.8 q4, q4, q15, #12
+ b 88f
+33: // 3 pixels valid
+ vext.8 q0, q0, q0, #6
+ vext.8 q4, q4, q4, #6
+ vext.8 q0, q0, q14, #10
+ vext.8 q4, q4, q15, #10
+ b 88f
+44: // 4 pixels valid
+ vmov d1, d28
+ vmov d9, d30
+ b 88f
+55: // 5 pixels valid
+ vext.8 q0, q0, q0, #10
+ vext.8 q4, q4, q4, #10
+ vext.8 q0, q0, q14, #6
+ vext.8 q4, q4, q15, #6
+
+88:
+ // Restore r11 after using it for a temporary value above
+ add r11, r1, #(2*SUM_STRIDE)
+
+ add3 4
+ subs r5, r5, #4
+ vst1.16 {d4}, [r1, :64]!
+ vst1.16 {d6}, [r11, :64]!
+ vst1.32 {q6}, [r0, :128]!
+ vst1.32 {q12}, [r10, :128]!
+ ble 9f
+ vext.8 q0, q0, q0, #8
+ vext.8 q4, q4, q4, #8
+ // Only one needed pixel left, but do a normal 4 pixel
+ // addition anyway
+ add3 4
+ vst1.16 {d4}, [r1, :64]!
+ vst1.16 {d6}, [r11, :64]!
+ vst1.32 {q6}, [r0, :128]!
+ vst1.32 {q12}, [r10, :128]!
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+ // Subtract the number of pixels read from the input from the stride.
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 0f
+ // !LR_HAVE_RIGHT
+ add lr, r5, #3
+ bic lr, lr, #3
+ add r8, r5, #13
+ b 1f
+0:
+ add lr, r5, #7
+ bic lr, lr, #7
+ add r8, r5, #15
+1:
+ sub r9, r9, lr, lsl #1
+ bic r8, r8, #7
+ sub r4, r4, r8, lsl #1
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #6
+ sub r12, r12, #6
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #6
+
+1: // Loop vertically
+ vld1.16 {q0, q1}, [r3]!
+ vld1.16 {q4, q5}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d5}, [r2]!
+ // Move r3/r12 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #6
+ sub r12, r12, #6
+ vld1.16 {d13}, [r2]!
+ vext.8 q1, q0, q1, #10
+ vext.8 q0, q2, q0, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+ // and shift q0 to have 3x the first pixel at the front.
+ vdup.16 q2, d0[0]
+ vdup.16 q6, d8[0]
+ // Move r3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub r3, r3, #6
+ sub r12, r12, #6
+ vext.8 q1, q0, q1, #10
+ vext.8 q0, q2, q0, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+
+2:
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 3 + 1)
+ lsl lr, lr, #1
+ ldrh r11, [r3, lr]
+ ldrh lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.16 q14, r11
+ vdup.16 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+ // If we'll have to pad the right edge we need to quit early here.
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+ cmp r5, #7
+ bge 5f // If w >= 7, we can produce 4 pixels
+ b 6f
+
+4: // Loop horizontally
+.macro add5 w
+.if \w > 4
+ vext.8 q8, q0, q1, #2
+ vext.8 q10, q4, q5, #2
+ vext.8 q9, q0, q1, #4
+ vext.8 q11, q4, q5, #4
+ vadd.i16 q2, q0, q8
+ vadd.i16 q3, q4, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+.else
+ vext.8 d16, d0, d1, #2
+ vext.8 d20, d8, d9, #2
+ vext.8 d18, d0, d1, #4
+ vext.8 d22, d8, d9, #4
+ vadd.i16 d4, d0, d16
+ vadd.i16 d6, d8, d20
+ vadd.i16 d4, d4, d18
+ vadd.i16 d6, d6, d22
+.endif
+
+ vmull.u16 q6, d0, d0
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d18, d18
+ vmull.u16 q12, d8, d8
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d22, d22
+.if \w > 4
+ vmull.u16 q7, d1, d1
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmull.u16 q13, d9, d9
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+.endif
+
+.if \w > 4
+ vext.8 q8, q0, q1, #6
+ vext.8 q10, q4, q5, #6
+ vext.8 q9, q0, q1, #8
+ vext.8 q11, q4, q5, #8
+ vadd.i16 q2, q2, q8
+ vadd.i16 q3, q3, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+.else
+ vext.8 d16, d0, d1, #6
+ // d18 would be equal to d1; using d1 instead
+ vext.8 d20, d8, d9, #6
+ // d22 would be equal to d9; using d9 instead
+ vadd.i16 d4, d4, d16
+ vadd.i16 d6, d6, d20
+ vadd.i16 d4, d4, d1
+ vadd.i16 d6, d6, d9
+.endif
+
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d1, d1
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d9, d9
+.if \w > 4
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+.endif
+.endm
+ add5 8
+ vst1.16 {q2}, [r1, :128]!
+ vst1.16 {q3}, [r11, :128]!
+ vst1.32 {q6, q7}, [r0, :128]!
+ vst1.32 {q12, q13}, [r10, :128]!
+
+ subs r5, r5, #8
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q0, q1
+ vmov q4, q5
+ vld1.16 {q1}, [r3]!
+ vld1.16 {q5}, [r12]!
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+5: // Produce 4 pixels, 7 <= w < 11
+ add5 4
+ vst1.16 {d4}, [r1, :64]!
+ vst1.16 {d6}, [r11, :64]!
+ vst1.32 {q6}, [r0, :128]!
+ vst1.32 {q12}, [r10, :128]!
+
+ subs r5, r5, #4 // 3 <= w < 7
+ vext.8 q0, q0, q1, #8
+ vext.8 q4, q4, q5, #8
+
+6: // Pad the right edge and produce the last few pixels.
+ // w < 7, w+1 pixels valid in q0/q4
+ sub lr, r5, #1
+ // lr = pixels valid - 2
+ adr r11, L(box5_variable_shift_tbl)
+ ldr lr, [r11, lr, lsl #2]
+ vmov q1, q14
+ vmov q5, q15
+ add r11, r11, lr
+ bx r11
+
+ .align 2
+L(box5_variable_shift_tbl):
+ .word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+ .word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+ .word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+ .word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+ .word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+ .word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+
+ // Shift q0 right, shifting out invalid pixels,
+ // shift q0 left to the original offset, shifting in padding pixels.
+22: // 2 pixels valid
+ vext.8 q0, q0, q0, #4
+ vext.8 q4, q4, q4, #4
+ vext.8 q0, q0, q14, #12
+ vext.8 q4, q4, q15, #12
+ b 88f
+33: // 3 pixels valid
+ vext.8 q0, q0, q0, #6
+ vext.8 q4, q4, q4, #6
+ vext.8 q0, q0, q14, #10
+ vext.8 q4, q4, q15, #10
+ b 88f
+44: // 4 pixels valid
+ vmov d1, d28
+ vmov d9, d30
+ b 88f
+55: // 5 pixels valid
+ vext.8 q0, q0, q0, #10
+ vext.8 q4, q4, q4, #10
+ vext.8 q0, q0, q14, #6
+ vext.8 q4, q4, q15, #6
+ b 88f
+66: // 6 pixels valid
+ vext.8 q0, q0, q0, #12
+ vext.8 q4, q4, q4, #12
+ vext.8 q0, q0, q14, #4
+ vext.8 q4, q4, q15, #4
+ b 88f
+77: // 7 pixels valid
+ vext.8 q0, q0, q0, #14
+ vext.8 q4, q4, q4, #14
+ vext.8 q0, q0, q14, #2
+ vext.8 q4, q4, q15, #2
+
+88:
+ // Restore r11 after using it for a temporary value above
+ add r11, r1, #(2*SUM_STRIDE)
+
+ add5 4
+ subs r5, r5, #4
+ vst1.16 {d4}, [r1, :64]!
+ vst1.16 {d6}, [r11, :64]!
+ vst1.32 {q6}, [r0, :128]!
+ vst1.32 {q12}, [r10, :128]!
+ ble 9f
+ vext.8 q0, q0, q1, #8
+ vext.8 q4, q4, q5, #8
+ add5 4
+ vst1.16 {d4}, [r1, :64]!
+ vst1.16 {d6}, [r11, :64]!
+ vst1.32 {q6}, [r0, :128]!
+ vst1.32 {q12}, [r10, :128]!
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.purgem add5
+endfunc
+
+sgr_funcs 16
--- /dev/null
+++ b/src/arm/32/looprestoration_common.S
@@ -1,0 +1,453 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+ push {r4-r9,lr}
+ ldr r4, [sp, #28]
+ add r12, r3, #2 // Number of output rows to move back
+ mov lr, r3 // Number of input rows to move back
+ add r2, r2, #2 // Actual summed width
+ mov r7, #(4*SUM_STRIDE) // sumsq stride
+ mov r8, #(2*SUM_STRIDE) // sum stride
+ sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst r4, #4 // LR_HAVE_TOP
+ beq 0f
+ // If have top, read from row -2.
+ sub r5, r0, #(4*SUM_STRIDE)
+ sub r6, r1, #(2*SUM_STRIDE)
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add r5, r0, #(4*SUM_STRIDE)
+ add r6, r1, #(2*SUM_STRIDE)
+1:
+
+ tst r4, #8 // LR_HAVE_BOTTOM
+ beq 1f
+ // LR_HAVE_BOTTOM
+ add r3, r3, #2 // Sum all h+2 lines with the main loop
+ add lr, lr, #2
+1:
+ mov r9, r3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into q8-q13 and q0-q2 taking top
+ // padding into consideration.
+ tst r4, #4 // LR_HAVE_TOP
+ vld1.32 {q8, q9}, [r5, :128], r7
+ vld1.16 {q0}, [r6, :128], r8
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.32 {q10, q11}, [r5, :128], r7
+ vld1.16 {q1}, [r6, :128], r8
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q10, q8
+ vmov q11, q9
+ vmov q1, q0
+ vmov q12, q8
+ vmov q13, q9
+ vmov q2, q0
+
+3:
+ subs r3, r3, #1
+.macro add3
+ vadd.i32 q8, q8, q10
+ vadd.i32 q9, q9, q11
+ vadd.i16 q0, q0, q1
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i16 q0, q0, q2
+ vst1.32 {q8, q9}, [r0, :128], r7
+ vst1.16 {q0}, [r1, :128], r8
+.endm
+ add3
+ vmov q8, q10
+ vmov q9, q11
+ vmov q0, q1
+ vmov q10, q12
+ vmov q11, q13
+ vmov q1, q2
+ ble 4f
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ b 3b
+
+4:
+ tst r4, #8 // LR_HAVE_BOTTOM
+ bne 5f
+ // !LR_HAVE_BOTTOM
+ // Produce two more rows, extending the already loaded rows.
+ add3
+ vmov q8, q10
+ vmov q9, q11
+ vmov q0, q1
+ add3
+
+5: // End of one vertical slice.
+ subs r2, r2, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ mls r5, r7, lr, r5
+ mls r6, r8, lr, r6
+ // Output pointers
+ mls r0, r7, r12, r0
+ mls r1, r8, r12, r1
+ add r0, r0, #32
+ add r1, r1, #16
+ add r5, r5, #32
+ add r6, r6, #16
+ mov r3, r9
+ b 1b
+
+0:
+ pop {r4-r9,pc}
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+ push {r4-r9,lr}
+ vpush {q5-q7}
+ ldr r4, [sp, #76]
+ add r12, r3, #2 // Number of output rows to move back
+ mov lr, r3 // Number of input rows to move back
+ add r2, r2, #8 // Actual summed width
+ mov r7, #(4*SUM_STRIDE) // sumsq stride
+ mov r8, #(2*SUM_STRIDE) // sum stride
+ sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst r4, #4 // LR_HAVE_TOP
+ beq 0f
+ // If have top, read from row -2.
+ sub r5, r0, #(4*SUM_STRIDE)
+ sub r6, r1, #(2*SUM_STRIDE)
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add r5, r0, #(4*SUM_STRIDE)
+ add r6, r1, #(2*SUM_STRIDE)
+1:
+
+ tst r4, #8 // LR_HAVE_BOTTOM
+ beq 0f
+ // LR_HAVE_BOTTOM
+ add r3, r3, #2 // Handle h+2 lines with the main loop
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_BOTTOM
+ sub r3, r3, #1 // Handle h-1 lines with the main loop
+1:
+ mov r9, r3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into q6-q15 and q0-q3,q5 taking top
+ // padding into consideration.
+ tst r4, #4 // LR_HAVE_TOP
+ vld1.32 {q6, q7}, [r5, :128], r7
+ vld1.16 {q0}, [r6, :128], r8
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.32 {q10, q11}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ vmov q8, q6
+ vmov q9, q7
+ vmov q1, q0
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q8, q6
+ vmov q9, q7
+ vmov q1, q0
+ vmov q10, q6
+ vmov q11, q7
+ vmov q2, q0
+ vmov q12, q6
+ vmov q13, q7
+ vmov q3, q0
+
+3:
+ cmp r3, #0
+ beq 4f
+ vld1.32 {q14, q15}, [r5, :128], r7
+ vld1.16 {q5}, [r6, :128], r8
+
+3:
+ // Start of vertical loop
+ subs r3, r3, #2
+.macro add5
+ vadd.i32 q6, q6, q8
+ vadd.i32 q7, q7, q9
+ vadd.i16 q0, q0, q1
+ vadd.i32 q6, q6, q10
+ vadd.i32 q7, q7, q11
+ vadd.i16 q0, q0, q2
+ vadd.i32 q6, q6, q12
+ vadd.i32 q7, q7, q13
+ vadd.i16 q0, q0, q3
+ vadd.i32 q6, q6, q14
+ vadd.i32 q7, q7, q15
+ vadd.i16 q0, q0, q5
+ vst1.32 {q6, q7}, [r0, :128], r7
+ vst1.16 {q0}, [r1, :128], r8
+.endm
+ add5
+.macro shift2
+ vmov q6, q10
+ vmov q7, q11
+ vmov q0, q2
+ vmov q8, q12
+ vmov q9, q13
+ vmov q1, q3
+ vmov q10, q14
+ vmov q11, q15
+ vmov q2, q5
+.endm
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ ble 5f
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ vld1.32 {q14, q15}, [r5, :128], r7
+ vld1.16 {q5}, [r6, :128], r8
+ b 3b
+
+4:
+ // h == 1, !LR_HAVE_BOTTOM.
+ // Pad the last row with the only content row, and add.
+ vmov q14, q12
+ vmov q15, q13
+ vmov q5, q3
+ add5
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ add5
+ b 6f
+
+5:
+ tst r4, #8 // LR_HAVE_BOTTOM
+ bne 6f
+ // !LR_HAVE_BOTTOM
+ cmp r3, #0
+ bne 5f
+ // The intended three edge rows left; output the one at h-2 and
+ // the past edge one at h.
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ // Pad the past-edge row from the last content row.
+ vmov q14, q12
+ vmov q15, q13
+ vmov q5, q3
+ add5
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ // The last two rows are already padded properly here.
+ add5
+ b 6f
+
+5:
+ // r3 == -1, two rows left, output one.
+ // Pad the last two rows from the mid one.
+ vmov q12, q10
+ vmov q13, q11
+ vmov q3, q2
+ vmov q14, q10
+ vmov q15, q11
+ vmov q5, q2
+ add5
+ add r0, r0, r7
+ add r1, r1, r8
+ b 6f
+
+6: // End of one vertical slice.
+ subs r2, r2, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ mls r5, r7, lr, r5
+ mls r6, r8, lr, r6
+ // Output pointers
+ mls r0, r7, r12, r0
+ mls r1, r8, r12, r1
+ add r0, r0, #32
+ add r1, r1, #16
+ add r5, r5, #32
+ add r6, r6, #16
+ mov r3, r9
+ b 1b
+
+0:
+ vpop {q5-q7}
+ pop {r4-r9,pc}
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength,
+// const int bitdepth_max);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength,
+// const int bitdepth_max);
+function sgr_calc_ab1_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #84]
+ add r3, r3, #2 // h += 2
+ clz r6, r5
+ vmov.i32 q15, #9 // n
+ movw r5, #455
+ mov lr, #SUM_STRIDE
+ b sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #84]
+ add r3, r3, #3 // h += 3
+ clz r6, r5
+ asr r3, r3, #1 // h /= 2
+ vmov.i32 q15, #25 // n
+ mov r5, #164
+ mov lr, #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+ movrel r12, X(sgr_x_by_x)
+ sub r6, r6, #24 // -bitdepth_min_8
+ vld1.8 {q8, q9}, [r12, :128]!
+ add r7, r6, r6 // -2*bitdepth_min_8
+ vmov.i8 q11, #5
+ vmov.i8 d10, #55 // idx of last 5
+ vld1.8 {q10}, [r12, :128]
+ vmov.i8 d11, #72 // idx of last 4
+ vmov.i8 d12, #101 // idx of last 3
+ vmov.i8 d13, #169 // idx of last 2
+ vmov.i8 d14, #254 // idx of last 1
+ vmov.i8 d15, #32 // elements consumed in first vtbl
+ add r2, r2, #2 // w += 2
+ add r12, r2, #7
+ bic r12, r12, #7 // aligned w
+ sub r12, lr, r12 // increment between rows
+ vdup.32 q12, r4
+ sub r0, r0, #(4*(SUM_STRIDE))
+ sub r1, r1, #(2*(SUM_STRIDE))
+ mov r4, r2 // backup of w
+ vsub.i8 q8, q8, q11
+ vsub.i8 q9, q9, q11
+ vsub.i8 q10, q10, q11
+1:
+ vld1.32 {q0, q1}, [r0, :128] // a
+ vld1.16 {q2}, [r1, :128] // b
+ vdup.32 q13, r7 // -2*bitdepth_min_8
+ vdup.16 q14, r6 // -bitdepth_min_8
+ subs r2, r2, #8
+ vrshl.s32 q0, q0, q13
+ vrshl.s32 q1, q1, q13
+ vrshl.s16 q4, q2, q14
+ vmul.i32 q0, q0, q15 // a * n
+ vmul.i32 q1, q1, q15 // a * n
+ vmull.u16 q3, d8, d8 // b * b
+ vmull.u16 q4, d9, d9 // b * b
+ vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0)
+ vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0)
+ vmul.i32 q0, q0, q12 // p * s
+ vmul.i32 q1, q1, q12 // p * s
+ vqshrn.u32 d0, q0, #16
+ vqshrn.u32 d1, q1, #16
+ vqrshrn.u16 d0, q0, #4 // imin(z, 255)
+
+ vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5
+ vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4
+ vtbl.8 d1, {q8, q9}, d0
+ vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3
+ vsub.i8 d9, d0, d15 // indices for vtbx
+ vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2
+ vadd.i8 d2, d2, d3
+ vtbx.8 d1, {q10}, d9
+ vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1
+ vadd.i8 d6, d6, d7
+ vadd.i8 d8, d8, d22
+ vadd.i8 d2, d2, d6
+ vadd.i8 d1, d1, d8
+ vadd.i8 d1, d1, d2
+ vmovl.u8 q0, d1 // x
+
+ vmov.i16 q13, #256
+ vdup.32 q14, r5 // one_by_x
+
+ vmull.u16 q1, d0, d4 // x * BB[i]
+ vmull.u16 q2, d1, d5 // x * BB[i]
+ vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x
+ vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x
+ vrshr.s32 q1, q1, #12 // AA[i]
+ vrshr.s32 q2, q2, #12 // AA[i]
+ vsub.i16 q0, q13, q0 // 256 - x
+
+ vst1.32 {q1, q2}, [r0, :128]!
+ vst1.16 {q0}, [r1, :128]!
+ bgt 1b
+
+ subs r3, r3, #1
+ ble 0f
+ add r0, r0, r12, lsl #2
+ add r1, r1, r12, lsl #1
+ mov r2, r4
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r7,pc}
+endfunc
--- /dev/null
+++ b/src/arm/32/looprestoration_tmpl.S
@@ -1,0 +1,600 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+.macro sgr_funcs bpc
+// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter1_\bpc\()bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r6, [sp, #108]
+ sub r7, r3, #(4*SUM_STRIDE)
+ add r8, r3, #(4*SUM_STRIDE)
+ sub r9, r4, #(2*SUM_STRIDE)
+ add r10, r4, #(2*SUM_STRIDE)
+ mov r11, #SUM_STRIDE
+ mov r12, #FILTER_OUT_STRIDE
+ add lr, r5, #3
+ bic lr, lr, #3 // Aligned width
+.if \bpc == 8
+ sub r2, r2, lr
+.else
+ sub r2, r2, lr, lsl #1
+.endif
+ sub r12, r12, lr
+ sub r11, r11, lr
+ sub r11, r11, #4 // We read 4 extra elements from both a and b
+ mov lr, r5
+ vmov.i16 q14, #3
+ vmov.i32 q15, #3
+1:
+ vld1.16 {q0}, [r9, :128]!
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q2}, [r10, :128]!
+ vld1.32 {q8, q9}, [r7, :128]!
+ vld1.32 {q10, q11}, [r3, :128]!
+ vld1.32 {q12, q13}, [r8, :128]!
+
+2:
+ subs r5, r5, #4
+ vext.8 d6, d0, d1, #2 // -stride
+ vext.8 d7, d2, d3, #2 // 0
+ vext.8 d8, d4, d5, #2 // +stride
+ vext.8 d9, d0, d1, #4 // +1-stride
+ vext.8 d10, d2, d3, #4 // +1
+ vext.8 d11, d4, d5, #4 // +1+stride
+ vadd.i16 d2, d2, d6 // -1, -stride
+ vadd.i16 d7, d7, d8 // 0, +stride
+ vadd.i16 d0, d0, d9 // -1-stride, +1-stride
+ vadd.i16 d2, d2, d7
+ vadd.i16 d4, d4, d11 // -1+stride, +1+stride
+ vadd.i16 d2, d2, d10 // +1
+ vadd.i16 d0, d0, d4
+
+ vext.8 q3, q8, q9, #4 // -stride
+ vshl.i16 d2, d2, #2
+ vext.8 q4, q8, q9, #8 // +1-stride
+ vext.8 q5, q10, q11, #4 // 0
+ vext.8 q6, q10, q11, #8 // +1
+ vmla.i16 d2, d0, d28 // * 3 -> a
+ vadd.i32 q3, q3, q10 // -stride, -1
+ vadd.i32 q8, q8, q4 // -1-stride, +1-stride
+ vadd.i32 q5, q5, q6 // 0, +1
+ vadd.i32 q8, q8, q12 // -1+stride
+ vadd.i32 q3, q3, q5
+ vext.8 q7, q12, q13, #4 // +stride
+ vext.8 q10, q12, q13, #8 // +1+stride
+.if \bpc == 8
+ vld1.32 {d24[0]}, [r1, :32]! // src
+.else
+ vld1.16 {d24}, [r1, :64]! // src
+.endif
+ vadd.i32 q3, q3, q7 // +stride
+ vadd.i32 q8, q8, q10 // +1+stride
+ vshl.i32 q3, q3, #2
+ vmla.i32 q3, q8, q15 // * 3 -> b
+.if \bpc == 8
+ vmovl.u8 q12, d24 // src
+.endif
+ vmov d0, d1
+ vmlal.u16 q3, d2, d24 // b + a * src
+ vmov d2, d3
+ vrshrn.i32 d6, q3, #9
+ vmov d4, d5
+ vst1.16 {d6}, [r0]!
+
+ ble 3f
+ vmov q8, q9
+ vmov q10, q11
+ vmov q12, q13
+ vld1.16 {d1}, [r9, :64]!
+ vld1.16 {d3}, [r4, :64]!
+ vld1.16 {d5}, [r10, :64]!
+ vld1.32 {q9}, [r7, :128]!
+ vld1.32 {q11}, [r3, :128]!
+ vld1.32 {q13}, [r8, :128]!
+ b 2b
+
+3:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ add r0, r0, r12, lsl #1
+ add r1, r1, r2
+ add r3, r3, r11, lsl #2
+ add r7, r7, r11, lsl #2
+ add r8, r8, r11, lsl #2
+ add r4, r4, r11, lsl #1
+ add r9, r9, r11, lsl #1
+ add r10, r10, r11, lsl #1
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter2_\bpc\()bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r6, [sp, #108]
+ add r7, r3, #(4*(SUM_STRIDE))
+ sub r3, r3, #(4*(SUM_STRIDE))
+ add r8, r4, #(2*(SUM_STRIDE))
+ sub r4, r4, #(2*(SUM_STRIDE))
+ mov r9, #(2*SUM_STRIDE)
+ mov r10, #FILTER_OUT_STRIDE
+ add r11, r5, #7
+ bic r11, r11, #7 // Aligned width
+.if \bpc == 8
+ sub r2, r2, r11
+.else
+ sub r2, r2, r11, lsl #1
+.endif
+ sub r10, r10, r11
+ sub r9, r9, r11
+ sub r9, r9, #4 // We read 4 extra elements from a
+ sub r12, r9, #4 // We read 8 extra elements from b
+ mov lr, r5
+
+1:
+ vld1.16 {q0, q1}, [r4, :128]!
+ vld1.16 {q2, q3}, [r8, :128]!
+ vld1.32 {q8, q9}, [r3, :128]!
+ vld1.32 {q11, q12}, [r7, :128]!
+ vld1.32 {q10}, [r3, :128]!
+ vld1.32 {q13}, [r7, :128]!
+
+2:
+ vmov.i16 q14, #5
+ vmov.i16 q15, #6
+ subs r5, r5, #8
+ vext.8 q4, q0, q1, #4 // +1-stride
+ vext.8 q5, q2, q3, #4 // +1+stride
+ vext.8 q6, q0, q1, #2 // -stride
+ vext.8 q7, q2, q3, #2 // +stride
+ vadd.i16 q0, q0, q4 // -1-stride, +1-stride
+ vadd.i16 q5, q2, q5 // -1+stride, +1+stride
+ vadd.i16 q2, q6, q7 // -stride, +stride
+ vadd.i16 q0, q0, q5
+
+ vext.8 q4, q8, q9, #8 // +1-stride
+ vext.8 q5, q9, q10, #8
+ vext.8 q6, q11, q12, #8 // +1+stride
+ vext.8 q7, q12, q13, #8
+ vmul.i16 q0, q0, q14 // * 5
+ vmla.i16 q0, q2, q15 // * 6
+ vadd.i32 q4, q4, q8 // -1-stride, +1-stride
+ vadd.i32 q5, q5, q9
+ vadd.i32 q6, q6, q11 // -1+stride, +1+stride
+ vadd.i32 q7, q7, q12
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q7
+ vext.8 q6, q8, q9, #4 // -stride
+ vext.8 q7, q9, q10, #4
+ vext.8 q8, q11, q12, #4 // +stride
+ vext.8 q11, q12, q13, #4
+
+.if \bpc == 8
+ vld1.8 {d4}, [r1, :64]!
+.else
+ vld1.8 {q2}, [r1, :128]!
+.endif
+
+ vmov.i32 q14, #5
+ vmov.i32 q15, #6
+
+ vadd.i32 q6, q6, q8 // -stride, +stride
+ vadd.i32 q7, q7, q11
+ vmul.i32 q4, q4, q14 // * 5
+ vmla.i32 q4, q6, q15 // * 6
+ vmul.i32 q5, q5, q14 // * 5
+ vmla.i32 q5, q7, q15 // * 6
+
+.if \bpc == 8
+ vmovl.u8 q2, d4
+.endif
+ vmlal.u16 q4, d0, d4 // b + a * src
+ vmlal.u16 q5, d1, d5 // b + a * src
+ vmov q0, q1
+ vrshrn.i32 d8, q4, #9
+ vrshrn.i32 d9, q5, #9
+ vmov q2, q3
+ vst1.16 {q4}, [r0, :128]!
+
+ ble 3f
+ vmov q8, q10
+ vmov q11, q13
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q3}, [r8, :128]!
+ vld1.32 {q9, q10}, [r3, :128]!
+ vld1.32 {q12, q13}, [r7, :128]!
+ b 2b
+
+3:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ add r0, r0, r10, lsl #1
+ add r1, r1, r2
+ add r3, r3, r9, lsl #2
+ add r7, r7, r9, lsl #2
+ add r4, r4, r12, lsl #1
+ add r8, r8, r12, lsl #1
+
+ vld1.32 {q8, q9}, [r3, :128]!
+ vld1.16 {q0, q1}, [r4, :128]!
+ vld1.32 {q10}, [r3, :128]!
+
+ vmov.i16 q12, #5
+ vmov.i16 q13, #6
+
+4:
+ subs r5, r5, #8
+ vext.8 q3, q0, q1, #4 // +1
+ vext.8 q2, q0, q1, #2 // 0
+ vadd.i16 q0, q0, q3 // -1, +1
+
+ vext.8 q4, q8, q9, #4 // 0
+ vext.8 q5, q9, q10, #4
+ vext.8 q6, q8, q9, #8 // +1
+ vext.8 q7, q9, q10, #8
+ vmul.i16 q2, q2, q13 // * 6
+ vmla.i16 q2, q0, q12 // * 5 -> a
+.if \bpc == 8
+ vld1.8 {d22}, [r1, :64]!
+.else
+ vld1.16 {q11}, [r1, :128]!
+.endif
+ vadd.i32 q8, q8, q6 // -1, +1
+ vadd.i32 q9, q9, q7
+.if \bpc == 8
+ vmovl.u8 q11, d22
+.endif
+ vmul.i32 q4, q4, q15 // * 6
+ vmla.i32 q4, q8, q14 // * 5 -> b
+ vmul.i32 q5, q5, q15 // * 6
+ vmla.i32 q5, q9, q14 // * 5 -> b
+
+ vmlal.u16 q4, d4, d22 // b + a * src
+ vmlal.u16 q5, d5, d23
+ vmov q0, q1
+ vrshrn.i32 d8, q4, #8
+ vrshrn.i32 d9, q5, #8
+ vmov q8, q10
+ vst1.16 {q4}, [r0, :128]!
+
+ ble 5f
+ vld1.16 {q1}, [r4, :128]!
+ vld1.32 {q9, q10}, [r3, :128]!
+ b 4b
+
+5:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started
+ sub r4, r4, r11, lsl #1
+ add r0, r0, r10, lsl #1
+ add r1, r1, r2
+ sub r3, r3, #16
+ sub r4, r4, #16
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int w, const int h,
+// const int wt, const int bitdepth_max);
+function sgr_weighted1_\bpc\()bpc_neon, export=1
+ push {r4-r9,lr}
+ ldrd r4, r5, [sp, #28]
+ ldrd r6, r7, [sp, #36]
+.if \bpc == 16
+ ldr r8, [sp, #44]
+.endif
+ vdup.16 d31, r7
+ cmp r6, #2
+.if \bpc == 16
+ vdup.16 q14, r8
+.endif
+ add r9, r0, r1
+ add r12, r2, r3
+ add lr, r4, #2*FILTER_OUT_STRIDE
+ mov r7, #(4*FILTER_OUT_STRIDE)
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+ add r8, r5, #7
+ bic r8, r8, #7 // Aligned width
+.if \bpc == 8
+ sub r1, r1, r8
+ sub r3, r3, r8
+.else
+ sub r1, r1, r8, lsl #1
+ sub r3, r3, r8, lsl #1
+.endif
+ sub r7, r7, r8, lsl #1
+ mov r8, r5
+ blt 2f
+1:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+ vld1.8 {d16}, [r12, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+ vld1.16 {q8}, [r12, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q9}, [lr, :128]!
+ subs r5, r5, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+ vshll.u8 q8, d16, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+ vshl.i16 q8, q8, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q9, q9, q8 // t1 - u
+ vshll.u16 q2, d0, #7 // u << 7
+ vshll.u16 q3, d1, #7 // u << 7
+ vshll.u16 q10, d16, #7 // u << 7
+ vshll.u16 q11, d17, #7 // u << 7
+ vmlal.s16 q2, d2, d31 // v
+ vmlal.s16 q3, d3, d31 // v
+ vmlal.s16 q10, d18, d31 // v
+ vmlal.s16 q11, d19, d31 // v
+.if \bpc == 8
+ vrshrn.i32 d4, q2, #11
+ vrshrn.i32 d5, q3, #11
+ vrshrn.i32 d20, q10, #11
+ vrshrn.i32 d21, q11, #11
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d20, q10
+ vst1.8 {d4}, [r0]!
+ vst1.8 {d20}, [r9]!
+.else
+ vqrshrun.s32 d4, q2, #11
+ vqrshrun.s32 d5, q3, #11
+ vqrshrun.s32 d20, q10, #11
+ vqrshrun.s32 d21, q11, #11
+ vmin.u16 q2, q2, q14
+ vmin.u16 q10, q10, q14
+ vst1.16 {q2}, [r0]!
+ vst1.16 {q10}, [r9]!
+.endif
+ bgt 1b
+
+ sub r6, r6, #2
+ cmp r6, #1
+ blt 0f
+ mov r5, r8
+ add r0, r0, r1
+ add r9, r9, r1
+ add r2, r2, r3
+ add r12, r12, r3
+ add r4, r4, r7
+ add lr, lr, r7
+ beq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ subs r5, r5, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vshll.u16 q2, d0, #7 // u << 7
+ vshll.u16 q3, d1, #7 // u << 7
+ vmlal.s16 q2, d2, d31 // v
+ vmlal.s16 q3, d3, d31 // v
+.if \bpc == 8
+ vrshrn.i32 d4, q2, #11
+ vrshrn.i32 d5, q3, #11
+ vqmovun.s16 d2, q2
+ vst1.8 {d2}, [r0]!
+.else
+ vqrshrun.s32 d4, q2, #11
+ vqrshrun.s32 d5, q3, #11
+ vmin.u16 q2, q2, q14
+ vst1.16 {q2}, [r0]!
+.endif
+ bgt 2b
+0:
+ pop {r4-r9,pc}
+endfunc
+
+// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int16_t *t2,
+// const int w, const int h,
+// const int16_t wt[2], const int bitdepth_max);
+function sgr_weighted2_\bpc\()bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+.if \bpc == 8
+ ldr r8, [sp, #52]
+.else
+ ldrd r8, r9, [sp, #52]
+.endif
+ cmp r7, #2
+ add r10, r0, r1
+ add r11, r2, r3
+ add r12, r4, #2*FILTER_OUT_STRIDE
+ add lr, r5, #2*FILTER_OUT_STRIDE
+ vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1]
+.if \bpc == 16
+ vdup.16 q14, r9
+.endif
+ mov r8, #4*FILTER_OUT_STRIDE
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+ add r9, r6, #7
+ bic r9, r9, #7 // Aligned width
+.if \bpc == 8
+ sub r1, r1, r9
+ sub r3, r3, r9
+.else
+ sub r1, r1, r9, lsl #1
+ sub r3, r3, r9, lsl #1
+.endif
+ sub r8, r8, r9, lsl #1
+ mov r9, r6
+ blt 2f
+1:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+ vld1.8 {d16}, [r11, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+ vld1.16 {q8}, [r11, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q9}, [r12, :128]!
+ vld1.16 {q2}, [r5, :128]!
+ vld1.16 {q10}, [lr, :128]!
+ subs r6, r6, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+ vshll.u8 q8, d16, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+ vshl.i16 q8, q8, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q2, q2, q0 // t2 - u
+ vsub.i16 q9, q9, q8 // t1 - u
+ vsub.i16 q10, q10, q8 // t2 - u
+ vshll.u16 q3, d0, #7 // u << 7
+ vshll.u16 q0, d1, #7 // u << 7
+ vshll.u16 q11, d16, #7 // u << 7
+ vshll.u16 q8, d17, #7 // u << 7
+ vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u)
+.if \bpc == 8
+ vrshrn.i32 d6, q3, #11
+ vrshrn.i32 d7, q0, #11
+ vrshrn.i32 d22, q11, #11
+ vrshrn.i32 d23, q8, #11
+ vqmovun.s16 d6, q3
+ vqmovun.s16 d22, q11
+ vst1.8 {d6}, [r0]!
+ vst1.8 {d22}, [r10]!
+.else
+ vqrshrun.s32 d6, q3, #11
+ vqrshrun.s32 d7, q0, #11
+ vqrshrun.s32 d22, q11, #11
+ vqrshrun.s32 d23, q8, #11
+ vmin.u16 q3, q3, q14
+ vmin.u16 q11, q11, q14
+ vst1.16 {q3}, [r0]!
+ vst1.16 {q11}, [r10]!
+.endif
+ bgt 1b
+
+ subs r7, r7, #2
+ cmp r7, #1
+ blt 0f
+ mov r6, r9
+ add r0, r0, r1
+ add r10, r10, r1
+ add r2, r2, r3
+ add r11, r11, r3
+ add r4, r4, r8
+ add r12, r12, r8
+ add r5, r5, r8
+ add lr, lr, r8
+ beq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q2}, [r5, :128]!
+ subs r6, r6, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q2, q2, q0 // t2 - u
+ vshll.u16 q3, d0, #7 // u << 7
+ vshll.u16 q0, d1, #7 // u << 7
+ vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
+.if \bpc == 8
+ vrshrn.i32 d6, q3, #11
+ vrshrn.i32 d7, q0, #11
+ vqmovun.s16 d6, q3
+ vst1.8 {d6}, [r0]!
+.else
+ vqrshrun.s32 d6, q3, #11
+ vqrshrun.s32 d7, q0, #11
+ vmin.u16 q3, q3, q14
+ vst1.16 {q3}, [r0]!
+.endif
+ bgt 1b
+0:
+ pop {r4-r11,pc}
+endfunc
+.endm
--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -1403,12 +1403,12 @@
vld1.8 {d24}, [\sr2], \s_strd
vmovl.u8 q8, d16
vmovl.u8 q12, d24
- vext.8 q9, q8, q8, #2
- vext.8 q10, q8, q8, #4
- vext.8 q11, q8, q8, #6
- vext.8 q13, q12, q12, #2
- vext.8 q14, q12, q12, #4
- vext.8 q15, q12, q12, #6
+ vext.8 d18, d16, d17, #2
+ vext.8 d20, d16, d17, #4
+ vext.8 d22, d16, d17, #6
+ vext.8 d26, d24, d25, #2
+ vext.8 d28, d24, d25, #4
+ vext.8 d30, d24, d25, #6
subs \h, \h, #2
vmul.s16 d4, d16, d0[0]
vmla.s16 d4, d18, d0[1]
@@ -1431,7 +1431,7 @@
pop {r4-r11,pc}
80: // 8xN h
- vld1.8 {d0}, [\mx]
+ vld1.8 {d0}, [\mx, :64]
sub \src, \src, #3
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -1482,7 +1482,7 @@
// one temporary for vext in the loop. That's slower on A7 and A53,
// (but surprisingly, marginally faster on A8 and A73).
vpush {q4-q6}
- vld1.8 {d0}, [\mx]
+ vld1.8 {d0}, [\mx, :64]
sub \src, \src, #3
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -1629,7 +1629,7 @@
28: // 2x8, 2x16 v
vpush {q4-q7}
- vld1.8 {d0}, [\my]
+ vld1.8 {d0}, [\my, :64]
sub \sr2, \src, \s_strd, lsl #1
add \ds2, \dst, \d_strd
sub \src, \sr2, \s_strd
@@ -1709,7 +1709,7 @@
480: // 4x8, 4x16 v
vpush {q4}
- vld1.8 {d0}, [\my]
+ vld1.8 {d0}, [\my, :64]
sub \sr2, \src, \s_strd, lsl #1
add \ds2, \dst, \d_strd
sub \src, \sr2, \s_strd
@@ -1782,7 +1782,7 @@
640:
1280:
vpush {q4}
- vld1.8 {d0}, [\my]
+ vld1.8 {d0}, [\my, :64]
sub \src, \src, \s_strd
sub \src, \src, \s_strd, lsl #1
vmovl.s8 q0, d0
@@ -1951,11 +1951,10 @@
bl L(\type\()_8tap_filter_2)
vext.8 d18, d17, d26, #4
- vmov d19, d26
vmull.s16 q2, d16, d2[0]
vmlal.s16 q2, d17, d2[1]
vmlal.s16 q2, d18, d2[2]
- vmlal.s16 q2, d19, d2[3]
+ vmlal.s16 q2, d26, d2[3]
vqrshrn.s32 d4, q2, #\shift_hv
vqmovun.s16 d4, q2
@@ -1964,11 +1963,11 @@
vst1.16 {d4[1]}, [\ds2, :16], \d_strd
ble 0f
vmov d16, d18
- vmov d17, d19
+ vmov d17, d26
b 2b
280: // 2x8, 2x16, 2x32 hv
- vld1.8 {d2}, [\my]
+ vld1.8 {d2}, [\my, :64]
sub \src, \src, #1
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
@@ -2001,7 +2000,6 @@
28:
bl L(\type\()_8tap_filter_2)
vext.8 d22, d21, d26, #4
- vmov d23, d26
vmull.s16 q2, d16, d2[0]
vmlal.s16 q2, d17, d2[1]
vmlal.s16 q2, d18, d2[2]
@@ -2009,7 +2007,7 @@
vmlal.s16 q2, d20, d3[0]
vmlal.s16 q2, d21, d3[1]
vmlal.s16 q2, d22, d3[2]
- vmlal.s16 q2, d23, d3[3]
+ vmlal.s16 q2, d26, d3[3]
vqrshrn.s32 d4, q2, #\shift_hv
vqmovun.s16 d4, q2
@@ -2022,7 +2020,7 @@
vmov d18, d20
vmov d19, d21
vmov d20, d22
- vmov d21, d23
+ vmov d21, d26
b 28b
0:
@@ -2108,7 +2106,7 @@
b 4b
480: // 4x8, 4x16, 4x32 hv
- vld1.8 {d2}, [\my]
+ vld1.8 {d2}, [\my, :64]
sub \src, \src, #1
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
@@ -2211,7 +2209,7 @@
bgt 880f
vpush {q4-q7}
add \my, \my, #2
- vld1.8 {d0}, [\mx]
+ vld1.8 {d0}, [\mx, :64]
vld1.32 {d2[]}, [\my]
sub \src, \src, #3
sub \src, \src, \s_strd
@@ -2301,8 +2299,8 @@
640:
1280:
vpush {q4-q7}
- vld1.8 {d0}, [\mx]
- vld1.8 {d2}, [\my]
+ vld1.8 {d0}, [\mx, :64]
+ vld1.8 {d2}, [\my, :64]
sub \src, \src, #3
sub \src, \src, \s_strd
sub \src, \src, \s_strd, lsl #1
--- a/src/arm/32/mc16.S
+++ b/src/arm/32/mc16.S
@@ -272,3 +272,2463 @@
bidir_fn avg, r6
bidir_fn w_avg, r7
bidir_fn mask, r7
+
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that r9 is set to (clz(w)-24).
+function put_neon
+ adr r10, L(put_tbl)
+ ldr r9, [r10, r9, lsl #2]
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(put_tbl):
+ .word 1280f - L(put_tbl) + CONFIG_THUMB
+ .word 640f - L(put_tbl) + CONFIG_THUMB
+ .word 320f - L(put_tbl) + CONFIG_THUMB
+ .word 16f - L(put_tbl) + CONFIG_THUMB
+ .word 80f - L(put_tbl) + CONFIG_THUMB
+ .word 4f - L(put_tbl) + CONFIG_THUMB
+ .word 2f - L(put_tbl) + CONFIG_THUMB
+
+2:
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d1[]}, [r2], r3
+ subs r5, r5, #2
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+ bgt 2b
+ pop {r4-r11,pc}
+4:
+ vld1.16 {d0}, [r2], r3
+ vld1.16 {d1}, [r2], r3
+ subs r5, r5, #2
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r0, :64], r1
+ bgt 4b
+ pop {r4-r11,pc}
+80:
+ add r8, r0, r1
+ lsl r1, r1, #1
+ add r9, r2, r3
+ lsl r3, r3, #1
+8:
+ vld1.16 {q0}, [r2], r3
+ vld1.16 {q1}, [r9], r3
+ subs r5, r5, #2
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r8, :128], r1
+ bgt 8b
+ pop {r4-r11,pc}
+16:
+ vld1.16 {q0, q1}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q0, q1}, [r0, :128], r1
+ bgt 16b
+ pop {r4-r11,pc}
+320:
+ sub r1, r1, #32
+ sub r3, r3, #32
+32:
+ vld1.16 {q0, q1}, [r2]!
+ vst1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q2, q3}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q2, q3}, [r0, :128], r1
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r1, r1, #96
+ sub r3, r3, #96
+64:
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r2]!
+ vst1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q14, q15}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r1, r1, #224
+ sub r3, r3, #224
+128:
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r2]!
+ vst1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r2]!
+ vst1.16 {q14, q15}, [r0, :128]!
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r2]!
+ vst1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q14, q15}, [r0, :128], r1
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that r9 is set to (clz(w)-24), r7 to intermediate_bits and
+// r8 to w*2.
+function prep_neon
+ adr r10, L(prep_tbl)
+ ldr r9, [r10, r9, lsl #2]
+ vdup.16 q15, r7 // intermediate_bits
+ vmov.i16 q14, #PREP_BIAS
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(prep_tbl):
+ .word 1280f - L(prep_tbl) + CONFIG_THUMB
+ .word 640f - L(prep_tbl) + CONFIG_THUMB
+ .word 320f - L(prep_tbl) + CONFIG_THUMB
+ .word 16f - L(prep_tbl) + CONFIG_THUMB
+ .word 80f - L(prep_tbl) + CONFIG_THUMB
+ .word 40f - L(prep_tbl) + CONFIG_THUMB
+
+40:
+ add r9, r1, r2
+ lsl r2, r2, #1
+4:
+ vld1.16 {d0}, [r1], r2
+ vld1.16 {d1}, [r9], r2
+ subs r4, r4, #2
+ vshl.s16 q0, q0, q15
+ vsub.i16 q0, q0, q14
+ vst1.16 {q0}, [r0, :128]!
+ bgt 4b
+ pop {r4-r11,pc}
+80:
+ add r9, r1, r2
+ lsl r2, r2, #1
+8:
+ vld1.16 {q0}, [r1], r2
+ vld1.16 {q1}, [r9], r2
+ subs r4, r4, #2
+ vshl.s16 q0, q0, q15
+ vshl.s16 q1, q1, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 8b
+ pop {r4-r11,pc}
+16:
+ vld1.16 {q0, q1}, [r1], r2
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1], r2
+ subs r4, r4, #2
+ vshl.s16 q1, q1, q15
+ vshl.s16 q2, q2, q15
+ vshl.s16 q3, q3, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q3, q3, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ bgt 16b
+ pop {r4-r11,pc}
+320:
+ sub r2, r2, #32
+32:
+ vld1.16 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1], r2
+ vshl.s16 q1, q1, q15
+ vshl.s16 q2, q2, q15
+ vshl.s16 q3, q3, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q3, q3, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r2, r2, #96
+64:
+ vld1.16 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1]!
+ vshl.s16 q1, q1, q15
+ vld1.16 {q8, q9}, [r1]!
+ vshl.s16 q2, q2, q15
+ vld1.16 {q10, q11}, [r1], r2
+ vshl.s16 q3, q3, q15
+ vshl.s16 q8, q8, q15
+ vshl.s16 q9, q9, q15
+ vshl.s16 q10, q10, q15
+ vshl.s16 q11, q11, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vsub.i16 q3, q3, q14
+ vsub.i16 q8, q8, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q9, q9, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ vsub.i16 q10, q10, q14
+ vst1.16 {q8, q9}, [r0, :128]!
+ vsub.i16 q11, q11, q14
+ vst1.16 {q10, q11}, [r0, :128]!
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r2, r2, #224
+128:
+ vld1.16 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1]!
+ vshl.s16 q1, q1, q15
+ vld1.16 {q8, q9}, [r1]!
+ vshl.s16 q2, q2, q15
+ vld1.16 {q10, q11}, [r1]!
+ vshl.s16 q3, q3, q15
+ vshl.s16 q8, q8, q15
+ vshl.s16 q9, q9, q15
+ vshl.s16 q10, q10, q15
+ vshl.s16 q11, q11, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vsub.i16 q3, q3, q14
+ vsub.i16 q8, q8, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q0, q1}, [r1]!
+ vsub.i16 q9, q9, q14
+ vsub.i16 q10, q10, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ vld1.16 {q2, q3}, [r1]!
+ vsub.i16 q11, q11, q14
+ vshl.s16 q0, q0, q15
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q8, q9}, [r1]!
+ vshl.s16 q1, q1, q15
+ vshl.s16 q2, q2, q15
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q10, q11}, [r1], r2
+ vshl.s16 q3, q3, q15
+ vshl.s16 q8, q8, q15
+ vshl.s16 q9, q9, q15
+ vshl.s16 q10, q10, q15
+ vshl.s16 q11, q11, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vsub.i16 q3, q3, q14
+ vsub.i16 q8, q8, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q9, q9, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ vsub.i16 q10, q10, q14
+ vst1.16 {q8, q9}, [r0, :128]!
+ vsub.i16 q11, q11, q14
+ vst1.16 {q10, q11}, [r0, :128]!
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ vld1.\wd {\d0[]}, [\s0], \strd
+ vld1.\wd {\d1[]}, [\s1], \strd
+.ifnb \d2
+ vld1.\wd {\d2[]}, [\s0], \strd
+ vld1.\wd {\d3[]}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.\wd {\d4[]}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.\wd {\d5[]}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.\wd {\d6[]}, [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ vld1.16 {\d0}, [\s0], \strd
+ vld1.16 {\d1}, [\s1], \strd
+.ifnb \d2
+ vld1.16 {\d2}, [\s0], \strd
+ vld1.16 {\d3}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.16 {\d4}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.16 {\d5}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.16 {\d6}, [\s0], \strd
+.endif
+.endm
+.macro load_regpair s0, s1, strd, d0, d1, d2, d3, d4, d5
+ vld1.16 {\d0, \d1}, [\s0], \strd
+.ifnb \d2
+ vld1.16 {\d2, \d3}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.16 {\d4, \d5}, [\s0], \strd
+.endif
+.endm
+.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16s16 s0, s1, strd, d0, d1, d2, d3, d4, d5
+ load_regpair \s0, \s1, \strd, \d0, \d1, \d2, \d3, \d4, \d5
+.endm
+.macro interleave_1_32 r0, r1, r2, r3, r4
+ vext.8 \r0, \r0, \r1, #4
+ vext.8 \r1, \r1, \r2, #4
+.ifnb \r3
+ vext.8 \r2, \r2, \r3, #4
+ vext.8 \r3, \r3, \r4, #4
+.endif
+.endm
+.macro vmin_u16 c, r0, r1, r2, r3
+ vmin.u16 \r0, \r0, \c
+.ifnb \r1
+ vmin.u16 \r1, \r1, \c
+.endif
+.ifnb \r2
+ vmin.u16 \r2, \r2, \c
+ vmin.u16 \r3, \r3, \c
+.endif
+.endm
+.macro vsub_i16 c, r0, r1, r2, r3
+ vsub.i16 \r0, \r0, \c
+.ifnb \r1
+ vsub.i16 \r1, \r1, \c
+.endif
+.ifnb \r2
+ vsub.i16 \r2, \r2, \c
+ vsub.i16 \r3, \r3, \c
+.endif
+.endm
+.macro vmull_vmlal_4 d, s0, s1, s2, s3
+ vmull.s16 \d, \s0, d0[0]
+ vmlal.s16 \d, \s1, d0[1]
+ vmlal.s16 \d, \s2, d0[2]
+ vmlal.s16 \d, \s3, d0[3]
+.endm
+.macro vmull_vmlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+ vmull.s16 \d, \s0, d0[0]
+ vmlal.s16 \d, \s1, d0[1]
+ vmlal.s16 \d, \s2, d0[2]
+ vmlal.s16 \d, \s3, d0[3]
+ vmlal.s16 \d, \s4, d1[0]
+ vmlal.s16 \d, \s5, d1[1]
+ vmlal.s16 \d, \s6, d1[2]
+ vmlal.s16 \d, \s7, d1[3]
+.endm
+.macro vqrshrun_s32 shift, q0, d0, q1, d1, q2, d2, q3, d3
+ vqrshrun.s32 \d0, \q0, #\shift
+.ifnb \q1
+ vqrshrun.s32 \d1, \q1, #\shift
+.endif
+.ifnb \q2
+ vqrshrun.s32 \d2, \q2, #\shift
+ vqrshrun.s32 \d3, \q3, #\shift
+.endif
+.endm
+.macro vmovn_i32 q0, d0, q1, d1, q2, d2, q3, d3
+ vmovn.i32 \d0, \q0
+.ifnb \q1
+ vmovn.i32 \d1, \q1
+.endif
+.ifnb \q2
+ vmovn.i32 \d2, \q2
+ vmovn.i32 \d3, \q3
+.endif
+.endm
+.macro vrshl_s32 shift, r0, r1, r2, r3
+ vrshl.s32 \r0, \r0, \shift
+ vrshl.s32 \r1, \r1, \shift
+.ifnb \r2
+ vrshl.s32 \r2, \r2, \shift
+ vrshl.s32 \r3, \r3, \shift
+.endif
+.endm
+.macro vst1_32 strd, r0, r1
+ vst1.32 {\r0[0]}, [r0, :32], \strd
+ vst1.32 {\r0[1]}, [r9, :32], \strd
+.ifnb \r1
+ vst1.32 {\r1[0]}, [r0, :32], \strd
+ vst1.32 {\r1[1]}, [r9, :32], \strd
+.endif
+.endm
+.macro vst1_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7
+ vst1.16 {\r0}, [r0, \align], \strd
+ vst1.16 {\r1}, [r9, \align], \strd
+.ifnb \r2
+ vst1.16 {\r2}, [r0, \align], \strd
+ vst1.16 {\r3}, [r9, \align], \strd
+.endif
+.ifnb \r4
+ vst1.16 {\r4}, [r0, \align], \strd
+ vst1.16 {\r5}, [r9, \align], \strd
+ vst1.16 {\r6}, [r0, \align], \strd
+ vst1.16 {\r7}, [r9, \align], \strd
+.endif
+.endm
+.macro finalize type, q0, q1, d0, d1, q2, q3, d2, d3
+.ifc \type, put
+ vqrshrun_s32 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+ vmin_u16 q15, \q0, \q1
+.else
+ vrshl_s32 q14, \q0, \q1, \q2, \q3 // -(6-intermediate_bits)
+ vmovn_i32 \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+ vsub_i16 q15, \q0, \q1 // PREP_BIAS
+.endif
+.endm
+.macro shift_store_4 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+ finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+ vst1_reg \strd, :64, \d0, \d1, \d2, \d3
+.endm
+.macro shift_store_8 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+ finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+ vst1_reg \strd, :128, \q0, \q1
+.endm
+.macro shift_store_16 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+ finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+ vst1.16 {\q0, \q1}, [r0, :128], \strd
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ movw r9, \type_h
+ movw r10, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, ds2, sr2
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+.ifc \bdmax, r8
+ ldr r8, [sp, #52]
+.endif
+ movw r11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, r11
+ mul \my, \my, r11
+ add \mx, \mx, r9 // mx, 8tap_h, 4tap_h
+ add \my, \my, r10 // my, 8tap_v, 4tap_v
+
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+
+ vdup.16 q15, \bdmax // bitdepth_max
+ clz \bdmax, \bdmax
+ clz r9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ tst \mx, #(0x7f << 14)
+ sub r9, r9, #24
+ add lr, \bdmax, #6 // 6 + intermediate_bits
+ rsb r12, \bdmax, #6 // 6 - intermediate_bits
+ movrel r11, X(mc_subpel_filters), -8
+ bne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ bne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx r10, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ it gt
+ movgt \mx, r10
+ tst \my, #(0x7f << 14)
+ add \mx, r11, \mx, lsl #3
+ bne L(\type\()_8tap_hv)
+
+ adr r10, L(\type\()_8tap_h_tbl)
+ vdup.32 q14, r12 // 6 - intermediate_bits
+ ldr r9, [r10, r9, lsl #2]
+ vneg.s32 q14, q14 // -(6-intermediate_bits)
+.ifc \type, put
+ vdup.16 q13, \bdmax // intermediate_bits
+.else
+ vmov.i16 q13, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vneg.s16 q13, q13 // -intermediate_bits
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_8tap_h_tbl):
+ .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+2:
+ vld1.16 {q2}, [\src], \s_strd
+ vld1.16 {q3}, [\sr2], \s_strd
+ vext.8 d5, d4, d5, #2
+ vext.8 d7, d6, d7, #2
+ subs \h, \h, #2
+ vtrn.32 d4, d6
+ vtrn.32 d5, d7
+ vmull.s16 q1, d4, d0[0]
+ vmlal.s16 q1, d5, d0[1]
+ vmlal.s16 q1, d6, d0[2]
+ vmlal.s16 q1, d7, d0[3]
+ vrshl.s32 q1, q1, q14 // -(6-intermediate_bits)
+ vqmovun.s32 d2, q1
+ vrshl.s16 d2, d2, d26 // -intermediate_bits
+ vmin.u16 d2, d2, d30
+ vst1.32 {d2[0]}, [\dst, :32], \d_strd
+ vst1.32 {d2[1]}, [\ds2, :32], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+4:
+ vld1.16 {q8}, [\src], \s_strd
+ vld1.16 {q11}, [\sr2], \s_strd
+ vext.8 d18, d16, d17, #2
+ vext.8 d19, d16, d17, #4
+ vext.8 d20, d16, d17, #6
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d21, d22, d23, #6
+ subs \h, \h, #2
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d19, d0[2]
+ vmlal.s16 q2, d20, d0[3]
+ vmull.s16 q3, d22, d0[0]
+ vmlal.s16 q3, d24, d0[1]
+ vmlal.s16 q3, d25, d0[2]
+ vmlal.s16 q3, d21, d0[3]
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+.ifc \type, put
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vrshl.s16 q2, q2, q13 // -intermediate_bits
+ vmin.u16 q2, q2, q15
+.else
+ vmovn.s32 d4, q2
+ vmovn.s32 d5, q3
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+ bgt 4b
+ pop {r4-r11,pc}
+
+80:
+160:
+320:
+640:
+1280: // 8xN, 16xN, 32xN, ... h
+ vpush {q4-q5}
+ vld1.8 {d0}, [\mx, :64]
+ sub \src, \src, #6
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ sub \s_strd, \s_strd, \w, lsl #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, lsl #1
+.endif
+81:
+ vld1.16 {q8, q9}, [\src]!
+ vld1.16 {q10, q11}, [\sr2]!
+ mov \mx, \w
+
+8:
+ vmull.s16 q1, d16, d0[0]
+ vmull.s16 q2, d17, d0[0]
+ vmull.s16 q3, d20, d0[0]
+ vmull.s16 q4, d21, d0[0]
+.irpc i, 1234567
+ vext.8 q12, q8, q9, #(2*\i)
+ vext.8 q5, q10, q11, #(2*\i)
+.if \i < 4
+ vmlal.s16 q1, d24, d0[\i]
+ vmlal.s16 q2, d25, d0[\i]
+ vmlal.s16 q3, d10, d0[\i]
+ vmlal.s16 q4, d11, d0[\i]
+.else
+ vmlal.s16 q1, d24, d1[\i-4]
+ vmlal.s16 q2, d25, d1[\i-4]
+ vmlal.s16 q3, d10, d1[\i-4]
+ vmlal.s16 q4, d11, d1[\i-4]
+.endif
+.endr
+ subs \mx, \mx, #8
+ vrshl.s32 q1, q1, q14 // -(6-intermediate_bits)
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vrshl.s32 q4, q4, q14 // -(6-intermediate_bits)
+.ifc \type, put
+ vqmovun.s32 d2, q1
+ vqmovun.s32 d3, q2
+ vqmovun.s32 d4, q3
+ vqmovun.s32 d5, q4
+ vrshl.s16 q1, q1, q13 // -intermediate_bits
+ vrshl.s16 q2, q2, q13 // -intermediate_bits
+ vmin.u16 q1, q1, q15
+ vmin.u16 q2, q2, q15
+.else
+ vmovn.s32 d2, q1
+ vmovn.s32 d3, q2
+ vmovn.s32 d4, q3
+ vmovn.s32 d5, q4
+ vsub.i16 q1, q1, q13 // PREP_BIAS
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ vst1.16 {q1}, [\dst, :128]!
+ vst1.16 {q2}, [\ds2, :128]!
+ ble 9f
+
+ vmov q8, q9
+ vmov q10, q11
+ vld1.16 {q9}, [\src]!
+ vld1.16 {q11}, [\sr2]!
+ b 8b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 81b
+ vpop {q4-q5}
+ pop {r4-r11,pc}
+
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx r10, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r10
+ add \my, r11, \my, lsl #3
+
+.ifc \type, prep
+ vdup.32 q14, r12 // 6 - intermediate_bits
+ vmov.i16 q15, #PREP_BIAS
+.endif
+ adr r10, L(\type\()_8tap_v_tbl)
+ ldr r9, [r10, r9, lsl #2]
+.ifc \type, prep
+ vneg.s32 q14, q14 // -(6-intermediate_bits)
+.endif
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(\type\()_8tap_v_tbl):
+ .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ bgt 28f
+
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ // 2x2 v
+ load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ interleave_1_32 d1, d2, d3, d4, d5
+ bgt 24f
+ vmull_vmlal_4 q8, d1, d2, d3, d4
+ vqrshrun_s32 6, q8, d16
+ vmin_u16 d30, d16
+ vst1_32 \d_strd, d16
+ pop {r4-r11,pc}
+
+24: // 2x4 v
+ load_32 \sr2, \src, \s_strd, d6, d7
+ interleave_1_32 d5, d6, d7
+ vmull_vmlal_4 q8, d1, d2, d3, d4
+ vmull_vmlal_4 q9, d3, d4, d5, d6
+ vqrshrun_s32 6, q8, d16, q9, d17
+ vmin_u16 q15, q8
+ vst1_32 \d_strd, d16, d17
+ pop {r4-r11,pc}
+
+28: // 2x8, 2x16 v
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ load_32 \src, \sr2, \s_strd, d2, d3, d4, d5, d6, d7, d16
+ interleave_1_32 d2, d3, d4, d5, d6
+ interleave_1_32 d6, d7, d16
+216:
+ subs \h, \h, #8
+ load_32 \sr2, \src, \s_strd, d17, d18, d19, d20
+ load_32 \sr2, \src, \s_strd, d21, d22, d23, d24
+ interleave_1_32 d16, d17, d18, d19, d20
+ interleave_1_32 d20, d21, d22, d23, d24
+ vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17
+ vmull_vmlal_8 q1, d4, d5, d6, d7, d16, d17, d18, d19
+ vmull_vmlal_8 q2, d6, d7, d16, d17, d18, d19, d20, d21
+ vmull_vmlal_8 q3, d16, d17, d18, d19, d20, d21, d22, d23
+ vqrshrun_s32 6, q13, d26, q1, d27, q2, d2, q3, d3
+ vmin_u16 q15, q13, q1
+ vst1_32 \d_strd, d26, d27
+ vst1_32 \d_strd, d2, d3
+ ble 0f
+ vmov q1, q9
+ vmov q2, q10
+ vmov q3, q11
+ vmov d16, d24
+ b 216b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40:
+ bgt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ vmull_vmlal_4 q8, d1, d2, d3, d4
+ vmull_vmlal_4 q9, d2, d3, d4, d5
+ shift_store_4 \type, \d_strd, q8, q9, d16, d17
+ ble 0f
+ load_reg \sr2, \src, \s_strd, d6, d7
+ vmull_vmlal_4 q8, d3, d4, d5, d6
+ vmull_vmlal_4 q9, d4, d5, d6, d7
+ shift_store_4 \type, \d_strd, q8, q9, d16, d17
+0:
+ pop {r4-r11,pc}
+
+480: // 4x8, 4x16 v
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, d16, d17, d18, d19, d20, d21, d22
+
+48:
+ subs \h, \h, #4
+ load_reg \sr2, \src, \s_strd, d23, d24, d25, d26
+ vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23
+ vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24
+ vmull_vmlal_8 q3, d18, d19, d20, d21, d22, d23, d24, d25
+ vmull_vmlal_8 q8, d19, d20, d21, d22, d23, d24, d25, d26
+ shift_store_4 \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5
+ ble 0f
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ vmov d22, d26
+ b 48b
+0:
+ pop {r4-r11,pc}
+
+80:
+ bgt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, q1, q2, q3, q8, q9
+ vmull_vmlal_4 q10, d2, d4, d6, d16
+ vmull_vmlal_4 q11, d3, d5, d7, d17
+ vmull_vmlal_4 q12, d4, d6, d16, d18
+ vmull_vmlal_4 q13, d5, d7, d17, d19
+ shift_store_8 \type, \d_strd, q10, q11, d20, d21, q12, q13, d22, d23
+ ble 0f
+ load_reg \sr2, \src, \s_strd, q10, q11
+ vmull_vmlal_4 q1, d6, d16, d18, d20
+ vmull_vmlal_4 q2, d7, d17, d19, d21
+ vmull_vmlal_4 q12, d16, d18, d20, d22
+ vmull_vmlal_4 q13, d17, d19, d21, d23
+ shift_store_8 \type, \d_strd, q1, q2, d2, d3, q12, q13, d4, d5
+0:
+ pop {r4-r11,pc}
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ vpush {q4-q7}
+ vld1.8 {d0}, [\my, :64]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_reg \src, \sr2, \s_strd, q5, q6, q7, q8, q9, q10, q11
+
+88:
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, q12, q13
+ vmull_vmlal_8 q1, d10, d12, d14, d16, d18, d20, d22, d24
+ vmull_vmlal_8 q2, d11, d13, d15, d17, d19, d21, d23, d25
+ vmull_vmlal_8 q3, d12, d14, d16, d18, d20, d22, d24, d26
+ vmull_vmlal_8 q4, d13, d15, d17, d19, d21, d23, d25, d27
+ shift_store_8 \type, \d_strd, q1, q2, d2, d3, q3, q4, d4, d5
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, q1, q2
+ vmull_vmlal_8 q3, d14, d16, d18, d20, d22, d24, d26, d2
+ vmull_vmlal_8 q4, d15, d17, d19, d21, d23, d25, d27, d3
+ vmull_vmlal_8 q5, d16, d18, d20, d22, d24, d26, d2, d4
+ vmull_vmlal_8 q6, d17, d19, d21, d23, d25, d27, d3, d5
+ shift_store_8 \type, \d_strd, q3, q4, d6, d7, q5, q6, d8, d9
+ ble 9f
+ vmov q5, q9
+ vmov q6, q10
+ vmov q7, q11
+ vmov q8, q12
+ vmov q9, q13
+ vmov q10, q1
+ vmov q11, q2
+ b 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+160:
+ bgt 1680b
+
+ // 16x2, 16x4 v
+ vpush {q6-q7}
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ vmovl.s8 q0, d0
+
+ load_16s16 \src, \src, \s_strd, q6, q7, q8, q9, q10, q11
+16:
+ load_16s16 \src, \src, \s_strd, q12, q13
+ subs \h, \h, #1
+ vmull_vmlal_4 q1, d12, d16, d20, d24
+ vmull_vmlal_4 q2, d13, d17, d21, d25
+ vmull_vmlal_4 q3, d14, d18, d22, d26
+ vmull_vmlal_4 q6, d15, d19, d23, d27
+ shift_store_16 \type, \d_strd, q1, q2, d2, d3, q3, q6, d4, d5
+ ble 0f
+ vmov q6, q8
+ vmov q7, q9
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ vmov q11, q13
+ b 16b
+0:
+ vpop {q6-q7}
+ pop {r4-r11,pc}
+
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx r10, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r10
+4:
+ add \my, r11, \my, lsl #3
+
+ adr r10, L(\type\()_8tap_hv_tbl)
+ neg r12, r12 // -(6-intermediate_bits)
+ ldr r9, [r10, r9, lsl #2]
+ vdup.32 q14, r12 // -(6-intermediate_bits)
+.ifc \type, put
+ neg r8, lr // -(6+intermeidate_bits)
+.else
+ vmov.i16 q13, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vdup.32 q13, r8 // -(6+intermediate_bits)
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_8tap_hv_tbl):
+ .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+
+20:
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 280f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vmull.s16 q11, d22, d0
+ vmull.s16 q12, d24, d0
+ vpadd.s32 d22, d22, d23
+ vpadd.s32 d23, d24, d25
+ vpadd.s32 d22, d22, d23
+ vrshl.s32 d16, d22, d28 // -(6-intermediate_bits)
+ vmovn.i32 d16, q8
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d16, d16, d16, #4
+ vext.8 d16, d16, d24, #4
+ vmov d17, d24
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d18, d17, d24, #4
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d24, d2[3]
+
+ vrshl.s32 q2, q2, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vmin.u16 d4, d4, d30
+ subs \h, \h, #2
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d4[1]}, [\ds2, :32], \d_strd
+ ble 0f
+ vmov d16, d18
+ vmov d17, d24
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vmull.s16 q11, d22, d0
+ vmull.s16 q12, d24, d0
+ vpadd.s32 d22, d22, d23
+ vpadd.s32 d23, d24, d25
+ vpadd.s32 d22, d22, d23
+ vrshl.s32 d16, d22, d28 // -(6-intermediate_bits)
+ vmovn.i32 d16, q8
+
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d16, d16, d16, #4
+ vext.8 d16, d16, d24, #4
+ vmov d17, d24
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d18, d17, d24, #4
+ vmov d19, d24
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d20, d19, d24, #4
+ vmov d21, d24
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d22, d21, d24, #4
+ vmull.s16 q3, d16, d2[0]
+ vmlal.s16 q3, d17, d2[1]
+ vmlal.s16 q3, d18, d2[2]
+ vmlal.s16 q3, d19, d2[3]
+ vmlal.s16 q3, d20, d3[0]
+ vmlal.s16 q3, d21, d3[1]
+ vmlal.s16 q3, d22, d3[2]
+ vmlal.s16 q3, d24, d3[3]
+
+ vrshl.s32 q3, q3, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d6, q3
+ vmin.u16 d6, d6, d30
+ subs \h, \h, #2
+ vst1.32 {d6[0]}, [\dst, :32], \d_strd
+ vst1.32 {d6[1]}, [\ds2, :32], \d_strd
+ ble 0f
+ vmov q8, q9
+ vmov q9, q10
+ vmov d20, d22
+ vmov d21, d24
+ b 28b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_2):
+ vld1.16 {q11}, [\sr2], \s_strd
+ vld1.16 {q12}, [\src], \s_strd
+ vext.8 d23, d22, d23, #2
+ vext.8 d25, d24, d25, #2
+ vtrn.32 q11, q12
+ vmull.s16 q3, d22, d0[0]
+ vmlal.s16 q3, d23, d0[1]
+ vmlal.s16 q3, d24, d0[2]
+ vmlal.s16 q3, d25, d0[3]
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vmovn.i32 d24, q3
+ bx lr
+.endif
+
+40:
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 480f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ // 4x2, 4x4 hv
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d23, d22, d23, #6
+ vmull.s16 q10, d22, d0[0]
+ vmlal.s16 q10, d24, d0[1]
+ vmlal.s16 q10, d25, d0[2]
+ vmlal.s16 q10, d23, d0[3]
+ vrshl.s32 q10, q10, q14 // -(6-intermediate_bits)
+ vmovn.i32 d17, q10
+
+ bl L(\type\()_8tap_filter_4)
+ vmov q9, q12
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d17, d2[0]
+ vmlal.s16 q2, d18, d2[1]
+ vmlal.s16 q2, d19, d2[2]
+ vmlal.s16 q2, d24, d2[3]
+ vmull.s16 q3, d18, d2[0]
+ vmlal.s16 q3, d19, d2[1]
+ vmlal.s16 q3, d24, d2[2]
+ vmlal.s16 q3, d25, d2[3]
+.ifc \type, put
+ vrshl.s32 q2, q2, q13 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q15
+.else
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d17, d19
+ vmov q9, q12
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+480: // 4x8, 4x16, 4x32 hv
+ vpush {d13-d15}
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d23, d22, d23, #6
+ vmull.s16 q10, d22, d0[0]
+ vmlal.s16 q10, d24, d0[1]
+ vmlal.s16 q10, d25, d0[2]
+ vmlal.s16 q10, d23, d0[3]
+ vrshl.s32 q10, q10, q14 // -(6-intermediate_bits)
+ vmovn.i32 d13, q10
+
+ bl L(\type\()_8tap_filter_4)
+ vmov q7, q12
+ bl L(\type\()_8tap_filter_4)
+ vmov q8, q12
+ bl L(\type\()_8tap_filter_4)
+ vmov q9, q12
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d13, d2[0]
+ vmlal.s16 q2, d14, d2[1]
+ vmlal.s16 q2, d15, d2[2]
+ vmlal.s16 q2, d16, d2[3]
+ vmlal.s16 q2, d17, d3[0]
+ vmlal.s16 q2, d18, d3[1]
+ vmlal.s16 q2, d19, d3[2]
+ vmlal.s16 q2, d24, d3[3]
+ vmull.s16 q3, d14, d2[0]
+ vmlal.s16 q3, d15, d2[1]
+ vmlal.s16 q3, d16, d2[2]
+ vmlal.s16 q3, d17, d2[3]
+ vmlal.s16 q3, d18, d3[0]
+ vmlal.s16 q3, d19, d3[1]
+ vmlal.s16 q3, d24, d3[2]
+ vmlal.s16 q3, d25, d3[3]
+.ifc \type, put
+ vrshl.s32 q2, q2, q13 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q15
+.else
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d13, d15
+ vmov q7, q8
+ vmov q8, q9
+ vmov q9, q12
+ b 48b
+0:
+ vpop {d13-d15}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_4):
+ vld1.16 {q10}, [\sr2], \s_strd
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d20, d21, #2
+ vext.8 d25, d20, d21, #4
+ vext.8 d21, d20, d21, #6
+ vmull.s16 q3, d20, d0[0]
+ vmlal.s16 q3, d24, d0[1]
+ vmlal.s16 q3, d25, d0[2]
+ vmlal.s16 q3, d21, d0[3]
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d23, d22, d23, #6
+ vmull.s16 q10, d22, d0[0]
+ vmlal.s16 q10, d24, d0[1]
+ vmlal.s16 q10, d25, d0[2]
+ vmlal.s16 q10, d23, d0[3]
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vrshl.s32 q10, q10, q14 // -(6-intermediate_bits)
+ vmovn.i32 d24, q3
+ vmovn.i32 d25, q10
+ bx lr
+
+80:
+160:
+320:
+ bgt 880f
+ add \my, \my, #2
+ vld1.8 {d0}, [\mx, :64]
+ vld1.32 {d2[]}, [\my]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.16 {q11, q12}, [\src], \s_strd
+ vmull.s16 q2, d22, d0[0]
+ vmull.s16 q3, d23, d0[0]
+ vdup.32 q14, r12 // -(6-intermediate_bits)
+.irpc i, 1234567
+ vext.8 q10, q11, q12, #(2*\i)
+.if \i < 4
+ vmlal.s16 q2, d20, d0[\i]
+ vmlal.s16 q3, d21, d0[\i]
+.else
+ vmlal.s16 q2, d20, d1[\i - 4]
+ vmlal.s16 q3, d21, d1[\i - 4]
+.endif
+.endr
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vmovn.i32 d16, q2
+ vmovn.i32 d17, q3
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q9, q11
+ vmov q10, q12
+
+8:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q2, d16, d2[0]
+ vmull.s16 q3, d17, d2[0]
+ vmull.s16 q13, d18, d2[0]
+ vmull.s16 q14, d19, d2[0]
+.ifc \type, put
+ vdup.32 q8, r8 // -(6+intermediate_bits)
+.endif
+ vmlal.s16 q2, d18, d2[1]
+ vmlal.s16 q3, d19, d2[1]
+ vmlal.s16 q13, d20, d2[1]
+ vmlal.s16 q14, d21, d2[1]
+ vmlal.s16 q2, d20, d2[2]
+ vmlal.s16 q3, d21, d2[2]
+ vmlal.s16 q13, d22, d2[2]
+ vmlal.s16 q14, d23, d2[2]
+ vmlal.s16 q2, d22, d2[3]
+ vmlal.s16 q3, d23, d2[3]
+ vmlal.s16 q13, d24, d2[3]
+ vmlal.s16 q14, d25, d2[3]
+.ifc \type, put
+ vdup.16 q9, \bdmax // bitdepth_max
+ vrshl.s32 q2, q2, q8 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q8 // -(6+intermediate_bits)
+ vrshl.s32 q13, q13, q8 // -(6+intermediate_bits)
+ vrshl.s32 q14, q14, q8 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q13
+ vqmovun.s32 d7, q14
+ vmin.u16 q2, q2, q15
+ vmin.u16 q3, q3, q15
+.else
+ vmov.i16 q9, #PREP_BIAS
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q13, #6
+ vrshrn.i32 d7, q14, #6
+ vsub.i16 q2, q2, q9 // PREP_BIAS
+ vsub.i16 q3, q3, q9 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ vst1.16 {q2}, [\dst, :128], \d_strd
+ vst1.16 {q3}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ b 8b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 164b
+0:
+ pop {r4-r11,pc}
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ vpush {q4-q7}
+ vld1.8 {d0}, [\mx, :64]
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.16 {q11, q12}, [\src], \s_strd
+ vmull.s16 q2, d22, d0[0]
+ vmull.s16 q3, d23, d0[0]
+ vdup.32 q14, r12 // -(6-intermediate_bits)
+.irpc i, 1234567
+ vext.8 q10, q11, q12, #(2*\i)
+.if \i < 4
+ vmlal.s16 q2, d20, d0[\i]
+ vmlal.s16 q3, d21, d0[\i]
+.else
+ vmlal.s16 q2, d20, d1[\i - 4]
+ vmlal.s16 q3, d21, d1[\i - 4]
+.endif
+.endr
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vmovn.i32 d8, q2
+ vmovn.i32 d9, q3
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q5, q11
+ vmov q6, q12
+ bl L(\type\()_8tap_filter_8)
+ vmov q7, q11
+ vmov q8, q12
+ bl L(\type\()_8tap_filter_8)
+ vmov q9, q11
+ vmov q10, q12
+
+88:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q2, d8, d2[0]
+ vmull.s16 q3, d9, d2[0]
+ vmull.s16 q13, d10, d2[0]
+ vmull.s16 q14, d11, d2[0]
+.ifc \type, put
+ vdup.32 q4, r8 // -(6+intermediate_bits)
+.endif
+ vmlal.s16 q2, d10, d2[1]
+ vmlal.s16 q3, d11, d2[1]
+ vmlal.s16 q13, d12, d2[1]
+ vmlal.s16 q14, d13, d2[1]
+ vmlal.s16 q2, d12, d2[2]
+ vmlal.s16 q3, d13, d2[2]
+ vmlal.s16 q13, d14, d2[2]
+ vmlal.s16 q14, d15, d2[2]
+ vmlal.s16 q2, d14, d2[3]
+ vmlal.s16 q3, d15, d2[3]
+ vmlal.s16 q13, d16, d2[3]
+ vmlal.s16 q14, d17, d2[3]
+ vmlal.s16 q2, d16, d3[0]
+ vmlal.s16 q3, d17, d3[0]
+ vmlal.s16 q13, d18, d3[0]
+ vmlal.s16 q14, d19, d3[0]
+ vmlal.s16 q2, d18, d3[1]
+ vmlal.s16 q3, d19, d3[1]
+ vmlal.s16 q13, d20, d3[1]
+ vmlal.s16 q14, d21, d3[1]
+ vmlal.s16 q2, d20, d3[2]
+ vmlal.s16 q3, d21, d3[2]
+ vmlal.s16 q13, d22, d3[2]
+ vmlal.s16 q14, d23, d3[2]
+ vmlal.s16 q2, d22, d3[3]
+ vmlal.s16 q3, d23, d3[3]
+ vmlal.s16 q13, d24, d3[3]
+ vmlal.s16 q14, d25, d3[3]
+.ifc \type, put
+ vrshl.s32 q2, q2, q4 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q4 // -(6+intermediate_bits)
+ vrshl.s32 q13, q13, q4 // -(6+intermediate_bits)
+ vrshl.s32 q14, q14, q4 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q13
+ vqmovun.s32 d7, q14
+ vmin.u16 q2, q2, q15
+ vmin.u16 q3, q3, q15
+.else
+ vmov.i16 q5, #PREP_BIAS
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q13, #6
+ vrshrn.i32 d7, q14, #6
+ vsub.i16 q2, q2, q5 // PREP_BIAS
+ vsub.i16 q3, q3, q5 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ vst1.16 {q2}, [\dst, :128], \d_strd
+ vst1.16 {q3}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q4, q6
+ vmov q5, q7
+ vmov q6, q8
+ vmov q7, q9
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ b 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_8):
+ vld1.16 {q13, q14}, [\sr2], \s_strd
+ vmull.s16 q2, d26, d0[0]
+ vmull.s16 q3, d27, d0[0]
+.irpc i, 1234567
+ vext.8 q12, q13, q14, #(2*\i)
+.if \i < 4
+ vmlal.s16 q2, d24, d0[\i]
+ vmlal.s16 q3, d25, d0[\i]
+.else
+ vmlal.s16 q2, d24, d1[\i - 4]
+ vmlal.s16 q3, d25, d1[\i - 4]
+.endif
+.endr
+ vdup.32 q12, r12 // -(6-intermediate_bits)
+ vld1.16 {q13, q14}, [\src], \s_strd
+ vrshl.s32 q2, q2, q12 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q12 // -(6-intermediate_bits)
+ vmovn.i32 d4, q2
+ vmovn.i32 d5, q3
+
+ vmull.s16 q3, d26, d0[0]
+ vmull.s16 q11, d27, d0[0]
+.irpc i, 1234567
+ vext.8 q12, q13, q14, #(2*\i)
+.if \i < 4
+ vmlal.s16 q3, d24, d0[\i]
+ vmlal.s16 q11, d25, d0[\i]
+.else
+ vmlal.s16 q3, d24, d1[\i - 4]
+ vmlal.s16 q11, d25, d1[\i - 4]
+.endif
+.endr
+ vdup.32 q13, r12 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q13 // -(6-intermediate_bits)
+ vrshl.s32 q11, q11, q13 // -(6-intermediate_bits)
+
+ vmovn.i32 d24, q3
+ vmovn.i32 d25, q11
+ vmov q11, q2
+ bx lr
+endfunc
+
+function \type\()_bilin_16bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+.ifc \bdmax, r8
+ ldr r8, [sp, #52]
+.endif
+ vdup.16 q1, \mx
+ vdup.16 q3, \my
+ rsb r9, \mx, #16
+ rsb r10, \my, #16
+ vdup.16 q0, r9
+ vdup.16 q2, r10
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+ clz \bdmax, \bdmax // bitdepth_max
+ clz r9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ cmp \mx, #0
+ sub r9, r9, #24
+ rsb r11, \bdmax, #4 // 4 - intermediate_bits
+ add r12, \bdmax, #4 // 4 + intermediate_bits
+ bne L(\type\()_bilin_h)
+ cmp \my, #0
+ bne L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cmp \my, #0
+ bne L(\type\()_bilin_hv)
+
+ adr r10, L(\type\()_bilin_h_tbl)
+ vdup.16 q15, r11 // 4 - intermediate_bits
+ ldr r9, [r10, r9, lsl #2]
+ vneg.s16 q15, q15 // -(4-intermediate_bits)
+.ifc \type, put
+ vdup.16 q14, \bdmax // intermediate_bits
+.else
+ vmov.i16 q14, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vneg.s16 q14, q14 // -intermediate_bits
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_bilin_h_tbl):
+ .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ vld1.16 {d16}, [\src], \s_strd
+ vld1.16 {d18}, [\sr2], \s_strd
+ vext.8 d17, d16, d16, #2
+ vext.8 d19, d18, d18, #2
+ vtrn.32 d16, d18
+ vtrn.32 d17, d19
+ subs \h, \h, #2
+ vmul.i16 d16, d16, d0
+ vmla.i16 d16, d17, d2
+ vrshl.u16 d16, d16, d30
+ vrshl.u16 d16, d16, d28
+ vst1.32 {d16[0]}, [\dst, :32], \d_strd
+ vst1.32 {d16[1]}, [\ds2, :32], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ vld1.16 {q8}, [\src], \s_strd
+ vld1.16 {q10}, [\sr2], \s_strd
+ vext.8 q9, q8, q8, #2
+ vext.8 q11, q10, q10, #2
+ vmov d17, d20
+ vmov d19, d22
+ subs \h, \h, #2
+ vmul.i16 q8, q8, q0
+ vmla.i16 q8, q9, q1
+ vrshl.u16 q8, q8, q15
+.ifc \type, put
+ vrshl.u16 q8, q8, q14
+.else
+ vsub.i16 q8, q8, q14
+.endif
+ vst1.16 {d16}, [\dst, :64], \d_strd
+ vst1.16 {d17}, [\ds2, :64], \d_strd
+ bgt 4b
+ pop {r4-r11,pc}
+
+80: // 8xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ vld1.16 {d16, d17, d18}, [\src], \s_strd
+ vld1.16 {d20, d21, d22}, [\sr2], \s_strd
+ vext.8 q9, q8, q9, #2
+ vext.8 q11, q10, q11, #2
+ subs \h, \h, #2
+ vmul.i16 q8, q8, q0
+ vmla.i16 q8, q9, q1
+ vmul.i16 q10, q10, q0
+ vmla.i16 q10, q11, q1
+ vrshl.u16 q8, q8, q15
+ vrshl.u16 q10, q10, q15
+.ifc \type, put
+ vrshl.u16 q8, q8, q14
+ vrshl.u16 q10, q10, q14
+.else
+ vsub.i16 q8, q8, q14
+ vsub.i16 q10, q10, q14
+.endif
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q10}, [\ds2, :128], \d_strd
+ bgt 8b
+ pop {r4-r11,pc}
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ vpush {q4-q7}
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w, lsl #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, lsl #1
+.endif
+161:
+ vld1.16 {q4}, [\src]!
+ vld1.16 {q9}, [\sr2]!
+ mov \mx, \w
+
+16:
+ vld1.16 {q5, q6}, [\src]!
+ vld1.16 {q10, q11}, [\sr2]!
+ vext.8 q7, q4, q5, #2
+ vext.8 q8, q5, q6, #2
+ vext.8 q12, q9, q10, #2
+ vext.8 q13, q10, q11, #2
+ vmul.i16 q4, q4, q0
+ vmla.i16 q4, q7, q1
+ vmul.i16 q5, q5, q0
+ vmla.i16 q5, q8, q1
+ vmul.i16 q9, q9, q0
+ vmla.i16 q9, q12, q1
+ vmul.i16 q10, q10, q0
+ vmla.i16 q10, q13, q1
+ vrshl.u16 q4, q4, q15
+ vrshl.u16 q5, q5, q15
+ vrshl.u16 q9, q9, q15
+ vrshl.u16 q10, q10, q15
+ subs \mx, \mx, #16
+.ifc \type, put
+ vrshl.u16 q4, q4, q14
+ vrshl.u16 q5, q5, q14
+ vrshl.u16 q9, q9, q14
+ vrshl.u16 q10, q10, q14
+.else
+ vsub.i16 q4, q4, q14
+ vsub.i16 q5, q5, q14
+ vsub.i16 q9, q9, q14
+ vsub.i16 q10, q10, q14
+.endif
+ vst1.16 {q4, q5}, [\dst, :128]!
+ vst1.16 {q9, q10}, [\ds2, :128]!
+ ble 9f
+
+ vmov q4, q6
+ vmov q9, q11
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 161b
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr r10, L(\type\()_bilin_v_tbl)
+.ifc \type, prep
+ vdup.16 q15, r11 // 4 - intermediate_bits
+.endif
+ ldr r9, [r10, r9, lsl #2]
+.ifc \type, prep
+ vmov.i16 q14, #PREP_BIAS
+ vneg.s16 q15, q15 // -(4-intermediate_bits)
+.endif
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(\type\()_bilin_v_tbl):
+ .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ vld1.32 {d16[]}, [\src], \s_strd
+ bgt 24f
+ vld1.32 {d17[]}, [\sr2], \s_strd
+ vld1.32 {d18[]}, [\src], \s_strd
+ vext.8 d16, d16, d17, #4
+ vext.8 d17, d17, d18, #4
+ vmul.i16 d16, d16, d4
+ vmla.i16 d16, d17, d6
+ vrshr.u16 d16, d16, #4
+ vst1.32 {d16[0]}, [\dst, :32]
+ vst1.32 {d16[1]}, [\ds2, :32]
+ pop {r4-r11,pc}
+24: // 2x4, 2x8, ... v
+ vld1.32 {d17[]}, [\sr2], \s_strd
+ vld1.32 {d18[]}, [\src], \s_strd
+ vld1.32 {d19[]}, [\sr2], \s_strd
+ vld1.32 {d20[]}, [\src], \s_strd
+ vext.8 d16, d16, d17, #4
+ vext.8 d17, d17, d18, #4
+ vext.8 d18, d18, d19, #4
+ vext.8 d19, d19, d20, #4
+ vswp d17, d18
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q9, q3
+ subs \h, \h, #4
+ vrshr.u16 q8, q8, #4
+ vst1.32 {d16[0]}, [\dst, :32], \d_strd
+ vst1.32 {d16[1]}, [\ds2, :32], \d_strd
+ vst1.32 {d17[0]}, [\dst, :32], \d_strd
+ vst1.32 {d17[1]}, [\ds2, :32], \d_strd
+ ble 0f
+ vmov d16, d20
+ b 24b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.16 {d16}, [\src], \s_strd
+4:
+ vld1.16 {d17}, [\sr2], \s_strd
+ vld1.16 {d19}, [\src], \s_strd
+ vmov d18, d17
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q9, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vrshr.u16 q8, q8, #4
+.else
+ vrshl.u16 q8, q8, q15
+ vsub.i16 q8, q8, q14
+.endif
+ vst1.16 {d16}, [\dst, :64], \d_strd
+ vst1.16 {d17}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d16, d19
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.16 {q8}, [\src], \s_strd
+8:
+ vld1.16 {q9}, [\sr2], \s_strd
+ vld1.16 {q10}, [\src], \s_strd
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q9, q3
+ vmul.i16 q9, q9, q2
+ vmla.i16 q9, q10, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vrshr.u16 q8, q8, #4
+ vrshr.u16 q9, q9, #4
+.else
+ vrshl.u16 q8, q8, q15
+ vrshl.u16 q9, q9, q15
+ vsub.i16 q8, q8, q14
+ vsub.i16 q9, q9, q14
+.endif
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q9}, [\ds2, :128], \d_strd
+ ble 0f
+ vmov q8, q10
+ b 8b
+0:
+ pop {r4-r11,pc}
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {q8, q9}, [\src], \s_strd
+2:
+ vld1.16 {q10, q11}, [\sr2], \s_strd
+ vld1.16 {q12, q13}, [\src], \s_strd
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q10, q3
+ vmul.i16 q9, q9, q2
+ vmla.i16 q9, q11, q3
+ vmul.i16 q10, q10, q2
+ vmla.i16 q10, q12, q3
+ vmul.i16 q11, q11, q2
+ vmla.i16 q11, q13, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vrshr.u16 q8, q8, #4
+ vrshr.u16 q9, q9, #4
+ vrshr.u16 q10, q10, #4
+ vrshr.u16 q11, q11, #4
+.else
+ vrshl.u16 q8, q8, q15
+ vrshl.u16 q9, q9, q15
+ vrshl.u16 q10, q10, q15
+ vrshl.u16 q11, q11, q15
+ vsub.i16 q8, q8, q14
+ vsub.i16 q9, q9, q14
+ vsub.i16 q10, q10, q14
+ vsub.i16 q11, q11, q14
+.endif
+ vst1.16 {q8, q9}, [\dst, :128], \d_strd
+ vst1.16 {q10, q11}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q8, q12
+ vmov q9, q13
+ b 2b
+9:
+ subs \w, \w, #16
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #32
+ add \dst, \dst, #32
+ b 1b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_bilin_hv):
+ adr r10, L(\type\()_bilin_hv_tbl)
+ vdup.16 q15, r11 // 4 - intermediate_bits
+ ldr r9, [r10, r9, lsl #2]
+ vneg.s16 q15, q15 // -(4-intermediate_bits)
+.ifc \type, put
+ vdup.32 q14, r12 // 4 + intermediate_bits
+.else
+ vmov.i16 q14, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vneg.s32 q14, q14 // -(4+intermediate_bits)
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_bilin_hv_tbl):
+ .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+
+20: // 2xN hv
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {d20}, [\src], \s_strd
+ vext.8 d21, d20, d20, #2
+ vmul.i16 d16, d20, d0
+ vmla.i16 d16, d21, d2
+ vrshl.u16 d16, d16, d30
+ vext.8 d16, d16, d16, #4
+
+2:
+ vld1.16 {d20}, [\sr2], \s_strd
+ vld1.16 {d22}, [\src], \s_strd
+ vext.8 d21, d20, d20, #2
+ vext.8 d23, d22, d22, #2
+ vtrn.32 d20, d22
+ vtrn.32 d21, d23
+ vmul.i16 d18, d20, d0
+ vmla.i16 d18, d21, d2
+ vrshl.u16 d18, d18, d30
+
+ vext.8 d16, d16, d18, #4
+
+ vmull.u16 q8, d16, d4
+ vmlal.u16 q8, d18, d6
+ vrshl.u32 q8, q8, q14
+ vmovn.i32 d16, q8
+ subs \h, \h, #2
+ vst1.32 {d16[0]}, [\dst, :32], \d_strd
+ vst1.32 {d16[1]}, [\ds2, :32], \d_strd
+ ble 0f
+ vmov d16, d18
+ b 2b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN hv
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {q10}, [\src], \s_strd
+ vext.8 d21, d20, d21, #2
+ vmul.i16 d16, d20, d0
+ vmla.i16 d16, d21, d2
+ vrshl.u16 d16, d16, d30
+
+4:
+ vld1.16 {q10}, [\sr2], \s_strd
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d21, d20, d21, #2
+ vext.8 d23, d22, d23, #2
+ vswp d21, d22
+ vmul.i16 q9, q10, q0
+ vmla.i16 q9, q11, q1
+ vrshl.u16 q9, q9, q15
+
+ vmull.u16 q10, d16, d4
+ vmlal.u16 q10, d18, d6
+ vmull.u16 q11, d18, d4
+ vmlal.u16 q11, d19, d6
+.ifc \type, put
+ vrshl.u32 q10, q10, q14
+ vrshl.u32 q11, q11, q14
+ vmovn.i32 d20, q10
+ vmovn.i32 d21, q11
+.else
+ vrshrn.i32 d20, q10, #4
+ vrshrn.i32 d21, q11, #4
+ vsub.i16 q10, q10, q14
+.endif
+ subs \h, \h, #2
+ vst1.16 {d20}, [\dst, :64], \d_strd
+ vst1.16 {d21}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d16, d19
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {d20, d21, d22}, [\src], \s_strd
+ vext.8 q11, q10, q11, #2
+ vmul.i16 q8, q10, q0
+ vmla.i16 q8, q11, q1
+ vrshl.u16 q8, q8, q15
+
+2:
+ vld1.16 {d20, d21, d22}, [\sr2], \s_strd
+ vld1.16 {d24, d25, d26}, [\src], \s_strd
+ vext.8 q11, q10, q11, #2
+ vext.8 q13, q12, q13, #2
+ vmul.i16 q9, q10, q0
+ vmla.i16 q9, q11, q1
+ vmul.i16 q10, q12, q0
+ vmla.i16 q10, q13, q1
+ vrshl.u16 q9, q9, q15
+ vrshl.u16 q10, q10, q15
+
+ vmull.u16 q11, d16, d4
+ vmlal.u16 q11, d18, d6
+ vmull.u16 q12, d17, d4
+ vmlal.u16 q12, d19, d6
+ vmull.u16 q8, d18, d4
+ vmlal.u16 q8, d20, d6
+ vmull.u16 q9, d19, d4
+ vmlal.u16 q9, d21, d6
+.ifc \type, put
+ vrshl.u32 q11, q11, q14
+ vrshl.u32 q12, q12, q14
+ vrshl.u32 q8, q8, q14
+ vrshl.u32 q9, q9, q14
+ vmovn.i32 d22, q11
+ vmovn.i32 d23, q12
+ vmovn.i32 d16, q8
+ vmovn.i32 d17, q9
+.else
+ vrshrn.i32 d22, q11, #4
+ vrshrn.i32 d23, q12, #4
+ vrshrn.i32 d16, q8, #4
+ vrshrn.i32 d17, q9, #4
+ vsub.i16 q11, q11, q14
+ vsub.i16 q8, q8, q14
+.endif
+ subs \h, \h, #2
+ vst1.16 {q11}, [\dst, :128], \d_strd
+ vst1.16 {q8}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q8, q10
+ b 2b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 1b
+0:
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10
+
+.macro load_filter_ptr src
+ asr r12, \src, #10
+ add r12, r11, r12, lsl #3
+.endm
+
+.macro load_filter_coef dst, src, inc
+ vld1.8 {\dst}, [r12, :64]
+ add \src, \src, \inc
+.endm
+
+.macro load_filter_row dst, src, inc
+ load_filter_ptr \src
+ load_filter_coef \dst, \src, \inc
+.endm
+
+function warp_filter_horz_neon
+ load_filter_ptr r5 // filter 0
+ vld1.16 {q6,q7}, [r2], r3
+
+ load_filter_coef d0, r5, r7 // filter 0
+ load_filter_row d2, r5, r7 // filter 1
+ vmovl.s8 q0, d0 // filter 0
+ vext.8 q3, q6, q7, #2*1 // filter 1 pixels
+ vmovl.s8 q1, d2 // filter 1
+
+ vmull.s16 q4, d12, d0 // filter 0 output (0-3)
+ vmull.s16 q5, d13, d1 // filter 0 output (4-7)
+
+ load_filter_ptr r5 // filter 2
+
+ vmull.s16 q2, d6, d2 // filter 1 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 1 output (4-7)
+
+ load_filter_coef d0, r5, r7 // filter 2
+
+ vpadd.i32 d8, d8, d9 // half pixel 0 (2x32)
+ vpadd.i32 d9, d10, d11 // half pixel 0 (2x32)
+
+ load_filter_ptr r5 // filter 3
+
+ vpadd.i32 d4, d4, d5 // half pixel 1 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 1 (2x32)
+
+ vmovl.s8 q0, d0 // filter 2
+ vext.8 q3, q6, q7, #2*2 // filter 2 pixels
+
+ vpadd.i32 d8, d8, d9 // pixel 0 (2x32)
+ vpadd.i32 d9, d4, d5 // pixel 1 (2x32)
+
+ load_filter_coef d2, r5, r7 // filter 3
+
+ vmull.s16 q2, d6, d0 // filter 2 output (0-3)
+ vmull.s16 q3, d7, d1 // filter 2 output (4-7)
+
+ load_filter_ptr r5 // filter 4
+
+ vpadd.i32 d8, d8, d9 // pixel 0,1
+
+ vpadd.i32 d9, d4, d5 // half pixel 2 (2x32)
+ vpadd.i32 d10, d6, d7 // half pixel 2 (2x32)
+
+ vmovl.s8 q1, d2 // filter 3
+ vext.8 q3, q6, q7, #2*3 // filter 3 pixels
+
+ load_filter_coef d0, r5, r7 // filter 4
+
+ vpadd.i32 d9, d9, d10 // pixel 2 (2x32)
+
+ vmull.s16 q2, d6, d2 // filter 3 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 3 output (4-7)
+
+ vmovl.s8 q0, d0 // filter 4
+ load_filter_ptr r5 // filter 5
+
+ vpadd.i32 d10, d4, d5 // half pixel 3 (2x32)
+ vpadd.i32 d11, d6, d7 // half pixel 3 (2x32)
+
+ vext.8 q3, q6, q7, #2*4 // filter 4 pixels
+ load_filter_coef d2, r5, r7 // filter 5
+
+ vpadd.i32 d10, d10, d11 // pixel 3 (2x32)
+
+ vpadd.i32 d9, d9, d10 // pixel 2,3
+
+ vmull.s16 q2, d6, d0 // filter 4 output (0-3)
+ vmull.s16 q3, d7, d1 // filter 4 output (4-7)
+
+ vmovl.s8 q1, d2 // filter 5
+ load_filter_ptr r5 // filter 6
+
+ vpadd.i32 d10, d4, d5 // half pixel 4 (2x32)
+ vpadd.i32 d11, d6, d7 // half pixel 4 (2x32)
+
+ vext.8 q3, q6, q7, #2*5 // filter 5 pixels
+ load_filter_coef d0, r5, r7 // filter 6
+
+ vpadd.i32 d10, d10, d11 // pixel 4 (2x32)
+
+ vmull.s16 q2, d6, d2 // filter 5 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 5 output (4-7)
+
+ vmovl.s8 q0, d0 // filter 6
+ load_filter_ptr r5 // filter 7
+
+ vpadd.i32 d4, d4, d5 // half pixel 5 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 5 (2x32)
+
+ vext.8 q3, q6, q7, #2*6 // filter 6 pixels
+ load_filter_coef d2, r5, r7 // filter 7
+
+ vpadd.i32 d11, d4, d5 // pixel 5 (2x32)
+
+ vmull.s16 q2, d6, d0 // filter 6 output (0-3)
+ vmull.s16 q3, d7, d1 // filter 6 output (4-7)
+
+ vmovl.s8 q1, d2 // filter 7
+
+ vpadd.i32 d10, d10, d11 // pixel 4,5
+
+ vpadd.i32 d4, d4, d5 // half pixel 6 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 6 (2x32)
+
+ vext.8 q3, q6, q7, #2*7 // filter 7 pixels
+
+ vpadd.i32 d11, d4, d5 // pixel 6 (2x32)
+
+ vmull.s16 q2, d6, d2 // filter 7 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 7 output (4-7)
+
+ vld1.32 {d14[],d15[]}, [sp] // -(7 - intermediate_bits)
+
+ vpadd.i32 d4, d4, d5 // half pixel 7 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 7 (2x32)
+
+ sub r5, r5, r7, lsl #3
+
+ vpadd.i32 d4, d4, d5 // pixel 7 (2x32)
+
+ add r5, r5, r8
+
+ vpadd.i32 d11, d11, d4 // pixel 6,7
+
+ vrshl.s32 q4, q4, q7 // -(7 - intermediate_bits)
+ vrshl.s32 q5, q5, q7 // -(7 - intermediate_bits)
+
+ bx lr
+endfunc
+
+// void dav1d_warp_affine_8x8_16bpc_neon(
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *const abcd, int mx, int my,
+// const int bitdepth_max)
+.macro warp t
+function warp_affine_8x8\t\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ sub sp, sp, #8
+
+ clz r7, r7
+ // intermediate_bits = clz(bitdepth_max) - 18
+.ifb \t
+ sub r8, r7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
+.endif
+ sub r7, r7, #25 // -(7 - intermediate_bits)
+.ifb \t
+ neg r8, r8 // -(7 + intermediate_bits)
+.endif
+ str r7, [sp] // spill -(7 - intermediate_bits) on stack
+.ifb \t
+ str r8, [sp, #4] // spill -(7 + intermediate_bits) on stack
+.endif
+
+ ldrd r8, r9, [r4]
+ sxth r7, r8
+ asr r8, r8, #16
+ asr r4, r9, #16
+ sxth r9, r9
+ mov r10, #8
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ sub r2, r2, #6
+ movrel r11, X(mc_warp_filter), 64*8
+.ifnb \t
+ lsl r1, r1, #1
+.endif
+ add r5, r5, #512
+ add r6, r6, #512
+
+ bl warp_filter_horz_neon
+ vmovn.i32 d16, q4
+ vmovn.i32 d17, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d18, q4
+ vmovn.i32 d19, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d20, q4
+ vmovn.i32 d21, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d22, q4
+ vmovn.i32 d23, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d24, q4
+ vmovn.i32 d25, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d26, q4
+ vmovn.i32 d27, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d28, q4
+ vmovn.i32 d29, q5
+
+1:
+ bl warp_filter_horz_neon
+ vmovn.i32 d30, q4
+ vmovn.i32 d31, q5
+
+ load_filter_row d8, r6, r9
+ load_filter_row d9, r6, r9
+ load_filter_row d10, r6, r9
+ load_filter_row d11, r6, r9
+ load_filter_row d12, r6, r9
+ load_filter_row d13, r6, r9
+ load_filter_row d14, r6, r9
+ load_filter_row d15, r6, r9
+ transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15
+ vmovl.s8 q1, d8
+ vmovl.s8 q2, d9
+ vmovl.s8 q3, d10
+ vmovl.s8 q4, d11
+ vmovl.s8 q5, d12
+ vmovl.s8 q6, d13
+
+ sub r6, r6, r9, lsl #3
+
+ // This ordering of vmull/vmlal is highly beneficial for
+ // Cortex A8/A9/A53 here, but harmful for Cortex A7.
+ vmull.s16 q0, d16, d2
+ vmlal.s16 q0, d18, d4
+ vmlal.s16 q0, d20, d6
+ vmlal.s16 q0, d22, d8
+ vmlal.s16 q0, d24, d10
+ vmlal.s16 q0, d26, d12
+ vmull.s16 q1, d17, d3
+ vmlal.s16 q1, d19, d5
+ vmlal.s16 q1, d21, d7
+ vmlal.s16 q1, d23, d9
+ vmlal.s16 q1, d25, d11
+ vmlal.s16 q1, d27, d13
+
+ vmovl.s8 q2, d14
+ vmovl.s8 q3, d15
+
+ vmlal.s16 q0, d28, d4
+ vmlal.s16 q0, d30, d6
+ vmlal.s16 q1, d29, d5
+ vmlal.s16 q1, d31, d7
+
+.ifb \t
+ ldr lr, [sp, #4] // -(7 + intermediate_bits)
+ ldr r12, [sp, #120] // bitdepth_max
+ vdup.32 q2, lr // -(7 + intermediate_bits)
+ vdup.16 q3, r12 // bitdepth_max
+.endif
+
+ vmov q8, q9
+ vmov q9, q10
+.ifb \t
+ vrshl.s32 q0, q0, q2 // -(7 + intermediate_bits)
+ vrshl.s32 q1, q1, q2 // -(7 + intermediate_bits)
+.else
+ vrshrn.s32 d0, q0, #7
+ vrshrn.s32 d1, q1, #7
+ vmov.i16 q3, #PREP_BIAS
+.endif
+ vmov q10, q11
+.ifb \t
+ vqmovun.s32 d0, q0
+ vqmovun.s32 d1, q1
+.else
+ vsub.i16 q0, q0, q3 // PREP_BIAS
+.endif
+ vmov q11, q12
+ vmov q12, q13
+.ifb \t
+ vmin.u16 q0, q0, q3 // bitdepth_max
+.endif
+ vmov q13, q14
+ vmov q14, q15
+ subs r10, r10, #1
+ vst1.16 {q0}, [r0, :128], r1
+
+ add r6, r6, r4
+ bgt 1b
+
+ add sp, sp, #8
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+warp
+warp t
--- a/src/arm/64/cdef_tmpl.S
+++ b/src/arm/64/cdef_tmpl.S
@@ -107,7 +107,7 @@
.macro filter_func w, bpc, pri, sec, min, suffix
function cdef_filter\w\suffix\()_\bpc\()bpc_neon
.if \bpc == 8
- ldr w8, [sp] // bitdepth_max
+ ldr w8, [sp] // edges
cmp w8, #0xf
b.eq cdef_filter\w\suffix\()_edged_8bpc_neon
.endif
--- a/src/arm/64/loopfilter.S
+++ b/src/arm/64/loopfilter.S
@@ -1034,11 +1034,11 @@
ld1r {v6.16b}, [x5] // sharp[1]
sub x5, x5, #8
bif v1.16b, v0.16b, v3.16b // if (!l[0][0]) L = l[offset][0]
+ cmtst v2.4s, v1.4s, v2.4s // L != 0
mul v1.4s, v1.4s, v4.4s // L
.ifc \type, y
dup v15.4s, w2 // vmask[2]
.endif
- cmtst v2.4s, v1.4s, v2.4s // L != 0
dup v14.4s, w7 // vmask[1]
mov x16, v2.d[0]
mov x17, v2.d[1]
--- a/src/arm/64/loopfilter16.S
+++ b/src/arm/64/loopfilter16.S
@@ -785,7 +785,7 @@
orr w6, w6, w7 // vmask[0] |= vmask[1]
1:
- tst w6, #0x0f
+ tst w6, #0x03
.ifc \dir, v
ld1 {v0.8b}, [x4], #8
ld1 {v1.8b}, [x3], #8
@@ -808,11 +808,11 @@
ld1r {v6.8b}, [x5] // sharp[1]
sub x5, x5, #8
bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0]
+ cmtst v2.2s, v1.2s, v2.2s // L != 0
mul v1.2s, v1.2s, v4.2s // L
.ifc \type, y
dup v15.2s, w2 // vmask[2]
.endif
- cmtst v2.2s, v1.2s, v2.2s // L != 0
dup v14.2s, w7 // vmask[1]
mov x16, v2.d[0]
cmp x16, #0
@@ -847,7 +847,7 @@
ushl v10.8h, v10.8h, v31.8h
.ifc \type, y
- tst w2, #0x0f
+ tst w2, #0x03
b.eq 2f
// wd16
bl lpf_\dir\()_16_8_neon
@@ -854,7 +854,7 @@
b 8f
2:
.endif
- tst w7, #0x0f
+ tst w7, #0x03
b.eq 3f
.ifc \type, y
// wd8
--- a/src/arm/64/looprestoration.S
+++ b/src/arm/64/looprestoration.S
@@ -30,7 +30,7 @@
// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
-// const int16_t fh[7], const intptr_t w,
+// const int16_t fh[8], intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_8bpc_neon, export=1
mov w8, w5
@@ -308,13 +308,11 @@
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
-// const int16_t fv[7], enum LrEdgeFlags edges,
+// const int16_t fv[8], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
function wiener_filter_v_8bpc_neon, export=1
mov w8, w4
ld1 {v0.8h}, [x5]
- movi v1.8h, #128
- add v1.8h, v1.8h, v0.8h
// Calculate the number of rows to move back when looping vertically
mov w11, w4
@@ -359,7 +357,7 @@
smull v2.4s, v16.4h, v0.h[0]
smlal v2.4s, v17.4h, v0.h[1]
smlal v2.4s, v18.4h, v0.h[2]
- smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v19.4h, v0.h[3]
smlal v2.4s, v20.4h, v0.h[4]
smlal v2.4s, v21.4h, v0.h[5]
smlal v2.4s, v22.4h, v0.h[6]
@@ -366,7 +364,7 @@
smull2 v3.4s, v16.8h, v0.h[0]
smlal2 v3.4s, v17.8h, v0.h[1]
smlal2 v3.4s, v18.8h, v0.h[2]
- smlal2 v3.4s, v19.8h, v1.h[3]
+ smlal2 v3.4s, v19.8h, v0.h[3]
smlal2 v3.4s, v20.8h, v0.h[4]
smlal2 v3.4s, v21.8h, v0.h[5]
smlal2 v3.4s, v22.8h, v0.h[6]
--- a/src/arm/64/looprestoration16.S
+++ b/src/arm/64/looprestoration16.S
@@ -126,7 +126,7 @@
tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
- // If we'll need to pad the right edge, load that byte to pad with
+ // If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here.
sub w9, w5, #14
ldr h27, [x2, w9, sxtw #1]
@@ -143,12 +143,6 @@
b 6f
4: // Loop horizontally
-.macro ushll_sz d0, d1, src, shift, wd
- ushll \d0\().4s, \src\().4h, \shift
-.ifc \wd, .8h
- ushll2 \d1\().4s, \src\().8h, \shift
-.endif
-.endm
.macro add_sz d0, d1, s0, s1, c, wd
add \d0\().4s, \s0\().4s, \c\().4s
.ifc \wd, .8h
@@ -178,8 +172,7 @@
ext v19.16b, v2.16b, v3.16b, #8
ext v20.16b, v2.16b, v3.16b, #10
ext v21.16b, v2.16b, v3.16b, #12
- ushll_sz v6, v7, v18, #7, \wd
- smlal v6.4s, v2.4h, v0.h[0]
+ smull v6.4s, v2.4h, v0.h[0]
smlal v6.4s, v16.4h, v0.h[1]
smlal v6.4s, v17.4h, v0.h[2]
smlal v6.4s, v18.4h, v0.h[3]
@@ -187,7 +180,7 @@
smlal v6.4s, v20.4h, v0.h[5]
smlal v6.4s, v21.4h, v0.h[6]
.ifc \wd, .8h
- smlal2 v7.4s, v2.8h, v0.h[0]
+ smull2 v7.4s, v2.8h, v0.h[0]
smlal2 v7.4s, v16.8h, v0.h[1]
smlal2 v7.4s, v17.8h, v0.h[2]
smlal2 v7.4s, v18.8h, v0.h[3]
@@ -201,8 +194,7 @@
ext v22.16b, v4.16b, v5.16b, #8
ext v23.16b, v4.16b, v5.16b, #10
ext v24.16b, v4.16b, v5.16b, #12
- ushll_sz v16, v17, v21, #7, \wd
- smlal v16.4s, v4.4h, v0.h[0]
+ smull v16.4s, v4.4h, v0.h[0]
smlal v16.4s, v19.4h, v0.h[1]
smlal v16.4s, v20.4h, v0.h[2]
smlal v16.4s, v21.4h, v0.h[3]
@@ -210,7 +202,7 @@
smlal v16.4s, v23.4h, v0.h[5]
smlal v16.4s, v24.4h, v0.h[6]
.ifc \wd, .8h
- smlal2 v17.4s, v4.8h, v0.h[0]
+ smull2 v17.4s, v4.8h, v0.h[0]
smlal2 v17.4s, v19.8h, v0.h[1]
smlal2 v17.4s, v20.8h, v0.h[2]
smlal2 v17.4s, v21.8h, v0.h[3]
@@ -329,14 +321,10 @@
add v16.4s, v16.4s, v17.4s
addv s6, v6.4s
addv s7, v16.4s
- dup v16.4h, v2.h[3]
- ins v16.h[1], v4.h[3]
ins v6.s[1], v7.s[0]
mvni v24.4h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
- ushll v16.4s, v16.4h, #7
- add v6.4s, v6.4s, v30.4s
- add v6.4s, v6.4s, v16.4s
- srshl v6.4s, v6.4s, v29.4s
+ add v6.2s, v6.2s, v30.2s
+ srshl v6.2s, v6.2s, v29.2s
sqxtun v6.4h, v6.4s
umin v6.4h, v6.4h, v24.4h
sub v6.4h, v6.4h, v31.4h
@@ -371,9 +359,7 @@
ld1 {v0.8h}, [x5]
dup v31.8h, w8
clz w8, w8
- movi v1.8h, #128
sub w8, w8, #11 // round_bits_v
- add v1.8h, v1.8h, v0.8h
dup v30.4s, w8
mov w8, w4
neg v30.4s, v30.4s // -round_bits_v
@@ -421,7 +407,7 @@
smull v2.4s, v16.4h, v0.h[0]
smlal v2.4s, v17.4h, v0.h[1]
smlal v2.4s, v18.4h, v0.h[2]
- smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v19.4h, v0.h[3]
smlal v2.4s, v20.4h, v0.h[4]
smlal v2.4s, v21.4h, v0.h[5]
smlal v2.4s, v22.4h, v0.h[6]
@@ -428,7 +414,7 @@
smull2 v3.4s, v16.8h, v0.h[0]
smlal2 v3.4s, v17.8h, v0.h[1]
smlal2 v3.4s, v18.8h, v0.h[2]
- smlal2 v3.4s, v19.8h, v1.h[3]
+ smlal2 v3.4s, v19.8h, v0.h[3]
smlal2 v3.4s, v20.8h, v0.h[4]
smlal2 v3.4s, v21.8h, v0.h[5]
smlal2 v3.4s, v22.8h, v0.h[6]
@@ -770,16 +756,9 @@
ext v16.16b, v18.16b, v16.16b, #12
2:
- umull v2.4s, v0.4h, v0.4h
- umull2 v3.4s, v0.8h, v0.8h
- umull v4.4s, v1.4h, v1.4h
- umull v18.4s, v16.4h, v16.4h
- umull2 v19.4s, v16.8h, v16.8h
- umull v20.4s, v17.4h, v17.4h
-
tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
- // If we'll need to pad the right edge, load that byte to pad with
+ // If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here.
sub w13, w5, #(2 + 16 - 2 + 1)
ldr h30, [x3, w13, sxtw #1]
@@ -796,41 +775,33 @@
b 6f
4: // Loop horizontally
-.macro ext_n dst1, dst2, src1, src2, src3, n, w
- ext \dst1, \src1, \src2, \n
+.macro add3 w, wd
+ ext v26.16b, v0.16b, v1.16b, #2
+ ext v28.16b, v16.16b, v17.16b, #2
+ ext v27.16b, v0.16b, v1.16b, #4
+ ext v29.16b, v16.16b, v17.16b, #4
+
+ add v6\wd, v0\wd, v26\wd
+ umull v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v7\wd, v16\wd, v28\wd
+ umull v24.4s, v16.4h, v16.4h
+ umlal v24.4s, v28.4h, v28.4h
+ umlal v24.4s, v29.4h, v29.4h
+ add v6\wd, v6\wd, v27\wd
.if \w > 4
- ext \dst2, \src2, \src3, \n
+ umull2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
.endif
-.endm
-.macro add_n dst1, dst2, src1, src2, src3, src4, w
- add \dst1, \src1, \src3
+ add v7\wd, v7\wd, v29\wd
.if \w > 4
- add \dst2, \src2, \src4
+ umull2 v25.4s, v16.8h, v16.8h
+ umlal2 v25.4s, v28.8h, v28.8h
+ umlal2 v25.4s, v29.8h, v29.8h
.endif
.endm
-
-.macro add3 w, wd
- ext v24.16b, v0.16b, v1.16b, #2
- ext v25.16b, v0.16b, v1.16b, #4
- ext v26.16b, v16.16b, v17.16b, #2
- ext v27.16b, v16.16b, v17.16b, #4
- add v6\wd, v0\wd, v24\wd
- add v7\wd, v16\wd, v26\wd
- add v6\wd, v6\wd, v25\wd
- add v7\wd, v7\wd, v27\wd
-
- ext_n v24.16b, v25.16b, v2.16b, v3.16b, v4.16b, #4, \w
- ext_n v26.16b, v27.16b, v2.16b, v3.16b, v4.16b, #8, \w
-
- add_n v22.4s, v23.4s, v2.4s, v3.4s, v24.4s, v25.4s, \w
- add_n v22.4s, v23.4s, v22.4s, v23.4s, v26.4s, v27.4s, \w
-
- ext_n v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w
- ext_n v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w
-
- add_n v24.4s, v25.4s, v18.4s, v19.4s, v24.4s, v25.4s, \w
- add_n v24.4s, v25.4s, v24.4s, v25.4s, v26.4s, v27.4s, \w
-.endm
add3 8, .8h
st1 {v6.8h}, [x1], #16
st1 {v7.8h}, [x11], #16
@@ -844,12 +815,6 @@
mov v16.16b, v17.16b
ld1 {v1.8h}, [x3], #16
ld1 {v17.8h}, [x12], #16
- mov v2.16b, v4.16b
- umull2 v3.4s, v0.8h, v0.8h
- umull v4.4s, v1.4h, v1.4h
- mov v18.16b, v20.16b
- umull2 v19.4s, v16.8h, v16.8h
- umull v20.4s, v17.4h, v17.4h
b.ne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
@@ -907,11 +872,6 @@
.hword L(box3_variable_shift_tbl) - 55b
88:
- umull v2.4s, v0.4h, v0.4h
- umull2 v3.4s, v0.8h, v0.8h
- umull v18.4s, v16.4h, v16.4h
- umull2 v19.4s, v16.8h, v16.8h
-
add3 4, .4h
subs w5, w5, #4
st1 {v6.4h}, [x1], #8
@@ -921,10 +881,6 @@
b.le 9f
ext v0.16b, v0.16b, v0.16b, #8
ext v16.16b, v16.16b, v16.16b, #8
- mov v2.16b, v3.16b
- mov v3.16b, v4.16b
- mov v18.16b, v19.16b
- mov v19.16b, v20.16b
// Only one needed pixel left, but do a normal 4 pixel
// addition anyway
add3 4, .4h
@@ -1026,7 +982,7 @@
// and shift v0/v1 to have 3x the first pixel at the front.
dup v2.8h, v0.h[0]
dup v18.8h, v16.h[0]
- // Move x3 back to account for the last 6 bytes we loaded before,
+ // Move x3 back to account for the last 3 pixels we loaded before,
// which we shifted out.
sub x3, x3, #6
sub x12, x12, #6
@@ -1036,16 +992,9 @@
ext v16.16b, v18.16b, v16.16b, #10
2:
- umull v2.4s, v0.4h, v0.4h
- umull2 v3.4s, v0.8h, v0.8h
- umull v4.4s, v1.4h, v1.4h
- umull v18.4s, v16.4h, v16.4h
- umull2 v19.4s, v16.8h, v16.8h
- umull v20.4s, v17.4h, v17.4h
-
tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
- // If we'll need to pad the right edge, load that byte to pad with
+ // If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here.
sub w13, w5, #(2 + 16 - 3 + 1)
ldr h30, [x3, w13, sxtw #1]
@@ -1063,43 +1012,53 @@
4: // Loop horizontally
.macro add5 w, wd
- ext v24.16b, v0.16b, v1.16b, #2
- ext v25.16b, v0.16b, v1.16b, #4
- ext v26.16b, v0.16b, v1.16b, #6
- ext v27.16b, v0.16b, v1.16b, #8
+ ext v26.16b, v0.16b, v1.16b, #2
+ ext v28.16b, v16.16b, v17.16b, #2
+ ext v27.16b, v0.16b, v1.16b, #4
+ ext v29.16b, v16.16b, v17.16b, #4
- add v6\wd, v0\wd, v24\wd
- add v25\wd, v25\wd, v26\wd
+ add v6\wd, v0\wd, v26\wd
+ umull v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v7\wd, v16\wd, v28\wd
+ umull v24.4s, v16.4h, v16.4h
+ umlal v24.4s, v28.4h, v28.4h
+ umlal v24.4s, v29.4h, v29.4h
add v6\wd, v6\wd, v27\wd
+.if \w > 4
+ umull2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+.endif
+ add v7\wd, v7\wd, v29\wd
+.if \w > 4
+ umull2 v25.4s, v16.8h, v16.8h
+ umlal2 v25.4s, v28.8h, v28.8h
+ umlal2 v25.4s, v29.8h, v29.8h
+.endif
- ext v26.16b, v16.16b, v17.16b, #2
- ext v27.16b, v16.16b, v17.16b, #4
+ ext v26.16b, v0.16b, v1.16b, #6
ext v28.16b, v16.16b, v17.16b, #6
+ ext v27.16b, v0.16b, v1.16b, #8
ext v29.16b, v16.16b, v17.16b, #8
- add v7\wd, v16\wd, v26\wd
- add v27\wd, v27\wd, v28\wd
+ add v6\wd, v6\wd, v26\wd
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v7\wd, v7\wd, v28\wd
+ umlal v24.4s, v28.4h, v28.4h
+ umlal v24.4s, v29.4h, v29.4h
+ add v6\wd, v6\wd, v27\wd
+.if \w > 4
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+.endif
add v7\wd, v7\wd, v29\wd
- add v6\wd, v6\wd, v25\wd
- add v7\wd, v7\wd, v27\wd
-
- ext_n v24.16b, v25.16b, v2.16b, v3.16b, v4.16b, #4, \w
- ext_n v26.16b, v27.16b, v2.16b, v3.16b, v4.16b, #8, \w
- ext_n v28.16b, v29.16b, v2.16b, v3.16b, v4.16b, #12, \w
-
- add_n v22.4s, v23.4s, v2.4s, v3.4s, v24.4s, v25.4s, \w
- add_n v26.4s, v27.4s, v26.4s, v27.4s, v28.4s, v29.4s, \w
- add_n v22.4s, v23.4s, v22.4s, v23.4s, v3.4s, v4.4s, \w
- add_n v22.4s, v23.4s, v22.4s, v23.4s, v26.4s, v27.4s, \w
-
- ext_n v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w
- ext_n v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w
- ext_n v28.16b, v29.16b, v18.16b, v19.16b, v20.16b, #12, \w
-
- add_n v24.4s, v25.4s, v18.4s, v19.4s, v24.4s, v25.4s, \w
- add_n v26.4s, v27.4s, v26.4s, v27.4s, v28.4s, v29.4s, \w
- add_n v24.4s, v25.4s, v24.4s, v25.4s, v19.4s, v20.4s, \w
- add_n v24.4s, v25.4s, v24.4s, v25.4s, v26.4s, v27.4s, \w
+.if \w > 4
+ umlal2 v25.4s, v28.8h, v28.8h
+ umlal2 v25.4s, v29.8h, v29.8h
+.endif
.endm
add5 8, .8h
st1 {v6.8h}, [x1], #16
@@ -1114,12 +1073,6 @@
mov v16.16b, v17.16b
ld1 {v1.8h}, [x3], #16
ld1 {v17.8h}, [x12], #16
- mov v2.16b, v4.16b
- umull2 v3.4s, v0.8h, v0.8h
- umull v4.4s, v1.4h, v1.4h
- mov v18.16b, v20.16b
- umull2 v19.4s, v16.8h, v16.8h
- umull v20.4s, v17.4h, v17.4h
b.ne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
@@ -1193,13 +1146,6 @@
.hword L(box5_variable_shift_tbl) - 77b
88:
- umull v2.4s, v0.4h, v0.4h
- umull2 v3.4s, v0.8h, v0.8h
- umull v4.4s, v1.4h, v1.4h
- umull v18.4s, v16.4h, v16.4h
- umull2 v19.4s, v16.8h, v16.8h
- umull v20.4s, v17.4h, v17.4h
-
add5 4, .4h
subs w5, w5, #4
st1 {v6.4h}, [x1], #8
@@ -1209,10 +1155,6 @@
b.le 9f
ext v0.16b, v0.16b, v1.16b, #8
ext v16.16b, v16.16b, v17.16b, #8
- mov v2.16b, v3.16b
- mov v3.16b, v4.16b
- mov v18.16b, v19.16b
- mov v19.16b, v20.16b
add5 4, .4h
st1 {v6.4h}, [x1], #8
st1 {v7.4h}, [x11], #8
--- a/src/arm/64/looprestoration_tmpl.S
+++ b/src/arm/64/looprestoration_tmpl.S
@@ -454,7 +454,7 @@
// const pixel *src, const ptrdiff_t src_stride,
// const int16_t *t1, const int16_t *t2,
// const int w, const int h,
-// const int16_t wt[2]);
+// const int16_t wt[2], const int bitdepth_max);
function sgr_weighted2_\bpc\()bpc_neon, export=1
.if \bpc == 8
ldr x8, [sp]
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1906,11 +1906,10 @@
bl L(\type\()_8tap_filter_2)
ext v18.8b, v17.8b, v28.8b, #4
- mov v19.8b, v28.8b
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
- smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v28.4h, v1.h[3]
sqrshrn v2.4h, v2.4s, #\shift_hv
sqxtun v2.8b, v2.8h
@@ -1919,7 +1918,7 @@
st1 {v2.h}[1], [\ds2], \d_strd
b.le 0f
mov v16.8b, v18.8b
- mov v17.8b, v19.8b
+ mov v17.8b, v28.8b
b 2b
280: // 2x8, 2x16, 2x32 hv
@@ -1956,7 +1955,6 @@
28:
bl L(\type\()_8tap_filter_2)
ext v22.8b, v21.8b, v28.8b, #4
- mov v23.8b, v28.8b
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -1964,7 +1962,7 @@
smlal v2.4s, v20.4h, v1.h[4]
smlal v2.4s, v21.4h, v1.h[5]
smlal v2.4s, v22.4h, v1.h[6]
- smlal v2.4s, v23.4h, v1.h[7]
+ smlal v2.4s, v28.4h, v1.h[7]
sqrshrn v2.4h, v2.4s, #\shift_hv
sqxtun v2.8b, v2.8h
@@ -1977,7 +1975,7 @@
mov v18.8b, v20.8b
mov v19.8b, v21.8b
mov v20.8b, v22.8b
- mov v21.8b, v23.8b
+ mov v21.8b, v28.8b
b 28b
0:
--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -1004,11 +1004,11 @@
b.gt 2b
ret
4:
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
+ ld1 {v0.4h}, [x2], x3
+ ld1 {v1.4h}, [x2], x3
subs w5, w5, #2
- st1 {v0.8b}, [x0], x1
- st1 {v1.8b}, [x0], x1
+ st1 {v0.4h}, [x0], x1
+ st1 {v1.4h}, [x0], x1
b.gt 4b
ret
80:
@@ -1017,11 +1017,11 @@
add x9, x2, x3
lsl x3, x3, #1
8:
- ld1 {v0.16b}, [x2], x3
- ld1 {v1.16b}, [x9], x3
+ ld1 {v0.8h}, [x2], x3
+ ld1 {v1.8h}, [x9], x3
subs w5, w5, #2
- st1 {v0.16b}, [x0], x1
- st1 {v1.16b}, [x8], x1
+ st1 {v0.8h}, [x0], x1
+ st1 {v1.8h}, [x8], x1
b.gt 8b
ret
16:
@@ -2039,7 +2039,6 @@
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
mov x15, x30
- sxtl v1.4s, v1.4h
ld1 {v27.8h}, [\src], \s_strd
ext v28.16b, v27.16b, v27.16b, #2
@@ -2049,19 +2048,23 @@
addp v16.4s, v27.4s, v27.4s
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
bl L(\type\()_8tap_filter_2)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+ xtn v16.4h, v16.4s
- trn1 v16.2d, v16.2d, v24.2d
- mov v17.16b, v24.16b
+ trn1 v16.2s, v16.2s, v24.2s
+ mov v17.8b, v24.8b
2:
bl L(\type\()_8tap_filter_2)
- ext v18.16b, v17.16b, v24.16b, #8
- mov v19.16b, v24.16b
- mul v2.4s, v16.4s, v1.s[0]
- mla v2.4s, v17.4s, v1.s[1]
- mla v2.4s, v18.4s, v1.s[2]
- mla v2.4s, v19.4s, v1.s[3]
+ ext v18.8b, v17.8b, v24.8b, #4
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v24.4h, v1.h[3]
srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
sqxtun v2.4h, v2.4s
@@ -2070,8 +2073,8 @@
st1 {v2.s}[0], [\dst], \d_strd
st1 {v2.s}[1], [\ds2], \d_strd
b.le 0f
- mov v16.16b, v18.16b
- mov v17.16b, v19.16b
+ mov v16.8b, v18.8b
+ mov v17.8b, v24.8b
b 2b
280: // 2x8, 2x16, 2x32 hv
@@ -2085,8 +2088,6 @@
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
mov x15, x30
- sxtl2 v2.4s, v1.8h
- sxtl v1.4s, v1.4h
ld1 {v27.8h}, [\src], \s_strd
ext v28.16b, v27.16b, v27.16b, #2
@@ -2095,29 +2096,33 @@
addp v27.4s, v27.4s, v28.4s
addp v16.4s, v27.4s, v27.4s
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
bl L(\type\()_8tap_filter_2)
- trn1 v16.2d, v16.2d, v24.2d
- mov v17.16b, v24.16b
+ xtn v16.4h, v16.4s
+ trn1 v16.2s, v16.2s, v24.2s
+ mov v17.8b, v24.8b
bl L(\type\()_8tap_filter_2)
- ext v18.16b, v17.16b, v24.16b, #8
- mov v19.16b, v24.16b
+ ext v18.8b, v17.8b, v24.8b, #4
+ mov v19.8b, v24.8b
bl L(\type\()_8tap_filter_2)
- ext v20.16b, v19.16b, v24.16b, #8
- mov v21.16b, v24.16b
+ ext v20.8b, v19.8b, v24.8b, #4
+ mov v21.8b, v24.8b
28:
bl L(\type\()_8tap_filter_2)
- ext v22.16b, v21.16b, v24.16b, #8
- mov v23.16b, v24.16b
- mul v3.4s, v16.4s, v1.s[0]
- mla v3.4s, v17.4s, v1.s[1]
- mla v3.4s, v18.4s, v1.s[2]
- mla v3.4s, v19.4s, v1.s[3]
- mla v3.4s, v20.4s, v2.s[0]
- mla v3.4s, v21.4s, v2.s[1]
- mla v3.4s, v22.4s, v2.s[2]
- mla v3.4s, v23.4s, v2.s[3]
+ ext v22.8b, v21.8b, v24.8b, #4
+ smull v3.4s, v16.4h, v1.h[0]
+ smlal v3.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[6]
+ smlal v3.4s, v24.4h, v1.h[7]
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
sqxtun v3.4h, v3.4s
@@ -2126,12 +2131,12 @@
st1 {v3.s}[0], [\dst], \d_strd
st1 {v3.s}[1], [\ds2], \d_strd
b.le 0f
- mov v16.16b, v18.16b
- mov v17.16b, v19.16b
- mov v18.16b, v20.16b
- mov v19.16b, v21.16b
- mov v20.16b, v22.16b
- mov v21.16b, v23.16b
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v24.8b
b 28b
0:
@@ -2151,6 +2156,7 @@
smlal v24.4s, v27.4h, v0.h[2]
smlal v24.4s, v28.4h, v0.h[3]
srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ xtn v24.4h, v24.4s
ret
.endif
--- a/src/arm/cdef_init_tmpl.c
+++ b/src/arm/cdef_init_tmpl.c
@@ -27,7 +27,6 @@
#include "src/cpu.h"
#include "src/cdef.h"
-#if BITDEPTH == 8 || ARCH_AARCH64
decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon));
void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src,
@@ -72,7 +71,6 @@
DEFINE_FILTER(8, 8, 16)
DEFINE_FILTER(4, 8, 8)
DEFINE_FILTER(4, 4, 8)
-#endif
COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) {
@@ -80,10 +78,8 @@
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
-#if BITDEPTH == 8 || ARCH_AARCH64
c->dir = BF(dav1d_cdef_find_dir, neon);
c->fb[0] = cdef_filter_8x8_neon;
c->fb[1] = cdef_filter_4x8_neon;
c->fb[2] = cdef_filter_4x4_neon;
-#endif
}
--- a/src/arm/loopfilter_init_tmpl.c
+++ b/src/arm/loopfilter_init_tmpl.c
@@ -38,10 +38,8 @@
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
-#if BITDEPTH == 8 || ARCH_AARCH64
c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon);
c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon);
c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon);
c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon);
-#endif
}
--- a/src/arm/looprestoration_init_tmpl.c
+++ b/src/arm/looprestoration_init_tmpl.c
@@ -29,7 +29,6 @@
#include "src/looprestoration.h"
#include "src/tables.h"
-#if BITDEPTH == 8 || ARCH_AARCH64
// The 8bpc version calculates things slightly differently than the reference
// C version. That version calculates roughly this:
// int16_t sum = 0;
@@ -46,12 +45,11 @@
// 1 << (bitdepth + 6 - round_bits_h).
void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
const pixel *src, ptrdiff_t stride,
- const int16_t fh[7], const intptr_t w,
+ const int16_t fh[8], intptr_t w,
int h, enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX);
// This calculates things slightly differently than the reference C version.
// This version calculates roughly this:
-// fv[3] += 128;
// int32_t sum = 0;
// for (int i = 0; i < 7; i++)
// sum += mid[idx] * fv[i];
@@ -59,7 +57,7 @@
// This function assumes that the width is a multiple of 8.
void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
const int16_t *mid, int w, int h,
- const int16_t fv[7], enum LrEdgeFlags edges,
+ const int16_t fv[8], enum LrEdgeFlags edges,
ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
void BF(dav1d_copy_narrow, neon)(pixel *dst, ptrdiff_t stride,
const pixel *src, int w, int h);
@@ -67,9 +65,9 @@
static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
- const int w, const int h, const int16_t fh[7],
- const int16_t fv[7], const enum LrEdgeFlags edges
- HIGHBD_DECL_SUFFIX)
+ const int w, const int h,
+ const int16_t filter[2][8],
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int16_t, mid, 68 * 384,);
int mid_stride = (w + 7) & ~7;
@@ -76,20 +74,21 @@
// Horizontal filter
BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, dst_stride,
- fh, w, h, edges HIGHBD_TAIL_SUFFIX);
+ filter[0], w, h, edges HIGHBD_TAIL_SUFFIX);
if (edges & LR_HAVE_TOP)
BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, lpf_stride,
- fh, w, 2, edges HIGHBD_TAIL_SUFFIX);
+ filter[0], w, 2, edges
+ HIGHBD_TAIL_SUFFIX);
if (edges & LR_HAVE_BOTTOM)
BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
lpf + 6 * PXSTRIDE(lpf_stride),
- lpf_stride, fh, w, 2, edges
+ lpf_stride, filter[0], w, 2, edges
HIGHBD_TAIL_SUFFIX);
// Vertical filter
if (w >= 8)
BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
- w & ~7, h, fv, edges,
+ w & ~7, h, filter[1], edges,
mid_stride * sizeof(*mid)
HIGHBD_TAIL_SUFFIX);
if (w & 7) {
@@ -98,7 +97,7 @@
ALIGN_STK_16(pixel, tmp, 64 * 8,);
BF(dav1d_wiener_filter_v, neon)(tmp, (w & 7) * sizeof(pixel),
&mid[2*mid_stride + (w & ~7)],
- w & 7, h, fv, edges,
+ w & 7, h, filter[1], edges,
mid_stride * sizeof(*mid)
HIGHBD_TAIL_SUFFIX);
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, tmp, w & 7, h);
@@ -283,7 +282,6 @@
}
}
}
-#endif // BITDEPTH == 8
COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
const unsigned flags = dav1d_get_cpu_flags();
@@ -290,9 +288,7 @@
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
-#if BITDEPTH == 8 || ARCH_AARCH64
- c->wiener = wiener_filter_neon;
+ c->wiener[0] = c->wiener[1] = wiener_filter_neon;
if (bpc <= 10)
c->selfguided = sgr_filter_neon;
-#endif
}
--- a/src/arm/mc_init_tmpl.c
+++ b/src/arm/mc_init_tmpl.c
@@ -77,7 +77,6 @@
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
-#if BITDEPTH == 8 || ARCH_AARCH64
init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
@@ -99,7 +98,6 @@
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
-#endif
c->avg = BF(dav1d_avg, neon);
c->w_avg = BF(dav1d_w_avg, neon);
@@ -111,8 +109,10 @@
c->w_mask[0] = BF(dav1d_w_mask_444, neon);
c->w_mask[1] = BF(dav1d_w_mask_422, neon);
c->w_mask[2] = BF(dav1d_w_mask_420, neon);
+#endif
c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
+#if BITDEPTH == 8 || ARCH_AARCH64
c->emu_edge = BF(dav1d_emu_edge, neon);
#endif
}
--- a/src/cdf.c
+++ b/src/cdf.c
@@ -29,10 +29,7 @@
#include <string.h>
-#include "src/thread.h"
-#include "common/intops.h"
-
-#include "src/cdf.h"
+#include "src/internal.h"
#include "src/tables.h"
#define CDF1(x) (32768-(x))
@@ -4098,11 +4095,11 @@
}
}
-int dav1d_cdf_thread_alloc(CdfThreadContext *const cdf,
+int dav1d_cdf_thread_alloc(Dav1dContext *const c, CdfThreadContext *const cdf,
struct thread_data *const t)
{
- cdf->ref = dav1d_ref_create(sizeof(CdfContext) +
- (t != NULL) * sizeof(atomic_uint));
+ cdf->ref = dav1d_ref_create_using_pool(c->cdf_pool,
+ sizeof(CdfContext) + sizeof(atomic_uint));
if (!cdf->ref) return DAV1D_ERR(ENOMEM);
cdf->data.cdf = cdf->ref->data;
if (t) {
--- a/src/cdf.h
+++ b/src/cdf.h
@@ -140,7 +140,8 @@
} CdfThreadContext;
void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx);
-int dav1d_cdf_thread_alloc(CdfThreadContext *cdf, struct thread_data *t);
+int dav1d_cdf_thread_alloc(Dav1dContext *c, CdfThreadContext *cdf,
+ struct thread_data *t);
void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src);
void dav1d_cdf_thread_ref(CdfThreadContext *dst, CdfThreadContext *src);
void dav1d_cdf_thread_unref(CdfThreadContext *cdf);
--- a/src/data.c
+++ b/src/data.c
@@ -43,6 +43,7 @@
uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
validate_input_or_ret(buf != NULL, NULL);
+ if (sz > SIZE_MAX / 2) return NULL;
buf->ref = dav1d_ref_create(sz);
if (!buf->ref) return NULL;
buf->data = buf->ref->const_data;
--- a/src/decode.c
+++ b/src/decode.c
@@ -38,7 +38,6 @@
#include "dav1d/data.h"
#include "common/intops.h"
-#include "common/mem.h"
#include "src/ctx.h"
#include "src/decode.h"
@@ -2681,7 +2680,7 @@
sizeof(*f->tile_thread.titsati_index_rows) *
(f->frame_hdr->tiling.rows + 1)))
{
- for (int tile_row = 0, tile_idx = 0;
+ for (int tile_row = 0, task_idx = 0;
tile_row < f->frame_hdr->tiling.rows; tile_row++)
{
for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
@@ -2688,10 +2687,10 @@
sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++)
{
for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols;
- tile_col++, tile_idx++)
+ tile_col++, task_idx++)
{
- f->tile_thread.task_idx_to_sby_and_tile_idx[tile_idx][0] = sby;
- f->tile_thread.task_idx_to_sby_and_tile_idx[tile_idx][1] =
+ f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][0] = sby;
+ f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][1] =
tile_row * f->frame_hdr->tiling.cols + tile_col;
}
}
@@ -3105,7 +3104,7 @@
4 * (t->by + f->sb_step),
PLANE_TYPE_BLOCK))
{
- return 1;
+ goto error;
}
dav1d_refmvs_load_tmvs(&f->rf, tile_row,
0, f->bw >> 1, t->by >> 1, by_end);
@@ -3401,7 +3400,7 @@
dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]);
}
if (f->frame_hdr->refresh_context) {
- res = dav1d_cdf_thread_alloc(&f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL);
+ res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL);
if (res < 0) goto error;
}
@@ -3466,8 +3465,8 @@
// ref_mvs
if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
- f->mvs_ref = dav1d_ref_create(f->sb128h * 16 * (f->b4_stride >> 1) *
- sizeof(*f->mvs));
+ f->mvs_ref = dav1d_ref_create_using_pool(c->refmvs_pool,
+ sizeof(*f->mvs) * f->sb128h * 16 * (f->b4_stride >> 1));
if (!f->mvs_ref) {
res = DAV1D_ERR(ENOMEM);
goto error;
@@ -3530,7 +3529,8 @@
// We're updating an existing map, but need somewhere to
// put the new values. Allocate them here (the data
// actually gets set elsewhere)
- f->cur_segmap_ref = dav1d_ref_create(f->b4_stride * 32 * f->sb128h);
+ f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool,
+ sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h);
if (!f->cur_segmap_ref) {
dav1d_ref_dec(&f->prev_segmap_ref);
res = DAV1D_ERR(ENOMEM);
@@ -3545,13 +3545,14 @@
f->cur_segmap = f->prev_segmap_ref->data;
} else {
// We need to make a new map. Allocate one here and zero it out.
- f->cur_segmap_ref = dav1d_ref_create(f->b4_stride * 32 * f->sb128h);
+ const size_t segmap_size = sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h;
+ f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool, segmap_size);
if (!f->cur_segmap_ref) {
res = DAV1D_ERR(ENOMEM);
goto error;
}
f->cur_segmap = f->cur_segmap_ref->data;
- memset(f->cur_segmap_ref->data, 0, f->b4_stride * 32 * f->sb128h);
+ memset(f->cur_segmap, 0, segmap_size);
}
} else {
f->cur_segmap = NULL;
--- a/src/internal.h
+++ b/src/internal.h
@@ -82,8 +82,10 @@
int n_tile_data_alloc;
int n_tile_data;
int n_tiles;
+ Dav1dMemPool *seq_hdr_pool;
Dav1dRef *seq_hdr_ref;
Dav1dSequenceHeader *seq_hdr;
+ Dav1dMemPool *frame_hdr_pool;
Dav1dRef *frame_hdr_ref;
Dav1dFrameHeader *frame_hdr;
@@ -107,6 +109,8 @@
} frame_thread;
// reference/entropy state
+ Dav1dMemPool *segmap_pool;
+ Dav1dMemPool *refmvs_pool;
struct {
Dav1dThreadPicture p;
Dav1dRef *segmap;
@@ -113,6 +117,7 @@
Dav1dRef *refmvs;
unsigned refpoc[7];
} refs[8];
+ Dav1dMemPool *cdf_pool;
CdfThreadContext cdf[8];
Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */];
@@ -135,6 +140,8 @@
int drain;
Dav1dLogger logger;
+
+ Dav1dMemPool *picture_pool;
};
struct Dav1dFrameContext {
--- a/src/lf_mask.h
+++ b/src/lf_mask.h
@@ -41,8 +41,8 @@
typedef struct Av1RestorationUnit {
enum Dav1dRestorationType type;
- int16_t filter_h[3];
- int16_t filter_v[3];
+ int8_t filter_h[3];
+ int8_t filter_v[3];
uint8_t sgr_idx;
int16_t sgr_weights[2];
} Av1RestorationUnit;
--- a/src/lib.c
+++ b/src/lib.c
@@ -35,7 +35,9 @@
#include <dlfcn.h>
#endif
-#include "common/mem.h"
+#include "dav1d/dav1d.h"
+#include "dav1d/data.h"
+
#include "common/validate.h"
#include "src/cpu.h"
@@ -126,6 +128,19 @@
c->all_layers = s->all_layers;
c->frame_size_limit = s->frame_size_limit;
+ if (dav1d_mem_pool_init(&c->seq_hdr_pool) ||
+ dav1d_mem_pool_init(&c->frame_hdr_pool) ||
+ dav1d_mem_pool_init(&c->segmap_pool) ||
+ dav1d_mem_pool_init(&c->refmvs_pool) ||
+ dav1d_mem_pool_init(&c->cdf_pool))
+ {
+ goto error;
+ }
+ if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc) {
+ if (dav1d_mem_pool_init(&c->picture_pool)) goto error;
+ c->allocator.cookie = c->picture_pool;
+ }
+
/* On 32-bit systems extremely large frame sizes can cause overflows in
* dav1d_decode_frame() malloc size calculations. Prevent that from occuring
* by enforcing a maximum frame size limit, chosen to roughly correspond to
@@ -568,6 +583,13 @@
dav1d_ref_dec(&c->mastering_display_ref);
dav1d_ref_dec(&c->content_light_ref);
dav1d_ref_dec(&c->itut_t35_ref);
+
+ dav1d_mem_pool_end(c->seq_hdr_pool);
+ dav1d_mem_pool_end(c->frame_hdr_pool);
+ dav1d_mem_pool_end(c->segmap_pool);
+ dav1d_mem_pool_end(c->refmvs_pool);
+ dav1d_mem_pool_end(c->cdf_pool);
+ dav1d_mem_pool_end(c->picture_pool);
dav1d_freep_aligned(c_out);
}
--- a/src/looprestoration.h
+++ b/src/looprestoration.h
@@ -54,9 +54,8 @@
void (name)(pixel *dst, ptrdiff_t dst_stride, \
const_left_pixel_row left, \
const pixel *lpf, ptrdiff_t lpf_stride, \
- int w, int h, const int16_t filterh[7], \
- const int16_t filterv[7], enum LrEdgeFlags edges \
- HIGHBD_DECL_SUFFIX)
+ int w, int h, const int16_t filter[2][8], \
+ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
typedef decl_wiener_filter_fn(*wienerfilter_fn);
#define decl_selfguided_filter_fn(name) \
@@ -68,7 +67,7 @@
typedef decl_selfguided_filter_fn(*selfguided_fn);
typedef struct Dav1dLoopRestorationDSPContext {
- wienerfilter_fn wiener;
+ wienerfilter_fn wiener[2]; /* 7-tap, 5-tap */
selfguided_fn selfguided;
} Dav1dLoopRestorationDSPContext;
--- a/src/looprestoration_tmpl.c
+++ b/src/looprestoration_tmpl.c
@@ -135,7 +135,7 @@
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h,
- const int16_t filterh[7], const int16_t filterv[7],
+ const int16_t filter[2][8],
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
// Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
@@ -156,10 +156,13 @@
const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
for (int j = 0; j < h + 6; j++) {
for (int i = 0; i < w; i++) {
- int sum = (tmp_ptr[i + 3] << 7) + (1 << (bitdepth + 6));
+ int sum = (1 << (bitdepth + 6));
+#if BITDEPTH == 8
+ sum += tmp_ptr[i + 3] * 128;
+#endif
for (int k = 0; k < 7; k++) {
- sum += tmp_ptr[i + k] * filterh[k];
+ sum += tmp_ptr[i + k] * filter[0][k];
}
hor_ptr[i] =
@@ -174,10 +177,10 @@
const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
- int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset;
+ int sum = -round_offset;
for (int k = 0; k < 7; k++) {
- sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filterv[k];
+ sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filter[1][k];
}
p[j * PXSTRIDE(p_stride) + i] =
@@ -208,16 +211,19 @@
// i: Pixel summed and stored (between loops)
// c: Pixel summed not stored
// x: Pixel not summed not stored
-static void boxsum3(coef *dst, const pixel *src, const int w, const int h) {
+static void boxsum3(int32_t *sumsq, coef *sum, const pixel *src,
+ const int w, const int h)
+{
// We skip the first row, as it is never used
src += REST_UNIT_STRIDE;
- dst += REST_UNIT_STRIDE;
// We skip the first and last columns, as they are never used
for (int x = 1; x < w - 1; x++) {
- coef *ds = dst + x;
+ coef *sum_v = sum + x;
+ int32_t *sumsq_v = sumsq + x;
const pixel *s = src + x;
- int a = s[0], b = s[REST_UNIT_STRIDE];
+ int a = s[0], a2 = a * a;
+ int b = s[REST_UNIT_STRIDE], b2 = b * b;
// We skip the first 2 rows, as they are skipped in the next loop and
// we don't need the last 2 row as it is skipped in the next loop
@@ -224,28 +230,39 @@
for (int y = 2; y < h - 2; y++) {
s += REST_UNIT_STRIDE;
const int c = s[REST_UNIT_STRIDE];
- ds += REST_UNIT_STRIDE;
- *ds = a + b + c;
+ const int c2 = c * c;
+ sum_v += REST_UNIT_STRIDE;
+ sumsq_v += REST_UNIT_STRIDE;
+ *sum_v = a + b + c;
+ *sumsq_v = a2 + b2 + c2;
a = b;
+ a2 = b2;
b = c;
+ b2 = c2;
}
}
- // We skip the first 2 rows as they are never read
- dst += REST_UNIT_STRIDE;
+ // We skip the first row as it is never read
+ sum += REST_UNIT_STRIDE;
+ sumsq += REST_UNIT_STRIDE;
// We skip the last 2 rows as it is never read
for (int y = 2; y < h - 2; y++) {
- int a = dst[1], b = dst[2];
+ int a = sum[1], a2 = sumsq[1];
+ int b = sum[2], b2 = sumsq[2];
// We don't store the first column as it is never read and
// we don't store the last 2 columns as they are never read
for (int x = 2; x < w - 2; x++) {
- const int c = dst[x + 1];
- dst[x] = a + b + c;
+ const int c = sum[x + 1], c2 = sumsq[x + 1];
+ sum[x] = a + b + c;
+ sumsq[x] = a2 + b2 + c2;
a = b;
+ a2 = b2;
b = c;
+ b2 = c2;
}
- dst += REST_UNIT_STRIDE;
+ sum += REST_UNIT_STRIDE;
+ sumsq += REST_UNIT_STRIDE;
}
}
@@ -271,142 +288,63 @@
// i: Pixel summed and stored (between loops)
// c: Pixel summed not stored
// x: Pixel not summed not stored
-static void boxsum5(coef *dst, const pixel *const src, const int w, const int h) {
- // We skip the first row, as it is never used
- dst += REST_UNIT_STRIDE;
-
+static void boxsum5(int32_t *sumsq, coef *sum, const pixel *const src,
+ const int w, const int h)
+{
for (int x = 0; x < w; x++) {
- coef *ds = dst + x;
+ coef *sum_v = sum + x;
+ int32_t *sumsq_v = sumsq + x;
const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
- int a = s[-3 * REST_UNIT_STRIDE];
- int b = s[-2 * REST_UNIT_STRIDE];
- int c = s[-1 * REST_UNIT_STRIDE];
- int d = s[0];
+ int a = s[-3 * REST_UNIT_STRIDE], a2 = a * a;
+ int b = s[-2 * REST_UNIT_STRIDE], b2 = b * b;
+ int c = s[-1 * REST_UNIT_STRIDE], c2 = c * c;
+ int d = s[0], d2 = d * d;
// We skip the first 2 rows, as they are skipped in the next loop and
// we don't need the last 2 row as it is skipped in the next loop
for (int y = 2; y < h - 2; y++) {
s += REST_UNIT_STRIDE;
- const int e = *s;
- ds += REST_UNIT_STRIDE;
- *ds = a + b + c + d + e;
+ const int e = *s, e2 = e * e;
+ sum_v += REST_UNIT_STRIDE;
+ sumsq_v += REST_UNIT_STRIDE;
+ *sum_v = a + b + c + d + e;
+ *sumsq_v = a2 + b2 + c2 + d2 + e2;
a = b;
b = c;
c = d;
d = e;
+ a2 = b2;
+ b2 = c2;
+ c2 = d2;
+ d2 = e2;
}
}
- // We skip the first 2 rows as they are never read
- dst += REST_UNIT_STRIDE;
- for (int y = 2; y < h - 2; y++) {
- int a = dst[0];
- int b = dst[1];
- int c = dst[2];
- int d = dst[3];
-
- for (int x = 2; x < w - 2; x++) {
- const int e = dst[x + 2];
- dst[x] = a + b + c + d + e;
- a = b;
- b = c;
- c = d;
- d = e;
- }
- dst += REST_UNIT_STRIDE;
- }
-}
-
-// See boxsum3 function comments for details on row and column skipping
-static void boxsum3sqr(int32_t *dst, const pixel *src, const int w, const int h) {
- // We skip the first row, as it is never used
- src += REST_UNIT_STRIDE;
- dst += REST_UNIT_STRIDE;
-
- // We skip the first and last columns, as they are never used
- for (int x = 1; x < w - 1; x++) {
- int32_t *ds = dst + x;
- const pixel *s = src + x;
- int a = s[0] * s[0];
- int b = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
-
- // We skip the first row, as it is skipped in the next loop and
- // we don't need the last row as it is skipped in the next loop
- for (int y = 2; y < h - 2; y++) {
- s += REST_UNIT_STRIDE;
- const int c = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
- ds += REST_UNIT_STRIDE;
- *ds = a + b + c;
- a = b;
- b = c;
- }
- }
-
// We skip the first row as it is never read
- dst += REST_UNIT_STRIDE;
- // We skip the last row as it is never read
+ sum += REST_UNIT_STRIDE;
+ sumsq += REST_UNIT_STRIDE;
for (int y = 2; y < h - 2; y++) {
- int a = dst[1], b = dst[2];
+ int a = sum[0], a2 = sumsq[0];
+ int b = sum[1], b2 = sumsq[1];
+ int c = sum[2], c2 = sumsq[2];
+ int d = sum[3], d2 = sumsq[3];
- // We don't store the first column as it is never read and
- // we don't store the last 2 columns as they are never read
for (int x = 2; x < w - 2; x++) {
- const int c = dst[x + 1];
- dst[x] = a + b + c;
+ const int e = sum[x + 2], e2 = sumsq[x + 2];
+ sum[x] = a + b + c + d + e;
+ sumsq[x] = a2 + b2 + c2 + d2 + e2;
a = b;
b = c;
- }
- dst += REST_UNIT_STRIDE;
- }
-}
-
-// See boxsum5 function comments for details on row and column skipping
-static void boxsum5sqr(int32_t *dst, const pixel *const src, const int w,
- const int h)
-{
- // We skip the first row, as it is never used
- dst += REST_UNIT_STRIDE;
-
- for (int x = 0; x < w; x++) {
- int32_t *ds = dst + x;
- const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
- int a = s[-3 * REST_UNIT_STRIDE] * s[-3 * REST_UNIT_STRIDE];
- int b = s[-2 * REST_UNIT_STRIDE] * s[-2 * REST_UNIT_STRIDE];
- int c = s[-1 * REST_UNIT_STRIDE] * s[-1 * REST_UNIT_STRIDE];
- int d = s[0] * s[0];
-
- // We skip the first 2 rows, as they are skipped in the next loop and
- // we don't need the last 2 row as it is skipped in the next loop
- for (int y = 2; y < h - 2; y++) {
- s += REST_UNIT_STRIDE;
- const int e = s[0] * s[0];
- ds += REST_UNIT_STRIDE;
- *ds = a + b + c + d + e;
- a = b;
- b = c;
c = d;
d = e;
+ a2 = b2;
+ b2 = c2;
+ c2 = d2;
+ d2 = e2;
}
+ sum += REST_UNIT_STRIDE;
+ sumsq += REST_UNIT_STRIDE;
}
-
- // We skip the first 2 rows as they are never read
- dst += REST_UNIT_STRIDE;
- for (int y = 2; y < h - 2; y++) {
- int a = dst[0];
- int b = dst[1];
- int c = dst[2];
- int d = dst[3];
-
- for (int x = 2; x < w - 2; x++) {
- const int e = dst[x + 2];
- dst[x] = a + b + c + d + e;
- a = b;
- b = c;
- c = d;
- d = e;
- }
- dst += REST_UNIT_STRIDE;
- }
}
static void selfguided_filter(coef *dst, const pixel *src,
@@ -418,21 +356,18 @@
// Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
// of padding above and below
- int32_t A_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
- int32_t *A = A_ + 3 * REST_UNIT_STRIDE + 3;
+ int32_t sumsq[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
+ int32_t *A = sumsq + 2 * REST_UNIT_STRIDE + 3;
// By inverting A and B after the boxsums, B can be of size coef instead
// of int32_t
- coef B_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
- coef *B = B_ + 3 * REST_UNIT_STRIDE + 3;
+ coef sum[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
+ coef *B = sum + 2 * REST_UNIT_STRIDE + 3;
const int step = (n == 25) + 1;
- if (n == 25) {
- boxsum5(B_, src, w + 6, h + 6);
- boxsum5sqr(A_, src, w + 6, h + 6);
- } else {
- boxsum3(B_, src, w + 6, h + 6);
- boxsum3sqr(A_, src, w + 6, h + 6);
- }
+ if (n == 25)
+ boxsum5(sumsq, sum, src, w + 6, h + 6);
+ else
+ boxsum3(sumsq, sum, src, w + 6, h + 6);
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
int32_t *AA = A - REST_UNIT_STRIDE;
@@ -574,7 +509,7 @@
}
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
- c->wiener = wiener_c;
+ c->wiener[0] = c->wiener[1] = wiener_c;
c->selfguided = selfguided_c;
#if HAVE_ASM
--- a/src/lr_apply_tmpl.c
+++ b/src/lr_apply_tmpl.c
@@ -164,28 +164,36 @@
// The first stripe of the frame is shorter by 8 luma pixel rows.
int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
- // FIXME [8] might be easier for SIMD
- int16_t filterh[7], filterv[7];
+ ALIGN_STK_16(int16_t, filter, 2, [8]);
+ wienerfilter_fn wiener_fn = NULL;
if (lr->type == DAV1D_RESTORATION_WIENER) {
- filterh[0] = filterh[6] = lr->filter_h[0];
- filterh[1] = filterh[5] = lr->filter_h[1];
- filterh[2] = filterh[4] = lr->filter_h[2];
- filterh[3] = -((filterh[0] + filterh[1] + filterh[2]) * 2);
+ filter[0][0] = filter[0][6] = lr->filter_h[0];
+ filter[0][1] = filter[0][5] = lr->filter_h[1];
+ filter[0][2] = filter[0][4] = lr->filter_h[2];
+ filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2;
+#if BITDEPTH != 8
+ /* For 8-bit SIMD it's beneficial to handle the +128 separately
+ * in order to avoid overflows. */
+ filter[0][3] += 128;
+#endif
- filterv[0] = filterv[6] = lr->filter_v[0];
- filterv[1] = filterv[5] = lr->filter_v[1];
- filterv[2] = filterv[4] = lr->filter_v[2];
- filterv[3] = -((filterv[0] + filterv[1] + filterv[2]) * 2);
+ filter[1][0] = filter[1][6] = lr->filter_v[0];
+ filter[1][1] = filter[1][5] = lr->filter_v[1];
+ filter[1][2] = filter[1][4] = lr->filter_v[2];
+ filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
+
+ wiener_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
+ } else {
+ assert(lr->type == DAV1D_RESTORATION_SGRPROJ);
}
while (y + stripe_h <= row_h) {
// Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h)
edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
- if (lr->type == DAV1D_RESTORATION_WIENER) {
- dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
- filterh, filterv, edges HIGHBD_CALL_SUFFIX);
+ if (wiener_fn) {
+ wiener_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
+ filter, edges HIGHBD_CALL_SUFFIX);
} else {
- assert(lr->type == DAV1D_RESTORATION_SGRPROJ);
dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
lr->sgr_idx, lr->sgr_weights, edges HIGHBD_CALL_SUFFIX);
}
--- /dev/null
+++ b/src/mem.c
@@ -1,0 +1,119 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2020, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+
+#include "src/internal.h"
+
+static COLD void mem_pool_destroy(Dav1dMemPool *const pool) {
+ pthread_mutex_destroy(&pool->lock);
+ free(pool);
+}
+
+void dav1d_mem_pool_push(Dav1dMemPool *const pool, Dav1dMemPoolBuffer *const buf) {
+ pthread_mutex_lock(&pool->lock);
+ const int ref_cnt = --pool->ref_cnt;
+ if (!pool->end) {
+ buf->next = pool->buf;
+ pool->buf = buf;
+ pthread_mutex_unlock(&pool->lock);
+ assert(ref_cnt > 0);
+ } else {
+ pthread_mutex_unlock(&pool->lock);
+ dav1d_free_aligned(buf->data);
+ if (!ref_cnt) mem_pool_destroy(pool);
+ }
+}
+
+Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *const pool, const size_t size) {
+ assert(!(size & (sizeof(void*) - 1)));
+ pthread_mutex_lock(&pool->lock);
+ Dav1dMemPoolBuffer *buf = pool->buf;
+ pool->ref_cnt++;
+ uint8_t *data;
+ if (buf) {
+ pool->buf = buf->next;
+ pthread_mutex_unlock(&pool->lock);
+ data = buf->data;
+ if ((uintptr_t)buf - (uintptr_t)data != size) {
+ /* Reallocate if the size has changed */
+ dav1d_free_aligned(data);
+ goto alloc;
+ }
+ } else {
+ pthread_mutex_unlock(&pool->lock);
+alloc:
+ data = dav1d_alloc_aligned(size + sizeof(Dav1dMemPoolBuffer), 64);
+ if (!data) {
+ pthread_mutex_lock(&pool->lock);
+ const int ref_cnt = --pool->ref_cnt;
+ pthread_mutex_unlock(&pool->lock);
+ if (!ref_cnt) mem_pool_destroy(pool);
+ return NULL;
+ }
+ buf = (Dav1dMemPoolBuffer*)(data + size);
+ buf->data = data;
+ }
+
+ return buf;
+}
+
+COLD int dav1d_mem_pool_init(Dav1dMemPool **const ppool) {
+ Dav1dMemPool *const pool = malloc(sizeof(Dav1dMemPool));
+ if (pool) {
+ if (!pthread_mutex_init(&pool->lock, NULL)) {
+ pool->buf = NULL;
+ pool->ref_cnt = 1;
+ pool->end = 0;
+ *ppool = pool;
+ return 0;
+ }
+ free(pool);
+ }
+ *ppool = NULL;
+ return DAV1D_ERR(ENOMEM);
+}
+
+COLD void dav1d_mem_pool_end(Dav1dMemPool *const pool) {
+ if (pool) {
+ pthread_mutex_lock(&pool->lock);
+ Dav1dMemPoolBuffer *buf = pool->buf;
+ const int ref_cnt = --pool->ref_cnt;
+ pool->buf = NULL;
+ pool->end = 1;
+ pthread_mutex_unlock(&pool->lock);
+
+ while (buf) {
+ void *const data = buf->data;
+ buf = buf->next;
+ dav1d_free_aligned(data);
+ }
+ if (!ref_cnt) mem_pool_destroy(pool);
+ }
+}
--- /dev/null
+++ b/src/mem.h
@@ -1,0 +1,103 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_MEM_H
+#define DAV1D_SRC_MEM_H
+
+#include <stdlib.h>
+
+#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
+#include <malloc.h>
+#endif
+
+#include "common/attributes.h"
+
+#include "src/thread.h"
+
+typedef struct Dav1dMemPoolBuffer {
+ void *data;
+ struct Dav1dMemPoolBuffer *next;
+} Dav1dMemPoolBuffer;
+
+typedef struct Dav1dMemPool {
+ pthread_mutex_t lock;
+ Dav1dMemPoolBuffer *buf;
+ int ref_cnt;
+ int end;
+} Dav1dMemPool;
+
+void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);
+Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size);
+int dav1d_mem_pool_init(Dav1dMemPool **pool);
+void dav1d_mem_pool_end(Dav1dMemPool *pool);
+
+/*
+ * Allocate align-byte aligned memory. The return value can be released
+ * by calling the dav1d_free_aligned() function.
+ */
+static inline void *dav1d_alloc_aligned(size_t sz, size_t align) {
+ assert(!(align & (align - 1)));
+#ifdef HAVE_POSIX_MEMALIGN
+ void *ptr;
+ if (posix_memalign(&ptr, align, sz)) return NULL;
+ return ptr;
+#elif defined(HAVE_ALIGNED_MALLOC)
+ return _aligned_malloc(sz, align);
+#elif defined(HAVE_MEMALIGN)
+ return memalign(align, sz);
+#else
+#error Missing aligned alloc implementation
+#endif
+}
+
+static inline void dav1d_free_aligned(void* ptr) {
+#ifdef HAVE_POSIX_MEMALIGN
+ free(ptr);
+#elif defined(HAVE_ALIGNED_MALLOC)
+ _aligned_free(ptr);
+#elif defined(HAVE_MEMALIGN)
+ free(ptr);
+#endif
+}
+
+static inline void dav1d_freep_aligned(void* ptr) {
+ void **mem = (void **) ptr;
+ if (*mem) {
+ dav1d_free_aligned(*mem);
+ *mem = NULL;
+ }
+}
+
+static inline void freep(void *ptr) {
+ void **mem = (void **) ptr;
+ if (*mem) {
+ free(*mem);
+ *mem = NULL;
+ }
+}
+
+#endif /* DAV1D_SRC_MEM_H */
--- a/src/meson.build
+++ b/src/meson.build
@@ -38,6 +38,7 @@
'itx_1d.c',
'lf_mask.c',
'log.c',
+ 'mem.c',
'msac.c',
'obu.c',
'picture.c',
@@ -82,7 +83,7 @@
)
# ASM specific sources
-libdav1d_nasm_objs = []
+libdav1d_asm_objs = []
# Arch-specific flags
arch_flags = []
if is_asm_enabled
@@ -102,7 +103,7 @@
)
if (host_machine.cpu_family() == 'aarch64' or
host_machine.cpu() == 'arm64')
- libdav1d_sources += files(
+ libdav1d_sources_asm = files(
# itx.S is used for both 8 and 16 bpc.
'arm/64/itx.S',
'arm/64/looprestoration_common.S',
@@ -110,7 +111,7 @@
)
if dav1d_bitdepths.contains('8')
- libdav1d_sources += files(
+ libdav1d_sources_asm += files(
'arm/64/cdef.S',
'arm/64/ipred.S',
'arm/64/loopfilter.S',
@@ -120,7 +121,7 @@
endif
if dav1d_bitdepths.contains('16')
- libdav1d_sources += files(
+ libdav1d_sources_asm += files(
'arm/64/cdef16.S',
'arm/64/ipred16.S',
'arm/64/itx16.S',
@@ -130,12 +131,13 @@
)
endif
elif host_machine.cpu_family().startswith('arm')
- libdav1d_sources += files(
+ libdav1d_sources_asm = files(
+ 'arm/32/looprestoration_common.S',
'arm/32/msac.S',
)
if dav1d_bitdepths.contains('8')
- libdav1d_sources += files(
+ libdav1d_sources_asm += files(
'arm/32/cdef.S',
'arm/32/ipred.S',
'arm/32/itx.S',
@@ -146,11 +148,20 @@
endif
if dav1d_bitdepths.contains('16')
- libdav1d_sources += files(
+ libdav1d_sources_asm += files(
+ 'arm/32/cdef16.S',
+ 'arm/32/loopfilter16.S',
+ 'arm/32/looprestoration16.S',
'arm/32/mc16.S',
)
endif
endif
+
+ if use_gaspp
+ libdav1d_asm_objs = gaspp_gen.process(libdav1d_sources_asm)
+ else
+ libdav1d_sources += libdav1d_sources_asm
+ endif
elif host_machine.cpu_family().startswith('x86')
libdav1d_sources += files(
@@ -190,7 +201,7 @@
'x86/ipred_ssse3.asm',
'x86/itx_ssse3.asm',
'x86/loopfilter_ssse3.asm',
- 'x86/looprestoration_ssse3.asm',
+ 'x86/looprestoration_sse.asm',
'x86/mc_sse.asm',
)
endif
@@ -201,7 +212,7 @@
endif
# Compile the ASM sources with NASM
- libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm)
+ libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm)
elif host_machine.cpu() == 'ppc64le'
arch_flags = ['-maltivec', '-mvsx']
libdav1d_sources += files(
@@ -223,17 +234,6 @@
#
if host_machine.system() == 'windows' and get_option('default_library') != 'static'
- rc_version_array = meson.project_version().split('.')
- winmod = import('windows')
- rc_data = configuration_data()
- rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0])
- rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1])
- rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2])
- rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
- rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
- rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
- rc_data.set('COPYRIGHT_YEARS', '2019')
-
rc_file = configure_file(
input : 'dav1d.rc.in',
output : 'dav1d.rc',
@@ -302,7 +302,7 @@
libdav1d = library('dav1d',
libdav1d_sources,
- libdav1d_nasm_objs,
+ libdav1d_asm_objs,
libdav1d_rc_obj,
objects : [
--- a/src/msac.c
+++ b/src/msac.c
@@ -101,17 +101,17 @@
}
int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
- const int n, const unsigned k)
+ const int n, unsigned k)
{
- int i = 0;
- int a = 0;
- int b = k;
- while ((2 << b) < n) {
- if (!dav1d_msac_decode_bool_equi(s)) break;
- b = k + i++;
- a = (1 << b);
+ assert(n >> k == 8);
+
+ unsigned a = 0;
+ if (dav1d_msac_decode_bool_equi(s)) {
+ if (dav1d_msac_decode_bool_equi(s))
+ k += dav1d_msac_decode_bool_equi(s) + 1;
+ a = 1 << k;
}
- const unsigned v = dav1d_msac_decode_bools(s, b) + a;
+ const unsigned v = dav1d_msac_decode_bools(s, k) + a;
return ref * 2 <= n ? inv_recenter(ref, v) :
n - 1 - inv_recenter(n - 1 - ref, v);
}
--- a/src/obu.c
+++ b/src/obu.c
@@ -57,7 +57,7 @@
hdr->profile = dav1d_get_bits(gb, 3);
if (hdr->profile > 2) goto error;
#if DEBUG_SEQ_HDR
- printf("SEQHDR: post-profile: off=%ld\n",
+ printf("SEQHDR: post-profile: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
@@ -65,7 +65,7 @@
hdr->reduced_still_picture_header = dav1d_get_bits(gb, 1);
if (hdr->reduced_still_picture_header && !hdr->still_picture) goto error;
#if DEBUG_SEQ_HDR
- printf("SEQHDR: post-stillpicture_flags: off=%ld\n",
+ printf("SEQHDR: post-stillpicture_flags: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
@@ -104,7 +104,7 @@
hdr->decoder_model_info_present = 0;
}
#if DEBUG_SEQ_HDR
- printf("SEQHDR: post-timinginfo: off=%ld\n",
+ printf("SEQHDR: post-timinginfo: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
@@ -114,6 +114,8 @@
struct Dav1dSequenceHeaderOperatingPoint *const op =
&hdr->operating_points[i];
op->idc = dav1d_get_bits(gb, 12);
+ if (op->idc && (!(op->idc & 0xff) || !(op->idc & 0xf00)))
+ goto error;
op->major_level = 2 + dav1d_get_bits(gb, 3);
op->minor_level = dav1d_get_bits(gb, 2);
op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0;
@@ -138,7 +140,7 @@
c->operating_point < hdr->num_operating_points ? c->operating_point : 0;
c->operating_point_idc = hdr->operating_points[op_idx].idc;
#if DEBUG_SEQ_HDR
- printf("SEQHDR: post-operating-points: off=%ld\n",
+ printf("SEQHDR: post-operating-points: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
}
@@ -148,7 +150,7 @@
hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1;
hdr->max_height = dav1d_get_bits(gb, hdr->height_n_bits) + 1;
#if DEBUG_SEQ_HDR
- printf("SEQHDR: post-size: off=%ld\n",
+ printf("SEQHDR: post-size: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
hdr->frame_id_numbers_present =
@@ -158,7 +160,7 @@
hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1;
}
#if DEBUG_SEQ_HDR
- printf("SEQHDR: post-frame-id-numbers-present: off=%ld\n",
+ printf("SEQHDR: post-frame-id-numbers-present: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
@@ -192,7 +194,7 @@
}
hdr->screen_content_tools = dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1);
#if DEBUG_SEQ_HDR
- printf("SEQHDR: post-screentools: off=%ld\n",
+ printf("SEQHDR: post-screentools: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
hdr->force_integer_mv = hdr->screen_content_tools ?
@@ -204,7 +206,7 @@
hdr->cdef = dav1d_get_bits(gb, 1);
hdr->restoration = dav1d_get_bits(gb, 1);
#if DEBUG_SEQ_HDR
- printf("SEQHDR: post-featurebits: off=%ld\n",
+ printf("SEQHDR: post-featurebits: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
@@ -264,13 +266,13 @@
}
hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1);
#if DEBUG_SEQ_HDR
- printf("SEQHDR: post-colorinfo: off=%ld\n",
+ printf("SEQHDR: post-colorinfo: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
hdr->film_grain_present = dav1d_get_bits(gb, 1);
#if DEBUG_SEQ_HDR
- printf("SEQHDR: post-filmgrain: off=%ld\n",
+ printf("SEQHDR: post-filmgrain: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
@@ -367,7 +369,7 @@
hdr->show_existing_frame =
!seqhdr->reduced_still_picture_header && dav1d_get_bits(gb, 1);
#if DEBUG_FRAME_HDR
- printf("HDR: post-show_existing_frame: off=%ld\n",
+ printf("HDR: post-show_existing_frame: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
if (hdr->show_existing_frame) {
@@ -374,8 +376,11 @@
hdr->existing_frame_idx = dav1d_get_bits(gb, 3);
if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval)
hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length);
- if (seqhdr->frame_id_numbers_present)
+ if (seqhdr->frame_id_numbers_present) {
hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
+ Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->existing_frame_idx].p.p.frame_hdr;
+ if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) return DAV1D_ERR(EINVAL);
+ }
return 0;
}
@@ -391,7 +396,7 @@
hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ||
seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1);
#if DEBUG_FRAME_HDR
- printf("HDR: post-frametype_bits: off=%ld\n",
+ printf("HDR: post-frametype_bits: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
hdr->disable_cdf_update = dav1d_get_bits(gb, 1);
@@ -412,7 +417,7 @@
hdr->frame_size_override = seqhdr->reduced_still_picture_header ? 0 :
hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bits(gb, 1);
#if DEBUG_FRAME_HDR
- printf("HDR: post-frame_size_override_flag: off=%ld\n",
+ printf("HDR: post-frame_size_override_flag: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
hdr->frame_offset = seqhdr->order_hint ?
@@ -550,8 +555,12 @@
for (int i = 0; i < 7; i++) {
if (!hdr->frame_ref_short_signaling)
hdr->refidx[i] = dav1d_get_bits(gb, 3);
- if (seqhdr->frame_id_numbers_present)
- dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits);
+ if (seqhdr->frame_id_numbers_present) {
+ const int delta_ref_frame_id_minus_1 = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits);
+ const int ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id_minus_1 - 1) & ((1 << seqhdr->frame_id_n_bits) - 1);
+ Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->refidx[i]].p.p.frame_hdr;
+ if (!ref_frame_hdr || ref_frame_hdr->frame_id != ref_frame_id) goto error;
+ }
}
const int use_ref = !hdr->error_resilient_mode &&
hdr->frame_size_override;
@@ -565,7 +574,7 @@
hdr->frame_type & 1 && dav1d_get_bits(gb, 1);
}
#if DEBUG_FRAME_HDR
- printf("HDR: post-frametype-specific-bits: off=%ld\n",
+ printf("HDR: post-frametype-specific-bits: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
@@ -572,7 +581,7 @@
hdr->refresh_context = !seqhdr->reduced_still_picture_header &&
!hdr->disable_cdf_update && !dav1d_get_bits(gb, 1);
#if DEBUG_FRAME_HDR
- printf("HDR: post-refresh_context: off=%ld\n",
+ printf("HDR: post-refresh_context: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
@@ -646,7 +655,7 @@
hdr->tiling.n_bytes = hdr->tiling.update = 0;
}
#if DEBUG_FRAME_HDR
- printf("HDR: post-tiling: off=%ld\n",
+ printf("HDR: post-tiling: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
@@ -669,7 +678,7 @@
}
}
#if DEBUG_FRAME_HDR
- printf("HDR: post-quant: off=%ld\n",
+ printf("HDR: post-quant: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
hdr->quant.qm = dav1d_get_bits(gb, 1);
@@ -681,7 +690,7 @@
hdr->quant.qm_u;
}
#if DEBUG_FRAME_HDR
- printf("HDR: post-qm: off=%ld\n",
+ printf("HDR: post-qm: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
@@ -766,7 +775,7 @@
hdr->segmentation.seg_data.d[i].ref = -1;
}
#if DEBUG_FRAME_HDR
- printf("HDR: post-segmentation: off=%ld\n",
+ printf("HDR: post-segmentation: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
@@ -778,7 +787,7 @@
hdr->delta.lf.res_log2 = hdr->delta.lf.present ? dav1d_get_bits(gb, 2) : 0;
hdr->delta.lf.multi = hdr->delta.lf.present ? dav1d_get_bits(gb, 1) : 0;
#if DEBUG_FRAME_HDR
- printf("HDR: post-delta_q_lf_flags: off=%ld\n",
+ printf("HDR: post-delta_q_lf_flags: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
@@ -838,7 +847,7 @@
}
}
#if DEBUG_FRAME_HDR
- printf("HDR: post-lpf: off=%ld\n",
+ printf("HDR: post-lpf: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
@@ -857,7 +866,7 @@
hdr->cdef.uv_strength[0] = 0;
}
#if DEBUG_FRAME_HDR
- printf("HDR: post-cdef: off=%ld\n",
+ printf("HDR: post-cdef: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
@@ -899,7 +908,7 @@
hdr->restoration.type[2] = DAV1D_RESTORATION_NONE;
}
#if DEBUG_FRAME_HDR
- printf("HDR: post-restoration: off=%ld\n",
+ printf("HDR: post-restoration: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
@@ -906,12 +915,12 @@
hdr->txfm_mode = hdr->all_lossless ? DAV1D_TX_4X4_ONLY :
dav1d_get_bits(gb, 1) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST;
#if DEBUG_FRAME_HDR
- printf("HDR: post-txfmmode: off=%ld\n",
+ printf("HDR: post-txfmmode: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
hdr->switchable_comp_refs = hdr->frame_type & 1 ? dav1d_get_bits(gb, 1) : 0;
#if DEBUG_FRAME_HDR
- printf("HDR: post-refmode: off=%ld\n",
+ printf("HDR: post-refmode: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
hdr->skip_mode_allowed = 0;
@@ -972,18 +981,18 @@
}
hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bits(gb, 1) : 0;
#if DEBUG_FRAME_HDR
- printf("HDR: post-extskip: off=%ld\n",
+ printf("HDR: post-extskip: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
hdr->warp_motion = !hdr->error_resilient_mode && hdr->frame_type & 1 &&
seqhdr->warped_motion && dav1d_get_bits(gb, 1);
#if DEBUG_FRAME_HDR
- printf("HDR: post-warpmotionbit: off=%ld\n",
+ printf("HDR: post-warpmotionbit: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
hdr->reduced_txtp_set = dav1d_get_bits(gb, 1);
#if DEBUG_FRAME_HDR
- printf("HDR: post-reducedtxtpset: off=%ld\n",
+ printf("HDR: post-reducedtxtpset: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
@@ -1037,7 +1046,7 @@
}
}
#if DEBUG_FRAME_HDR
- printf("HDR: post-gmv: off=%ld\n",
+ printf("HDR: post-gmv: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
@@ -1121,7 +1130,7 @@
memset(&hdr->film_grain.data, 0, sizeof(hdr->film_grain.data));
}
#if DEBUG_FRAME_HDR
- printf("HDR: post-filmgrain: off=%ld\n",
+ printf("HDR: post-filmgrain: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
@@ -1227,7 +1236,8 @@
switch (type) {
case DAV1D_OBU_SEQ_HDR: {
- Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dSequenceHeader));
+ Dav1dRef *ref = dav1d_ref_create_using_pool(c->seq_hdr_pool,
+ sizeof(Dav1dSequenceHeader));
if (!ref) return DAV1D_ERR(ENOMEM);
Dav1dSequenceHeader *seq_hdr = ref->data;
memset(seq_hdr, 0, sizeof(*seq_hdr));
@@ -1273,7 +1283,8 @@
if (global) break;
if (!c->seq_hdr) goto error;
if (!c->frame_hdr_ref) {
- c->frame_hdr_ref = dav1d_ref_create(sizeof(Dav1dFrameHeader));
+ c->frame_hdr_ref = dav1d_ref_create_using_pool(c->frame_hdr_pool,
+ sizeof(Dav1dFrameHeader));
if (!c->frame_hdr_ref) return DAV1D_ERR(ENOMEM);
}
#ifndef NDEBUG
@@ -1366,6 +1377,10 @@
break;
}
case DAV1D_OBU_METADATA: {
+#define DEBUG_OBU_METADATA 0
+#if DEBUG_OBU_METADATA
+ const uint8_t *const init_ptr = gb.ptr;
+#endif
// obu metadta type field
const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3;
@@ -1378,7 +1393,17 @@
Dav1dContentLightLevel *const content_light = ref->data;
content_light->max_content_light_level = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("CLLOBU: max-content-light-level: %d [off=%td]\n",
+ content_light->max_content_light_level,
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("CLLOBU: max-frame-average-light-level: %d [off=%td]\n",
+ content_light->max_frame_average_light_level,
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
// Skip the trailing bit, align to the next byte boundary and check for overrun.
dav1d_get_bits(&gb, 1);
@@ -1401,13 +1426,37 @@
for (int i = 0; i < 3; i++) {
mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16);
mastering_display->primaries[i][1] = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: primaries[%d]: (%d, %d) [off=%td]\n", i,
+ mastering_display->primaries[i][0],
+ mastering_display->primaries[i][1],
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
}
mastering_display->white_point[0] = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: white-point-x: %d [off=%td]\n",
+ mastering_display->white_point[0],
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
mastering_display->white_point[1] = dav1d_get_bits(&gb, 16);
-
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: white-point-y: %d [off=%td]\n",
+ mastering_display->white_point[1],
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
mastering_display->max_luminance = dav1d_get_bits(&gb, 32);
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: max-luminance: %d [off=%td]\n",
+ mastering_display->max_luminance,
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
mastering_display->min_luminance = dav1d_get_bits(&gb, 32);
-
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: min-luminance: %d [off=%td]\n",
+ mastering_display->min_luminance,
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
// Skip the trailing bit, align to the next byte boundary and check for overrun.
dav1d_get_bits(&gb, 1);
dav1d_bytealign_get_bits(&gb);
--- a/src/picture.c
+++ b/src/picture.c
@@ -36,7 +36,6 @@
#include <string.h>
#include "common/intops.h"
-#include "common/mem.h"
#include "common/validate.h"
#include "src/internal.h"
@@ -47,7 +46,7 @@
#include "src/thread_task.h"
int dav1d_default_picture_alloc(Dav1dPicture *const p, void *const cookie) {
- assert(cookie == NULL);
+ assert(sizeof(Dav1dMemPoolBuffer) <= DAV1D_PICTURE_ALIGNMENT);
const int hbd = p->p.bpc > 8;
const int aligned_w = (p->p.w + 127) & ~127;
const int aligned_h = (p->p.h + 127) & ~127;
@@ -69,27 +68,24 @@
p->stride[1] = uv_stride;
const size_t y_sz = y_stride * aligned_h;
const size_t uv_sz = uv_stride * (aligned_h >> ss_ver);
- const size_t pic_size = y_sz + 2 * uv_sz + DAV1D_PICTURE_ALIGNMENT;
- uint8_t *const data = dav1d_alloc_aligned(pic_size, DAV1D_PICTURE_ALIGNMENT);
- if (!data) return DAV1D_ERR(ENOMEM);
+ const size_t pic_size = y_sz + 2 * uv_sz;
+ Dav1dMemPoolBuffer *const buf = dav1d_mem_pool_pop(cookie, pic_size +
+ DAV1D_PICTURE_ALIGNMENT -
+ sizeof(Dav1dMemPoolBuffer));
+ if (!buf) return DAV1D_ERR(ENOMEM);
+ p->allocator_data = buf;
+
+ uint8_t *const data = buf->data;
p->data[0] = data;
p->data[1] = has_chroma ? data + y_sz : NULL;
p->data[2] = has_chroma ? data + y_sz + uv_sz : NULL;
-#ifndef NDEBUG /* safety check */
- p->allocator_data = data;
-#endif
-
return 0;
}
void dav1d_default_picture_release(Dav1dPicture *const p, void *const cookie) {
- assert(cookie == NULL);
-#ifndef NDEBUG /* safety check */
- assert(p->allocator_data == p->data[0]);
-#endif
- dav1d_free_aligned(p->data[0]);
+ dav1d_mem_pool_push(cookie, p->allocator_data);
}
struct pic_ctx_context {
--- a/src/picture.h
+++ b/src/picture.h
@@ -52,6 +52,11 @@
atomic_uint *progress;
} Dav1dThreadPicture;
+typedef struct Dav1dPictureBuffer {
+ void *data;
+ struct Dav1dPictureBuffer *next;
+} Dav1dPictureBuffer;
+
/*
* Allocate a picture with custom border size.
*/
--- a/src/ppc/looprestoration_init_tmpl.c
+++ b/src/ppc/looprestoration_init_tmpl.c
@@ -49,7 +49,7 @@
static void wiener_filter_h_vsx(int32_t *hor_ptr,
uint8_t *tmp_ptr,
- const int16_t filterh[7],
+ const int16_t filterh[8],
const int w, const int h)
{
static const i32x4 zerov = vec_splats(0);
@@ -149,14 +149,10 @@
} while (0)
#define LOAD_AND_APPLY_FILTER_V(sumpixelv, hor) do { \
- i32x4 v_1 = (i32x4) vec_ld( 0, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
- i32x4 v_2 = (i32x4) vec_ld(16, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
- i32x4 v_3 = (i32x4) vec_ld(32, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
- i32x4 v_4 = (i32x4) vec_ld(48, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
- i32x4 sum1 = -round_offset_vec; \
- i32x4 sum2 = -round_offset_vec; \
- i32x4 sum3 = -round_offset_vec; \
- i32x4 sum4 = -round_offset_vec; \
+ i32x4 sum1 = round_vec; \
+ i32x4 sum2 = round_vec; \
+ i32x4 sum3 = round_vec; \
+ i32x4 sum4 = round_vec; \
APPLY_FILTER_V(0, filterv0); \
APPLY_FILTER_V(1, filterv1); \
APPLY_FILTER_V(2, filterv2); \
@@ -164,31 +160,25 @@
APPLY_FILTER_V(4, filterv4); \
APPLY_FILTER_V(5, filterv5); \
APPLY_FILTER_V(6, filterv6); \
- sum1 = (v_1 << seven_vec) + sum1 + rounding_off_vec; \
- sum2 = (v_2 << seven_vec) + sum2 + rounding_off_vec; \
- sum3 = (v_3 << seven_vec) + sum3 + rounding_off_vec; \
- sum4 = (v_4 << seven_vec) + sum4 + rounding_off_vec; \
sum1 = sum1 >> round_bits_vec; \
sum2 = sum2 >> round_bits_vec; \
sum3 = sum3 >> round_bits_vec; \
sum4 = sum4 >> round_bits_vec; \
- i16x8 sum_short_packed_1 = (i16x8) vec_pack( sum1, sum2 ); \
- i16x8 sum_short_packed_2 = (i16x8) vec_pack( sum3, sum4 ); \
+ i16x8 sum_short_packed_1 = (i16x8) vec_pack(sum1, sum2); \
+ i16x8 sum_short_packed_2 = (i16x8) vec_pack(sum3, sum4); \
sum_short_packed_1 = iclip_u8_vec(sum_short_packed_1); \
sum_short_packed_2 = iclip_u8_vec(sum_short_packed_2); \
- sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2 ); \
+ sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2); \
} while (0)
static inline void wiener_filter_v_vsx(uint8_t *p,
const ptrdiff_t p_stride,
const int32_t *hor,
- const int16_t filterv[7],
+ const int16_t filterv[8],
const int w, const int h)
{
static const i32x4 round_bits_vec = vec_splats(11);
- static const i32x4 rounding_off_vec = vec_splats(1 << 10);
- static const i32x4 round_offset_vec = vec_splats(1 << 18);
- static const i32x4 seven_vec = vec_splats(7);
+ static const i32x4 round_vec = vec_splats((1 << 10) - (1 << 18));
i32x4 filterv0 = vec_splats((int32_t) filterv[0]);
i32x4 filterv1 = vec_splats((int32_t) filterv[1]);
@@ -319,8 +309,7 @@
const uint8_t *lpf,
const ptrdiff_t lpf_stride,
const int w, const int h,
- const int16_t filterh[7],
- const int16_t filterv[7],
+ const int16_t filter[2][8],
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
// Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
@@ -329,8 +318,8 @@
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
- wiener_filter_h_vsx(hor, tmp, filterh, w, h);
- wiener_filter_v_vsx(p, p_stride, hor, filterv, w, h);
+ wiener_filter_h_vsx(hor, tmp, filter[0], w, h);
+ wiener_filter_v_vsx(p, p_stride, hor, filter[1], w, h);
}
#endif
@@ -343,7 +332,7 @@
if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
#if BITDEPTH == 8
- c->wiener = wiener_filter_vsx;
+ c->wiener[0] = c->wiener[1] = wiener_filter_vsx;
#endif
}
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -36,7 +36,6 @@
#include "common/bitdepth.h"
#include "common/dump.h"
#include "common/intops.h"
-#include "common/mem.h"
#include "src/cdef_apply.h"
#include "src/ctx.h"
--- a/src/ref.c
+++ b/src/ref.c
@@ -27,8 +27,6 @@
#include "config.h"
-#include "common/mem.h"
-
#include "src/ref.h"
static void default_free_callback(const uint8_t *const data, void *const user_data) {
@@ -36,19 +34,43 @@
dav1d_free_aligned(user_data);
}
-Dav1dRef *dav1d_ref_create(const size_t size) {
- void *data = dav1d_alloc_aligned(size, 32);
+Dav1dRef *dav1d_ref_create(size_t size) {
+ size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+ uint8_t *const data = dav1d_alloc_aligned(size + sizeof(Dav1dRef), 64);
if (!data) return NULL;
- Dav1dRef *const res = dav1d_ref_wrap(data, default_free_callback, data);
- if (res)
- res->data = data;
- else
- dav1d_free_aligned(data);
+ Dav1dRef *const res = (Dav1dRef*)(data + size);
+ res->const_data = res->user_data = res->data = data;
+ atomic_init(&res->ref_cnt, 1);
+ res->free_ref = 0;
+ res->free_callback = default_free_callback;
return res;
}
+static void pool_free_callback(const uint8_t *const data, void *const user_data) {
+ dav1d_mem_pool_push((Dav1dMemPool*)data, user_data);
+}
+
+Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *const pool, size_t size) {
+ size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+ Dav1dMemPoolBuffer *const buf =
+ dav1d_mem_pool_pop(pool, size + sizeof(Dav1dRef));
+ if (!buf) return NULL;
+
+ Dav1dRef *const res = &((Dav1dRef*)buf)[-1];
+ res->data = buf->data;
+ res->const_data = pool;
+ atomic_init(&res->ref_cnt, 1);
+ res->free_ref = 0;
+ res->free_callback = pool_free_callback;
+ res->user_data = buf;
+
+ return res;
+}
+
Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr,
void (*free_callback)(const uint8_t *data, void *user_data),
void *const user_data)
@@ -59,6 +81,7 @@
res->data = NULL;
res->const_data = ptr;
atomic_init(&res->ref_cnt, 1);
+ res->free_ref = 1;
res->free_callback = free_callback;
res->user_data = user_data;
@@ -76,8 +99,9 @@
if (!ref) return;
if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) {
+ const int free_ref = ref->free_ref;
ref->free_callback(ref->const_data, ref->user_data);
- free(ref);
+ if (free_ref) free(ref);
}
*pref = NULL;
}
--- a/src/ref.h
+++ b/src/ref.h
@@ -30,6 +30,9 @@
#include "dav1d/dav1d.h"
+#include "src/mem.h"
+#include "src/thread.h"
+
#include <stdatomic.h>
#include <stddef.h>
@@ -37,11 +40,13 @@
void *data;
const void *const_data;
atomic_int ref_cnt;
+ int free_ref;
void (*free_callback)(const uint8_t *data, void *user_data);
void *user_data;
};
Dav1dRef *dav1d_ref_create(size_t size);
+Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *pool, size_t size);
Dav1dRef *dav1d_ref_wrap(const uint8_t *ptr,
void (*free_callback)(const uint8_t *data, void *user_data),
void *user_data);
--- a/src/x86/looprestoration.asm
+++ b/src/x86/looprestoration.asm
@@ -29,21 +29,25 @@
%if ARCH_X86_64
SECTION_RODATA 32
+
+wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14
+wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12
+wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1
+wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
pb_right_ext_mask: times 32 db 0xff
times 32 db 0
-pb_14x0_1_2: times 14 db 0
- db 1, 2
-pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
- db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
-pb_15: times 16 db 15
-pw_16: times 2 dw 16
-pw_256: times 2 dw 256
-pw_2048: times 2 dw 2048
-pw_16380: times 2 dw 16380
-pw_0_128: dw 0, 128
-pw_5_6: dw 5, 6
-pd_6: dd 6
-pd_1024: dd 1024
+
+pb_3: times 4 db 3
+pb_m5: times 4 db -5
+pw_16: times 2 dw 16
+pw_256: times 2 dw 256
+pw_2056: times 2 dw 2056
+pw_m16380: times 2 dw -16380
+pw_5_6: dw 5, 6
+pd_1024: dd 1024
pd_0xf0080029: dd 0xf0080029
pd_0xf00801c7: dd 0xf00801c7
@@ -51,279 +55,662 @@
SECTION .text
-INIT_YMM avx2
-cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, fh, w, h, edge
- mov edged, edgem
- vpbroadcastb m15, [fhq+0]
- movifnidn wd, wm
- vpbroadcastb m14, [fhq+2]
- mov hd, hm
- vpbroadcastb m13, [fhq+4]
- vpbroadcastw m12, [fhq+6]
- vpbroadcastd m11, [pw_2048]
- vpbroadcastd m10, [pw_16380]
- lea r11, [pb_right_ext_mask]
+%macro REPX 2-*
+ %xdefine %%f(x) %1
+%rep %0 - 1
+ %rotate 1
+ %%f(%1)
+%endrep
+%endmacro
- DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
+DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers
- ; if (edge & has_right) align_w_to_32
- ; else w -= 32, and use that as limit in x loop
- test edgeb, 2 ; has_right
- jnz .align
- mov xlimq, -3
- jmp .loop
-.align:
- add wd, 31
- and wd, ~31
- xor xlimd, xlimd
-
- ; main y loop for vertical filter
-.loop:
- mov srcptrq, srcq
- mov dstptrq, dstq
- lea xq, [wq+xlimq]
-
- ; load left edge pixels
- test edgeb, 1 ; have_left
- jz .emu_left
- test leftq, leftq ; left == NULL for the edge-extended bottom/top
- jz .load_left_combined
- movd xm0, [leftq]
- add leftq, 4
- pinsrd xm0, [srcq], 1
- pslldq xm0, 9
- jmp .left_load_done
-.load_left_combined:
- movq xm0, [srcq-3]
- pslldq xm0, 10
- jmp .left_load_done
-.emu_left:
- movd xm0, [srcq]
- pshufb xm0, [pb_14x0_1_2]
-
- ; load right edge pixels
-.left_load_done:
- cmp xd, 32
- jg .main_load
- test xd, xd
- jg .load_and_splat
- je .splat_right
-
- ; for very small images (w=[1-2]), edge-extend the original cache,
- ; ugly, but only runs in very odd cases
- add wd, wd
- pshufb xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
- shr wd, 1
-
- ; main x loop, mostly this starts in .main_load
-.splat_right:
- ; no need to load new pixels, just extend them from the (possibly previously
- ; extended) previous load into m0
- pshufb xm1, xm0, [pb_15]
- jmp .main_loop
-.load_and_splat:
- ; load new pixels and extend edge for right-most
- movu m1, [srcptrq+3]
- sub r11, xq
- movu m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32]
- add r11, xq
- vpbroadcastb m3, [srcptrq+2+xq]
- pand m1, m2
- pandn m3, m2, m3
- por m1, m3
- jmp .main_loop
-.main_load:
- ; load subsequent line
- movu m1, [srcptrq+3]
+INIT_YMM avx2
+cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
+ lpf_stride, w, edge, flt, h
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+ mov hd, r6m
+ vbroadcasti128 m6, [wiener_shufA]
+ vpbroadcastb m11, [fltq+ 0] ; x0 x0
+ vbroadcasti128 m7, [wiener_shufB]
+ vpbroadcastd m12, [fltq+ 2]
+ vbroadcasti128 m8, [wiener_shufC]
+ packsswb m12, m12 ; x1 x2
+ vpbroadcastw m13, [fltq+ 6] ; x3
+ vbroadcasti128 m9, [wiener_shufD]
+ add lpfq, wq
+ vpbroadcastd m10, [pw_m16380]
+ lea t1, [rsp+wq*2+16]
+ vpbroadcastd m14, [fltq+16] ; y0 y1
+ add dstq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ neg wq
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, lpf_strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r7, [lpfq+lpf_strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ mov [rsp+8*1], lpf_strideq
+ add r7, lpf_strideq
+ mov [rsp+8*0], r7 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
.main_loop:
- vinserti128 m0, xm1, 1
-
- palignr m2, m1, m0, 10
- palignr m3, m1, m0, 11
- palignr m4, m1, m0, 12
- palignr m5, m1, m0, 13
- palignr m6, m1, m0, 14
- palignr m7, m1, m0, 15
-
- punpcklbw m0, m2, m1
- punpckhbw m2, m1
- punpcklbw m8, m3, m7
- punpckhbw m3, m7
- punpcklbw m7, m4, m6
- punpckhbw m4, m6
- pxor m9, m9
- punpcklbw m6, m5, m9
- punpckhbw m5, m9
-
- pmaddubsw m0, m15
- pmaddubsw m2, m15
- pmaddubsw m8, m14
- pmaddubsw m3, m14
- pmaddubsw m7, m13
- pmaddubsw m4, m13
- paddw m0, m8
- paddw m2, m3
- psllw m8, m6, 7
- psllw m3, m5, 7
- psubw m8, m10
- psubw m3, m10
- pmullw m6, m12
- pmullw m5, m12
- paddw m0, m7
- paddw m2, m4
- paddw m0, m6
- paddw m2, m5
- ; for a signed overflow to happen we need filter and pixels as follow:
- ; filter => -5,-23,-17,90,-17,-23,-5
- ; pixels => 255,255,255,0,255,255,255 or 0,0,0,255,0,0,0
- ; m0 would fall in the range [-59A6;+59A6] = [A65A;59A6]
- ; m8 would fall in the range [-3FFC;+3F84] = [C004;3F84]
- ; 32-bit arithmetic m0+m8 = [-99A2;+992A] = [FFFF665E;992A]
- ; => signed 16-bit overflow occurs
- paddsw m0, m8 ; paddsw clips this range to [-8000;+7FFF]
- paddsw m2, m3
- psraw m0, 3 ; shift changes the range to [-1000;+FFF]
- psraw m2, 3
- paddw m0, m11 ; adding back 800 (removed in m8) changes the
- paddw m2, m11 ; range to [-800;+17FF] as defined in the spec
- mova [dstptrq], xm0 ; (note that adding another 800 would give us
- mova [dstptrq+16], xm2; the same range as in the C code => [0;1FFF])
- vextracti128 [dstptrq+32], m0, 1
- vextracti128 [dstptrq+48], m2, 1
- vextracti128 xm0, m1, 1
- add srcptrq, 32
- add dstptrq, 64
- sub xq, 32
- cmp xd, 32
- jg .main_load
- test xd, xd
- jg .load_and_splat
- cmp xd, xlimd
- jg .splat_right
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp+8*0]
+ call .hv_bottom
+ add lpfq, [rsp+8*1]
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r7, [lpfq+lpf_strideq*4]
+ mov lpfq, dstq
+ mov [rsp+8*1], lpf_strideq
+ lea r7, [r7+lpf_strideq*2]
+ mov [rsp+8*0], r7
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.extend_right:
+ movd xm2, r10d
+ vpbroadcastd m0, [pb_3]
+ vpbroadcastd m1, [pb_m5]
+ vpbroadcastb m2, xm2
+ movu m3, [pb_0to31]
+ psubb m0, m2
+ psubb m1, m2
+ pminub m0, m3
+ pminub m1, m3
+ pshufb m4, m0
+ pshufb m5, m1
+ ret
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located
+ mova m4, [lpfq+r10] ; before the start of the buffer
+ palignr m4, m5, 12
+ pshufb m4, [wiener_l_shuf]
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ movu m4, [lpfq+r10-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jnz .h_main
+ pshufb m4, [wiener_l_shuf]
+ jmp .h_main
+.h_loop:
+ movu m4, [lpfq+r10-4]
+.h_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m11
+ pshufb m1, m5, m6
+ pmaddubsw m1, m11
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ paddw m0, m2
+ pshufb m2, m4, m8
+ pmaddubsw m2, m12
+ paddw m1, m3
+ pshufb m3, m5, m8
+ pmaddubsw m3, m12
+ pshufb m4, m9
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m9
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m10
+ paddw m5, m10
+ paddw m0, m2
+ vpbroadcastd m2, [pw_2056]
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m2
+ paddw m1, m2
+ mova [t1+r10*2+ 0], m0
+ mova [t1+r10*2+32], m1
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, dst_strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ movu m4, [lpfq+r10-4]
+ pshufb m4, [wiener_l_shuf]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+r10-4]
+.hv_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -34
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m11
+ pshufb m1, m5, m6
+ pmaddubsw m1, m11
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ paddw m0, m2
+ pshufb m2, m4, m8
+ pmaddubsw m2, m12
+ paddw m1, m3
+ pshufb m3, m5, m8
+ pmaddubsw m3, m12
+ pshufb m4, m9
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m9
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m10
+ paddw m5, m10
+ paddw m0, m2
+ paddw m1, m3
+ mova m2, [t4+r10*2]
+ paddw m2, [t2+r10*2]
+ mova m3, [t3+r10*2]
+ paddsw m0, m4
+ vpbroadcastd m4, [pw_2056]
+ paddsw m1, m5
+ mova m5, [t5+r10*2]
+ paddw m5, [t1+r10*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m4
+ paddw m1, m4
+ paddw m4, m0, [t6+r10*2]
+ mova [t0+r10*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m14
+ punpckhwd m4, m5
+ pmaddwd m4, m14
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t4+r10*2+32]
+ paddw m2, [t2+r10*2+32]
+ mova m3, [t3+r10*2+32]
+ mova m5, [t5+r10*2+32]
+ paddw m5, [t1+r10*2+32]
+ psrad m0, 11
+ psrad m4, 11
+ packssdw m0, m4
+ paddw m4, m1, [t6+r10*2+32]
+ mova [t0+r10*2+32], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m14
+ punpckhwd m4, m5
+ pmaddwd m4, m14
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 11
+ psrad m2, 11
+ packssdw m1, m2
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, dst_strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m2, [t4+r10*2+ 0]
+ paddw m2, [t2+r10*2+ 0]
+ mova m4, [t3+r10*2+ 0]
+ mova m6, [t1+r10*2+ 0]
+ paddw m8, m6, [t6+r10*2+ 0]
+ paddw m6, [t5+r10*2+ 0]
+ mova m3, [t4+r10*2+32]
+ paddw m3, [t2+r10*2+32]
+ mova m5, [t3+r10*2+32]
+ mova m7, [t1+r10*2+32]
+ paddw m9, m7, [t6+r10*2+32]
+ paddw m7, [t5+r10*2+32]
+ punpcklwd m0, m2, m4
+ pmaddwd m0, m15
+ punpckhwd m2, m4
+ pmaddwd m2, m15
+ punpcklwd m4, m8, m6
+ pmaddwd m4, m14
+ punpckhwd m6, m8, m6
+ pmaddwd m6, m14
+ punpcklwd m1, m3, m5
+ pmaddwd m1, m15
+ punpckhwd m3, m5
+ pmaddwd m3, m15
+ punpcklwd m5, m9, m7
+ pmaddwd m5, m14
+ punpckhwd m7, m9, m7
+ pmaddwd m7, m14
+ paddd m0, m4
+ paddd m2, m6
+ paddd m1, m5
+ paddd m3, m7
+ REPX {psrad x, 11}, m0, m2, m1, m3
+ packssdw m0, m2
+ packssdw m1, m3
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, dst_strideq
+ ret
- add srcq, strideq
- add dstq, 384*2
- dec hd
- jg .loop
+cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+ lpf_stride, w, edge, flt, h
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+ mov hd, r6m
+ vbroadcasti128 m6, [wiener_shufB]
+ vpbroadcastd m12, [fltq+ 2]
+ vbroadcasti128 m7, [wiener_shufC]
+ packsswb m12, m12 ; x1 x2
+ vpbroadcastw m13, [fltq+ 6] ; x3
+ vbroadcasti128 m8, [wiener_shufD]
+ add lpfq, wq
+ vpbroadcastd m9, [pw_m16380]
+ vpbroadcastd m10, [pw_2056]
+ lea t1, [rsp+wq*2+16]
+ mova m11, [wiener_l_shuf]
+ vpbroadcastd m14, [fltq+16] ; __ y1
+ add dstq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ neg wq
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, lpf_strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r7, [lpfq+lpf_strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ mov [rsp+8*1], lpf_strideq
+ add r7, lpf_strideq
+ mov [rsp+8*0], r7 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp+8*0]
+ call .hv_bottom
+ add lpfq, [rsp+8*1]
+ call .hv_bottom
+.end:
RET
+.no_top:
+ lea r7, [lpfq+lpf_strideq*4]
+ mov lpfq, dstq
+ mov [rsp+8*1], lpf_strideq
+ lea r7, [r7+lpf_strideq*2]
+ mov [rsp+8*0], r7
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, dst_strideq
+.v1:
+ call .v
+ jmp .end
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located
+ mova m4, [lpfq+r10] ; before the start of the buffer
+ palignr m4, m5, 12
+ pshufb m4, m11
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ movu m4, [lpfq+r10-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jnz .h_main
+ pshufb m4, m11
+ jmp .h_main
+.h_loop:
+ movu m4, [lpfq+r10-4]
+.h_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -33
+ jl .h_have_right
+ call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
+.h_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+ pshufb m1, m5, m6
+ pmaddubsw m1, m12
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ pshufb m4, m8
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m8
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m9
+ paddw m5, m9
+ paddw m0, m2
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m10
+ paddw m1, m10
+ mova [t1+r10*2+ 0], m0
+ mova [t1+r10*2+32], m1
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, dst_strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ movu m4, [lpfq+r10-4]
+ pshufb m4, m11
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+r10-4]
+.hv_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -33
+ jl .hv_have_right
+ call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
+.hv_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+ pshufb m1, m5, m6
+ pmaddubsw m1, m12
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ pshufb m4, m8
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m8
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m9
+ paddw m5, m9
+ paddw m0, m2
+ paddw m1, m3
+ mova m2, [t3+r10*2]
+ paddw m2, [t1+r10*2]
+ mova m3, [t2+r10*2]
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m10
+ paddw m1, m10
+ paddw m4, m0, [t4+r10*2]
+ mova [t0+r10*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m14
+ punpckhwd m4, m4
+ pmaddwd m4, m14
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t3+r10*2+32]
+ paddw m2, [t1+r10*2+32]
+ mova m3, [t2+r10*2+32]
+ psrad m0, 11
+ psrad m4, 11
+ packssdw m0, m4
+ paddw m4, m1, [t4+r10*2+32]
+ mova [t0+r10*2+32], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m14
+ punpckhwd m4, m4
+ pmaddwd m4, m14
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 11
+ psrad m2, 11
+ packssdw m1, m2
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ add dstq, dst_strideq
+ ret
+.v:
+ mov r10, wq
+ psrld m13, m14, 16 ; y1 __
+.v_loop:
+ mova m6, [t1+r10*2+ 0]
+ paddw m2, m6, [t3+r10*2+ 0]
+ mova m4, [t2+r10*2+ 0]
+ mova m7, [t1+r10*2+32]
+ paddw m3, m7, [t3+r10*2+32]
+ mova m5, [t2+r10*2+32]
+ paddw m6, [t4+r10*2+ 0]
+ paddw m7, [t4+r10*2+32]
+ punpcklwd m0, m2, m4
+ pmaddwd m0, m15
+ punpckhwd m2, m4
+ pmaddwd m2, m15
+ punpcklwd m1, m3, m5
+ pmaddwd m1, m15
+ punpckhwd m3, m5
+ pmaddwd m3, m15
+ punpcklwd m5, m7, m6
+ pmaddwd m4, m5, m14
+ punpckhwd m7, m6
+ pmaddwd m6, m7, m14
+ pmaddwd m5, m13
+ pmaddwd m7, m13
+ paddd m0, m4
+ paddd m2, m6
+ paddd m1, m5
+ paddd m3, m7
+ REPX {psrad x, 11}, m0, m2, m1, m3
+ packssdw m0, m2
+ packssdw m1, m3
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ ret
-cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, fv, edge
- movifnidn fvq, fvmp
- mov edged, edgem
- movifnidn hd, hm
- vpbroadcastd m10, [fvq]
- vpbroadcastd m11, [fvq+4]
- vpbroadcastd m0, [pw_0_128]
- vpbroadcastd m12, [pd_1024]
-
- DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr
- rorx ylimd, edged, 2
- paddw m11, m0
- and ylimd, 2 ; have_bottom
- sub ylimd, 3
-
- ; main x loop for vertical filter, does one column of 16 pixels
-.loop_x:
- mova m3, [midq] ; middle line
-
- ; load top pixels
- test edgeb, 4 ; have_top
- jz .emu_top
- mova m0, [midq-384*4]
- mova m2, [midq-384*2]
- mova m1, m0
- jmp .load_bottom_pixels
-.emu_top:
- mova m0, m3
- mova m1, m3
- mova m2, m3
-
- ; load bottom pixels
-.load_bottom_pixels:
- mov yd, hd
- mov mptrq, midq
- mov dstptrq, dstq
- add yd, ylimd
- jg .load_threelines
-
- ; the remainder here is somewhat messy but only runs in very weird
- ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
- ; so performance is not terribly important here...
- je .load_twolines
- cmp yd, -1
- je .load_oneline
- ; h == 1 case
- mova m5, m3
- mova m4, m3
- mova m6, m3
- jmp .loop
-.load_oneline:
- ; h == 2 case
- mova m4, [midq+384*2]
- mova m5, m4
- mova m6, m4
- jmp .loop
-.load_twolines:
- ; h == 3 case
- mova m4, [midq+384*2]
- mova m5, [midq+384*4]
- mova m6, m5
- jmp .loop
-.load_threelines:
- ; h > 3 case
- mova m4, [midq+384*2]
- mova m5, [midq+384*4]
- ; third line loaded in main loop below
-
- ; main y loop for vertical filter
-.loop_load:
- ; load one line into m6. if that pixel is no longer available, do
- ; nothing, since m6 still has the data from the previous line in it. We
- ; try to structure the loop so that the common case is evaluated fastest
- mova m6, [mptrq+384*6]
-.loop:
- paddw m0, m6
- paddw m7, m1, m5
- paddw m8, m2, m4
- punpcklwd m9, m0, m7
- punpckhwd m0, m7
- punpcklwd m7, m8, m3
- punpckhwd m8, m3
- pmaddwd m9, m10
- pmaddwd m0, m10
- pmaddwd m7, m11
- pmaddwd m8, m11
- add mptrq, 384*2
- paddd m7, m9
- paddd m0, m8
- paddd m7, m12
- paddd m0, m12
- psrad m7, 11
- psrad m0, 11
- packssdw m7, m0
- vextracti128 xm0, m7, 1
- packuswb xm7, xm0
- mova [dstptrq], xm7
- ; shift pixels one position
- mova m0, m1
- mova m1, m2
- mova m2, m3
- mova m3, m4
- mova m4, m5
- mova m5, m6
- add dstptrq, strideq
- dec yd
- jg .loop_load
- ; for the bottom pixels, continue using m6 (as extended edge)
- cmp yd, ylimd
- jg .loop
- add midq, 32
- add dstq, 16
- sub wd, 16
- jg .loop_x
- RET
-
-INIT_YMM avx2
cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
mov xlimd, edgem
movifnidn wd, wm
--- a/src/x86/looprestoration_init_tmpl.c
+++ b/src/x86/looprestoration_init_tmpl.c
@@ -31,52 +31,19 @@
#include "common/intops.h"
#include "src/tables.h"
-// Future potential optimizations:
-// - special chroma versions which don't filter [0]/[6];
-// - running filter_h_avx2 transposed (one col of 32 pixels per iteration, top
-// to bottom) instead of scanline-ordered should be faster since then the
-// if (have_left) and similar conditions run only once instead of per line;
-// - filter_v_avx2 currently runs 16 pixels per iteration, it should be possible
-// to run 32 (like filter_h_avx2), and then all vpermqs can go;
-// - maybe split out the top/bottom filter_h_avx2 from the main body filter_h_avx2,
-// since then the have_left condition can be inlined;
-// - consider having the wrapper (wiener_filter_${ext}) also in hand-written
-// assembly, so the setup overhead is minimized.
-
#define WIENER_FILTER(ext) \
-\
-void dav1d_wiener_filter_h_##ext(int16_t *dst, const pixel (*left)[4], \
- const pixel *src, ptrdiff_t stride, \
- const int16_t fh[7], const intptr_t w, \
- int h, enum LrEdgeFlags edges); \
-void dav1d_wiener_filter_v_##ext(pixel *dst, ptrdiff_t stride, \
- const int16_t *mid, int w, int h, \
- const int16_t fv[7], enum LrEdgeFlags edges); \
-\
-static void wiener_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
- const pixel (*const left)[4], \
- const pixel *lpf, const ptrdiff_t lpf_stride, \
- const int w, const int h, const int16_t fh[7], \
- const int16_t fv[7], const enum LrEdgeFlags edges) \
-{ \
- ALIGN_STK_32(int16_t, mid, 68 * 384,); \
-\
- /* horizontal filter */ \
- dav1d_wiener_filter_h_##ext(&mid[2 * 384], left, dst, dst_stride, \
- fh, w, h, edges); \
- if (edges & LR_HAVE_TOP) \
- dav1d_wiener_filter_h_##ext(mid, NULL, lpf, lpf_stride, \
- fh, w, 2, edges); \
- if (edges & LR_HAVE_BOTTOM) \
- dav1d_wiener_filter_h_##ext(&mid[(2 + h) * 384], NULL, \
- lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride, \
- fh, w, 2, edges); \
-\
- dav1d_wiener_filter_v_##ext(dst, dst_stride, &mid[2*384], w, h, fv, edges); \
-}
+void dav1d_wiener_filter7_##ext(pixel *const dst, ptrdiff_t dst_stride, \
+ const pixel (*left)[4], const pixel *lpf, \
+ ptrdiff_t lpf_stride, int w, int h, \
+ const int16_t filter[2][8], \
+ enum LrEdgeFlags edges); \
+void dav1d_wiener_filter5_##ext(pixel *const dst, ptrdiff_t dst_stride, \
+ const pixel (*left)[4], const pixel *lpf, \
+ ptrdiff_t lpf_stride, int w, int h, \
+ const int16_t filter[2][8], \
+ enum LrEdgeFlags edges);
#define SGR_FILTER(ext) \
-\
void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \
const pixel (*left)[4], \
const pixel *src, const ptrdiff_t stride, \
@@ -199,15 +166,13 @@
} \
}
-#define DEF_LR_FILTERS(ext) \
-WIENER_FILTER(ext) \
-SGR_FILTER(ext)
-
#if BITDEPTH == 8
WIENER_FILTER(sse2)
-DEF_LR_FILTERS(ssse3)
+WIENER_FILTER(ssse3)
+SGR_FILTER(ssse3)
# if ARCH_X86_64
-DEF_LR_FILTERS(avx2)
+WIENER_FILTER(avx2)
+SGR_FILTER(avx2)
# endif
#endif
@@ -216,18 +181,21 @@
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
#if BITDEPTH == 8
- c->wiener = wiener_filter_sse2;
+ c->wiener[0] = dav1d_wiener_filter7_sse2;
+ c->wiener[1] = dav1d_wiener_filter5_sse2;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
- c->wiener = wiener_filter_ssse3;
+ c->wiener[0] = dav1d_wiener_filter7_ssse3;
+ c->wiener[1] = dav1d_wiener_filter5_ssse3;
c->selfguided = sgr_filter_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8 && ARCH_X86_64
- c->wiener = wiener_filter_avx2;
+ c->wiener[0] = dav1d_wiener_filter7_avx2;
+ c->wiener[1] = dav1d_wiener_filter5_avx2;
c->selfguided = sgr_filter_avx2;
#endif
}
--- /dev/null
+++ b/src/x86/looprestoration_sse.asm
@@ -1,0 +1,2448 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4
+wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14
+wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12
+wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1
+wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
+
+pb_right_ext_mask: times 24 db 0xff
+ times 8 db 0
+pb_0: times 16 db 0
+pb_3: times 16 db 3
+pb_15: times 16 db 15
+pb_0_1: times 8 db 0, 1
+pb_14_15: times 8 db 14, 15
+pw_1: times 8 dw 1
+pw_16: times 8 dw 16
+pw_128: times 8 dw 128
+pw_256: times 8 dw 256
+pw_2048: times 8 dw 2048
+pw_2056: times 8 dw 2056
+pw_m16380: times 8 dw -16380
+pw_5_6: times 4 dw 5, 6
+pd_1024: times 4 dd 1024
+%if ARCH_X86_32
+pd_512: times 4 dd 512
+pd_2048: times 4 dd 2048
+%endif
+pd_0xF0080029: times 4 dd 0xF0080029
+pd_0xF00801C7: times 4 dd 0XF00801C7
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+
+ %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
+ %assign pic_reg_stk_off 4
+ %xdefine PIC_reg %1
+ %if %2 == 1
+ mov [esp], %1
+ %endif
+ LEA PIC_reg, PIC_base_offset
+ %if %3 == 1
+ XCHG_PIC_REG
+ %endif
+ %endmacro
+
+ %macro XCHG_PIC_REG 0
+ mov [esp+pic_reg_stk_off], PIC_reg
+ %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
+ mov PIC_reg, [esp+pic_reg_stk_off]
+ %endmacro
+
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+
+%else
+ %macro XCHG_PIC_REG 0
+ %endmacro
+
+ %define PIC_sym(sym) (sym)
+%endif
+
+%macro WIENER 0
+%if ARCH_X86_64
+DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers
+cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
+ lpf_stride, w, edge, flt, h, x
+ %define base 0
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+ mov hd, r6m
+ movq m14, [fltq]
+ add lpfq, wq
+ lea t1, [rsp+wq*2+16]
+ mova m15, [pw_2056]
+ add dstq, wq
+ movq m7, [fltq+16]
+ neg wq
+%if cpuflag(ssse3)
+ pshufb m14, [wiener_init]
+ mova m8, [wiener_shufA]
+ pshufd m12, m14, q2222 ; x0 x0
+ mova m9, [wiener_shufB]
+ pshufd m13, m14, q3333 ; x1 x2
+ mova m10, [wiener_shufC]
+ punpcklqdq m14, m14 ; x3
+ mova m11, [wiener_shufD]
+%else
+ mova m10, [pw_m16380]
+ punpcklwd m14, m14
+ pshufd m11, m14, q0000 ; x0
+ pshufd m12, m14, q1111 ; x1
+ pshufd m13, m14, q2222 ; x2
+ pshufd m14, m14, q3333 ; x3
+%endif
+%else
+DECLARE_REG_TMP 4, 0, _, 5
+%if cpuflag(ssse3)
+ %define m10 [base+wiener_shufC]
+ %define m11 [base+wiener_shufD]
+ %define stk_off 96
+%else
+ %define m10 [base+pw_m16380]
+ %define m11 [stk+96]
+ %define stk_off 112
+%endif
+cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
+ %define base r6-pb_right_ext_mask-21
+ %define stk esp
+ %define dstq leftq
+ %define edgeb byte edged
+ %define edged [stk+ 8]
+ %define dstmp [stk+12]
+ %define hd dword [stk+16]
+ %define wq [stk+20]
+ %define dst_strideq [stk+24]
+ %define leftmp [stk+28]
+ %define t2 [stk+32]
+ %define t4 [stk+36]
+ %define t5 [stk+40]
+ %define t6 [stk+44]
+ %define m8 [base+wiener_shufA]
+ %define m9 [base+wiener_shufB]
+ %define m12 [stk+48]
+ %define m13 [stk+64]
+ %define m14 [stk+80]
+ %define m15 [base+pw_2056]
+ mov r1, r7m ; flt
+ mov r0, r0m ; dst
+ mov r5, r5m ; w
+ mov lpfq, lpfm
+ mov r2, r8m ; edge
+ mov r4, r6m ; h
+ movq m3, [r1+ 0]
+ movq m7, [r1+16]
+ add r0, r5
+ mov r1, r1m ; dst_stride
+ add lpfq, r5
+ mov edged, r2
+ mov r2, r2m ; left
+ mov dstmp, r0
+ lea t1, [rsp+r5*2+stk_off]
+ mov hd, r4
+ neg r5
+ mov lpf_strideq, lpf_stridem
+ LEA r6, pb_right_ext_mask+21
+ mov wq, r5
+ mov dst_strideq, r1
+ mov leftmp, r2
+%if cpuflag(ssse3)
+ pshufb m3, [base+wiener_init]
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q3333
+ punpcklqdq m3, m3
+%else
+ punpcklwd m3, m3
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m11, m0
+%endif
+ mova m12, m1
+ mova m13, m2
+ mova m14, m3
+%endif
+ pshufd m6, m7, q0000 ; y0 y1
+ pshufd m7, m7, q1111 ; y2 y3
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, lpf_strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea t3, [lpfq+lpf_strideq*4]
+ mov lpfq, dstmp
+ mov [rsp+gprsize*1], lpf_strideq
+ add t3, lpf_strideq
+ mov [rsp+gprsize*0], t3 ; below
+ mov t4, t1
+ add t1, 384*2
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp+gprsize*0]
+ call .hv_bottom
+ add lpfq, [rsp+gprsize*1]
+ call .hv_bottom
+.v1:
+ call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+ RET
+.no_top:
+ lea t3, [lpfq+lpf_strideq*4]
+ mov lpfq, dstmp
+ mov [rsp+gprsize*1], lpf_strideq
+ lea t3, [t3+lpf_strideq*2]
+ mov [rsp+gprsize*0], t3
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+.v2:
+ call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+ jmp .v1
+.extend_right:
+ movd m2, [lpfq-4]
+%if ARCH_X86_64
+ push r0
+ lea r0, [pb_right_ext_mask+21]
+ movu m0, [r0+xq+0]
+ movu m1, [r0+xq+8]
+ pop r0
+%else
+ movu m0, [r6+xq+0]
+ movu m1, [r6+xq+8]
+%endif
+%if cpuflag(ssse3)
+ pshufb m2, [base+pb_3]
+%else
+ punpcklbw m2, m2
+ pshuflw m2, m2, q3333
+ punpcklqdq m2, m2
+%endif
+ pand m4, m0
+ pand m5, m1
+ pandn m0, m2
+ pandn m1, m2
+ por m4, m0
+ por m5, m1
+ ret
+.h:
+ %define stk esp+4 ; offset due to call
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, [base+wiener_l_shuf]
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .h_main
+.h_top:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+xq-4]
+.h_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp xd, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+%macro %%h7 0
+%if cpuflag(ssse3)
+ pshufb m0, m4, m8
+ pmaddubsw m0, m12
+ pshufb m1, m5, m8
+ pmaddubsw m1, m12
+ pshufb m2, m4, m9
+ pmaddubsw m2, m13
+ pshufb m3, m5, m9
+ pmaddubsw m3, m13
+ paddw m0, m2
+ pshufb m2, m4, m10
+ pmaddubsw m2, m13
+ paddw m1, m3
+ pshufb m3, m5, m10
+ pmaddubsw m3, m13
+ pshufb m4, m11
+ paddw m0, m2
+ pmullw m2, m14, m4
+ pshufb m5, m11
+ paddw m1, m3
+ pmullw m3, m14, m5
+ psllw m4, 7
+ psllw m5, 7
+ paddw m0, m2
+ mova m2, [base+pw_m16380]
+ paddw m1, m3
+ paddw m4, m2
+ paddw m5, m2
+ paddsw m0, m4
+ paddsw m1, m5
+%else
+ psrldq m0, m4, 1
+ pslldq m1, m4, 1
+ pxor m3, m3
+ punpcklbw m0, m3
+ punpckhbw m1, m3
+ paddw m0, m1
+ pmullw m0, m11
+ psrldq m1, m4, 2
+ pslldq m2, m4, 2
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ paddw m1, m2
+ pmullw m1, m12
+ paddw m0, m1
+ pshufd m2, m4, q0321
+ punpcklbw m2, m3
+ pmullw m1, m14, m2
+ paddw m0, m1
+ psrldq m1, m4, 3
+ pslldq m4, 3
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m13
+ paddw m0, m1
+ psllw m2, 7
+ paddw m2, m10
+ paddsw m0, m2
+ psrldq m1, m5, 1
+ pslldq m2, m5, 1
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ paddw m1, m2
+ pmullw m1, m11
+ psrldq m2, m5, 2
+ pslldq m4, m5, 2
+ punpcklbw m2, m3
+ punpckhbw m4, m3
+ paddw m2, m4
+ pmullw m2, m12
+ paddw m1, m2
+ pshufd m4, m5, q0321
+ punpcklbw m4, m3
+ pmullw m2, m14, m4
+ paddw m1, m2
+ psrldq m2, m5, 3
+ pslldq m5, 3
+ punpcklbw m2, m3
+ punpckhbw m5, m3
+ paddw m2, m5
+ pmullw m2, m13
+ paddw m1, m2
+ psllw m4, 7
+ paddw m4, m10
+ paddsw m1, m4
+%endif
+%endmacro
+ %%h7
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+ mova [t1+xq*2+ 0], m0
+ mova [t1+xq*2+16], m1
+ add xq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, dst_strideq
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, [base+wiener_l_shuf]
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .hv_main
+.hv_bottom:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+xq-4]
+.hv_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp xd, -18
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ %%h7
+%if ARCH_X86_64
+ mova m2, [t4+xq*2]
+ paddw m2, [t2+xq*2]
+%else
+ mov r2, t4
+ mova m2, [r2+xq*2]
+ mov r2, t2
+ paddw m2, [r2+xq*2]
+ mov r2, t5
+%endif
+ mova m3, [t3+xq*2]
+%if ARCH_X86_64
+ mova m5, [t5+xq*2]
+%else
+ mova m5, [r2+xq*2]
+ mov r2, t6
+%endif
+ paddw m5, [t1+xq*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+%if ARCH_X86_64
+ paddw m4, m0, [t6+xq*2]
+%else
+ paddw m4, m0, [r2+xq*2]
+ mov r2, t4
+%endif
+ mova [t0+xq*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m0, m3
+ mova m3, [t3+xq*2+16]
+ paddd m4, m2
+%if ARCH_X86_64
+ mova m2, [t4+xq*2+16]
+ paddw m2, [t2+xq*2+16]
+ mova m5, [t5+xq*2+16]
+%else
+ mova m2, [r2+xq*2+16]
+ mov r2, t2
+ paddw m2, [r2+xq*2+16]
+ mov r2, t5
+ mova m5, [r2+xq*2+16]
+ mov r2, t6
+%endif
+ paddw m5, [t1+xq*2+16]
+ psrad m0, 11
+ psrad m4, 11
+ packssdw m0, m4
+%if ARCH_X86_64
+ paddw m4, m1, [t6+xq*2+16]
+%else
+ paddw m4, m1, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ mova [t0+xq*2+16], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 11
+ psrad m2, 11
+ packssdw m1, m2
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .hv_loop
+ add dstq, dst_strideq
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+%else
+ mov dstmp, dstq
+ mov r1, t5
+ mov r2, t4
+ mov t6, r1
+ mov t5, r2
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, r1
+%endif
+ ret
+%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code
+.v:
+ mov xq, wq
+.v_loop:
+%if ARCH_X86_64
+ mova m1, [t4+xq*2]
+ paddw m1, [t2+xq*2]
+%else
+ mov r2, t4
+ mova m1, [r2+xq*2]
+ mov r2, t2
+ paddw m1, [r2+xq*2]
+ mov r2, t6
+%endif
+ mova m2, [t3+xq*2]
+ mova m4, [t1+xq*2]
+%if ARCH_X86_64
+ paddw m3, m4, [t6+xq*2]
+ paddw m4, [t5+xq*2]
+%else
+ paddw m3, m4, [r2+xq*2]
+ mov r2, t5
+ paddw m4, [r2+xq*2]
+ mov r2, t4
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m7
+ punpckhwd m1, m2
+ pmaddwd m1, m7
+ punpcklwd m2, m3, m4
+ pmaddwd m2, m6
+ punpckhwd m3, m4
+ pmaddwd m3, m6
+ paddd m0, m2
+ paddd m1, m3
+%if ARCH_X86_64
+ mova m2, [t4+xq*2+16]
+ paddw m2, [t2+xq*2+16]
+%else
+ mova m2, [r2+xq*2+16]
+ mov r2, t2
+ paddw m2, [r2+xq*2+16]
+ mov r2, t6
+%endif
+ mova m3, [t3+xq*2+16]
+ mova m5, [t1+xq*2+16]
+%if ARCH_X86_64
+ paddw m4, m5, [t6+xq*2+16]
+ paddw m5, [t5+xq*2+16]
+%else
+ paddw m4, m5, [r2+xq*2+16]
+ mov r2, t5
+ paddw m5, [r2+xq*2+16]
+ movifnidn dstq, dstmp
+%endif
+ psrad m0, 11
+ psrad m1, 11
+ packssdw m0, m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 11
+ psrad m2, 11
+ packssdw m1, m2
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .v_loop
+ add dstq, dst_strideq
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+%else
+ mov dstmp, dstq
+ mov r1, t5
+ mov r2, t4
+ mov t6, r1
+ mov t5, r2
+%endif
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ ret
+%endif
+
+%if ARCH_X86_64
+cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+ lpf_stride, w, edge, flt, h, x
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+ mov hd, r6m
+ movq m14, [fltq]
+ add lpfq, wq
+ mova m8, [pw_m16380]
+ lea t1, [rsp+wq*2+16]
+ mova m15, [pw_2056]
+ add dstq, wq
+ movq m7, [fltq+16]
+ neg wq
+%if cpuflag(ssse3)
+ pshufb m14, [wiener_init]
+ mova m9, [wiener_shufB]
+ pshufd m13, m14, q3333 ; x1 x2
+ mova m10, [wiener_shufC]
+ punpcklqdq m14, m14 ; x3
+ mova m11, [wiener_shufD]
+ mova m12, [wiener_l_shuf]
+%else
+ punpcklwd m14, m14
+ pshufd m11, m14, q1111 ; x1
+ pshufd m13, m14, q2222 ; x2
+ pshufd m14, m14, q3333 ; x3
+%endif
+%else
+%if cpuflag(ssse3)
+ %define stk_off 80
+%else
+ %define m11 [stk+80]
+ %define stk_off 96
+%endif
+cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
+ %define stk esp
+ %define leftmp [stk+28]
+ %define m8 [base+pw_m16380]
+ %define m12 [base+wiener_l_shuf]
+ %define m14 [stk+48]
+ mov r1, r7m ; flt
+ mov r0, r0m ; dst
+ mov r5, r5m ; w
+ mov lpfq, lpfm
+ mov r2, r8m ; edge
+ mov r4, r6m ; h
+ movq m2, [r1+ 0]
+ movq m7, [r1+16]
+ add r0, r5
+ mov r1, r1m ; dst_stride
+ add lpfq, r5
+ mov edged, r2
+ mov r2, r2m ; left
+ mov dstmp, r0
+ lea t1, [rsp+r5*2+stk_off]
+ mov hd, r4
+ neg r5
+ mov lpf_strideq, lpf_stridem
+ LEA r6, pb_right_ext_mask+21
+ mov wq, r5
+ mov dst_strideq, r1
+ mov leftmp, r2
+%if cpuflag(ssse3)
+ pshufb m2, [base+wiener_init]
+ pshufd m1, m2, q3333
+ punpcklqdq m2, m2
+%else
+ punpcklwd m2, m2
+ pshufd m0, m2, q1111
+ pshufd m1, m2, q2222
+ pshufd m2, m2, q3333
+ mova m11, m0
+%endif
+ mova m13, m1
+ mova m14, m2
+%endif
+ pshufd m6, m7, q0000 ; __ y1
+ pshufd m7, m7, q1111 ; y2 y3
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, lpf_strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea xq, [lpfq+lpf_strideq*4]
+ mov lpfq, dstmp
+ mov t3, t1
+ add t1, 384*2
+ mov [rsp+gprsize*1], lpf_strideq
+ add xq, lpf_strideq
+ mov [rsp+gprsize*0], xq ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp+gprsize*0]
+ call .hv_bottom
+ add lpfq, [rsp+gprsize*1]
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea t3, [lpfq+lpf_strideq*4]
+ mov lpfq, dstmp
+ mov [rsp+gprsize*1], lpf_strideq
+ lea t3, [t3+lpf_strideq*2]
+ mov [rsp+gprsize*0], t3
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call mangle(private_prefix %+ _wiener_filter5_ssse3).v
+ add dstq, dst_strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ movifnidn dstmp, dstq
+.v1:
+ call mangle(private_prefix %+ _wiener_filter5_ssse3).v
+ jmp .end
+.h:
+ %define stk esp+4
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, m12
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .h_main
+.h_top:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+xq-4]
+.h_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp xd, -17
+ jl .h_have_right
+ call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
+.h_have_right:
+%macro %%h5 0
+%if cpuflag(ssse3)
+ pshufb m0, m4, m9
+ pmaddubsw m0, m13
+ pshufb m1, m5, m9
+ pmaddubsw m1, m13
+ pshufb m2, m4, m10
+ pmaddubsw m2, m13
+ pshufb m3, m5, m10
+ pmaddubsw m3, m13
+ pshufb m4, m11
+ paddw m0, m2
+ pmullw m2, m14, m4
+ pshufb m5, m11
+ paddw m1, m3
+ pmullw m3, m14, m5
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m8
+ paddw m5, m8
+ paddw m0, m2
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+%else
+ psrldq m0, m4, 2
+ pslldq m1, m4, 2
+ pxor m3, m3
+ punpcklbw m0, m3
+ punpckhbw m1, m3
+ paddw m0, m1
+ pmullw m0, m11
+ pshufd m2, m4, q0321
+ punpcklbw m2, m3
+ pmullw m1, m14, m2
+ paddw m0, m1
+ psrldq m1, m4, 3
+ pslldq m4, 3
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m13
+ paddw m0, m1
+ psllw m2, 7
+ paddw m2, m8
+ paddsw m0, m2
+ psrldq m1, m5, 2
+ pslldq m4, m5, 2
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m11
+ pshufd m4, m5, q0321
+ punpcklbw m4, m3
+ pmullw m2, m14, m4
+ paddw m1, m2
+ psrldq m2, m5, 3
+ pslldq m5, 3
+ punpcklbw m2, m3
+ punpckhbw m5, m3
+ paddw m2, m5
+ pmullw m2, m13
+ paddw m1, m2
+ psllw m4, 7
+ paddw m4, m8
+ paddsw m1, m4
+%endif
+%endmacro
+ %%h5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+ mova [t1+xq*2+ 0], m0
+ mova [t1+xq*2+16], m1
+ add xq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, dst_strideq
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, m12
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .hv_main
+.hv_bottom:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+xq-4]
+.hv_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp xd, -17
+ jl .hv_have_right
+ call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
+.hv_have_right:
+ %%h5
+ mova m2, [t3+xq*2]
+ paddw m2, [t1+xq*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+%if ARCH_X86_64
+ mova m3, [t2+xq*2]
+ paddw m4, m0, [t4+xq*2]
+%else
+ mov r2, t2
+ mova m3, [r2+xq*2]
+ mov r2, t4
+ paddw m4, m0, [r2+xq*2]
+%endif
+ mova [t0+xq*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t3+xq*2+16]
+ paddw m2, [t1+xq*2+16]
+ psrad m0, 11
+ psrad m4, 11
+ packssdw m0, m4
+%if ARCH_X86_64
+ mova m3, [t2+xq*2+16]
+ paddw m4, m1, [t4+xq*2+16]
+%else
+ paddw m4, m1, [r2+xq*2+16]
+ mov r2, t2
+ mova m3, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ mova [t0+xq*2+16], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 11
+ psrad m2, 11
+ packssdw m1, m2
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .hv_loop
+ add dstq, dst_strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ movifnidn dstmp, dstq
+ ret
+%if cpuflag(ssse3)
+.v:
+ mov xq, wq
+.v_loop:
+ mova m3, [t1+xq*2]
+ paddw m1, m3, [t3+xq*2]
+%if ARCH_X86_64
+ mova m2, [t2+xq*2]
+ paddw m3, [t4+xq*2]
+%else
+ mov r2, t2
+ mova m2, [r2+xq*2]
+ mov r2, t4
+ paddw m3, [r2+xq*2]
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m7
+ punpckhwd m1, m2
+ pmaddwd m1, m7
+ punpcklwd m2, m3
+ pmaddwd m2, m6
+ punpckhwd m3, m3
+ pmaddwd m3, m6
+ paddd m0, m2
+ paddd m1, m3
+ mova m4, [t1+xq*2+16]
+ paddw m2, m4, [t3+xq*2+16]
+%if ARCH_X86_64
+ mova m3, [t2+xq*2+16]
+ paddw m4, [t4+xq*2+16]
+%else
+ paddw m4, [r2+xq*2+16]
+ mov r2, t2
+ mova m3, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ psrad m0, 11
+ psrad m1, 11
+ packssdw m0, m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 11
+ psrad m2, 11
+ packssdw m1, m2
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .v_loop
+ ret
+%endif
+%endmacro
+
+INIT_XMM sse2
+WIENER
+
+INIT_XMM ssse3
+WIENER
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; self-guided ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro MULLD 2
+ pmulhuw m5, %1, %2
+ pmullw %1, %2
+ pslld m5, 16
+ paddd %1, m5
+%endmacro
+
+%macro GATHERDD 2
+ mova m5, m7
+ movd r6d, %2
+ %if ARCH_X86_64
+ movd %1, [r5+r6]
+ pextrw r6d, %2, 2
+ pinsrw m5, [r5+r6+2], 3
+ pextrw r6d, %2, 4
+ pinsrw %1, [r5+r6+2], 5
+ pextrw r6d, %2, 6
+ pinsrw m5, [r5+r6+2], 7
+ %else
+ movd %1, [PIC_sym(sgr_x_by_x-0xF03)+r6]
+ pextrw r6d, %2, 2
+ pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3
+ pextrw r6d, %2, 4
+ pinsrw %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5
+ pextrw r6d, %2, 6
+ pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7
+ %endif
+ por %1, m5
+%endmacro
+
+%if ARCH_X86_64
+cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ mov xlimd, edgem
+ movifnidn xd, xm
+ mov hd, hm
+ mov edged, xlimd
+ and xlimd, 2 ; have_right
+ add xd, xlimd
+ xor xlimd, 2 ; 2*!have_right
+%else
+cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ %define wq r0m
+ %define xlimd r1m
+ %define hd hmp
+ %define edgeb byte edgem
+
+ mov r6, edgem
+ and r6, 2 ; have_right
+ add xd, r6
+ xor r6, 2 ; 2*!have_right
+ mov xlimd, r6
+ SETUP_PIC r6, 0
+%endif
+
+ jnz .no_right
+ add xd, 7
+ and xd, ~7
+.no_right:
+ pxor m1, m1
+ lea srcq, [srcq+xq]
+ lea sumq, [sumq+xq*2-2]
+ lea sumsqq, [sumsqq+xq*4-4]
+ neg xq
+ mov wq, xq
+%if ARCH_X86_64
+ lea r10, [pb_right_ext_mask+24]
+%endif
+.loop_y:
+ mov xq, wq
+
+ ; load left
+ test edgeb, 1 ; have_left
+ jz .no_left
+ test leftq, leftq
+ jz .load_left_from_main
+ movd m0, [leftq]
+ pslldq m0, 12
+ add leftq, 4
+ jmp .expand_x
+.no_left:
+ movd m0, [srcq+xq]
+ pshufb m0, [PIC_sym(pb_0)]
+ jmp .expand_x
+.load_left_from_main:
+ movd m0, [srcq+xq-2]
+ pslldq m0, 14
+.expand_x:
+ punpckhbw xm0, xm1
+
+ ; when we reach this, m0 contains left two px in highest words
+ cmp xd, -8
+ jle .loop_x
+.partial_load_and_extend:
+ movd m3, [srcq-4]
+ pshufb m3, [PIC_sym(pb_3)]
+ movq m2, [srcq+xq]
+ punpcklbw m2, m1
+ punpcklbw m3, m1
+%if ARCH_X86_64
+ movu m4, [r10+xq*2]
+%else
+ movu m4, [PIC_sym(pb_right_ext_mask)+xd*2+24]
+%endif
+ pand m2, m4
+ pandn m4, m3
+ por m2, m4
+ jmp .loop_x_noload
+.right_extend:
+ pshufb m2, m0, [PIC_sym(pb_14_15)]
+ jmp .loop_x_noload
+
+.loop_x:
+ movq m2, [srcq+xq]
+ punpcklbw m2, m1
+.loop_x_noload:
+ palignr m3, m2, m0, 12
+ palignr m4, m2, m0, 14
+
+ punpcklwd m5, m3, m2
+ punpckhwd m6, m3, m2
+ paddw m3, m4
+ punpcklwd m7, m4, m1
+ punpckhwd m4, m1
+ pmaddwd m5, m5
+ pmaddwd m6, m6
+ pmaddwd m7, m7
+ pmaddwd m4, m4
+ paddd m5, m7
+ paddd m6, m4
+ paddw m3, m2
+ movu [sumq+xq*2], m3
+ movu [sumsqq+xq*4+ 0], m5
+ movu [sumsqq+xq*4+16], m6
+
+ mova m0, m2
+ add xq, 8
+
+ ; if x <= -8 we can reload more pixels
+ ; else if x < 0 we reload and extend (this implies have_right=0)
+ ; else if x < xlimd we extend from previous load (this implies have_right=0)
+ ; else we are done
+
+ cmp xd, -8
+ jle .loop_x
+ test xd, xd
+ jl .partial_load_and_extend
+ cmp xd, xlimd
+ jl .right_extend
+
+ add sumsqq, (384+16)*4
+ add sumq, (384+16)*2
+ add srcq, strideq
+ dec hd
+ jg .loop_y
+ RET
+
+%if ARCH_X86_64
+cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
+ movifnidn edged, edgem
+%else
+cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
+ %define sumsq_baseq dword [esp+0]
+ %define sum_baseq dword [esp+4]
+ %define ylimd dword [esp+8]
+ %define m8 [esp+12]
+ mov edged, r4m
+ mov hd, r3m
+%endif
+ mov xq, -2
+%if ARCH_X86_64
+ mov ylimd, edged
+ and ylimd, 8 ; have_bottom
+ shr ylimd, 2
+ sub ylimd, 2 ; -2 if have_bottom=0, else 0
+ mov sumsq_baseq, sumsqq
+ mov sum_baseq, sumq
+.loop_x:
+ mov sumsqq, sumsq_baseq
+ mov sumq, sum_baseq
+ lea yd, [hq+ylimq+2]
+%else
+ mov yd, edged
+ and yd, 8 ; have_bottom
+ shr yd, 2
+ sub yd, 2 ; -2 if have_bottom=0, else 0
+ mov sumsq_baseq, sumsqq
+ mov sum_baseq, sumq
+ mov ylimd, yd
+.loop_x:
+ mov sumsqd, sumsq_baseq
+ mov sumd, sum_baseq
+ lea yd, [hq+2]
+ add yd, ylimd
+%endif
+ lea sumsqq, [sumsqq+xq*4+4-(384+16)*4]
+ lea sumq, [sumq+xq*2+2-(384+16)*2]
+ test edgeb, 4 ; have_top
+ jnz .load_top
+ movu m0, [sumsqq+(384+16)*4*1]
+ movu m1, [sumsqq+(384+16)*4*1+16]
+ mova m2, m0
+ mova m3, m1
+ mova m4, m0
+ mova m5, m1
+ movu m6, [sumq+(384+16)*2*1]
+ mova m7, m6
+ mova m8, m6
+ jmp .loop_y_noload
+.load_top:
+ movu m0, [sumsqq-(384+16)*4*1] ; l2sq [left]
+ movu m1, [sumsqq-(384+16)*4*1+16] ; l2sq [right]
+ movu m2, [sumsqq-(384+16)*4*0] ; l1sq [left]
+ movu m3, [sumsqq-(384+16)*4*0+16] ; l1sq [right]
+ movu m6, [sumq-(384+16)*2*1] ; l2
+ movu m7, [sumq-(384+16)*2*0] ; l1
+.loop_y:
+%if ARCH_X86_64
+ movu m8, [sumq+(384+16)*2*1] ; l0
+%else
+ movu m4, [sumq+(384+16)*2*1] ; l0
+ mova m8, m4
+%endif
+ movu m4, [sumsqq+(384+16)*4*1] ; l0sq [left]
+ movu m5, [sumsqq+(384+16)*4*1+16] ; l0sq [right]
+.loop_y_noload:
+ paddd m0, m2
+ paddd m1, m3
+ paddw m6, m7
+ paddd m0, m4
+ paddd m1, m5
+ paddw m6, m8
+ movu [sumsqq+ 0], m0
+ movu [sumsqq+16], m1
+ movu [sumq], m6
+
+ ; shift position down by one
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+ mova m6, m7
+ mova m7, m8
+ add sumsqq, (384+16)*4
+ add sumq, (384+16)*2
+ dec yd
+ jg .loop_y
+ cmp yd, ylimd
+ jg .loop_y_noload
+ add xd, 8
+ cmp xd, wd
+ jl .loop_x
+ RET
+
+cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
+ movifnidn sd, sm
+ sub aq, (384+16-1)*4
+ sub bq, (384+16-1)*2
+ add hd, 2
+%if ARCH_X86_64
+ LEA r5, sgr_x_by_x-0xF03
+%else
+ SETUP_PIC r5, 0
+%endif
+ movd m6, sd
+ pshuflw m6, m6, q0000
+ punpcklqdq m6, m6
+ pxor m7, m7
+ DEFINE_ARGS a, b, w, h, x
+%if ARCH_X86_64
+ mova m8, [pd_0xF00801C7]
+ mova m9, [pw_256]
+ psrld m10, m9, 13 ; pd_2048
+ mova m11, [pb_unpcklwdw]
+%else
+ %define m8 [PIC_sym(pd_0xF00801C7)]
+ %define m9 [PIC_sym(pw_256)]
+ %define m10 [PIC_sym(pd_2048)]
+ %define m11 [PIC_sym(pb_unpcklwdw)]
+%endif
+.loop_y:
+ mov xq, -2
+.loop_x:
+ movq m0, [bq+xq*2]
+ movq m1, [bq+xq*2+(384+16)*2]
+ punpcklwd m0, m7
+ punpcklwd m1, m7
+ movu m2, [aq+xq*4]
+ movu m3, [aq+xq*4+(384+16)*4]
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m2, m4 ; aa * 9
+ paddd m3, m5
+ pmaddwd m4, m0, m0
+ pmaddwd m5, m1, m1
+ pmaddwd m0, m8
+ pmaddwd m1, m8
+ psubd m2, m4 ; p = aa * 9 - bb * bb
+ psubd m3, m5
+ MULLD m2, m6
+ MULLD m3, m6
+ paddusw m2, m8
+ paddusw m3, m8
+ psrld m2, 20 ; z
+ psrld m3, 20
+ GATHERDD m4, m2 ; xx
+ GATHERDD m2, m3
+ psrld m4, 24
+ psrld m2, 24
+ packssdw m3, m4, m2
+ pshufb m4, m11
+ MULLD m0, m4
+ pshufb m2, m11
+ MULLD m1, m2
+ psubw m5, m9, m3
+ paddd m0, m10
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ movq [bq+xq*2], m5
+ psrldq m5, 8
+ movq [bq+xq*2+(384+16)*2], m5
+ movu [aq+xq*4], m0
+ movu [aq+xq*4+(384+16)*4], m1
+ add xd, 4
+ cmp xd, wd
+ jl .loop_x
+ add aq, (384+16)*4*2
+ add bq, (384+16)*2*2
+ sub hd, 2
+ jg .loop_y
+ RET
+
+%if ARCH_X86_64
+cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
+ tmp_base, src_base, a_base, b_base, x, y
+ movifnidn wd, wm
+ mov hd, hm
+ mova m15, [pw_16]
+ mov tmp_baseq, tq
+ mov src_baseq, srcq
+ mov a_baseq, aq
+ mov b_baseq, bq
+ xor xd, xd
+%else
+cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
+ %define tmp_baseq [esp+8]
+ %define src_baseq [esp+12]
+ %define a_baseq [esp+16]
+ %define b_baseq [esp+20]
+ %define wd [esp+24]
+ %define hd [esp+28]
+ mov tmp_baseq, tq
+ mov src_baseq, srcq
+ mov a_baseq, aq
+ mov b_baseq, bq
+ mov wd, xd
+ mov hd, yd
+ xor xd, xd
+ SETUP_PIC yd, 1, 1
+ jmp .loop_start
+%endif
+
+.loop_x:
+ mov tq, tmp_baseq
+ mov srcq, src_baseq
+ mov aq, a_baseq
+ mov bq, b_baseq
+%if ARCH_X86_32
+.loop_start:
+ movu m0, [bq+xq*2-(384+16)*2-2]
+ movu m2, [bq+xq*2-(384+16)*2+2]
+ mova m1, [bq+xq*2-(384+16)*2] ; b:top
+ paddw m0, m2 ; b:tl+tr
+ movu m2, [bq+xq*2-2]
+ movu m3, [bq+xq*2+2]
+ paddw m1, [bq+xq*2] ; b:top+ctr
+ paddw m2, m3 ; b:l+r
+ mova [esp+0x80], m0
+ mova [esp+0x70], m1
+ mova [esp+0x60], m2
+%endif
+ movu m0, [aq+xq*4-(384+16)*4-4]
+ movu m2, [aq+xq*4-(384+16)*4+4]
+ mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half]
+ paddd m0, m2 ; a:tl+tr [first half]
+ movu m2, [aq+xq*4-(384+16)*4-4+16]
+ movu m4, [aq+xq*4-(384+16)*4+4+16]
+ mova m3, [aq+xq*4-(384+16)*4+16] ; a:top [second half]
+ paddd m2, m4 ; a:tl+tr [second half]
+ movu m4, [aq+xq*4-4]
+ movu m5, [aq+xq*4+4]
+ paddd m1, [aq+xq*4] ; a:top+ctr [first half]
+ paddd m4, m5 ; a:l+r [first half]
+ movu m5, [aq+xq*4+16-4]
+ movu m6, [aq+xq*4+16+4]
+ paddd m3, [aq+xq*4+16] ; a:top+ctr [second half]
+ paddd m5, m6 ; a:l+r [second half]
+%if ARCH_X86_64
+ movu m6, [bq+xq*2-(384+16)*2-2]
+ movu m8, [bq+xq*2-(384+16)*2+2]
+ mova m7, [bq+xq*2-(384+16)*2] ; b:top
+ paddw m6, m8 ; b:tl+tr
+ movu m8, [bq+xq*2-2]
+ movu m9, [bq+xq*2+2]
+ paddw m7, [bq+xq*2] ; b:top+ctr
+ paddw m8, m9 ; b:l+r
+%endif
+
+ lea tq, [tq+xq*2]
+ lea srcq, [srcq+xq*1]
+ lea aq, [aq+xq*4+(384+16)*4]
+ lea bq, [bq+xq*2+(384+16)*2]
+ mov yd, hd
+.loop_y:
+%if ARCH_X86_64
+ movu m9, [bq-2]
+ movu m10, [bq+2]
+ paddw m7, [bq] ; b:top+ctr+bottom
+ paddw m9, m10 ; b:bl+br
+ paddw m10, m7, m8 ; b:top+ctr+bottom+l+r
+ paddw m6, m9 ; b:tl+tr+bl+br
+ psubw m7, [bq-(384+16)*2*2] ; b:ctr+bottom
+ paddw m10, m6
+ psllw m10, 2
+ psubw m10, m6 ; aa
+ pxor m14, m14
+ movq m12, [srcq]
+ punpcklbw m12, m14
+ punpcklwd m6, m10, m15
+ punpckhwd m10, m15
+ punpcklwd m13, m12, m15
+ punpckhwd m12, m15
+ pmaddwd m6, m13 ; aa*src[x]+256 [first half]
+ pmaddwd m10, m12 ; aa*src[x]+256 [second half]
+%else
+ paddd m1, [aq] ; a:top+ctr+bottom [first half]
+ paddd m3, [aq+16] ; a:top+ctr+bottom [second half]
+ mova [esp+0x50], m1
+ mova [esp+0x40], m3
+ mova [esp+0x30], m4
+ movu m6, [aq-4]
+ movu m7, [aq+4]
+ paddd m1, m4 ; a:top+ctr+bottom+l+r [first half]
+ paddd m3, m5 ; a:top+ctr+bottom+l+r [second half]
+ paddd m6, m7 ; a:bl+br [first half]
+ movu m7, [aq+16-4]
+ movu m4, [aq+16+4]
+ paddd m7, m4 ; a:bl+br [second half]
+ paddd m0, m6 ; a:tl+tr+bl+br [first half]
+ paddd m2, m7 ; a:tl+tr+bl+br [second half]
+ paddd m1, m0
+ paddd m3, m2
+ pslld m1, 2
+ pslld m3, 2
+ psubd m1, m0 ; bb [first half]
+ psubd m3, m2 ; bb [second half]
+%endif
+
+%if ARCH_X86_64
+ movu m11, [aq-4]
+ movu m12, [aq+4]
+ paddd m1, [aq] ; a:top+ctr+bottom [first half]
+ paddd m11, m12 ; a:bl+br [first half]
+ movu m12, [aq+16-4]
+ movu m13, [aq+16+4]
+ paddd m3, [aq+16] ; a:top+ctr+bottom [second half]
+ paddd m12, m13 ; a:bl+br [second half]
+ paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half]
+ paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half]
+ paddd m0, m11 ; a:tl+tr+bl+br [first half]
+ paddd m2, m12 ; a:tl+tr+bl+br [second half]
+ paddd m13, m0
+ paddd m14, m2
+ pslld m13, 2
+ pslld m14, 2
+ psubd m13, m0 ; bb [first half]
+ psubd m14, m2 ; bb [second half]
+ psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half]
+ psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half]
+%else
+ mova m4, [esp+0x80]
+ mova [esp+0x80], m5
+ mova m5, [esp+0x70]
+ mova [esp+0x70], m6
+ mova m6, [esp+0x60]
+ mova [esp+0x60], m7
+ mova [esp+0x20], m1
+ movu m7, [bq-2]
+ movu m1, [bq+2]
+ paddw m5, [bq] ; b:top+ctr+bottom
+ paddw m7, m1
+ paddw m1, m5, m6 ; b:top+ctr+bottom+l+r
+ paddw m4, m7 ; b:tl+tr+bl+br
+ psubw m5, [bq-(384+16)*2*2] ; b:ctr+bottom
+ paddw m1, m4
+ psllw m1, 2
+ psubw m1, m4 ; aa
+ movq m0, [srcq]
+ XCHG_PIC_REG
+ punpcklbw m0, [PIC_sym(pb_0)]
+ punpcklwd m4, m1, [PIC_sym(pw_16)]
+ punpckhwd m1, [PIC_sym(pw_16)]
+ punpcklwd m2, m0, [PIC_sym(pw_16)]
+ punpckhwd m0, [PIC_sym(pw_16)]
+ XCHG_PIC_REG
+ pmaddwd m4, m2 ; aa*src[x]+256 [first half]
+ pmaddwd m1, m0 ; aa*src[x]+256 [second half]
+%endif
+
+%if ARCH_X86_64
+ paddd m6, m13
+ paddd m10, m14
+ psrad m6, 9
+ psrad m10, 9
+ packssdw m6, m10
+ mova [tq], m6
+%else
+ paddd m4, [esp+0x20]
+ paddd m1, m3
+ psrad m4, 9
+ psrad m1, 9
+ packssdw m4, m1
+ mova [tq], m4
+%endif
+
+ ; shift to next row
+%if ARCH_X86_64
+ mova m0, m4
+ mova m2, m5
+ mova m4, m11
+ mova m5, m12
+ mova m6, m8
+ mova m8, m9
+%else
+ mova m1, [esp+0x50]
+ mova m3, [esp+0x40]
+ mova m0, [esp+0x30]
+ mova m2, [esp+0x80]
+ mova m4, [esp+0x70]
+ mova [esp+0x70], m5
+ mova m5, [esp+0x60]
+ mova [esp+0x80], m6
+ mova [esp+0x60], m7
+ psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half]
+ psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half]
+%endif
+
+ add srcq, strideq
+ add aq, (384+16)*4
+ add bq, (384+16)*2
+ add tq, 384*2
+ dec yd
+ jg .loop_y
+ add xd, 8
+ cmp xd, wd
+ jl .loop_x
+ RET
+
+cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
+ movifnidn hd, hm
+%if ARCH_X86_32
+ SETUP_PIC r6, 0
+%endif
+ movd m0, wtm
+ pshufb m0, [PIC_sym(pb_0_1)]
+ psllw m0, 4
+ pxor m7, m7
+ DEFINE_ARGS dst, stride, t, w, h, idx
+.loop_y:
+ xor idxd, idxd
+.loop_x:
+ mova m1, [tq+idxq*2+ 0]
+ mova m4, [tq+idxq*2+16]
+ mova m5, [dstq+idxq]
+ punpcklbw m2, m5, m7
+ punpckhbw m5, m7
+ psllw m3, m2, 4
+ psllw m6, m5, 4
+ psubw m1, m3
+ psubw m4, m6
+ pmulhrsw m1, m0
+ pmulhrsw m4, m0
+ paddw m1, m2
+ paddw m4, m5
+ packuswb m1, m4
+ mova [dstq+idxq], m1
+ add idxd, 16
+ cmp idxd, wd
+ jl .loop_x
+ add dstq, strideq
+ add tq, 384 * 2
+ dec hd
+ jg .loop_y
+ RET
+
+%if ARCH_X86_64
+cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+ mov edged, edgem
+ movifnidn wd, wm
+ mov hd, hm
+ mova m10, [pb_0]
+ mova m11, [pb_0_1]
+%else
+cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
+ %define edgeb byte edgem
+ %define wd xd
+ %define wq wd
+ %define wm r5m
+ %define strideq r4m
+ SUB esp, 8
+ SETUP_PIC sumsqd, 1, 1
+
+ %define m10 [PIC_sym(pb_0)]
+ %define m11 [PIC_sym(pb_0_1)]
+%endif
+
+ test edgeb, 2 ; have_right
+ jz .no_right
+ xor xlimd, xlimd
+ add wd, 2
+ add wd, 15
+ and wd, ~15
+ jmp .right_done
+.no_right:
+ mov xlimd, 3
+ dec wd
+.right_done:
+ pxor m1, m1
+ lea srcq, [srcq+wq+1]
+ lea sumq, [sumq+wq*2-2]
+ lea sumsqq, [sumsqq+wq*4-4]
+ neg wq
+%if ARCH_X86_64
+ lea r10, [pb_right_ext_mask+24]
+%else
+ mov wm, xd
+ %define wq wm
+%endif
+
+.loop_y:
+ mov xq, wq
+ ; load left
+ test edgeb, 1 ; have_left
+ jz .no_left
+ test leftq, leftq
+ jz .load_left_from_main
+ movd m0, [leftq]
+ movd m2, [srcq+xq-1]
+ pslldq m2, 4
+ por m0, m2
+ pslldq m0, 11
+ add leftq, 4
+ jmp .expand_x
+.no_left:
+ movd m0, [srcq+xq-1]
+ XCHG_PIC_REG
+ pshufb m0, m10
+ XCHG_PIC_REG
+ jmp .expand_x
+.load_left_from_main:
+ movd m0, [srcq+xq-4]
+ pslldq m0, 12
+.expand_x:
+ punpckhbw m0, m1
+
+ ; when we reach this, m0 contains left two px in highest words
+ cmp xd, -8
+ jle .loop_x
+ test xd, xd
+ jge .right_extend
+.partial_load_and_extend:
+ XCHG_PIC_REG
+ movd m3, [srcq-1]
+ movq m2, [srcq+xq]
+ pshufb m3, m10
+ punpcklbw m3, m1
+ punpcklbw m2, m1
+%if ARCH_X86_64
+ movu m4, [r10+xq*2]
+%else
+ movu m4, [PIC_sym(pb_right_ext_mask)+xd*2+24]
+ XCHG_PIC_REG
+%endif
+ pand m2, m4
+ pandn m4, m3
+ por m2, m4
+ jmp .loop_x_noload
+.right_extend:
+ psrldq m2, m0, 14
+ XCHG_PIC_REG
+ pshufb m2, m11
+ XCHG_PIC_REG
+ jmp .loop_x_noload
+
+.loop_x:
+ movq m2, [srcq+xq]
+ punpcklbw m2, m1
+.loop_x_noload:
+ palignr m3, m2, m0, 8
+ palignr m4, m2, m0, 10
+ palignr m5, m2, m0, 12
+ palignr m6, m2, m0, 14
+
+%if ARCH_X86_64
+ paddw m0, m3, m2
+ punpcklwd m7, m3, m2
+ punpckhwd m3, m2
+ paddw m0, m4
+ punpcklwd m8, m4, m5
+ punpckhwd m4, m5
+ paddw m0, m5
+ punpcklwd m9, m6, m1
+ punpckhwd m5, m6, m1
+ paddw m0, m6
+ pmaddwd m7, m7
+ pmaddwd m3, m3
+ pmaddwd m8, m8
+ pmaddwd m4, m4
+ pmaddwd m9, m9
+ pmaddwd m5, m5
+ paddd m7, m8
+ paddd m3, m4
+ paddd m7, m9
+ paddd m3, m5
+ movu [sumq+xq*2], m0
+ movu [sumsqq+xq*4+ 0], m7
+ movu [sumsqq+xq*4+16], m3
+%else
+ paddw m0, m3, m2
+ paddw m0, m4
+ paddw m0, m5
+ paddw m0, m6
+ movu [sumq+xq*2], m0
+ punpcklwd m7, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m0, m4, m5
+ punpckhwd m4, m5
+ punpckhwd m5, m6, m1
+ pmaddwd m7, m7
+ pmaddwd m3, m3
+ pmaddwd m0, m0
+ pmaddwd m4, m4
+ pmaddwd m5, m5
+ paddd m7, m0
+ paddd m3, m4
+ paddd m3, m5
+ punpcklwd m0, m6, m1
+ pmaddwd m0, m0
+ paddd m7, m0
+ movu [sumsqq+xq*4+ 0], m7
+ movu [sumsqq+xq*4+16], m3
+%endif
+
+ mova m0, m2
+ add xq, 8
+
+ ; if x <= -8 we can reload more pixels
+ ; else if x < 0 we reload and extend (this implies have_right=0)
+ ; else if x < xlimd we extend from previous load (this implies have_right=0)
+ ; else we are done
+
+ cmp xd, -8
+ jle .loop_x
+ test xd, xd
+ jl .partial_load_and_extend
+ cmp xd, xlimd
+ jl .right_extend
+
+ add srcq, strideq
+ add sumsqq, (384+16)*4
+ add sumq, (384+16)*2
+ dec hd
+ jg .loop_y
+%if ARCH_X86_32
+ ADD esp, 8
+%endif
+ RET
+
+%if ARCH_X86_64
+cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+ movifnidn edged, edgem
+ mov ylimd, edged
+%else
+cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
+ %define wm [esp+0]
+ %define hm [esp+4]
+ %define edgem [esp+8]
+ mov wm, xd
+ mov hm, yd
+ mov edgem, ylimd
+%endif
+
+ and ylimd, 8 ; have_bottom
+ shr ylimd, 2
+ sub ylimd, 3 ; -3 if have_bottom=0, else -1
+ mov xq, -2
+%if ARCH_X86_64
+.loop_x:
+ lea yd, [hd+ylimd+2]
+ lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
+ lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2]
+ test edgeb, 4 ; have_top
+ jnz .load_top
+ movu m0, [sumsq_ptrq+(384+16)*4*1]
+ movu m1, [sumsq_ptrq+(384+16)*4*1+16]
+ mova m2, m0
+ mova m3, m1
+ mova m4, m0
+ mova m5, m1
+ mova m6, m0
+ mova m7, m1
+ movu m10, [sum_ptrq+(384+16)*2*1]
+ mova m11, m10
+ mova m12, m10
+ mova m13, m10
+ jmp .loop_y_second_load
+.load_top:
+ movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left]
+ movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right]
+ movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left]
+ movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right]
+ mova m2, m0
+ mova m3, m1
+ movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4
+ movu m12, [sum_ptrq-(384+16)*2*0] ; l2
+ mova m11, m10
+.loop_y:
+ movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left]
+ movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right]
+ movu m13, [sum_ptrq+(384+16)*2*1] ; l1
+.loop_y_second_load:
+ test yd, yd
+ jle .emulate_second_load
+ movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left]
+ movu m9, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right]
+ movu m14, [sum_ptrq+(384+16)*2*2] ; l0
+.loop_y_noload:
+ paddd m0, m2
+ paddd m1, m3
+ paddw m10, m11
+ paddd m0, m4
+ paddd m1, m5
+ paddw m10, m12
+ paddd m0, m6
+ paddd m1, m7
+ paddw m10, m13
+ paddd m0, m8
+ paddd m1, m9
+ paddw m10, m14
+ movu [sumsq_ptrq+ 0], m0
+ movu [sumsq_ptrq+16], m1
+ movu [sum_ptrq], m10
+
+ ; shift position down by one
+ mova m0, m4
+ mova m1, m5
+ mova m2, m6
+ mova m3, m7
+ mova m4, m8
+ mova m5, m9
+ mova m10, m12
+ mova m11, m13
+ mova m12, m14
+ add sumsq_ptrq, (384+16)*4*2
+ add sum_ptrq, (384+16)*2*2
+ sub yd, 2
+ jge .loop_y
+ ; l1 = l0
+ mova m6, m8
+ mova m7, m9
+ mova m13, m14
+ cmp yd, ylimd
+ jg .loop_y_noload
+ add xd, 8
+ cmp xd, wd
+ jl .loop_x
+ RET
+.emulate_second_load:
+ mova m8, m6
+ mova m9, m7
+ mova m14, m13
+ jmp .loop_y_noload
+%else
+.sumsq_loop_x:
+ lea yd, [ylimd+2]
+ add yd, hm
+ lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
+ test byte edgem, 4 ; have_top
+ jnz .sumsq_load_top
+ movu m0, [sumsq_ptrq+(384+16)*4*1]
+ movu m1, [sumsq_ptrq+(384+16)*4*1+16]
+ mova m4, m0
+ mova m5, m1
+ mova m6, m0
+ mova m7, m1
+ mova [esp+0x1c], m0
+ mova [esp+0x0c], m1
+ jmp .sumsq_loop_y_second_load
+.sumsq_load_top:
+ movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left]
+ movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right]
+ movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left]
+ movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right]
+ mova [esp+0x1c], m0
+ mova [esp+0x0c], m1
+.sumsq_loop_y:
+ movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left]
+ movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right]
+.sumsq_loop_y_second_load:
+ test yd, yd
+ jle .sumsq_emulate_second_load
+ movu m2, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left]
+ movu m3, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right]
+.sumsq_loop_y_noload:
+ paddd m0, [esp+0x1c]
+ paddd m1, [esp+0x0c]
+ paddd m0, m4
+ paddd m1, m5
+ paddd m0, m6
+ paddd m1, m7
+ paddd m0, m2
+ paddd m1, m3
+ movu [sumsq_ptrq+ 0], m0
+ movu [sumsq_ptrq+16], m1
+
+ ; shift position down by one
+ mova m0, m4
+ mova m1, m5
+ mova m4, m2
+ mova m5, m3
+ mova [esp+0x1c], m6
+ mova [esp+0x0c], m7
+ add sumsq_ptrq, (384+16)*4*2
+ sub yd, 2
+ jge .sumsq_loop_y
+ ; l1 = l0
+ mova m6, m2
+ mova m7, m3
+ cmp yd, ylimd
+ jg .sumsq_loop_y_noload
+ add xd, 8
+ cmp xd, wm
+ jl .sumsq_loop_x
+
+ mov xd, -2
+.sum_loop_x:
+ lea yd, [ylimd+2]
+ add yd, hm
+ lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
+ test byte edgem, 4 ; have_top
+ jnz .sum_load_top
+ movu m0, [sum_ptrq+(384+16)*2*1]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp .sum_loop_y_second_load
+.sum_load_top:
+ movu m0, [sum_ptrq-(384+16)*2*1] ; l3/4
+ movu m2, [sum_ptrq-(384+16)*2*0] ; l2
+ mova m1, m0
+.sum_loop_y:
+ movu m3, [sum_ptrq+(384+16)*2*1] ; l1
+.sum_loop_y_second_load:
+ test yd, yd
+ jle .sum_emulate_second_load
+ movu m4, [sum_ptrq+(384+16)*2*2] ; l0
+.sum_loop_y_noload:
+ paddw m0, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ movu [sum_ptrq], m0
+
+ ; shift position down by one
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ add sum_ptrq, (384+16)*2*2
+ sub yd, 2
+ jge .sum_loop_y
+ ; l1 = l0
+ mova m3, m4
+ cmp yd, ylimd
+ jg .sum_loop_y_noload
+ add xd, 8
+ cmp xd, wm
+ jl .sum_loop_x
+ RET
+.sumsq_emulate_second_load:
+ mova m2, m6
+ mova m3, m7
+ jmp .sumsq_loop_y_noload
+.sum_emulate_second_load:
+ mova m4, m3
+ jmp .sum_loop_y_noload
+%endif
+
+cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
+ movifnidn sd, sm
+ sub aq, (384+16-1)*4
+ sub bq, (384+16-1)*2
+ add hd, 2
+%if ARCH_X86_64
+ LEA r5, sgr_x_by_x-0xF03
+%else
+ SETUP_PIC r5, 0
+%endif
+ movd m6, sd
+ pshuflw m6, m6, q0000
+ punpcklqdq m6, m6
+ pxor m7, m7
+ DEFINE_ARGS a, b, w, h, x
+%if ARCH_X86_64
+ mova m8, [pd_0xF0080029]
+ mova m9, [pw_256]
+ psrld m10, m9, 15 ; pd_512
+%else
+ %define m8 [PIC_sym(pd_0xF0080029)]
+ %define m9 [PIC_sym(pw_256)]
+ %define m10 [PIC_sym(pd_512)]
+%endif
+.loop_y:
+ mov xq, -2
+.loop_x:
+ movq m0, [bq+xq*2+0]
+ movq m1, [bq+xq*2+8]
+ punpcklwd m0, m7
+ punpcklwd m1, m7
+ movu m2, [aq+xq*4+ 0]
+ movu m3, [aq+xq*4+16]
+ pslld m4, m2, 3 ; aa * 8
+ pslld m5, m3, 3
+ paddd m2, m4 ; aa * 9
+ paddd m3, m5
+ paddd m4, m4 ; aa * 16
+ paddd m5, m5
+ paddd m2, m4 ; aa * 25
+ paddd m3, m5
+ pmaddwd m4, m0, m0
+ pmaddwd m5, m1, m1
+ psubd m2, m4 ; p = aa * 25 - bb * bb
+ psubd m3, m5
+ MULLD m2, m6
+ MULLD m3, m6
+ paddusw m2, m8
+ paddusw m3, m8
+ psrld m2, 20 ; z
+ psrld m3, 20
+ GATHERDD m4, m2 ; xx
+ GATHERDD m2, m3
+ psrld m4, 24
+ psrld m2, 24
+ packssdw m3, m4, m2
+ pmullw m4, m8
+ pmullw m2, m8
+ psubw m5, m9, m3
+ pmaddwd m0, m4
+ pmaddwd m1, m2
+ paddd m0, m10
+ paddd m1, m10
+ psrld m0, 10
+ psrld m1, 10
+ movu [bq+xq*2], m5
+ movu [aq+xq*4+ 0], m0
+ movu [aq+xq*4+16], m1
+ add xd, 8
+ cmp xd, wd
+ jl .loop_x
+ add aq, (384+16)*4*2
+ add bq, (384+16)*2*2
+ sub hd, 2
+ jg .loop_y
+ RET
+
+%if ARCH_X86_64
+cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
+ tmp_base, src_base, a_base, b_base, x, y
+ movifnidn wd, wm
+ mov hd, hm
+ mov tmp_baseq, tq
+ mov src_baseq, srcq
+ mov a_baseq, aq
+ mov b_baseq, bq
+ mova m9, [pw_5_6]
+ mova m12, [pw_256]
+ psrlw m10, m12, 8 ; pw_1
+ psrlw m11, m12, 1 ; pw_128
+ pxor m13, m13
+%else
+cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
+ %define tmp_baseq r0m
+ %define src_baseq r1m
+ %define a_baseq r3m
+ %define b_baseq r4m
+ %define wd r5m
+ %define hd r6m
+
+ SUB esp, 8
+ SETUP_PIC yd
+
+ %define m8 m5
+ %define m9 [PIC_sym(pw_5_6)]
+ %define m10 [PIC_sym(pw_1)]
+ %define m11 [PIC_sym(pw_128)]
+ %define m12 [PIC_sym(pw_256)]
+ %define m13 m0
+%endif
+ xor xd, xd
+.loop_x:
+ mov tq, tmp_baseq
+ mov srcq, src_baseq
+ mov aq, a_baseq
+ mov bq, b_baseq
+ movu m0, [aq+xq*4-(384+16)*4-4]
+ mova m1, [aq+xq*4-(384+16)*4]
+ movu m2, [aq+xq*4-(384+16)*4+4]
+ movu m3, [aq+xq*4-(384+16)*4-4+16]
+ mova m4, [aq+xq*4-(384+16)*4+16]
+ movu m5, [aq+xq*4-(384+16)*4+4+16]
+ paddd m0, m2
+ paddd m3, m5
+ paddd m0, m1
+ paddd m3, m4
+ pslld m2, m0, 2
+ pslld m5, m3, 2
+ paddd m2, m0
+ paddd m5, m3
+ paddd m0, m2, m1 ; prev_odd_b [first half]
+ paddd m1, m5, m4 ; prev_odd_b [second half]
+ movu m3, [bq+xq*2-(384+16)*2-2]
+ mova m4, [bq+xq*2-(384+16)*2]
+ movu m5, [bq+xq*2-(384+16)*2+2]
+ paddw m3, m5
+ punpcklwd m5, m3, m4
+ punpckhwd m3, m4
+ pmaddwd m5, m9
+ pmaddwd m3, m9
+ mova m2, m5
+ packssdw m2, m3 ; prev_odd_a
+ lea tq, [tq+xq*2]
+ lea srcq, [srcq+xq*1]
+ lea aq, [aq+xq*4+(384+16)*4]
+ lea bq, [bq+xq*2+(384+16)*2]
+%if ARCH_X86_32
+ mov [esp], PIC_reg
+%endif
+ mov yd, hd
+ XCHG_PIC_REG
+.loop_y:
+ movu m3, [aq-4]
+ mova m4, [aq]
+ movu m5, [aq+4]
+ paddd m3, m5
+ paddd m3, m4
+ pslld m5, m3, 2
+ paddd m5, m3
+ paddd m5, m4 ; cur_odd_b [first half]
+ movu m3, [aq+16-4]
+ mova m6, [aq+16]
+ movu m7, [aq+16+4]
+ paddd m3, m7
+ paddd m3, m6
+ pslld m7, m3, 2
+ paddd m7, m3
+ paddd m4, m7, m6 ; cur_odd_b [second half]
+ movu m3, [bq-2]
+ mova m6, [bq]
+ movu m7, [bq+2]
+ paddw m3, m7
+ punpcklwd m7, m3, m6
+ punpckhwd m3, m6
+ pmaddwd m7, m9
+ pmaddwd m3, m9
+ packssdw m6, m7, m3 ; cur_odd_a
+
+ paddd m0, m5 ; cur_even_b [first half]
+ paddd m1, m4 ; cur_even_b [second half]
+ paddw m2, m6 ; cur_even_a
+
+ movq m3, [srcq]
+%if ARCH_X86_64
+ punpcklbw m3, m13
+%else
+ mova [td], m5
+ pxor m7, m7
+ punpcklbw m3, m7
+%endif
+ punpcklwd m7, m3, m10
+ punpckhwd m3, m10
+ punpcklwd m8, m2, m12
+ punpckhwd m2, m12
+ pmaddwd m7, m8
+ pmaddwd m3, m2
+ paddd m7, m0
+ paddd m3, m1
+ psrad m7, 9
+ psrad m3, 9
+
+%if ARCH_X86_32
+ pxor m13, m13
+%endif
+ movq m8, [srcq+strideq]
+ punpcklbw m8, m13
+ punpcklwd m0, m8, m10
+ punpckhwd m8, m10
+ punpcklwd m1, m6, m11
+ punpckhwd m2, m6, m11
+ pmaddwd m0, m1
+ pmaddwd m8, m2
+%if ARCH_X86_64
+ paddd m0, m5
+%else
+ paddd m0, [td]
+%endif
+ paddd m8, m4
+ psrad m0, 8
+ psrad m8, 8
+
+ packssdw m7, m3
+ packssdw m0, m8
+%if ARCH_X86_32
+ mova m5, [td]
+%endif
+ mova [tq+384*2*0], m7
+ mova [tq+384*2*1], m0
+
+ mova m0, m5
+ mova m1, m4
+ mova m2, m6
+ add aq, (384+16)*4*2
+ add bq, (384+16)*2*2
+ add tq, 384*2*2
+ lea srcq, [srcq+strideq*2]
+%if ARCH_X86_64
+ sub yd, 2
+%else
+ sub dword [esp+4], 2
+%endif
+ jg .loop_y
+ add xd, 8
+ cmp xd, wd
+ jl .loop_x
+%if ARCH_X86_32
+ ADD esp, 8
+%endif
+ RET
+
+%undef t2
+cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
+ movifnidn wd, wm
+ movd m0, wtm
+%if ARCH_X86_64
+ movifnidn hd, hm
+ mova m10, [pd_1024]
+ pxor m11, m11
+%else
+ SETUP_PIC hd, 0
+ %define m10 [PIC_sym(pd_1024)]
+ %define m11 m7
+%endif
+ pshufd m0, m0, 0
+ DEFINE_ARGS dst, stride, t1, t2, w, h, idx
+%if ARCH_X86_32
+ %define hd hmp
+%endif
+
+.loop_y:
+ xor idxd, idxd
+.loop_x:
+ mova m1, [t1q+idxq*2+ 0]
+ mova m2, [t1q+idxq*2+16]
+ mova m3, [t2q+idxq*2+ 0]
+ mova m4, [t2q+idxq*2+16]
+ mova m6, [dstq+idxq]
+%if ARCH_X86_32
+ pxor m11, m11
+%endif
+ punpcklbw m5, m6, m11
+ punpckhbw m6, m11
+ psllw m7, m5, 4
+ psubw m1, m7
+ psubw m3, m7
+ psllw m7, m6, 4
+ psubw m2, m7
+ psubw m4, m7
+ punpcklwd m7, m1, m3
+ punpckhwd m1, m3
+ punpcklwd m3, m2, m4
+ punpckhwd m2, m4
+ pmaddwd m7, m0
+ pmaddwd m1, m0
+ pmaddwd m3, m0
+ pmaddwd m2, m0
+ paddd m7, m10
+ paddd m1, m10
+ paddd m3, m10
+ paddd m2, m10
+ psrad m7, 11
+ psrad m1, 11
+ psrad m3, 11
+ psrad m2, 11
+ packssdw m7, m1
+ packssdw m3, m2
+ paddw m7, m5
+ paddw m3, m6
+ packuswb m7, m3
+ mova [dstq+idxq], m7
+ add idxd, 16
+ cmp idxd, wd
+ jl .loop_x
+ add dstq, strideq
+ add t1q, 384 * 2
+ add t2q, 384 * 2
+ dec hd
+ jg .loop_y
+ RET
--- a/src/x86/looprestoration_ssse3.asm
+++ /dev/null
@@ -1,1953 +1,0 @@
-; Copyright © 2018, VideoLAN and dav1d authors
-; Copyright © 2018, Two Orioles, LLC
-; Copyright © 2018, VideoLabs
-; All rights reserved.
-;
-; Redistribution and use in source and binary forms, with or without
-; modification, are permitted provided that the following conditions are met:
-;
-; 1. Redistributions of source code must retain the above copyright notice, this
-; list of conditions and the following disclaimer.
-;
-; 2. Redistributions in binary form must reproduce the above copyright notice,
-; this list of conditions and the following disclaimer in the documentation
-; and/or other materials provided with the distribution.
-;
-; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-%include "config.asm"
-%include "ext/x86/x86inc.asm"
-
-SECTION_RODATA 16
-
-pb_right_ext_mask: times 16 db 0xff
- times 16 db 0
-pb_14x0_1_2: times 14 db 0
- db 1, 2
-pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
- db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
-pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
-pb_0: times 16 db 0
-pb_2: times 16 db 2
-pb_3: times 16 db 3
-pb_4: times 16 db 4
-pb_15: times 16 db 15
-pb_0_1: times 8 db 0, 1
-pb_6_7: times 8 db 6, 7
-pb_14_15: times 8 db 14, 15
-pw_1: times 8 dw 1
-pw_16: times 8 dw 16
-pw_128: times 8 dw 128
-pw_255: times 8 dw 255
-pw_256: times 8 dw 256
-pw_2048: times 8 dw 2048
-pw_16380: times 8 dw 16380
-pw_5_6: times 4 dw 5, 6
-pw_0_128: times 4 dw 0, 128
-pd_1024: times 4 dd 1024
-%if ARCH_X86_32
-pd_256: times 4 dd 256
-pd_512: times 4 dd 512
-pd_2048: times 4 dd 2048
-%endif
-pd_0xF0080029: times 4 dd 0xF0080029
-pd_0xF00801C7: times 4 dd 0XF00801C7
-
-cextern sgr_x_by_x
-
-SECTION .text
-
-%if ARCH_X86_32
- %define PIC_base_offset $$
-
- %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
- %assign pic_reg_stk_off 4
- %xdefine PIC_reg %1
- %if %2 == 1
- mov [esp], %1
- %endif
- LEA PIC_reg, PIC_base_offset
- %if %3 == 1
- XCHG_PIC_REG
- %endif
- %endmacro
-
- %macro XCHG_PIC_REG 0
- mov [esp+pic_reg_stk_off], PIC_reg
- %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
- mov PIC_reg, [esp+pic_reg_stk_off]
- %endmacro
-
- %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
-
-%else
- %macro XCHG_PIC_REG 0
- %endmacro
-
- %define PIC_sym(sym) (sym)
-%endif
-
-%macro PALIGNR 4 ; dst, src1, src2, shift
- %if cpuflag(ssse3)
- palignr %1, %2, %3, %4
- %else
- %assign %%i regnumof%+%1 + 1
- %define %%tmp m %+ %%i
- psrldq %1, %3, %4
- pslldq %%tmp, %2, 16-%4
- por %1, %%tmp
- %endif
-%endmacro
-
-%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero
- %if cpuflag(ssse3)
- pmaddubsw %1, %2
- %else
- %if %5 == 1
- pxor %3, %3
- %endif
- punpckhbw %4, %1, %3
- punpcklbw %1, %3
- pmaddwd %4, %2
- pmaddwd %1, %2
- packssdw %1, %4
- %endif
-%endmacro
-
-;;;;;;;;;;;;;;;;;;;;;;
-;; wiener ;;
-;;;;;;;;;;;;;;;;;;;;;;
-
-%macro WIENER_H 0
-%if ARCH_X86_64
-cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge
- mov edged, edgem
- movifnidn wd, wm
- mov hd, hm
-%else
-cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge
- mov r5, edgem
- mov [esp+12], r5
- mov wd, wm
- mov hd, hm
- SETUP_PIC hd
- %define m15 m0
- %define m14 m1
- %define m13 m2
- %define m12 m3
-%endif
-
- movq m15, [fhq]
-%if cpuflag(ssse3)
- pshufb m12, m15, [PIC_sym(pb_6_7)]
- pshufb m13, m15, [PIC_sym(pb_4)]
- pshufb m14, m15, [PIC_sym(pb_2)]
- pshufb m15, m15, [PIC_sym(pb_0)]
-%else
- pshuflw m12, m15, q3333
- punpcklbw m15, m15
- pshufhw m13, m15, q0000
- pshuflw m14, m15, q2222
- pshuflw m15, m15, q0000
- punpcklqdq m12, m12
- punpckhqdq m13, m13
- punpcklqdq m14, m14
- punpcklqdq m15, m15
- psraw m13, 8
- psraw m14, 8
- psraw m15, 8
-%endif
-
-%if ARCH_X86_64
- mova m11, [pw_2048]
- mova m10, [pw_16380]
- lea r11, [pb_right_ext_mask]
-
- DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
-%else
- %define m10 [PIC_sym(pw_16380)]
- %define m11 [PIC_sym(pw_2048)]
- %define m12 [esp+0x14]
- %define m13 [esp+0x24]
- %define m14 [esp+0x34]
- %define m15 [esp+0x44]
- mova m12, m3
- mova m13, m2
- mova m14, m1
- mova m15, m0
-
- DEFINE_ARGS dst, left, src, stride, x, w, h, edge
- %define srcptrq srcq
- %define dstptrq dstq
- %define hd dword [esp+ 0]
- %define edgeb byte [esp+12]
- %define xlimd dword [esp+16]
-%endif
-
- ; if (edge & has_right) align_w_to_16
- ; else w -= 3, and use that as limit in x loop
- test edgeb, 2 ; has_right
- jnz .align
- mov xlimd, -3
- jmp .loop
-.align:
- add wd, 15
- and wd, ~15
-%if ARCH_X86_64
- xor xlimd, xlimd
-%else
- mov xlimd, 0
-%endif
-
- ; main y loop for vertical filter
-.loop:
-%if ARCH_X86_64
- mov srcptrq, srcq
- mov dstptrq, dstq
- lea xd, [wq+xlimq]
-%else
- mov [esp+8], srcq
- mov [esp+4], dstq
- mov xd, xlimd
- add xd, wd
-%endif
-
- ; load left edge pixels
- test edgeb, 1 ; have_left
- jz .emu_left
- test leftq, leftq ; left == NULL for the edge-extended bottom/top
- jz .load_left_combined
- movd m0, [leftq]
- movd m1, [srcq]
- punpckldq m0, m1
- pslldq m0, 9
- add leftq, 4
- jmp .left_load_done
-.load_left_combined:
- movq m0, [srcq-3]
- pslldq m0, 10
- jmp .left_load_done
-.emu_left:
- movd m0, [srcq]
-%if cpuflag(ssse3)
- pshufb m0, [PIC_sym(pb_14x0_1_2)]
-%else
- pslldq m1, m0, 13
- punpcklbw m0, m0
- pshuflw m0, m0, q0000
- punpcklqdq m0, m0
- psrldq m0, 2
- por m0, m1
-%endif
-
- ; load right edge pixels
-.left_load_done:
- cmp xd, 16
- jg .main_load
- test xd, xd
- jg .load_and_splat
- je .splat_right
-
- ; for very small images (w=[1-2]), edge-extend the original cache,
- ; ugly, but only runs in very odd cases
-%if cpuflag(ssse3)
- add wd, wd
- %if ARCH_X86_64
- pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
- %else
- pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16]
- %endif
- shr wd, 1
-%else
- shl wd, 4
- pcmpeqd m2, m2
- movd m3, wd
- psrldq m2, 2
- punpckhbw m1, m0, m0
- pshufhw m1, m1, q1122
- psllq m1, m3
- pand m0, m2
- pandn m2, m1
- por m0, m2
- shr wd, 4
-%endif
-
- ; main x loop, mostly this starts in .main_load
-.splat_right:
- ; no need to load new pixels, just extend them from the (possibly previously
- ; extended) previous load into m0
-%if cpuflag(ssse3)
- pshufb m1, m0, [PIC_sym(pb_15)]
-%else
- punpckhbw m1, m0, m0
- pshufhw m1, m1, q3333
- punpckhqdq m1, m1
-%endif
- jmp .main_loop
-.load_and_splat:
- ; load new pixels and extend edge for right-most
- movu m1, [srcptrq+3]
-%if ARCH_X86_64
- sub r11, xq
- movu m2, [r11+16]
- add r11, xq
-%else
- sub PIC_reg, xd
- movu m2, [PIC_sym(pb_right_ext_mask)+16]
- add PIC_reg, xd
-%endif
- movd m3, [srcptrq+2+xq]
-%if cpuflag(ssse3)
- pshufb m3, [PIC_sym(pb_0)]
-%else
- punpcklbw m3, m3
- pshuflw m3, m3, q0000
- punpcklqdq m3, m3
-%endif
- pand m1, m2
- pxor m2, [PIC_sym(pb_right_ext_mask)]
- pand m3, m2
- pxor m2, [PIC_sym(pb_right_ext_mask)]
- por m1, m3
- jmp .main_loop
-.main_load:
- ; load subsequent line
- movu m1, [srcptrq+3]
-.main_loop:
-%if ARCH_X86_64
- PALIGNR m2, m1, m0, 10
- PALIGNR m3, m1, m0, 11
- PALIGNR m4, m1, m0, 12
- PALIGNR m5, m1, m0, 13
- PALIGNR m6, m1, m0, 14
- PALIGNR m7, m1, m0, 15
-
- punpcklbw m0, m2, m1
- punpckhbw m2, m1
- punpcklbw m8, m3, m7
- punpckhbw m3, m7
- punpcklbw m7, m4, m6
- punpckhbw m4, m6
- PMADDUBSW m0, m15, m6, m9, 1
- PMADDUBSW m2, m15, m6, m9, 0
- PMADDUBSW m8, m14, m6, m9, 0
- PMADDUBSW m3, m14, m6, m9, 0
- PMADDUBSW m7, m13, m6, m9, 0
- PMADDUBSW m4, m13, m6, m9, 0
- paddw m0, m8
- paddw m2, m3
- %if cpuflag(ssse3)
- pxor m6, m6
- %endif
- punpcklbw m3, m5, m6
- punpckhbw m5, m6
- psllw m8, m3, 7
- psllw m6, m5, 7
- psubw m8, m10
- psubw m6, m10
- pmullw m3, m12
- pmullw m5, m12
- paddw m0, m7
- paddw m2, m4
- paddw m0, m3
- paddw m2, m5
- paddsw m0, m8 ; see the avx2 for an explanation
- paddsw m2, m6 ; of how the clipping works here
- psraw m0, 3
- psraw m2, 3
- paddw m0, m11
- paddw m2, m11
- mova [dstptrq+ 0], m0
- mova [dstptrq+16], m2
-%else
- PALIGNR m2, m1, m0, 10
- punpcklbw m3, m2, m1
- punpckhbw m2, m1
- PMADDUBSW m3, m15, m4, m5, 1
- PMADDUBSW m2, m15, m4, m5, 0
- PALIGNR m4, m1, m0, 11
- PALIGNR m5, m1, m0, 15
- punpcklbw m6, m4, m5
- punpckhbw m4, m5
- PMADDUBSW m6, m14, m5, m7, 1
- PMADDUBSW m4, m14, m5, m7, 0
- paddw m3, m6
- paddw m2, m4
- PALIGNR m4, m1, m0, 12
- PALIGNR m5, m1, m0, 14
- punpcklbw m6, m4, m5
- punpckhbw m4, m5
- PMADDUBSW m6, m13, m5, m7, 1
- PMADDUBSW m4, m13, m5, m7, 0
- paddw m3, m6
- paddw m2, m4
- PALIGNR m6, m1, m0, 13
- %if cpuflag(ssse3)
- pxor m5, m5
- %endif
- punpcklbw m4, m6, m5
- punpckhbw m6, m5
- psllw m5, m4, 7
- psllw m7, m6, 7
- psubw m5, m10
- psubw m7, m10
- pmullw m4, m12
- pmullw m6, m12
- paddw m3, m4
- paddw m2, m6
- paddsw m3, m5
- paddsw m2, m7
- psraw m3, 3
- psraw m2, 3
- paddw m3, m11
- paddw m2, m11
- mova [dstptrq+ 0], m3
- mova [dstptrq+16], m2
-%endif
-
- mova m0, m1
- add srcptrq, 16
- add dstptrq, 32
- sub xd, 16
- cmp xd, 16
- jg .main_load
- test xd, xd
- jg .load_and_splat
- cmp xd, xlimd
- jg .splat_right
-
-%if ARCH_X86_32
- mov srcq, [esp+8]
- mov dstq, [esp+4]
-%endif
- add srcq, strideq
- add dstq, 384*2
- dec hd
- jg .loop
- RET
-%endmacro
-
-%macro WIENER_V 0
-%if ARCH_X86_64
-cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge
- mov edged, edgem
- movifnidn fvq, fvmp
- movifnidn hd, hm
- movq m15, [fvq]
- pshufd m14, m15, q1111
- pshufd m15, m15, q0000
- paddw m14, [pw_0_128]
- mova m12, [pd_1024]
-
- DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr
-
- mov ylimd, edged
- and ylimd, 8 ; have_bottom
- shr ylimd, 2
- sub ylimd, 3
-%else
-cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
- %define ylimd [esp+12]
-
- mov r5d, edgem
- and r5d, 8
- shr r5d, 2
- sub r5d, 3
- mov ylimd, r5d
- mov fvq, fvmp
- mov edged, edgem
-
- SETUP_PIC edged
-
- movq m0, [fvq]
- pshufd m1, m0, q1111
- pshufd m0, m0, q0000
- paddw m1, [PIC_sym(pw_0_128)]
- mova [esp+0x50], m0
- mova [esp+0x40], m1
-
- DEFINE_ARGS dst, stride, mid, w, h, y, edge
- %define mptrq midq
- %define dstptrq dstq
- %define edgeb byte [esp]
-%endif
-
- ; main x loop for vertical filter, does one column of 16 pixels
-.loop_x:
- mova m3, [midq] ; middle line
-
- ; load top pixels
- test edgeb, 4 ; have_top
- jz .emu_top
- mova m0, [midq-384*4]
- mova m2, [midq-384*2]
- mova m1, m0
- jmp .load_bottom_pixels
-.emu_top:
- mova m0, m3
- mova m1, m3
- mova m2, m3
-
- ; load bottom pixels
-.load_bottom_pixels:
- mov yd, hd
-%if ARCH_X86_64
- mov mptrq, midq
- mov dstptrq, dstq
- add yd, ylimd
-%else
- mov [esp+8], midq
- mov [esp+4], dstq
- add yd, ylimd
-%endif
- jg .load_threelines
-
- ; the remainder here is somewhat messy but only runs in very weird
- ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
- ; so performance is not terribly important here...
- je .load_twolines
- cmp yd, -1
- je .load_oneline
- ; h == 1 case
- mova m5, m3
- mova m4, m3
- mova m6, m3
- jmp .loop
-.load_oneline:
- ; h == 2 case
- mova m4, [midq+384*2]
- mova m5, m4
- mova m6, m4
- jmp .loop
-.load_twolines:
- ; h == 3 case
- mova m4, [midq+384*2]
- mova m5, [midq+384*4]
- mova m6, m5
- jmp .loop
-.load_threelines:
- ; h > 3 case
- mova m4, [midq+384*2]
- mova m5, [midq+384*4]
- ; third line loaded in main loop below
-
- ; main y loop for vertical filter
-.loop_load:
- ; load one line into m6. if that pixel is no longer available, do
- ; nothing, since m6 still has the data from the previous line in it. We
- ; try to structure the loop so that the common case is evaluated fastest
- mova m6, [mptrq+384*6]
-.loop:
-%if ARCH_X86_64
- paddw m7, m0, m6
- paddw m8, m1, m5
- paddw m9, m2, m4
- punpcklwd m10, m7, m8
- punpckhwd m7, m8
- punpcklwd m11, m9, m3
- punpckhwd m9, m3
- pmaddwd m10, m15
- pmaddwd m7, m15
- pmaddwd m11, m14
- pmaddwd m9, m14
- paddd m10, m12
- paddd m7, m12
- paddd m10, m11
- paddd m7, m9
- psrad m10, 11
- psrad m7, 11
- packssdw m10, m7
- packuswb m10, m10
- movq [dstptrq], m10
-%else
- mova [esp+0x30], m1
- mova [esp+0x20], m2
- mova [esp+0x10], m3
- paddw m0, m6
- paddw m1, m5
- paddw m2, m4
- punpcklwd m7, m2, m3
- punpckhwd m2, m3
- punpcklwd m3, m0, m1
- punpckhwd m0, m1
- mova m1, [esp+0x50]
- pmaddwd m3, m1
- pmaddwd m0, m1
- mova m1, [esp+0x40]
- pmaddwd m7, m1
- pmaddwd m2, m1
- paddd m3, [PIC_sym(pd_1024)]
- paddd m0, [PIC_sym(pd_1024)]
- paddd m3, m7
- paddd m0, m2
- psrad m3, 11
- psrad m0, 11
- packssdw m3, m0
- packuswb m3, m3
- movq [dstq], m3
- mova m1, [esp+0x30]
- mova m2, [esp+0x20]
- mova m3, [esp+0x10]
-%endif
- ; shift pixels one position
- mova m0, m1
- mova m1, m2
- mova m2, m3
- mova m3, m4
- mova m4, m5
- mova m5, m6
- add mptrq, 384*2
- add dstptrq, strideq
- dec yd
- jg .loop_load
- ; for the bottom pixels, continue using m6 (as extended edge)
- cmp yd, ylimd
- jg .loop
-
-%if ARCH_X86_32
- mov midq, [esp+8]
- mov dstq, [esp+4]
-%endif
- add midq, 16
- add dstq, 8
- sub wd, 8
- jg .loop_x
- RET
-%endmacro
-
-INIT_XMM sse2
-WIENER_H
-WIENER_V
-
-INIT_XMM ssse3
-WIENER_H
-WIENER_V
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; self-guided ;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-%macro MULLD 2
- pmulhuw m5, %1, %2
- pmullw %1, %2
- pslld m5, 16
- paddd %1, m5
-%endmacro
-
-%macro GATHERDD 2
- mova m5, m7
- movd r6d, %2
- %if ARCH_X86_64
- movd %1, [r5+r6]
- pextrw r6d, %2, 2
- pinsrw m5, [r5+r6+2], 3
- pextrw r6d, %2, 4
- pinsrw %1, [r5+r6+2], 5
- pextrw r6d, %2, 6
- pinsrw m5, [r5+r6+2], 7
- %else
- movd %1, [PIC_sym(sgr_x_by_x-0xF03)+r6]
- pextrw r6d, %2, 2
- pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3
- pextrw r6d, %2, 4
- pinsrw %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5
- pextrw r6d, %2, 6
- pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7
- %endif
- por %1, m5
-%endmacro
-
-%if ARCH_X86_64
-cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
- mov xlimd, edgem
- movifnidn xd, xm
- mov hd, hm
- mov edged, xlimd
- and xlimd, 2 ; have_right
- add xd, xlimd
- xor xlimd, 2 ; 2*!have_right
-%else
-cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
- %define wq r0m
- %define xlimd r1m
- %define hd hmp
- %define edgeb byte edgem
-
- mov r6, edgem
- and r6, 2 ; have_right
- add xd, r6
- xor r6, 2 ; 2*!have_right
- mov xlimd, r6
- SETUP_PIC r6, 0
-%endif
-
- jnz .no_right
- add xd, 7
- and xd, ~7
-.no_right:
- pxor m1, m1
- lea srcq, [srcq+xq]
- lea sumq, [sumq+xq*2-2]
- lea sumsqq, [sumsqq+xq*4-4]
- neg xq
- mov wq, xq
-%if ARCH_X86_64
- lea r10, [pb_right_ext_mask+16]
-%endif
-.loop_y:
- mov xq, wq
-
- ; load left
- test edgeb, 1 ; have_left
- jz .no_left
- test leftq, leftq
- jz .load_left_from_main
- movd m0, [leftq]
- pslldq m0, 12
- add leftq, 4
- jmp .expand_x
-.no_left:
- movd m0, [srcq+xq]
- pshufb m0, [PIC_sym(pb_0)]
- jmp .expand_x
-.load_left_from_main:
- movd m0, [srcq+xq-2]
- pslldq m0, 14
-.expand_x:
- punpckhbw xm0, xm1
-
- ; when we reach this, m0 contains left two px in highest words
- cmp xd, -8
- jle .loop_x
-.partial_load_and_extend:
- movd m3, [srcq-4]
- pshufb m3, [PIC_sym(pb_3)]
- movq m2, [srcq+xq]
- punpcklbw m2, m1
- punpcklbw m3, m1
-%if ARCH_X86_64
- movu m4, [r10+xq*2]
-%else
- movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
-%endif
- pand m2, m4
- pandn m4, m3
- por m2, m4
- jmp .loop_x_noload
-.right_extend:
- pshufb m2, m0, [PIC_sym(pb_14_15)]
- jmp .loop_x_noload
-
-.loop_x:
- movq m2, [srcq+xq]
- punpcklbw m2, m1
-.loop_x_noload:
- palignr m3, m2, m0, 12
- palignr m4, m2, m0, 14
-
- punpcklwd m5, m3, m2
- punpckhwd m6, m3, m2
- paddw m3, m4
- punpcklwd m7, m4, m1
- punpckhwd m4, m1
- pmaddwd m5, m5
- pmaddwd m6, m6
- pmaddwd m7, m7
- pmaddwd m4, m4
- paddd m5, m7
- paddd m6, m4
- paddw m3, m2
- movu [sumq+xq*2], m3
- movu [sumsqq+xq*4+ 0], m5
- movu [sumsqq+xq*4+16], m6
-
- mova m0, m2
- add xq, 8
-
- ; if x <= -8 we can reload more pixels
- ; else if x < 0 we reload and extend (this implies have_right=0)
- ; else if x < xlimd we extend from previous load (this implies have_right=0)
- ; else we are done
-
- cmp xd, -8
- jle .loop_x
- test xd, xd
- jl .partial_load_and_extend
- cmp xd, xlimd
- jl .right_extend
-
- add sumsqq, (384+16)*4
- add sumq, (384+16)*2
- add srcq, strideq
- dec hd
- jg .loop_y
- RET
-
-%if ARCH_X86_64
-cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
- movifnidn edged, edgem
-%else
-cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
- %define sumsq_baseq dword [esp+0]
- %define sum_baseq dword [esp+4]
- %define ylimd dword [esp+8]
- %define m8 [esp+12]
- mov edged, r4m
- mov hd, r3m
-%endif
- mov xq, -2
-%if ARCH_X86_64
- mov ylimd, edged
- and ylimd, 8 ; have_bottom
- shr ylimd, 2
- sub ylimd, 2 ; -2 if have_bottom=0, else 0
- mov sumsq_baseq, sumsqq
- mov sum_baseq, sumq
-.loop_x:
- mov sumsqq, sumsq_baseq
- mov sumq, sum_baseq
- lea yd, [hq+ylimq+2]
-%else
- mov yd, edged
- and yd, 8 ; have_bottom
- shr yd, 2
- sub yd, 2 ; -2 if have_bottom=0, else 0
- mov sumsq_baseq, sumsqq
- mov sum_baseq, sumq
- mov ylimd, yd
-.loop_x:
- mov sumsqd, sumsq_baseq
- mov sumd, sum_baseq
- lea yd, [hq+2]
- add yd, ylimd
-%endif
- lea sumsqq, [sumsqq+xq*4+4-(384+16)*4]
- lea sumq, [sumq+xq*2+2-(384+16)*2]
- test edgeb, 4 ; have_top
- jnz .load_top
- movu m0, [sumsqq+(384+16)*4*1]
- movu m1, [sumsqq+(384+16)*4*1+16]
- mova m2, m0
- mova m3, m1
- mova m4, m0
- mova m5, m1
- movu m6, [sumq+(384+16)*2*1]
- mova m7, m6
- mova m8, m6
- jmp .loop_y_noload
-.load_top:
- movu m0, [sumsqq-(384+16)*4*1] ; l2sq [left]
- movu m1, [sumsqq-(384+16)*4*1+16] ; l2sq [right]
- movu m2, [sumsqq-(384+16)*4*0] ; l1sq [left]
- movu m3, [sumsqq-(384+16)*4*0+16] ; l1sq [right]
- movu m6, [sumq-(384+16)*2*1] ; l2
- movu m7, [sumq-(384+16)*2*0] ; l1
-.loop_y:
-%if ARCH_X86_64
- movu m8, [sumq+(384+16)*2*1] ; l0
-%else
- movu m4, [sumq+(384+16)*2*1] ; l0
- mova m8, m4
-%endif
- movu m4, [sumsqq+(384+16)*4*1] ; l0sq [left]
- movu m5, [sumsqq+(384+16)*4*1+16] ; l0sq [right]
-.loop_y_noload:
- paddd m0, m2
- paddd m1, m3
- paddw m6, m7
- paddd m0, m4
- paddd m1, m5
- paddw m6, m8
- movu [sumsqq+ 0], m0
- movu [sumsqq+16], m1
- movu [sumq], m6
-
- ; shift position down by one
- mova m0, m2
- mova m1, m3
- mova m2, m4
- mova m3, m5
- mova m6, m7
- mova m7, m8
- add sumsqq, (384+16)*4
- add sumq, (384+16)*2
- dec yd
- jg .loop_y
- cmp yd, ylimd
- jg .loop_y_noload
- add xd, 8
- cmp xd, wd
- jl .loop_x
- RET
-
-cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
- movifnidn sd, sm
- sub aq, (384+16-1)*4
- sub bq, (384+16-1)*2
- add hd, 2
-%if ARCH_X86_64
- LEA r5, sgr_x_by_x-0xF03
-%else
- SETUP_PIC r5, 0
-%endif
- movd m6, sd
- pshuflw m6, m6, q0000
- punpcklqdq m6, m6
- pxor m7, m7
- DEFINE_ARGS a, b, w, h, x
-%if ARCH_X86_64
- mova m8, [pd_0xF00801C7]
- mova m9, [pw_256]
- psrld m10, m9, 13 ; pd_2048
- mova m11, [pb_unpcklwdw]
-%else
- %define m8 [PIC_sym(pd_0xF00801C7)]
- %define m9 [PIC_sym(pw_256)]
- %define m10 [PIC_sym(pd_2048)]
- %define m11 [PIC_sym(pb_unpcklwdw)]
-%endif
-.loop_y:
- mov xq, -2
-.loop_x:
- movq m0, [bq+xq*2]
- movq m1, [bq+xq*2+(384+16)*2]
- punpcklwd m0, m7
- punpcklwd m1, m7
- movu m2, [aq+xq*4]
- movu m3, [aq+xq*4+(384+16)*4]
- pslld m4, m2, 3
- pslld m5, m3, 3
- paddd m2, m4 ; aa * 9
- paddd m3, m5
- pmaddwd m4, m0, m0
- pmaddwd m5, m1, m1
- pmaddwd m0, m8
- pmaddwd m1, m8
- psubd m2, m4 ; p = aa * 9 - bb * bb
- psubd m3, m5
- MULLD m2, m6
- MULLD m3, m6
- paddusw m2, m8
- paddusw m3, m8
- psrld m2, 20 ; z
- psrld m3, 20
- GATHERDD m4, m2 ; xx
- GATHERDD m2, m3
- psrld m4, 24
- psrld m2, 24
- packssdw m3, m4, m2
- pshufb m4, m11
- MULLD m0, m4
- pshufb m2, m11
- MULLD m1, m2
- psubw m5, m9, m3
- paddd m0, m10
- paddd m1, m10
- psrld m0, 12
- psrld m1, 12
- movq [bq+xq*2], m5
- psrldq m5, 8
- movq [bq+xq*2+(384+16)*2], m5
- movu [aq+xq*4], m0
- movu [aq+xq*4+(384+16)*4], m1
- add xd, 4
- cmp xd, wd
- jl .loop_x
- add aq, (384+16)*4*2
- add bq, (384+16)*2*2
- sub hd, 2
- jg .loop_y
- RET
-
-%if ARCH_X86_64
-cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
- tmp_base, src_base, a_base, b_base, x, y
- movifnidn wd, wm
- mov hd, hm
- mova m15, [pw_16]
- mov tmp_baseq, tq
- mov src_baseq, srcq
- mov a_baseq, aq
- mov b_baseq, bq
- xor xd, xd
-%else
-cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
- %define tmp_baseq [esp+8]
- %define src_baseq [esp+12]
- %define a_baseq [esp+16]
- %define b_baseq [esp+20]
- %define wd [esp+24]
- %define hd [esp+28]
- mov tmp_baseq, tq
- mov src_baseq, srcq
- mov a_baseq, aq
- mov b_baseq, bq
- mov wd, xd
- mov hd, yd
- xor xd, xd
- SETUP_PIC yd, 1, 1
- jmp .loop_start
-%endif
-
-.loop_x:
- mov tq, tmp_baseq
- mov srcq, src_baseq
- mov aq, a_baseq
- mov bq, b_baseq
-%if ARCH_X86_32
-.loop_start:
- movu m0, [bq+xq*2-(384+16)*2-2]
- movu m2, [bq+xq*2-(384+16)*2+2]
- mova m1, [bq+xq*2-(384+16)*2] ; b:top
- paddw m0, m2 ; b:tl+tr
- movu m2, [bq+xq*2-2]
- movu m3, [bq+xq*2+2]
- paddw m1, [bq+xq*2] ; b:top+ctr
- paddw m2, m3 ; b:l+r
- mova [esp+0x80], m0
- mova [esp+0x70], m1
- mova [esp+0x60], m2
-%endif
- movu m0, [aq+xq*4-(384+16)*4-4]
- movu m2, [aq+xq*4-(384+16)*4+4]
- mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half]
- paddd m0, m2 ; a:tl+tr [first half]
- movu m2, [aq+xq*4-(384+16)*4-4+16]
- movu m4, [aq+xq*4-(384+16)*4+4+16]
- mova m3, [aq+xq*4-(384+16)*4+16] ; a:top [second half]
- paddd m2, m4 ; a:tl+tr [second half]
- movu m4, [aq+xq*4-4]
- movu m5, [aq+xq*4+4]
- paddd m1, [aq+xq*4] ; a:top+ctr [first half]
- paddd m4, m5 ; a:l+r [first half]
- movu m5, [aq+xq*4+16-4]
- movu m6, [aq+xq*4+16+4]
- paddd m3, [aq+xq*4+16] ; a:top+ctr [second half]
- paddd m5, m6 ; a:l+r [second half]
-%if ARCH_X86_64
- movu m6, [bq+xq*2-(384+16)*2-2]
- movu m8, [bq+xq*2-(384+16)*2+2]
- mova m7, [bq+xq*2-(384+16)*2] ; b:top
- paddw m6, m8 ; b:tl+tr
- movu m8, [bq+xq*2-2]
- movu m9, [bq+xq*2+2]
- paddw m7, [bq+xq*2] ; b:top+ctr
- paddw m8, m9 ; b:l+r
-%endif
-
- lea tq, [tq+xq*2]
- lea srcq, [srcq+xq*1]
- lea aq, [aq+xq*4+(384+16)*4]
- lea bq, [bq+xq*2+(384+16)*2]
- mov yd, hd
-.loop_y:
-%if ARCH_X86_64
- movu m9, [bq-2]
- movu m10, [bq+2]
- paddw m7, [bq] ; b:top+ctr+bottom
- paddw m9, m10 ; b:bl+br
- paddw m10, m7, m8 ; b:top+ctr+bottom+l+r
- paddw m6, m9 ; b:tl+tr+bl+br
- psubw m7, [bq-(384+16)*2*2] ; b:ctr+bottom
- paddw m10, m6
- psllw m10, 2
- psubw m10, m6 ; aa
- pxor m14, m14
- movq m12, [srcq]
- punpcklbw m12, m14
- punpcklwd m6, m10, m15
- punpckhwd m10, m15
- punpcklwd m13, m12, m15
- punpckhwd m12, m15
- pmaddwd m6, m13 ; aa*src[x]+256 [first half]
- pmaddwd m10, m12 ; aa*src[x]+256 [second half]
-%else
- paddd m1, [aq] ; a:top+ctr+bottom [first half]
- paddd m3, [aq+16] ; a:top+ctr+bottom [second half]
- mova [esp+0x50], m1
- mova [esp+0x40], m3
- mova [esp+0x30], m4
- movu m6, [aq-4]
- movu m7, [aq+4]
- paddd m1, m4 ; a:top+ctr+bottom+l+r [first half]
- paddd m3, m5 ; a:top+ctr+bottom+l+r [second half]
- paddd m6, m7 ; a:bl+br [first half]
- movu m7, [aq+16-4]
- movu m4, [aq+16+4]
- paddd m7, m4 ; a:bl+br [second half]
- paddd m0, m6 ; a:tl+tr+bl+br [first half]
- paddd m2, m7 ; a:tl+tr+bl+br [second half]
- paddd m1, m0
- paddd m3, m2
- pslld m1, 2
- pslld m3, 2
- psubd m1, m0 ; bb [first half]
- psubd m3, m2 ; bb [second half]
-%endif
-
-%if ARCH_X86_64
- movu m11, [aq-4]
- movu m12, [aq+4]
- paddd m1, [aq] ; a:top+ctr+bottom [first half]
- paddd m11, m12 ; a:bl+br [first half]
- movu m12, [aq+16-4]
- movu m13, [aq+16+4]
- paddd m3, [aq+16] ; a:top+ctr+bottom [second half]
- paddd m12, m13 ; a:bl+br [second half]
- paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half]
- paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half]
- paddd m0, m11 ; a:tl+tr+bl+br [first half]
- paddd m2, m12 ; a:tl+tr+bl+br [second half]
- paddd m13, m0
- paddd m14, m2
- pslld m13, 2
- pslld m14, 2
- psubd m13, m0 ; bb [first half]
- psubd m14, m2 ; bb [second half]
- psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half]
- psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half]
-%else
- mova m4, [esp+0x80]
- mova [esp+0x80], m5
- mova m5, [esp+0x70]
- mova [esp+0x70], m6
- mova m6, [esp+0x60]
- mova [esp+0x60], m7
- mova [esp+0x20], m1
- movu m7, [bq-2]
- movu m1, [bq+2]
- paddw m5, [bq] ; b:top+ctr+bottom
- paddw m7, m1
- paddw m1, m5, m6 ; b:top+ctr+bottom+l+r
- paddw m4, m7 ; b:tl+tr+bl+br
- psubw m5, [bq-(384+16)*2*2] ; b:ctr+bottom
- paddw m1, m4
- psllw m1, 2
- psubw m1, m4 ; aa
- movq m0, [srcq]
- XCHG_PIC_REG
- punpcklbw m0, [PIC_sym(pb_right_ext_mask)+16]
- punpcklwd m4, m1, [PIC_sym(pw_16)]
- punpckhwd m1, [PIC_sym(pw_16)]
- punpcklwd m2, m0, [PIC_sym(pw_16)]
- punpckhwd m0, [PIC_sym(pw_16)]
- XCHG_PIC_REG
- pmaddwd m4, m2 ; aa*src[x]+256 [first half]
- pmaddwd m1, m0 ; aa*src[x]+256 [second half]
-%endif
-
-%if ARCH_X86_64
- paddd m6, m13
- paddd m10, m14
- psrad m6, 9
- psrad m10, 9
- packssdw m6, m10
- mova [tq], m6
-%else
- paddd m4, [esp+0x20]
- paddd m1, m3
- psrad m4, 9
- psrad m1, 9
- packssdw m4, m1
- mova [tq], m4
-%endif
-
- ; shift to next row
-%if ARCH_X86_64
- mova m0, m4
- mova m2, m5
- mova m4, m11
- mova m5, m12
- mova m6, m8
- mova m8, m9
-%else
- mova m1, [esp+0x50]
- mova m3, [esp+0x40]
- mova m0, [esp+0x30]
- mova m2, [esp+0x80]
- mova m4, [esp+0x70]
- mova [esp+0x70], m5
- mova m5, [esp+0x60]
- mova [esp+0x80], m6
- mova [esp+0x60], m7
- psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half]
- psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half]
-%endif
-
- add srcq, strideq
- add aq, (384+16)*4
- add bq, (384+16)*2
- add tq, 384*2
- dec yd
- jg .loop_y
- add xd, 8
- cmp xd, wd
- jl .loop_x
- RET
-
-cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
- movifnidn hd, hm
-%if ARCH_X86_32
- SETUP_PIC r6, 0
-%endif
- movd m0, wtm
- pshufb m0, [PIC_sym(pb_0_1)]
- psllw m0, 4
- pxor m7, m7
- DEFINE_ARGS dst, stride, t, w, h, idx
-.loop_y:
- xor idxd, idxd
-.loop_x:
- mova m1, [tq+idxq*2+ 0]
- mova m4, [tq+idxq*2+16]
- mova m5, [dstq+idxq]
- punpcklbw m2, m5, m7
- punpckhbw m5, m7
- psllw m3, m2, 4
- psllw m6, m5, 4
- psubw m1, m3
- psubw m4, m6
- pmulhrsw m1, m0
- pmulhrsw m4, m0
- paddw m1, m2
- paddw m4, m5
- packuswb m1, m4
- mova [dstq+idxq], m1
- add idxd, 16
- cmp idxd, wd
- jl .loop_x
- add dstq, strideq
- add tq, 384 * 2
- dec hd
- jg .loop_y
- RET
-
-%if ARCH_X86_64
-cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
- mov edged, edgem
- movifnidn wd, wm
- mov hd, hm
- mova m10, [pb_0]
- mova m11, [pb_0_1]
-%else
-cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
- %define edgeb byte edgem
- %define wd xd
- %define wq wd
- %define wm r5m
- %define strideq r4m
- SUB esp, 8
- SETUP_PIC sumsqd, 1, 1
-
- %define m10 [PIC_sym(pb_0)]
- %define m11 [PIC_sym(pb_0_1)]
-%endif
-
- test edgeb, 2 ; have_right
- jz .no_right
- xor xlimd, xlimd
- add wd, 2
- add wd, 15
- and wd, ~15
- jmp .right_done
-.no_right:
- mov xlimd, 3
- dec wd
-.right_done:
- pxor m1, m1
- lea srcq, [srcq+wq+1]
- lea sumq, [sumq+wq*2-2]
- lea sumsqq, [sumsqq+wq*4-4]
- neg wq
-%if ARCH_X86_64
- lea r10, [pb_right_ext_mask+16]
-%else
- mov wm, xd
- %define wq wm
-%endif
-
-.loop_y:
- mov xq, wq
- ; load left
- test edgeb, 1 ; have_left
- jz .no_left
- test leftq, leftq
- jz .load_left_from_main
- movd m0, [leftq]
- movd m2, [srcq+xq-1]
- pslldq m2, 4
- por m0, m2
- pslldq m0, 11
- add leftq, 4
- jmp .expand_x
-.no_left:
- movd m0, [srcq+xq-1]
- XCHG_PIC_REG
- pshufb m0, m10
- XCHG_PIC_REG
- jmp .expand_x
-.load_left_from_main:
- movd m0, [srcq+xq-4]
- pslldq m0, 12
-.expand_x:
- punpckhbw m0, m1
-
- ; when we reach this, m0 contains left two px in highest words
- cmp xd, -8
- jle .loop_x
- test xd, xd
- jge .right_extend
-.partial_load_and_extend:
- XCHG_PIC_REG
- movd m3, [srcq-1]
- movq m2, [srcq+xq]
- pshufb m3, m10
- punpcklbw m3, m1
- punpcklbw m2, m1
-%if ARCH_X86_64
- movu m4, [r10+xq*2]
-%else
- movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
- XCHG_PIC_REG
-%endif
- pand m2, m4
- pandn m4, m3
- por m2, m4
- jmp .loop_x_noload
-.right_extend:
- psrldq m2, m0, 14
- XCHG_PIC_REG
- pshufb m2, m11
- XCHG_PIC_REG
- jmp .loop_x_noload
-
-.loop_x:
- movq m2, [srcq+xq]
- punpcklbw m2, m1
-.loop_x_noload:
- palignr m3, m2, m0, 8
- palignr m4, m2, m0, 10
- palignr m5, m2, m0, 12
- palignr m6, m2, m0, 14
-
-%if ARCH_X86_64
- paddw m0, m3, m2
- punpcklwd m7, m3, m2
- punpckhwd m3, m2
- paddw m0, m4
- punpcklwd m8, m4, m5
- punpckhwd m4, m5
- paddw m0, m5
- punpcklwd m9, m6, m1
- punpckhwd m5, m6, m1
- paddw m0, m6
- pmaddwd m7, m7
- pmaddwd m3, m3
- pmaddwd m8, m8
- pmaddwd m4, m4
- pmaddwd m9, m9
- pmaddwd m5, m5
- paddd m7, m8
- paddd m3, m4
- paddd m7, m9
- paddd m3, m5
- movu [sumq+xq*2], m0
- movu [sumsqq+xq*4+ 0], m7
- movu [sumsqq+xq*4+16], m3
-%else
- paddw m0, m3, m2
- paddw m0, m4
- paddw m0, m5
- paddw m0, m6
- movu [sumq+xq*2], m0
- punpcklwd m7, m3, m2
- punpckhwd m3, m2
- punpcklwd m0, m4, m5
- punpckhwd m4, m5
- punpckhwd m5, m6, m1
- pmaddwd m7, m7
- pmaddwd m3, m3
- pmaddwd m0, m0
- pmaddwd m4, m4
- pmaddwd m5, m5
- paddd m7, m0
- paddd m3, m4
- paddd m3, m5
- punpcklwd m0, m6, m1
- pmaddwd m0, m0
- paddd m7, m0
- movu [sumsqq+xq*4+ 0], m7
- movu [sumsqq+xq*4+16], m3
-%endif
-
- mova m0, m2
- add xq, 8
-
- ; if x <= -8 we can reload more pixels
- ; else if x < 0 we reload and extend (this implies have_right=0)
- ; else if x < xlimd we extend from previous load (this implies have_right=0)
- ; else we are done
-
- cmp xd, -8
- jle .loop_x
- test xd, xd
- jl .partial_load_and_extend
- cmp xd, xlimd
- jl .right_extend
-
- add srcq, strideq
- add sumsqq, (384+16)*4
- add sumq, (384+16)*2
- dec hd
- jg .loop_y
-%if ARCH_X86_32
- ADD esp, 8
-%endif
- RET
-
-%if ARCH_X86_64
-cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
- movifnidn edged, edgem
- mov ylimd, edged
-%else
-cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
- %define wm [esp+0]
- %define hm [esp+4]
- %define edgem [esp+8]
- mov wm, xd
- mov hm, yd
- mov edgem, ylimd
-%endif
-
- and ylimd, 8 ; have_bottom
- shr ylimd, 2
- sub ylimd, 3 ; -3 if have_bottom=0, else -1
- mov xq, -2
-%if ARCH_X86_64
-.loop_x:
- lea yd, [hd+ylimd+2]
- lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
- lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2]
- test edgeb, 4 ; have_top
- jnz .load_top
- movu m0, [sumsq_ptrq+(384+16)*4*1]
- movu m1, [sumsq_ptrq+(384+16)*4*1+16]
- mova m2, m0
- mova m3, m1
- mova m4, m0
- mova m5, m1
- mova m6, m0
- mova m7, m1
- movu m10, [sum_ptrq+(384+16)*2*1]
- mova m11, m10
- mova m12, m10
- mova m13, m10
- jmp .loop_y_second_load
-.load_top:
- movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left]
- movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right]
- movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left]
- movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right]
- mova m2, m0
- mova m3, m1
- movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4
- movu m12, [sum_ptrq-(384+16)*2*0] ; l2
- mova m11, m10
-.loop_y:
- movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left]
- movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right]
- movu m13, [sum_ptrq+(384+16)*2*1] ; l1
-.loop_y_second_load:
- test yd, yd
- jle .emulate_second_load
- movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left]
- movu m9, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right]
- movu m14, [sum_ptrq+(384+16)*2*2] ; l0
-.loop_y_noload:
- paddd m0, m2
- paddd m1, m3
- paddw m10, m11
- paddd m0, m4
- paddd m1, m5
- paddw m10, m12
- paddd m0, m6
- paddd m1, m7
- paddw m10, m13
- paddd m0, m8
- paddd m1, m9
- paddw m10, m14
- movu [sumsq_ptrq+ 0], m0
- movu [sumsq_ptrq+16], m1
- movu [sum_ptrq], m10
-
- ; shift position down by one
- mova m0, m4
- mova m1, m5
- mova m2, m6
- mova m3, m7
- mova m4, m8
- mova m5, m9
- mova m10, m12
- mova m11, m13
- mova m12, m14
- add sumsq_ptrq, (384+16)*4*2
- add sum_ptrq, (384+16)*2*2
- sub yd, 2
- jge .loop_y
- ; l1 = l0
- mova m6, m8
- mova m7, m9
- mova m13, m14
- cmp yd, ylimd
- jg .loop_y_noload
- add xd, 8
- cmp xd, wd
- jl .loop_x
- RET
-.emulate_second_load:
- mova m8, m6
- mova m9, m7
- mova m14, m13
- jmp .loop_y_noload
-%else
-.sumsq_loop_x:
- lea yd, [ylimd+2]
- add yd, hm
- lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
- test byte edgem, 4 ; have_top
- jnz .sumsq_load_top
- movu m0, [sumsq_ptrq+(384+16)*4*1]
- movu m1, [sumsq_ptrq+(384+16)*4*1+16]
- mova m4, m0
- mova m5, m1
- mova m6, m0
- mova m7, m1
- mova [esp+0x1c], m0
- mova [esp+0x0c], m1
- jmp .sumsq_loop_y_second_load
-.sumsq_load_top:
- movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left]
- movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right]
- movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left]
- movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right]
- mova [esp+0x1c], m0
- mova [esp+0x0c], m1
-.sumsq_loop_y:
- movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left]
- movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right]
-.sumsq_loop_y_second_load:
- test yd, yd
- jle .sumsq_emulate_second_load
- movu m2, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left]
- movu m3, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right]
-.sumsq_loop_y_noload:
- paddd m0, [esp+0x1c]
- paddd m1, [esp+0x0c]
- paddd m0, m4
- paddd m1, m5
- paddd m0, m6
- paddd m1, m7
- paddd m0, m2
- paddd m1, m3
- movu [sumsq_ptrq+ 0], m0
- movu [sumsq_ptrq+16], m1
-
- ; shift position down by one
- mova m0, m4
- mova m1, m5
- mova m4, m2
- mova m5, m3
- mova [esp+0x1c], m6
- mova [esp+0x0c], m7
- add sumsq_ptrq, (384+16)*4*2
- sub yd, 2
- jge .sumsq_loop_y
- ; l1 = l0
- mova m6, m2
- mova m7, m3
- cmp yd, ylimd
- jg .sumsq_loop_y_noload
- add xd, 8
- cmp xd, wm
- jl .sumsq_loop_x
-
- mov xd, -2
-.sum_loop_x:
- lea yd, [ylimd+2]
- add yd, hm
- lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
- test byte edgem, 4 ; have_top
- jnz .sum_load_top
- movu m0, [sum_ptrq+(384+16)*2*1]
- mova m1, m0
- mova m2, m0
- mova m3, m0
- jmp .sum_loop_y_second_load
-.sum_load_top:
- movu m0, [sum_ptrq-(384+16)*2*1] ; l3/4
- movu m2, [sum_ptrq-(384+16)*2*0] ; l2
- mova m1, m0
-.sum_loop_y:
- movu m3, [sum_ptrq+(384+16)*2*1] ; l1
-.sum_loop_y_second_load:
- test yd, yd
- jle .sum_emulate_second_load
- movu m4, [sum_ptrq+(384+16)*2*2] ; l0
-.sum_loop_y_noload:
- paddw m0, m1
- paddw m0, m2
- paddw m0, m3
- paddw m0, m4
- movu [sum_ptrq], m0
-
- ; shift position down by one
- mova m0, m2
- mova m1, m3
- mova m2, m4
- add sum_ptrq, (384+16)*2*2
- sub yd, 2
- jge .sum_loop_y
- ; l1 = l0
- mova m3, m4
- cmp yd, ylimd
- jg .sum_loop_y_noload
- add xd, 8
- cmp xd, wm
- jl .sum_loop_x
- RET
-.sumsq_emulate_second_load:
- mova m2, m6
- mova m3, m7
- jmp .sumsq_loop_y_noload
-.sum_emulate_second_load:
- mova m4, m3
- jmp .sum_loop_y_noload
-%endif
-
-cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
- movifnidn sd, sm
- sub aq, (384+16-1)*4
- sub bq, (384+16-1)*2
- add hd, 2
-%if ARCH_X86_64
- LEA r5, sgr_x_by_x-0xF03
-%else
- SETUP_PIC r5, 0
-%endif
- movd m6, sd
- pshuflw m6, m6, q0000
- punpcklqdq m6, m6
- pxor m7, m7
- DEFINE_ARGS a, b, w, h, x
-%if ARCH_X86_64
- mova m8, [pd_0xF0080029]
- mova m9, [pw_256]
- psrld m10, m9, 15 ; pd_512
-%else
- %define m8 [PIC_sym(pd_0xF0080029)]
- %define m9 [PIC_sym(pw_256)]
- %define m10 [PIC_sym(pd_512)]
-%endif
-.loop_y:
- mov xq, -2
-.loop_x:
- movq m0, [bq+xq*2+0]
- movq m1, [bq+xq*2+8]
- punpcklwd m0, m7
- punpcklwd m1, m7
- movu m2, [aq+xq*4+ 0]
- movu m3, [aq+xq*4+16]
- pslld m4, m2, 3 ; aa * 8
- pslld m5, m3, 3
- paddd m2, m4 ; aa * 9
- paddd m3, m5
- paddd m4, m4 ; aa * 16
- paddd m5, m5
- paddd m2, m4 ; aa * 25
- paddd m3, m5
- pmaddwd m4, m0, m0
- pmaddwd m5, m1, m1
- psubd m2, m4 ; p = aa * 25 - bb * bb
- psubd m3, m5
- MULLD m2, m6
- MULLD m3, m6
- paddusw m2, m8
- paddusw m3, m8
- psrld m2, 20 ; z
- psrld m3, 20
- GATHERDD m4, m2 ; xx
- GATHERDD m2, m3
- psrld m4, 24
- psrld m2, 24
- packssdw m3, m4, m2
- pmullw m4, m8
- pmullw m2, m8
- psubw m5, m9, m3
- pmaddwd m0, m4
- pmaddwd m1, m2
- paddd m0, m10
- paddd m1, m10
- psrld m0, 10
- psrld m1, 10
- movu [bq+xq*2], m5
- movu [aq+xq*4+ 0], m0
- movu [aq+xq*4+16], m1
- add xd, 8
- cmp xd, wd
- jl .loop_x
- add aq, (384+16)*4*2
- add bq, (384+16)*2*2
- sub hd, 2
- jg .loop_y
- RET
-
-%if ARCH_X86_64
-cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
- tmp_base, src_base, a_base, b_base, x, y
- movifnidn wd, wm
- mov hd, hm
- mov tmp_baseq, tq
- mov src_baseq, srcq
- mov a_baseq, aq
- mov b_baseq, bq
- mova m9, [pw_5_6]
- mova m12, [pw_256]
- psrlw m10, m12, 8 ; pw_1
- psrlw m11, m12, 1 ; pw_128
- pxor m13, m13
-%else
-cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
- %define tmp_baseq r0m
- %define src_baseq r1m
- %define a_baseq r3m
- %define b_baseq r4m
- %define wd r5m
- %define hd r6m
-
- SUB esp, 8
- SETUP_PIC yd
-
- %define m8 m5
- %define m9 [PIC_sym(pw_5_6)]
- %define m10 [PIC_sym(pw_1)]
- %define m11 [PIC_sym(pw_128)]
- %define m12 [PIC_sym(pw_256)]
- %define m13 m0
-%endif
- xor xd, xd
-.loop_x:
- mov tq, tmp_baseq
- mov srcq, src_baseq
- mov aq, a_baseq
- mov bq, b_baseq
- movu m0, [aq+xq*4-(384+16)*4-4]
- mova m1, [aq+xq*4-(384+16)*4]
- movu m2, [aq+xq*4-(384+16)*4+4]
- movu m3, [aq+xq*4-(384+16)*4-4+16]
- mova m4, [aq+xq*4-(384+16)*4+16]
- movu m5, [aq+xq*4-(384+16)*4+4+16]
- paddd m0, m2
- paddd m3, m5
- paddd m0, m1
- paddd m3, m4
- pslld m2, m0, 2
- pslld m5, m3, 2
- paddd m2, m0
- paddd m5, m3
- paddd m0, m2, m1 ; prev_odd_b [first half]
- paddd m1, m5, m4 ; prev_odd_b [second half]
- movu m3, [bq+xq*2-(384+16)*2-2]
- mova m4, [bq+xq*2-(384+16)*2]
- movu m5, [bq+xq*2-(384+16)*2+2]
- paddw m3, m5
- punpcklwd m5, m3, m4
- punpckhwd m3, m4
- pmaddwd m5, m9
- pmaddwd m3, m9
- mova m2, m5
- packssdw m2, m3 ; prev_odd_a
- lea tq, [tq+xq*2]
- lea srcq, [srcq+xq*1]
- lea aq, [aq+xq*4+(384+16)*4]
- lea bq, [bq+xq*2+(384+16)*2]
-%if ARCH_X86_32
- mov [esp], PIC_reg
-%endif
- mov yd, hd
- XCHG_PIC_REG
-.loop_y:
- movu m3, [aq-4]
- mova m4, [aq]
- movu m5, [aq+4]
- paddd m3, m5
- paddd m3, m4
- pslld m5, m3, 2
- paddd m5, m3
- paddd m5, m4 ; cur_odd_b [first half]
- movu m3, [aq+16-4]
- mova m6, [aq+16]
- movu m7, [aq+16+4]
- paddd m3, m7
- paddd m3, m6
- pslld m7, m3, 2
- paddd m7, m3
- paddd m4, m7, m6 ; cur_odd_b [second half]
- movu m3, [bq-2]
- mova m6, [bq]
- movu m7, [bq+2]
- paddw m3, m7
- punpcklwd m7, m3, m6
- punpckhwd m3, m6
- pmaddwd m7, m9
- pmaddwd m3, m9
- packssdw m6, m7, m3 ; cur_odd_a
-
- paddd m0, m5 ; cur_even_b [first half]
- paddd m1, m4 ; cur_even_b [second half]
- paddw m2, m6 ; cur_even_a
-
- movq m3, [srcq]
-%if ARCH_X86_64
- punpcklbw m3, m13
-%else
- mova [td], m5
- pxor m7, m7
- punpcklbw m3, m7
-%endif
- punpcklwd m7, m3, m10
- punpckhwd m3, m10
- punpcklwd m8, m2, m12
- punpckhwd m2, m12
- pmaddwd m7, m8
- pmaddwd m3, m2
- paddd m7, m0
- paddd m3, m1
- psrad m7, 9
- psrad m3, 9
-
-%if ARCH_X86_32
- pxor m13, m13
-%endif
- movq m8, [srcq+strideq]
- punpcklbw m8, m13
- punpcklwd m0, m8, m10
- punpckhwd m8, m10
- punpcklwd m1, m6, m11
- punpckhwd m2, m6, m11
- pmaddwd m0, m1
- pmaddwd m8, m2
-%if ARCH_X86_64
- paddd m0, m5
-%else
- paddd m0, [td]
-%endif
- paddd m8, m4
- psrad m0, 8
- psrad m8, 8
-
- packssdw m7, m3
- packssdw m0, m8
-%if ARCH_X86_32
- mova m5, [td]
-%endif
- mova [tq+384*2*0], m7
- mova [tq+384*2*1], m0
-
- mova m0, m5
- mova m1, m4
- mova m2, m6
- add aq, (384+16)*4*2
- add bq, (384+16)*2*2
- add tq, 384*2*2
- lea srcq, [srcq+strideq*2]
-%if ARCH_X86_64
- sub yd, 2
-%else
- sub dword [esp+4], 2
-%endif
- jg .loop_y
- add xd, 8
- cmp xd, wd
- jl .loop_x
-%if ARCH_X86_32
- ADD esp, 8
-%endif
- RET
-
-cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
- movifnidn wd, wm
- movd m0, wtm
-%if ARCH_X86_64
- movifnidn hd, hm
- mova m10, [pd_1024]
- pxor m11, m11
-%else
- SETUP_PIC hd, 0
- %define m10 [PIC_sym(pd_1024)]
- %define m11 m7
-%endif
- pshufd m0, m0, 0
- DEFINE_ARGS dst, stride, t1, t2, w, h, idx
-%if ARCH_X86_32
- %define hd hmp
-%endif
-
-.loop_y:
- xor idxd, idxd
-.loop_x:
- mova m1, [t1q+idxq*2+ 0]
- mova m2, [t1q+idxq*2+16]
- mova m3, [t2q+idxq*2+ 0]
- mova m4, [t2q+idxq*2+16]
- mova m6, [dstq+idxq]
-%if ARCH_X86_32
- pxor m11, m11
-%endif
- punpcklbw m5, m6, m11
- punpckhbw m6, m11
- psllw m7, m5, 4
- psubw m1, m7
- psubw m3, m7
- psllw m7, m6, 4
- psubw m2, m7
- psubw m4, m7
- punpcklwd m7, m1, m3
- punpckhwd m1, m3
- punpcklwd m3, m2, m4
- punpckhwd m2, m4
- pmaddwd m7, m0
- pmaddwd m1, m0
- pmaddwd m3, m0
- pmaddwd m2, m0
- paddd m7, m10
- paddd m1, m10
- paddd m3, m10
- paddd m2, m10
- psrad m7, 11
- psrad m1, 11
- psrad m3, 11
- psrad m2, 11
- packssdw m7, m1
- packssdw m3, m2
- paddw m7, m5
- paddw m3, m6
- packuswb m7, m3
- mova [dstq+idxq], m7
- add idxd, 16
- cmp idxd, wd
- jl .loop_x
- add dstq, strideq
- add t1q, 384 * 2
- add t2q, 384 * 2
- dec hd
- jg .loop_y
- RET
--- a/src/x86/mc_avx2.asm
+++ b/src/x86/mc_avx2.asm
@@ -59,8 +59,8 @@
subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
-bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
@@ -76,6 +76,7 @@
pb_64: times 4 db 64
pw_m256: times 2 dw -256
+pw_15: times 2 dw 15
pw_32: times 2 dw 32
pw_34: times 2 dw 34
pw_258: times 2 dw 258
@@ -201,10 +202,9 @@
SECTION .text
INIT_XMM avx2
-DECLARE_REG_TMP 4, 6, 7
cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
movifnidn mxyd, r6m ; mx
- lea t2, [put_avx2]
+ lea r7, [put_avx2]
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
@@ -213,35 +213,35 @@
test mxyd, mxyd
jnz .v
.put:
- movzx wd, word [t2+wq*2+table_offset(put,)]
- add wq, t2
+ movzx wd, word [r7+wq*2+table_offset(put,)]
+ add wq, r7
jmp wq
.put_w2:
- movzx t0d, word [srcq+ssq*0]
- movzx t1d, word [srcq+ssq*1]
+ movzx r6d, word [srcq+ssq*0]
+ movzx r7d, word [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- mov [dstq+dsq*0], t0w
- mov [dstq+dsq*1], t1w
+ mov [dstq+dsq*0], r6w
+ mov [dstq+dsq*1], r7w
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w2
RET
.put_w4:
- mov t0d, [srcq+ssq*0]
- mov t1d, [srcq+ssq*1]
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- mov [dstq+dsq*0], t0d
- mov [dstq+dsq*1], t1d
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w4
RET
.put_w8:
- mov t0, [srcq+ssq*0]
- mov t1, [srcq+ssq*1]
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- mov [dstq+dsq*0], t0
- mov [dstq+dsq*1], t1
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w8
@@ -298,17 +298,17 @@
.h:
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
- imul mxyd, 0xff01
+ imul mxyd, 255
vbroadcasti128 m4, [bilin_h_shuf8]
- add mxyd, 16 << 8
+ add mxyd, 16
movd xm5, mxyd
mov mxyd, r7m ; my
vpbroadcastw m5, xm5
test mxyd, mxyd
jnz .hv
- movzx wd, word [t2+wq*2+table_offset(put, _bilin_h)]
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
vpbroadcastd m3, [pw_2048]
- add wq, t2
+ add wq, r7
jmp wq
.h_w2:
movd xm0, [srcq+ssq*0]
@@ -419,10 +419,10 @@
jg .h_w64
RET
.h_w128:
- mov t1, -32*3
+ mov r6, -32*3
.h_w128_loop:
- movu m0, [srcq+t1+32*3+8*0]
- movu m1, [srcq+t1+32*3+8*1]
+ movu m0, [srcq+r6+32*3+8*0]
+ movu m1, [srcq+r6+32*3+8*1]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
@@ -430,8 +430,8 @@
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
- mova [dstq+t1+32*3], m0
- add t1, 32
+ mova [dstq+r6+32*3], m0
+ add r6, 32
jle .h_w128_loop
add srcq, ssq
add dstq, dsq
@@ -439,11 +439,11 @@
jg .h_w128
RET
.v:
- movzx wd, word [t2+wq*2+table_offset(put, _bilin_v)]
- imul mxyd, 0xff01
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 255
vpbroadcastd m5, [pw_2048]
- add mxyd, 16 << 8
- add wq, t2
+ add mxyd, 16
+ add wq, r7
movd xm4, mxyd
vpbroadcastw m4, xm4
jmp wq
@@ -454,7 +454,7 @@
lea srcq, [srcq+ssq*2]
pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1
pshuflw xm1, xm1, q2301 ; 1 0
- punpcklbw xm1, xm0, xm1
+ punpcklbw xm1, xm0
pmaddubsw xm1, xm4
pmulhrsw xm1, xm5
packuswb xm1, xm1
@@ -467,11 +467,11 @@
.v_w4:
movd xm0, [srcq+ssq*0]
.v_w4_loop:
- vpbroadcastd xm1, [srcq+ssq*1]
+ vpbroadcastd xm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- vpblendd xm2, xm1, xm0, 0x01 ; 0 1
+ vpblendd xm1, xm2, xm0, 0x01 ; 0 1
vpbroadcastd xm0, [srcq+ssq*0]
- vpblendd xm1, xm0, 0x02 ; 1 2
+ vpblendd xm2, xm0, 0x02 ; 1 2
punpcklbw xm1, xm2
pmaddubsw xm1, xm4
pmulhrsw xm1, xm5
@@ -485,11 +485,11 @@
.v_w8:
movq xm0, [srcq+ssq*0]
.v_w8_loop:
- movq xm3, [srcq+ssq*1]
+ movq xm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw xm1, xm3, xm0
+ punpcklbw xm1, xm0, xm2
movq xm0, [srcq+ssq*0]
- punpcklbw xm2, xm0, xm3
+ punpcklbw xm2, xm0
pmaddubsw xm1, xm4
pmaddubsw xm2, xm4
pmulhrsw xm1, xm5
@@ -504,11 +504,11 @@
.v_w16:
movu xm0, [srcq+ssq*0]
.v_w16_loop:
- vbroadcasti128 m2, [srcq+ssq*1]
+ vbroadcasti128 m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- vpblendd m3, m2, m0, 0x0f ; 0 1
+ vpblendd m2, m3, m0, 0x0f ; 0 1
vbroadcasti128 m0, [srcq+ssq*0]
- vpblendd m2, m0, 0xf0 ; 1 2
+ vpblendd m3, m0, 0xf0 ; 1 2
punpcklbw m1, m2, m3
punpckhbw m2, m3
pmaddubsw m1, m4
@@ -528,8 +528,8 @@
%%loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw m1, m3, m0
- punpckhbw m2, m3, m0
+ punpcklbw m1, m0, m3
+ punpckhbw m2, m0, m3
movu m0, [srcq+ssq*0]
pmaddubsw m1, m4
pmaddubsw m2, m4
@@ -536,15 +536,15 @@
pmulhrsw m1, m5
pmulhrsw m2, m5
packuswb m1, m2
- mova [dstq+dsq*0], m1
- punpcklbw m1, m0, m3
- punpckhbw m2, m0, m3
- pmaddubsw m1, m4
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
pmaddubsw m2, m4
- pmulhrsw m1, m5
+ pmaddubsw m3, m4
pmulhrsw m2, m5
- packuswb m1, m2
- mova [dstq+dsq*1], m1
+ pmulhrsw m3, m5
+ packuswb m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg %%loop
@@ -557,8 +557,8 @@
.v_w64_loop:
add srcq, ssq
movu m3, [srcq+32*0]
- punpcklbw m2, m3, m0
- punpckhbw m0, m3, m0
+ punpcklbw m2, m0, m3
+ punpckhbw m0, m3
pmaddubsw m2, m4
pmaddubsw m0, m4
pmulhrsw m2, m5
@@ -567,8 +567,8 @@
mova m0, m3
movu m3, [srcq+32*1]
mova [dstq+32*0], m2
- punpcklbw m2, m3, m1
- punpckhbw m1, m3, m1
+ punpcklbw m2, m1, m3
+ punpckhbw m1, m3
pmaddubsw m2, m4
pmaddubsw m1, m4
pmulhrsw m2, m5
@@ -581,28 +581,29 @@
jg .v_w64_loop
RET
.v_w128:
- mov t0, dstq
- mov t1, srcq
- lea t2d, [hq+(3<<8)]
+ lea r6d, [hq+(3<<8)]
+ mov r4, srcq
+ mov r7, dstq
.v_w128_loop:
PUT_BILIN_V_W32
- movzx hd, t2b
- add t0, 32
- add t1, 32
- mov dstq, t0
- mov srcq, t1
- sub t2d, 1<<8
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
jg .v_w128_loop
RET
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
- movzx wd, word [t2+wq*2+table_offset(put, _bilin_hv)]
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
WIN64_SPILL_XMM 8
shl mxyd, 11 ; can't shift by 12 due to signed overflow
- vpbroadcastd m7, [pw_2048]
+ vpbroadcastd m7, [pw_15]
movd xm6, mxyd
- add wq, t2
+ add wq, r7
+ paddb m5, m5
vpbroadcastw m6, xm6
jmp wq
.hv_w2:
@@ -618,10 +619,10 @@
shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _
mova xm0, xm1
psubw xm1, xm2
- paddw xm1, xm1
pmulhw xm1, xm6
+ pavgw xm2, xm7
paddw xm1, xm2
- pmulhrsw xm1, xm7
+ psrlw xm1, 4
packuswb xm1, xm1
pextrw [dstq+dsq*0], xm1, 0
pextrw [dstq+dsq*1], xm1, 2
@@ -643,10 +644,10 @@
shufps xm2, xm0, xm1, q1032 ; 0 1
mova xm0, xm1
psubw xm1, xm2
- paddw xm1, xm1
pmulhw xm1, xm6
+ pavgw xm2, xm7
paddw xm1, xm2
- pmulhrsw xm1, xm7
+ psrlw xm1, 4
packuswb xm1, xm1
movd [dstq+dsq*0], xm1
pextrd [dstq+dsq*1], xm1, 1
@@ -667,10 +668,10 @@
vperm2i128 m2, m0, m1, 0x21 ; 0 1
mova m0, m1
psubw m1, m2
- paddw m1, m1
pmulhw m1, m6
+ pavgw m2, m7
paddw m1, m2
- pmulhrsw m1, m7
+ psrlw m1, 4
vextracti128 xm2, m1, 1
packuswb xm1, xm2
movq [dstq+dsq*0], xm1
@@ -694,16 +695,16 @@
pshufb m3, m4
pmaddubsw m2, m5
psubw m1, m2, m0
- paddw m1, m1
pmulhw m1, m6
+ pavgw m0, m7
paddw m1, m0
pmaddubsw m0, m3, m5
psubw m3, m0, m2
- paddw m3, m3
pmulhw m3, m6
+ pavgw m2, m7
paddw m3, m2
- pmulhrsw m1, m7
- pmulhrsw m3, m7
+ psrlw m1, 4
+ psrlw m3, 4
packuswb m1, m3
vpermq m1, m1, q3120
mova [dstq+dsq*0], xm1
@@ -712,19 +713,21 @@
sub hd, 2
jg .hv_w16_loop
RET
+.hv_w128:
+ lea r6d, [hq+(3<<16)]
+ jmp .hv_w32_start
+.hv_w64:
+ lea r6d, [hq+(1<<16)]
+.hv_w32_start:
+ mov r4, srcq
+ mov r7, dstq
.hv_w32:
- xor t2d, t2d
-.hv_w32gt:
- mov t0, dstq
- mov t1, srcq
%if WIN64
movaps r4m, xmm8
%endif
.hv_w32_loop0:
movu m0, [srcq+8*0]
- vinserti128 m0, [srcq+8*2], 1
movu m1, [srcq+8*1]
- vinserti128 m1, [srcq+8*3], 1
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
@@ -731,53 +734,44 @@
pmaddubsw m1, m5
.hv_w32_loop:
add srcq, ssq
- movu xm2, [srcq+8*1]
- vinserti128 m2, [srcq+8*3], 1
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
pshufb m2, m4
+ pshufb m3, m4
pmaddubsw m2, m5
- psubw m3, m2, m1
- paddw m3, m3
- pmulhw m3, m6
- paddw m3, m1
- mova m1, m2
- pmulhrsw m8, m3, m7
- movu xm2, [srcq+8*0]
- vinserti128 m2, [srcq+8*2], 1
- pshufb m2, m4
- pmaddubsw m2, m5
- psubw m3, m2, m0
- paddw m3, m3
- pmulhw m3, m6
- paddw m3, m0
+ pmaddubsw m3, m5
+ psubw m8, m2, m0
+ pmulhw m8, m6
+ pavgw m0, m7
+ paddw m8, m0
mova m0, m2
- pmulhrsw m3, m7
- packuswb m3, m8
- mova [dstq], m3
+ psubw m2, m3, m1
+ pmulhw m2, m6
+ pavgw m1, m7
+ paddw m2, m1
+ mova m1, m3
+ psrlw m8, 4
+ psrlw m2, 4
+ packuswb m8, m2
+ mova [dstq], m8
add dstq, dsq
dec hd
jg .hv_w32_loop
- movzx hd, t2b
- add t0, 32
- add t1, 32
- mov dstq, t0
- mov srcq, t1
- sub t2d, 1<<8
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<16
jg .hv_w32_loop0
%if WIN64
movaps xmm8, r4m
%endif
RET
-.hv_w64:
- lea t2d, [hq+(1<<8)]
- jmp .hv_w32gt
-.hv_w128:
- lea t2d, [hq+(3<<8)]
- jmp .hv_w32gt
-DECLARE_REG_TMP 3, 5, 6
cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movifnidn mxyd, r5m ; mx
- lea t2, [prep%+SUFFIX]
+ lea r6, [prep%+SUFFIX]
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
@@ -786,8 +780,8 @@
test mxyd, mxyd
jnz .v
.prep:
- movzx wd, word [t2+wq*2+table_offset(prep,)]
- add wq, t2
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ add wq, r6
lea stride3q, [strideq*3]
jmp wq
.prep_w4:
@@ -906,16 +900,16 @@
.h:
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
; = (16 - mx) * src[x] + mx * src[x + 1]
- imul mxyd, 0xff01
+ imul mxyd, 255
vbroadcasti128 m4, [bilin_h_shuf8]
- add mxyd, 16 << 8
+ add mxyd, 16
movd xm5, mxyd
mov mxyd, r6m ; my
vpbroadcastw m5, xm5
test mxyd, mxyd
jnz .hv
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
- add wq, t2
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+ add wq, r6
lea stride3q, [strideq*3]
jmp wq
.h_w4:
@@ -1079,10 +1073,10 @@
RET
.v:
WIN64_SPILL_XMM 7
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
- imul mxyd, 0xff01
- add mxyd, 16 << 8
- add wq, t2
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+ imul mxyd, 255
+ add mxyd, 16
+ add wq, r6
lea stride3q, [strideq*3]
movd xm6, mxyd
vpbroadcastw m6, xm6
@@ -1100,9 +1094,9 @@
vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4
vpblendd m1, m3, 0xaa ; 0 1 2 3
vpblendd m2, m3, 0x55 ; 1 2 3 4
- punpcklbw m2, m1
- pmaddubsw m2, m6
- mova [tmpq], m2
+ punpcklbw m1, m2
+ pmaddubsw m1, m6
+ mova [tmpq], m1
add tmpq, 32
sub hd, 4
jg .v_w4_loop
@@ -1116,15 +1110,15 @@
lea srcq, [srcq+strideq*4]
vpblendd m1, m0, 0x03 ; 0 2 2 2
vpbroadcastq m0, [srcq+strideq*0]
- vpblendd m3, m2, 0x33 ; 1 3 1 3
- vpblendd m2, m1, m3, 0x0f ; 1 3 2 2
- vpblendd m1, m3, 0xf0 ; 0 2 1 3
- vpblendd m2, m0, 0xc0 ; 1 3 2 4
- punpcklbw m3, m2, m1
- punpckhbw m2, m1
- pmaddubsw m3, m6
+ vpblendd m2, m3, 0xcc ; 1 3 1 3
+ vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2
+ vpblendd m2, m1, 0x0f ; 0 2 1 3
+ vpblendd m3, m0, 0xc0 ; 1 3 2 4
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m6
pmaddubsw m2, m6
- mova [tmpq+32*0], m3
+ mova [tmpq+32*0], m1
mova [tmpq+32*1], m2
add tmpq, 32*2
sub hd, 4
@@ -1133,25 +1127,25 @@
.v_w16:
vbroadcasti128 m0, [srcq+strideq*0]
.v_w16_loop:
- vbroadcasti128 m1, [srcq+strideq*2]
- vbroadcasti128 m2, [srcq+strideq*1]
+ vbroadcasti128 m1, [srcq+strideq*1]
+ vbroadcasti128 m2, [srcq+strideq*2]
vbroadcasti128 m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- shufpd m4, m0, m1, 0x0c ; 0 2 ; 0l2l 0h2h
+ shufpd m4, m0, m2, 0x0c ; 0 2
vbroadcasti128 m0, [srcq+strideq*0]
- shufpd m2, m2, m3, 0x0c ; 1 3 ; 1l3l 1h3h
- shufpd m1, m1, m0, 0x0c ; 2 4 ; 2l4l 2h4h
- punpcklbw m3, m2, m4
+ shufpd m1, m3, 0x0c ; 1 3
+ shufpd m2, m0, 0x0c ; 2 4
+ punpcklbw m3, m4, m1
punpcklbw m5, m1, m2
+ punpckhbw m4, m1
punpckhbw m1, m2
- punpckhbw m2, m4
pmaddubsw m3, m6
pmaddubsw m5, m6
- pmaddubsw m2, m6
+ pmaddubsw m4, m6
pmaddubsw m1, m6
mova [tmpq+32*0], m3
mova [tmpq+32*1], m5
- mova [tmpq+32*2], m2
+ mova [tmpq+32*2], m4
mova [tmpq+32*3], m1
add tmpq, 32*4
sub hd, 4
@@ -1164,32 +1158,32 @@
vpermq m2, [srcq+strideq*2], q3120
vpermq m3, [srcq+stride3q ], q3120
lea srcq, [srcq+strideq*4]
- punpcklbw m4, m1, m0
- punpckhbw m5, m1, m0
+ punpcklbw m4, m0, m1
+ punpckhbw m5, m0, m1
vpermq m0, [srcq+strideq*0], q3120
pmaddubsw m4, m6
pmaddubsw m5, m6
mova [tmpq+32*0], m4
mova [tmpq+32*1], m5
- punpcklbw m4, m2, m1
- punpckhbw m5, m2, m1
+ punpcklbw m4, m1, m2
+ punpckhbw m1, m2
pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ punpcklbw m5, m2, m3
+ punpckhbw m2, m3
pmaddubsw m5, m6
+ pmaddubsw m2, m6
mova [tmpq+32*2], m4
- mova [tmpq+32*3], m5
+ mova [tmpq+32*3], m1
add tmpq, 32*8
- punpcklbw m4, m3, m2
- punpckhbw m5, m3, m2
- punpcklbw m1, m0, m3
- punpckhbw m2, m0, m3
- pmaddubsw m4, m6
- pmaddubsw m5, m6
+ punpcklbw m1, m3, m0
+ punpckhbw m3, m0
pmaddubsw m1, m6
- pmaddubsw m2, m6
- mova [tmpq-32*4], m4
- mova [tmpq-32*3], m5
+ pmaddubsw m3, m6
+ mova [tmpq-32*4], m5
+ mova [tmpq-32*3], m2
mova [tmpq-32*2], m1
- mova [tmpq-32*1], m2
+ mova [tmpq-32*1], m3
sub hd, 4
jg .v_w32_loop
RET
@@ -1200,14 +1194,14 @@
vpermq m2, [srcq+strideq*1+32*0], q3120
vpermq m3, [srcq+strideq*1+32*1], q3120
lea srcq, [srcq+strideq*2]
- punpcklbw m4, m2, m0
- punpckhbw m5, m2, m0
+ punpcklbw m4, m0, m2
+ punpckhbw m0, m2
pmaddubsw m4, m6
- pmaddubsw m5, m6
+ pmaddubsw m0, m6
mova [tmpq+32*0], m4
- mova [tmpq+32*1], m5
- punpcklbw m4, m3, m1
- punpckhbw m5, m3, m1
+ mova [tmpq+32*1], m0
+ punpcklbw m4, m1, m3
+ punpckhbw m5, m1, m3
vpermq m0, [srcq+strideq*0+32*0], q3120
vpermq m1, [srcq+strideq*0+32*1], q3120
pmaddubsw m4, m6
@@ -1215,52 +1209,52 @@
mova [tmpq+32*2], m4
mova [tmpq+32*3], m5
add tmpq, 32*8
- punpcklbw m4, m0, m2
- punpckhbw m5, m0, m2
- punpcklbw m2, m1, m3
- punpckhbw m3, m1, m3
+ punpcklbw m4, m2, m0
+ punpckhbw m2, m0
+ punpcklbw m5, m3, m1
+ punpckhbw m3, m1
pmaddubsw m4, m6
- pmaddubsw m5, m6
pmaddubsw m2, m6
+ pmaddubsw m5, m6
pmaddubsw m3, m6
mova [tmpq-32*4], m4
- mova [tmpq-32*3], m5
- mova [tmpq-32*2], m2
+ mova [tmpq-32*3], m2
+ mova [tmpq-32*2], m5
mova [tmpq-32*1], m3
sub hd, 2
jg .v_w64_loop
RET
.v_w128:
- mov t0, tmpq
- mov t1, srcq
- lea t2d, [hq+(3<<8)]
+ lea r6d, [hq+(3<<8)]
+ mov r3, srcq
+ mov r5, tmpq
.v_w128_loop0:
vpermq m0, [srcq+strideq*0], q3120
.v_w128_loop:
vpermq m1, [srcq+strideq*1], q3120
lea srcq, [srcq+strideq*2]
- punpcklbw m2, m1, m0
- punpckhbw m3, m1, m0
+ punpcklbw m2, m0, m1
+ punpckhbw m3, m0, m1
vpermq m0, [srcq+strideq*0], q3120
- punpcklbw m4, m0, m1
- punpckhbw m5, m0, m1
pmaddubsw m2, m6
pmaddubsw m3, m6
+ punpcklbw m4, m1, m0
+ punpckhbw m1, m0
pmaddubsw m4, m6
- pmaddubsw m5, m6
+ pmaddubsw m1, m6
mova [tmpq+32*0], m2
mova [tmpq+32*1], m3
mova [tmpq+32*8], m4
- mova [tmpq+32*9], m5
+ mova [tmpq+32*9], m1
add tmpq, 32*16
sub hd, 2
jg .v_w128_loop
- movzx hd, t2b
- add t0, 64
- add t1, 32
- mov tmpq, t0
- mov srcq, t1
- sub t2d, 1<<8
+ add r3, 32
+ add r5, 64
+ movzx hd, r6b
+ mov srcq, r3
+ mov tmpq, r5
+ sub r6d, 1<<8
jg .v_w128_loop0
RET
.hv:
@@ -1268,11 +1262,11 @@
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 7
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
shl mxyd, 11
movd xm6, mxyd
vpbroadcastw m6, xm6
- add wq, t2
+ add wq, r6
lea stride3q, [strideq*3]
jmp wq
.hv_w4:
@@ -1388,10 +1382,19 @@
dec hd
jg .hv_w32_loop
RET
+.hv_w128:
+ lea r3d, [hq+(7<<8)]
+ mov r6d, 256
+ jmp .hv_w64_start
.hv_w64:
- mov t0, tmpq
- mov t1, srcq
- lea t2d, [hq+(3<<8)]
+ lea r3d, [hq+(3<<8)]
+ mov r6d, 128
+.hv_w64_start:
+%if WIN64
+ PUSH r7
+%endif
+ mov r5, srcq
+ mov r7, tmpq
.hv_w64_loop0:
movu xm0, [srcq+strideq*0+8*0]
vinserti128 m0, [srcq+strideq*0+8*1], 1
@@ -1413,57 +1416,22 @@
psubw m2, m0, m1
pmulhrsw m2, m6
paddw m2, m1
- mova [tmpq+32*0], m3
- add tmpq, 32*8
- mova [tmpq-32*4], m2
+ mova [tmpq+r6*0], m3
+ mova [tmpq+r6*1], m2
+ lea tmpq, [tmpq+r6*2]
sub hd, 2
jg .hv_w64_loop
- movzx hd, t2b
- add t0, 32
- add t1, 16
- mov tmpq, t0
- mov srcq, t1
- sub t2d, 1<<8
+ add r5, 16
+ add r7, 32
+ movzx hd, r3b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r3d, 1<<8
jg .hv_w64_loop0
+%if WIN64
+ POP r7
+%endif
RET
-.hv_w128:
- mov t0, tmpq
- mov t1, srcq
- lea t2d, [hq+(7<<8)]
-.hv_w128_loop0:
- movu xm0, [srcq+strideq*0+8*0]
- vinserti128 m0, [srcq+strideq*0+8*1], 1
- pshufb m0, m4
- pmaddubsw m0, m5
-.hv_w128_loop:
- movu xm1, [srcq+strideq*1+8*0]
- vinserti128 m1, [srcq+strideq*1+8*1], 1
- lea srcq, [srcq+strideq*2]
- movu xm2, [srcq+strideq*0+8*0]
- vinserti128 m2, [srcq+strideq*0+8*1], 1
- pshufb m1, m4
- pshufb m2, m4
- pmaddubsw m1, m5
- psubw m3, m1, m0
- pmulhrsw m3, m6
- paddw m3, m0
- pmaddubsw m0, m2, m5
- psubw m2, m0, m1
- pmulhrsw m2, m6
- paddw m2, m1
- mova [tmpq+32*0], m3
- mova [tmpq+32*8], m2
- add tmpq, 32*16
- sub hd, 2
- jg .hv_w128_loop
- movzx hd, t2b
- add t0, 32
- add t1, 16
- mov tmpq, t0
- mov srcq, t1
- sub t2d, 1<<8
- jg .hv_w128_loop0
- RET
; int8_t subpel_filters[5][15][8]
%assign FILTER_REGULAR (0*15 << 16) | 3*15
@@ -1676,12 +1644,12 @@
movd xm2, [srcq+ssq*0]
pinsrw xm2, [srcq+ssq*1], 2
pinsrw xm2, [srcq+ssq*2], 4
- pinsrw xm2, [srcq+ss3q ], 6 ; 0 1 2 3
- lea srcq, [srcq+ssq*4]
- movd xm3, [srcq+ssq*0]
- vpbroadcastd xm1, [srcq+ssq*1]
- vpbroadcastd xm0, [srcq+ssq*2]
add srcq, ss3q
+ pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3
+ movd xm3, [srcq+ssq*1]
+ vpbroadcastd xm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xm0, [srcq+ssq*0]
vpblendd xm3, xm1, 0x02 ; 4 5
vpblendd xm1, xm0, 0x02 ; 5 6
palignr xm4, xm3, xm2, 4 ; 1 2 3 4
@@ -1696,10 +1664,10 @@
mova xm2, xm3
pmaddubsw xm3, xm10 ; a2 b2
paddw xm5, xm3
- vpbroadcastd xm4, [srcq+ssq*0]
- vpblendd xm3, xm0, xm4, 0x02 ; 6 7
- vpbroadcastd xm0, [srcq+ssq*1]
+ vpbroadcastd xm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*0]
vpblendd xm4, xm0, 0x02 ; 7 8
punpcklbw xm3, xm4 ; 67 78
pmaddubsw xm4, xm3, xm11 ; a3 b3
@@ -1716,12 +1684,12 @@
movd xm2, [srcq+ssq*0]
pinsrd xm2, [srcq+ssq*1], 1
pinsrd xm2, [srcq+ssq*2], 2
- pinsrd xm2, [srcq+ss3q ], 3 ; 0 1 2 3
- lea srcq, [srcq+ssq*4]
- movd xm3, [srcq+ssq*0]
- vpbroadcastd xm1, [srcq+ssq*1]
- vpbroadcastd xm0, [srcq+ssq*2]
add srcq, ss3q
+ pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3
+ movd xm3, [srcq+ssq*1]
+ vpbroadcastd xm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xm0, [srcq+ssq*0]
vpblendd xm3, xm1, 0x02 ; 4 5
vpblendd xm1, xm0, 0x02 ; 5 6
palignr xm4, xm3, xm2, 4 ; 1 2 3 4
@@ -1736,10 +1704,10 @@
mova xm2, xm3
pmaddubsw xm3, xm10 ; a2 b2
paddw xm5, xm3
- vpbroadcastd xm4, [srcq+ssq*0]
- vpblendd xm3, xm0, xm4, 0x02 ; 6 7
- vpbroadcastd xm0, [srcq+ssq*1]
+ vpbroadcastd xm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*0]
vpblendd xm4, xm0, 0x02 ; 7 8
punpcklbw xm3, xm4 ; 67 78
pmaddubsw xm4, xm3, xm11 ; a3 b3
@@ -1756,12 +1724,12 @@
movq xm1, [srcq+ssq*0]
vpbroadcastq m4, [srcq+ssq*1]
vpbroadcastq m2, [srcq+ssq*2]
- vpbroadcastq m5, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- vpbroadcastq m3, [srcq+ssq*0]
- vpbroadcastq m6, [srcq+ssq*1]
- vpbroadcastq m0, [srcq+ssq*2]
add srcq, ss3q
+ vpbroadcastq m5, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpbroadcastq m6, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m0, [srcq+ssq*0]
vpblendd m1, m4, 0x30
vpblendd m4, m2, 0x30
punpcklbw m1, m4 ; 01 12
@@ -1772,6 +1740,8 @@
vpblendd m6, m0, 0x30
punpcklbw m3, m6 ; 45 56
.v_w8_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
pmaddubsw m5, m1, m8 ; a0 b0
mova m1, m2
pmaddubsw m2, m9 ; a1 b1
@@ -1779,10 +1749,8 @@
mova m2, m3
pmaddubsw m3, m10 ; a2 b2
paddw m5, m3
- vpbroadcastq m4, [srcq+ssq*0]
vpblendd m3, m0, m4, 0x30
- vpbroadcastq m0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
+ vpbroadcastq m0, [srcq+ssq*0]
vpblendd m4, m0, 0x30
punpcklbw m3, m4 ; 67 78
pmaddubsw m4, m3, m11 ; a3 b3
@@ -1800,30 +1768,28 @@
.v_w32:
.v_w64:
.v_w128:
- lea r6d, [wq-16]
- mov r4, dstq
- mov r7, srcq
- shl r6d, 4
- mov r6b, hb
+ lea r6d, [wq*8-128]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*2]
.v_w16_loop0:
vbroadcasti128 m4, [srcq+ssq*0]
vbroadcasti128 m5, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vbroadcasti128 m0, [srcq+ssq*1]
- vbroadcasti128 m6, [srcq+ssq*0]
- lea srcq, [srcq+ssq*2]
- vbroadcasti128 m1, [srcq+ssq*0]
- vbroadcasti128 m2, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vbroadcasti128 m1, [srcq+ssq*1]
+ vbroadcasti128 m2, [srcq+ssq*2]
+ add srcq, ss3q
vbroadcasti128 m3, [srcq+ssq*0]
- shufpd m4, m4, m0, 0x0c
- shufpd m5, m5, m1, 0x0c
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
punpcklbw m1, m4, m5 ; 01
punpckhbw m4, m5 ; 34
- shufpd m6, m6, m2, 0x0c
+ shufpd m6, m2, 0x0c
punpcklbw m2, m5, m6 ; 12
punpckhbw m5, m6 ; 45
- shufpd m0, m0, m3, 0x0c
+ shufpd m0, m3, 0x0c
punpcklbw m3, m6, m0 ; 23
punpckhbw m6, m0 ; 56
.v_w16_loop:
@@ -1861,11 +1827,11 @@
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w16_loop
- movzx hd, r6b
add r4, 16
add r7, 16
- mov dstq, r4
- mov srcq, r7
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
sub r6d, 1<<8
jg .v_w16_loop0
RET
@@ -1898,12 +1864,12 @@
movq xm2, [srcq+ssq*0]
movhps xm2, [srcq+ssq*1]
movq xm0, [srcq+ssq*2]
- movhps xm0, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- vpbroadcastq m3, [srcq+ssq*0]
- vpbroadcastq m4, [srcq+ssq*1]
- vpbroadcastq m1, [srcq+ssq*2]
add srcq, ss3q
+ movhps xm0, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpbroadcastq m4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m1, [srcq+ssq*0]
vpblendd m2, m3, 0x30
vpblendd m0, m1, 0x30
vpblendd m2, m4, 0xc0
@@ -1920,6 +1886,11 @@
pshufd xm0, xm3, q2121
punpcklwd xm3, xm0 ; 45 56
.hv_w2_loop:
+ movq xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm4, [srcq+ssq*0]
+ pshufb xm4, xm6
+ pmaddubsw xm4, xm7
pmaddwd xm5, xm1, xm10 ; a0 b0
mova xm1, xm2
pmaddwd xm2, xm11 ; a1 b1
@@ -1926,14 +1897,9 @@
paddd xm5, xm2
mova xm2, xm3
pmaddwd xm3, xm12 ; a2 b2
- paddd xm5, xm3
- movq xm4, [srcq+ssq*0]
- movhps xm4, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- pshufb xm4, xm6
- pmaddubsw xm4, xm7
phaddw xm4, xm4
pmulhrsw xm4, xm8
+ paddd xm5, xm3
palignr xm3, xm4, xm0, 12
mova xm0, xm4
punpcklwd xm3, xm0 ; 67 78
@@ -1954,13 +1920,13 @@
vpbroadcastq m2, [srcq+ssq*0]
vpbroadcastq m4, [srcq+ssq*1]
vpbroadcastq m0, [srcq+ssq*2]
- vpbroadcastq m5, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- vpbroadcastq m3, [srcq+ssq*0]
+ add srcq, ss3q
+ vpbroadcastq m5, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
vpblendd m2, m4, 0xcc ; 0 1
- vpbroadcastq m4, [srcq+ssq*1]
- vpbroadcastq m1, [srcq+ssq*2]
+ vpbroadcastq m4, [srcq+ssq*2]
add srcq, ss3q
+ vpbroadcastq m1, [srcq+ssq*0]
vpblendd m0, m5, 0xcc ; 2 3
vpblendd m3, m4, 0xcc ; 4 5
pshufb m2, m6
@@ -1981,6 +1947,8 @@
pshufd m0, m3, q2121
punpcklwd m3, m0 ; 45 56
.hv_w4_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
pmaddwd m5, m1, m10 ; a0 b0
mova m1, m2
pmaddwd m2, m11 ; a1 b1
@@ -1988,9 +1956,7 @@
mova m2, m3
pmaddwd m3, m12 ; a2 b2
paddd m5, m3
- vpbroadcastq m4, [srcq+ssq*0]
- vpbroadcastq m3, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
+ vpbroadcastq m3, [srcq+ssq*0]
vpblendd m4, m3, 0xcc ; 7 8
pshufb m4, m6
pmaddubsw m4, m7
@@ -2031,25 +1997,23 @@
pshufd m13, m0, q1111
pshufd m14, m0, q2222
pshufd m15, m0, q3333
- lea r6d, [wq-8]
- mov r4, dstq
- mov r7, srcq
- shl r6d, 5
- mov r6b, hb
+ lea r6d, [wq*8-64]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*4]
.hv_w8_loop0:
vbroadcasti128 m7, [subpel_h_shufA]
- vbroadcasti128 m8, [subpel_h_shufB]
- vbroadcasti128 m9, [subpel_h_shufC]
movu xm4, [srcq+ssq*0]
+ vbroadcasti128 m8, [subpel_h_shufB]
movu xm5, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- movu xm6, [srcq+ssq*0]
- vbroadcasti128 m0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m9, [subpel_h_shufC]
+ movu xm6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti128 m0, [srcq+ssq*0]
vpblendd m4, m0, 0xf0 ; 0 3
- vinserti128 m5, [srcq+ssq*0], 1 ; 1 4
- vinserti128 m6, [srcq+ssq*1], 1 ; 2 5
- lea srcq, [srcq+ssq*2]
+ vinserti128 m5, [srcq+ssq*1], 1 ; 1 4
+ vinserti128 m6, [srcq+ssq*2], 1 ; 2 5
+ add srcq, ss3q
vinserti128 m0, [srcq+ssq*0], 1 ; 3 6
%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
pshufb %3, %1, %6
@@ -2130,11 +2094,11 @@
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
- movzx hd, r6b
add r4, 8
add r7, 8
- mov dstq, r4
- mov srcq, r7
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
sub r6d, 1<<8
jg .hv_w8_loop0
RET
@@ -2153,48 +2117,6 @@
pmulhrsw m0, m4
%endmacro
-%macro PREP_8TAP_V_W4 5 ; round, weights
- movd xm0, [srcq+strideq*0]
- vpbroadcastd m1, [srcq+strideq*2]
- vpbroadcastd xm2, [srcq+strideq*1]
- vpbroadcastd m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _
- vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _
- vpbroadcastd m0, [srcq+strideq*0]
- vpbroadcastd m2, [srcq+strideq*1]
- vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _
- vpbroadcastd m0, [srcq+strideq*2]
- vbroadcasti128 m5, [deint_shuf4]
- vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5
- vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5
- vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _
- punpcklbw m1, m2, m3 ; 01 12 23 34
- vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6
- punpckhbw m2, m3 ; 23 34 45 56
-.v_w4_loop:
- pinsrd xm0, [srcq+stride3q ], 1
- lea srcq, [srcq+strideq*4]
- vpbroadcastd m3, [srcq+strideq*0]
- vpbroadcastd m4, [srcq+strideq*1]
- vpblendd m3, m4, 0x20 ; _ _ 8 _ 8 9 _ _
- vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 9 _ _
- vpbroadcastd m0, [srcq+strideq*2]
- vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _
- pshufb m3, m5 ; 67 78 89 9a
- pmaddubsw m4, m1, m%2
- vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78
- pmaddubsw m2, m%3
- paddw m4, m2
- mova m2, m3
- pmaddubsw m3, m%5
- paddw m3, m4
- pmaddubsw m4, m1, m%4
- paddw m3, m4
- pmulhrsw m3, m%1
- mova [tmpq], m3
-%endmacro
-
%if WIN64
DECLARE_REG_TMP 6, 4
%else
@@ -2347,7 +2269,45 @@
jg .v_w16
je .v_w8
.v_w4:
- PREP_8TAP_V_W4 7, 8, 9, 10, 11
+ movd xm0, [srcq+strideq*0]
+ vpbroadcastd m1, [srcq+strideq*2]
+ vpbroadcastd xm2, [srcq+strideq*1]
+ add srcq, stride3q
+ vpbroadcastd m3, [srcq+strideq*0]
+ vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _
+ vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _
+ vpbroadcastd m0, [srcq+strideq*1]
+ vpbroadcastd m2, [srcq+strideq*2]
+ vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _
+ vpbroadcastd m0, [srcq+stride3q ]
+ vbroadcasti128 m5, [deint_shuf4]
+ vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5
+ vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5
+ vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _
+ punpcklbw m1, m2, m3 ; 01 12 23 34
+ vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6
+ punpckhbw m2, m3 ; 23 34 45 56
+.v_w4_loop:
+ lea srcq, [srcq+strideq*4]
+ pinsrd xm0, [srcq+strideq*0], 1
+ vpbroadcastd m3, [srcq+strideq*1]
+ vpbroadcastd m4, [srcq+strideq*2]
+ vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _
+ vpbroadcastd m0, [srcq+stride3q ]
+ vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _
+ vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _
+ pshufb m3, m5 ; 67 78 89 9a
+ pmaddubsw m4, m1, m8
+ vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78
+ pmaddubsw m2, m9
+ paddw m4, m2
+ mova m2, m3
+ pmaddubsw m3, m11
+ paddw m3, m4
+ pmaddubsw m4, m1, m10
+ paddw m3, m4
+ pmulhrsw m3, m7
+ mova [tmpq], m3
add tmpq, 32
sub hd, 4
jg .v_w4_loop
@@ -2406,11 +2366,10 @@
jg .v_w8_loop
RET
.v_w16:
- lea r6d, [wq-16]
- mov r5, tmpq
- mov r7, srcq
- shl r6d, 4
- mov r6b, hb
+ add wd, wd
+ mov r5, srcq
+ mov r7, tmpq
+ lea r6d, [hq+wq*8-256]
.v_w16_loop0:
vbroadcasti128 m4, [srcq+strideq*0]
vbroadcasti128 m5, [srcq+strideq*1]
@@ -2461,15 +2420,15 @@
pmulhrsw m14, m7
pmulhrsw m15, m7
mova [tmpq+wq*0], m14
- mova [tmpq+wq*2], m15
- lea tmpq, [tmpq+wq*4]
+ mova [tmpq+wq*1], m15
+ lea tmpq, [tmpq+wq*2]
sub hd, 2
jg .v_w16_loop
+ add r5, 16
+ add r7, 32
movzx hd, r6b
- add r5, 32
- add r7, 16
- mov tmpq, r5
- mov srcq, r7
+ mov srcq, r5
+ mov tmpq, r7
sub r6d, 1<<8
jg .v_w16_loop0
RET
@@ -2557,8 +2516,8 @@
vpbroadcastq m2, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
paddd m6, m4
- paddd m5, m3
vpbroadcastq m4, [srcq+strideq*0]
+ paddd m5, m3
vpbroadcastq m3, [srcq+strideq*1]
vpblendd m2, m4, 0xcc
vpbroadcastq m4, [srcq+strideq*2]
@@ -2591,18 +2550,17 @@
jg .hv_w4_loop
RET
.hv_w8:
- lea r6d, [wq-8]
- mov r5, tmpq
- mov r7, srcq
- shl r6d, 5
- mov r6b, hb
+ lea r6d, [wq*8-64]
+ mov r5, srcq
+ mov r7, tmpq
+ lea r6d, [hq+r6*4]
.hv_w8_loop0:
vbroadcasti128 m7, [subpel_h_shufA]
- vbroadcasti128 m8, [subpel_h_shufB]
- vbroadcasti128 m9, [subpel_h_shufC]
movu xm4, [srcq+strideq*0]
+ vbroadcasti128 m8, [subpel_h_shufB]
movu xm5, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m9, [subpel_h_shufC]
movu xm6, [srcq+strideq*0]
vbroadcasti128 m0, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
@@ -2676,11 +2634,11 @@
lea tmpq, [tmpq+wq*4]
sub hd, 2
jg .hv_w8_loop
+ add r5, 8
+ add r7, 16
movzx hd, r6b
- add r5, 16
- add r7, 8
- mov tmpq, r5
- mov srcq, r7
+ mov srcq, r5
+ mov tmpq, r7
sub r6d, 1<<8
jg .hv_w8_loop0
RET
--- a/src/x86/mc_sse.asm
+++ b/src/x86/mc_sse.asm
@@ -57,8 +57,8 @@
subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
-bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
pb_8x0_8x8: times 8 db 0
@@ -77,6 +77,7 @@
pw_1: times 8 dw 1
pw_2: times 8 dw 2
pw_8: times 8 dw 8
+pw_15: times 8 dw 15
pw_26: times 8 dw 26
pw_34: times 8 dw 34
pw_512: times 8 dw 512
@@ -220,16 +221,18 @@
DECLARE_REG_TMP 7
%define base 0
%endif
-;
+
%macro RESTORE_DSQ_32 1
%if ARCH_X86_32
mov %1, dsm ; restore dsq
%endif
%endmacro
-;
-cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
+
+cglobal put_bilin, 1, 8, 0, dst, ds, src, ss, w, h, mxy
movifnidn mxyd, r6m ; mx
LEA t0, put_ssse3
+ movifnidn srcq, srcmp
+ movifnidn ssq, ssmp
tzcnt wd, wm
mov hd, hm
test mxyd, mxyd
@@ -335,20 +338,19 @@
.h:
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
- imul mxyd, 0xff01
+ imul mxyd, 0x00ff00ff
mova m4, [base+bilin_h_shuf8]
mova m0, [base+bilin_h_shuf4]
- add mxyd, 16 << 8
+ add mxyd, 0x00100010
movd m5, mxyd
mov mxyd, r7m ; my
- pshuflw m5, m5, q0000
- punpcklqdq m5, m5
+ pshufd m5, m5, q0000
test mxyd, mxyd
jnz .hv
movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)]
mova m3, [base+pw_2048]
add wq, t0
- RESTORE_DSQ_32 t0
+ movifnidn dsq, dsmp
jmp wq
.h_w2:
pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
@@ -485,14 +487,13 @@
RET
.v:
movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)]
- imul mxyd, 0xff01
+ imul mxyd, 0x00ff00ff
mova m5, [base+pw_2048]
- add mxyd, 16 << 8
+ add mxyd, 0x00100010
add wq, t0
movd m4, mxyd
- pshuflw m4, m4, q0000
- punpcklqdq m4, m4
- RESTORE_DSQ_32 t0
+ pshufd m4, m4, q0000
+ movifnidn dsq, dsmp
jmp wq
.v_w2:
movd m0, [srcq+ssq*0]
@@ -499,9 +500,9 @@
.v_w2_loop:
pinsrw m0, [srcq+ssq*1], 1 ; 0 1
lea srcq, [srcq+ssq*2]
- pshuflw m2, m0, q2301
+ pshuflw m1, m0, q2301
pinsrw m0, [srcq+ssq*0], 0 ; 2 1
- punpcklbw m1, m0, m2
+ punpcklbw m1, m0
pmaddubsw m1, m4
pmulhrsw m1, m5
packuswb m1, m1
@@ -516,11 +517,12 @@
.v_w4:
movd m0, [srcq+ssq*0]
.v_w4_loop:
- movd m1, [srcq+ssq*1]
+ movd m2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpckldq m2, m0, m1 ; 0 1
+ mova m1, m0
movd m0, [srcq+ssq*0]
- punpckldq m1, m0 ; 1 2
+ punpckldq m1, m2 ; 0 1
+ punpckldq m2, m0 ; 1 2
punpcklbw m1, m2
pmaddubsw m1, m4
pmulhrsw m1, m5
@@ -536,11 +538,12 @@
.v_w8:
movq m0, [srcq+ssq*0]
.v_w8_loop:
- movq m3, [srcq+ssq*1]
+ movq m2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw m1, m3, m0
+ mova m1, m0
movq m0, [srcq+ssq*0]
- punpcklbw m2, m0, m3
+ punpcklbw m1, m2
+ punpcklbw m2, m0
pmaddubsw m1, m4
pmaddubsw m2, m4
pmulhrsw m1, m5
@@ -552,66 +555,69 @@
sub hd, 2
jg .v_w8_loop
RET
- ;
%macro PUT_BILIN_V_W16 0
movu m0, [srcq+ssq*0]
%%loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw m1, m3, m0
- punpckhbw m2, m3, m0
+ mova m1, m0
+ mova m2, m0
movu m0, [srcq+ssq*0]
+ punpcklbw m1, m3
+ punpckhbw m2, m3
pmaddubsw m1, m4
pmaddubsw m2, m4
pmulhrsw m1, m5
pmulhrsw m2, m5
packuswb m1, m2
- mova [dstq+dsq*0], m1
- punpcklbw m1, m0, m3
- punpckhbw m2, m0, m3
- pmaddubsw m1, m4
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
pmaddubsw m2, m4
- pmulhrsw m1, m5
+ pmaddubsw m3, m4
pmulhrsw m2, m5
- packuswb m1, m2
- mova [dstq+dsq*1], m1
+ pmulhrsw m3, m5
+ packuswb m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg %%loop
%endmacro
- ;
.v_w16:
PUT_BILIN_V_W16
RET
+.v_w128:
+ lea r6d, [hq+(7<<16)]
+ jmp .v_w16gt
+.v_w64:
+ lea r6d, [hq+(3<<16)]
+ jmp .v_w16gt
+.v_w32:
+ lea r6d, [hq+(1<<16)]
.v_w16gt:
- mov r4, dstq
- mov r6, srcq
+ mov r4, srcq
+%if ARCH_X86_64
+ mov r7, dstq
+%endif
.v_w16gt_loop:
-%if ARCH_X86_32
- mov bakm, t0q
- RESTORE_DSQ_32 t0
PUT_BILIN_V_W16
- mov t0q, bakm
+%if ARCH_X86_64
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
%else
- PUT_BILIN_V_W16
+ mov dstq, dstmp
+ add r4, 16
+ movzx hd, r6w
+ add dstq, 16
+ mov srcq, r4
+ mov dstmp, dstq
%endif
- mov hw, t0w
- add r4, mmsize
- add r6, mmsize
- mov dstq, r4
- mov srcq, r6
- sub t0d, 1<<16
+ sub r6d, 1<<16
jg .v_w16gt
RET
-.v_w32:
- lea t0d, [hq+(1<<16)]
- jmp .v_w16gt
-.v_w64:
- lea t0d, [hq+(3<<16)]
- jmp .v_w16gt
-.v_w128:
- lea t0d, [hq+(7<<16)]
- jmp .v_w16gt
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
@@ -618,32 +624,33 @@
movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
WIN64_SPILL_XMM 8
shl mxyd, 11 ; can't shift by 12 due to signed overflow
- mova m7, [base+pw_2048]
+ mova m7, [base+pw_15]
movd m6, mxyd
add wq, t0
pshuflw m6, m6, q0000
+ paddb m5, m5
punpcklqdq m6, m6
jmp wq
.hv_w2:
RESTORE_DSQ_32 t0
movd m0, [srcq+ssq*0]
- pshufd m0, m0, q0000 ; src[x - src_stride]
+ punpckldq m0, m0
pshufb m0, m4
pmaddubsw m0, m5
.hv_w2_loop:
- movd m1, [srcq+ssq*1] ; src[x]
+ movd m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- movhps m1, [srcq+ssq*0] ; src[x + src_stride]
- pshufd m1, m1, q3120
+ movd m2, [srcq+ssq*0]
+ punpckldq m1, m2
pshufb m1, m4
pmaddubsw m1, m5 ; 1 _ 2 _
shufps m2, m0, m1, q1032 ; 0 _ 1 _
mova m0, m1
- psubw m1, m2 ; src[x + src_stride] - src[x]
- paddw m1, m1
- pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x])
- paddw m1, m2 ; src[x] + (my * (src[x + src_stride] - src[x])
- pmulhrsw m1, m7
+ psubw m1, m2 ; 2 * (src[x + src_stride] - src[x])
+ pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4
+ pavgw m2, m7 ; src[x] + 8
+ paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8
+ psrlw m1, 4
packuswb m1, m1
%if ARCH_X86_64
movq r6, m1
@@ -660,8 +667,8 @@
RET
.hv_w4:
mova m4, [base+bilin_h_shuf4]
- RESTORE_DSQ_32 t0
movddup xm0, [srcq+ssq*0]
+ movifnidn dsq, dsmp
pshufb m0, m4
pmaddubsw m0, m5
.hv_w4_loop:
@@ -669,14 +676,14 @@
lea srcq, [srcq+ssq*2]
movhps m1, [srcq+ssq*0]
pshufb m1, m4
- pmaddubsw m1, m5 ; 1 2
+ pmaddubsw m1, m5 ; 1 2
shufps m2, m0, m1, q1032 ; 0 1
mova m0, m1
psubw m1, m2
- paddw m1, m1
pmulhw m1, m6
+ pavgw m2, m7
paddw m1, m2
- pmulhrsw m1, m7
+ psrlw m1, 4
packuswb m1, m1
movd [dstq+dsq*0], m1
psrlq m1, 32
@@ -686,28 +693,28 @@
jg .hv_w4_loop
RET
.hv_w8:
- RESTORE_DSQ_32 t0
- movu m0, [srcq+ssq*0+8*0]
+ movu m0, [srcq+ssq*0]
+ movifnidn dsq, dsmp
pshufb m0, m4
pmaddubsw m0, m5
.hv_w8_loop:
- movu m2, [srcq+ssq*1+8*0]
+ movu m2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pshufb m2, m4
pmaddubsw m2, m5
psubw m1, m2, m0
- paddw m1, m1
pmulhw m1, m6
+ pavgw m0, m7
paddw m1, m0
- movu m0, [srcq+ssq*0+8*0]
+ movu m0, [srcq+ssq*0]
pshufb m0, m4
pmaddubsw m0, m5
psubw m3, m0, m2
- paddw m3, m3
pmulhw m3, m6
+ pavgw m2, m7
paddw m3, m2
- pmulhrsw m1, m7
- pmulhrsw m3, m7
+ psrlw m1, 4
+ psrlw m3, 4
packuswb m1, m3
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
@@ -715,27 +722,34 @@
sub hd, 2
jg .hv_w8_loop
RET
+.hv_w128:
+ lea r6d, [hq+(7<<16)]
+ jmp .hv_w16_start
+.hv_w64:
+ lea r6d, [hq+(3<<16)]
+ jmp .hv_w16_start
+.hv_w32:
+ lea r6d, [hq+(1<<16)]
+.hv_w16_start:
+ mov r4, srcq
+%if ARCH_X86_32
+ %define m8 [dstq]
+%else
+ mov r7, dstq
+%endif
.hv_w16:
- xor t0d, t0d
-.hv_w16gt:
- mov r4, dstq
- mov r6, srcq
- %if WIN64
- movaps r4m, xmm8
- %endif
+ movifnidn dsq, dsmp
+%if WIN64
+ movaps r4m, m8
+%endif
.hv_w16_loop0:
- movu m0, [srcq+8*0]
- movu m1, [srcq+8*1]
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
.hv_w16_loop:
-%if ARCH_X86_32
- %define m0tmp [dstq]
-%else
- %define m0tmp m8
-%endif
add srcq, ssq
movu m2, [srcq+8*0]
movu m3, [srcq+8*1]
@@ -743,62 +757,51 @@
pshufb m3, m4
pmaddubsw m2, m5
pmaddubsw m3, m5
- mova m0tmp, m2
+ mova m8, m2
psubw m2, m0
- paddw m2, m2
pmulhw m2, m6
+ pavgw m0, m7
paddw m2, m0
mova m0, m3
psubw m3, m1
- paddw m3, m3
pmulhw m3, m6
+ pavgw m1, m7
paddw m3, m1
mova m1, m0
- mova m0, m0tmp
- pmulhrsw m2, m7
- pmulhrsw m3, m7
+ mova m0, m8
+ psrlw m2, 4
+ psrlw m3, 4
packuswb m2, m3
mova [dstq], m2
add dstq, dsmp
dec hd
jg .hv_w16_loop
- movzx hd, t0w
- add r4, mmsize
- add r6, mmsize
- mov dstq, r4
- mov srcq, r6
- sub t0d, 1<<16
- jg .hv_w16_loop0
- %if WIN64
- movaps xmm8, r4m
- %endif
+%if ARCH_X86_32
+ mov dstq, dstm
+ add r4, 16
+ movzx hd, r6w
+ add dstq, 16
+ mov srcq, r4
+ mov dstm, dstq
+%else
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+%endif
+ sub r6d, 1<<16
+ jg .hv_w16_loop0
+%if WIN64
+ movaps m8, r4m
+%endif
RET
-.hv_w32:
- lea t0d, [hq+(1<<16)]
- jmp .hv_w16gt
-.hv_w64:
- lea t0d, [hq+(3<<16)]
- jmp .hv_w16gt
-.hv_w128:
- lea t0d, [hq+(7<<16)]
- jmp .hv_w16gt
-%macro PSHUFB_0X1X 1-2 ; dst[, src]
- %if cpuflag(ssse3)
- pshufb %1, %2
- %else
- punpcklbw %1, %1
- psraw %1, 8
- pshufd %1, %1, q0000
- %endif
-%endmacro
-
%macro PSHUFB_BILIN_H8 2 ; dst, src
%if cpuflag(ssse3)
pshufb %1, %2
%else
- mova %2, %1
- psrldq %1, 1
+ psrldq %2, %1, 1
punpcklbw %1, %2
%endif
%endmacro
@@ -807,8 +810,7 @@
%if cpuflag(ssse3)
pshufb %1, %2
%else
- mova %2, %1
- psrldq %1, 1
+ psrldq %2, %1, 1
punpckhbw %3, %1, %2
punpcklbw %1, %2
punpcklqdq %1, %3
@@ -845,17 +847,15 @@
%endmacro
%macro PREP_BILIN 0
-
-DECLARE_REG_TMP 3, 5, 6
%if ARCH_X86_32
- %define base t2-prep%+SUFFIX
+ %define base r6-prep%+SUFFIX
%else
- %define base 0
+ %define base 0
%endif
cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movifnidn mxyd, r5m ; mx
- LEA t2, prep%+SUFFIX
+ LEA r6, prep%+SUFFIX
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
@@ -865,11 +865,12 @@
jnz .v
.prep:
%if notcpuflag(ssse3)
- add t2, prep_ssse3 - prep_sse2
+ add r6, prep_ssse3 - prep_sse2
jmp prep_ssse3
%else
- movzx wd, word [t2+wq*2+table_offset(prep,)]
- add wq, t2
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ pxor m4, m4
+ add wq, r6
lea stride3q, [strideq*3]
jmp wq
.prep_w4:
@@ -877,17 +878,16 @@
movd m1, [srcq+strideq*1]
movd m2, [srcq+strideq*2]
movd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
punpckldq m0, m1
punpckldq m2, m3
- lea srcq, [srcq+strideq*4]
- pxor m1, m1
- punpcklbw m0, m1
- punpcklbw m2, m1
+ punpcklbw m0, m4
+ punpcklbw m2, m4
psllw m0, 4
psllw m2, 4
- mova [tmpq+mmsize*0], m0
- mova [tmpq+mmsize*1], m2
- add tmpq, 32
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
sub hd, 4
jg .prep_w4
RET
@@ -897,7 +897,6 @@
movq m2, [srcq+strideq*2]
movq m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- pxor m4, m4
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
@@ -915,16 +914,13 @@
jg .prep_w8
RET
.prep_w16:
- movq m0, [srcq+strideq*0+8*0]
- movq m1, [srcq+strideq*0+8*1]
- movq m2, [srcq+strideq*1+8*0]
- movq m3, [srcq+strideq*1+8*1]
+ movu m1, [srcq+strideq*0]
+ movu m3, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
- pxor m4, m4
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpcklbw m3, m4
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
psllw m0, 4
psllw m1, 4
psllw m2, 4
@@ -937,27 +933,25 @@
sub hd, 2
jg .prep_w16
RET
-.prep_w32:
- mov t2d, 1
- jmp .prep_w32_vloop
-.prep_w64:
- mov t2d, 2
- jmp .prep_w32_vloop
.prep_w128:
- mov t2d, 4
+ mov r3, -128
+ jmp .prep_w32_start
+.prep_w64:
+ mov r3, -64
+ jmp .prep_w32_start
+.prep_w32:
+ mov r3, -32
+.prep_w32_start:
+ sub srcq, r3
.prep_w32_vloop:
- mov t1q, srcq
- mov r3d, t2d
+ mov r6, r3
.prep_w32_hloop:
- movq m0, [t1q+8*0]
- movq m1, [t1q+8*1]
- movq m2, [t1q+8*2]
- movq m3, [t1q+8*3]
- pxor m4, m4
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpcklbw m3, m4
+ movu m1, [srcq+r6+16*0]
+ movu m3, [srcq+r6+16*1]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
psllw m0, 4
psllw m1, 4
psllw m2, 4
@@ -967,10 +961,9 @@
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
add tmpq, 16*4
- add t1q, 32
- dec r3d
- jg .prep_w32_hloop
- lea srcq, [srcq+strideq]
+ add r6, 32
+ jl .prep_w32_hloop
+ add srcq, strideq
dec hd
jg .prep_w32_vloop
RET
@@ -978,40 +971,31 @@
.h:
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
; = (16 - mx) * src[x] + mx * src[x + 1]
- imul mxyd, 0xff01
%if cpuflag(ssse3)
+ imul mxyd, 0x00ff00ff
mova m4, [base+bilin_h_shuf8]
+ add mxyd, 0x00100010
+%else
+ imul mxyd, 0xffff
+ add mxyd, 16
%endif
- add mxyd, 16 << 8
movd m5, mxyd
mov mxyd, r6m ; my
-%if cpuflag(ssse3)
- pshuflw m5, m5, q0000
- punpcklqdq m5, m5
-%else
- PSHUFB_0X1X m5
-%endif
+ pshufd m5, m5, q0000
test mxyd, mxyd
jnz .hv
-%if ARCH_X86_32
- mov t1, t2 ; save base reg for w4
-%endif
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
%if notcpuflag(ssse3)
WIN64_SPILL_XMM 8
pxor m6, m6
%endif
- add wq, t2
- lea stride3q, [strideq*3]
+ add wq, r6
jmp wq
.h_w4:
%if cpuflag(ssse3)
- %if ARCH_X86_32
- mova m4, [t1-prep_ssse3+bilin_h_shuf4]
- %else
- mova m4, [bilin_h_shuf4]
- %endif
+ mova m4, [base+bilin_h_shuf4]
%endif
+ lea stride3q, [strideq*3]
.h_w4_loop:
movq m0, [srcq+strideq*0]
movhps m0, [srcq+strideq*1]
@@ -1029,6 +1013,8 @@
jg .h_w4_loop
RET
.h_w8:
+ lea stride3q, [strideq*3]
+.h_w8_loop:
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*1]
movu m2, [srcq+strideq*2]
@@ -1048,7 +1034,7 @@
mova [tmpq+16*3], m3
add tmpq, 16*4
sub hd, 4
- jg .h_w8
+ jg .h_w8_loop
RET
.h_w16:
movu m0, [srcq+strideq*0+8*0]
@@ -1072,22 +1058,23 @@
sub hd, 2
jg .h_w16
RET
-.h_w32:
- mov t2d, 1 << 0
- jmp .h_w32_vloop
-.h_w64:
- mov t2d, 1 << 1
- jmp .h_w32_vloop
.h_w128:
- mov t2d, 1 << 3
+ mov r3, -128
+ jmp .h_w32_start
+.h_w64:
+ mov r3, -64
+ jmp .h_w32_start
+.h_w32:
+ mov r3, -32
+.h_w32_start:
+ sub srcq, r3
.h_w32_vloop:
- mov t1q, srcq
- mov r3d, t2d
+ mov r6, r3
.h_w32_hloop:
- movu m0, [t1q+8*0]
- movu m1, [t1q+8*1]
- movu m2, [t1q+8*2]
- movu m3, [t1q+8*3]
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ movu m2, [srcq+r6+8*2]
+ movu m3, [srcq+r6+8*3]
PSHUFB_BILIN_H8 m0, m4
PSHUFB_BILIN_H8 m1, m4
PSHUFB_BILIN_H8 m2, m4
@@ -1101,11 +1088,10 @@
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
add tmpq, 16*4
- add t1q, 32
- shr r3d, 1
- jnz .h_w32_hloop
- lea srcq, [srcq+strideq]
- sub hd, 1
+ add r6, 32
+ jl .h_w32_hloop
+ add srcq, strideq
+ dec hd
jg .h_w32_vloop
RET
.v:
@@ -1113,19 +1099,19 @@
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
%endif
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
- imul mxyd, 0xff01
- add mxyd, 16 << 8
- add wq, t2
- lea stride3q, [strideq*3]
- movd m5, mxyd
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
%if cpuflag(ssse3)
- pshuflw m5, m5, q0000
- punpcklqdq m5, m5
+ imul mxyd, 0x00ff00ff
+ add mxyd, 0x00100010
%else
- PSHUFB_0X1X m5
+ imul mxyd, 0xffff
pxor m6, m6
+ add mxyd, 16
%endif
+ add wq, r6
+ lea stride3q, [strideq*3]
+ movd m5, mxyd
+ pshufd m5, m5, q0000
jmp wq
.v_w4:
movd m0, [srcq+strideq*0]
@@ -1134,20 +1120,18 @@
movd m2, [srcq+strideq*2]
movd m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- punpcklwd m0, m1 ; 0 1 _ _
- punpcklwd m1, m2 ; 1 2 _ _
- punpcklbw m1, m0
- PMADDUBSW m1, m5, m6, m7, 0
- pshufd m1, m1, q3120
- mova [tmpq+16*0], m1
+ punpckldq m0, m1
+ punpckldq m1, m2
+ punpcklbw m0, m1 ; 01 12
+ PMADDUBSW m0, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
movd m0, [srcq+strideq*0]
- punpcklwd m2, m3 ; 2 3 _ _
- punpcklwd m3, m0 ; 3 4 _ _
- punpcklbw m3, m2
- PMADDUBSW m3, m5, m6, m7, 0
- pshufd m3, m3, q3120
- mova [tmpq+16*1], m3
- add tmpq, 32
+ punpckldq m2, m3
+ punpckldq m3, m0
+ punpcklbw m2, m3 ; 23 34
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
sub hd, 4
jg .v_w4_loop
RET
@@ -1154,26 +1138,23 @@
.v_w8:
movq m0, [srcq+strideq*0]
.v_w8_loop:
- movq m1, [srcq+strideq*2]
- movq m2, [srcq+strideq*1]
+ movq m1, [srcq+strideq*1]
+ movq m2, [srcq+strideq*2]
movq m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- shufpd m4, m0, m1, 0x0c ; 0 2
+ punpcklbw m0, m1 ; 01
+ punpcklbw m1, m2 ; 12
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
movq m0, [srcq+strideq*0]
- shufpd m2, m3, 0x0c ; 1 3
- shufpd m1, m0, 0x0c ; 2 4
- punpcklbw m3, m2, m4
+ punpcklbw m2, m3 ; 23
+ punpcklbw m3, m0 ; 34
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*1], m1
PMADDUBSW m3, m5, m6, m7, 0
- mova [tmpq+16*0], m3
- punpckhbw m3, m2, m4
- PMADDUBSW m3, m5, m6, m7, 0
- mova [tmpq+16*2], m3
- punpcklbw m3, m1, m2
- punpckhbw m1, m2
- PMADDUBSW m3, m5, m6, m7, 0
- PMADDUBSW m1, m5, m6, m7, 0
- mova [tmpq+16*1], m3
- mova [tmpq+16*3], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
add tmpq, 16*4
sub hd, 4
jg .v_w8_loop
@@ -1183,48 +1164,48 @@
.v_w16_loop:
movu m1, [srcq+strideq*1]
movu m2, [srcq+strideq*2]
- punpcklbw m3, m1, m0
- punpckhbw m4, m1, m0
- PMADDUBSW m3, m5, m6, m7, 0
- PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq+16*0], m3
- mova [tmpq+16*1], m4
- punpcklbw m3, m2, m1
- punpckhbw m4, m2, m1
- PMADDUBSW m3, m5, m6, m7, 0
- PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq+16*2], m3
- mova [tmpq+16*3], m4
movu m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
+ punpcklbw m4, m0, m1
+ punpckhbw m0, m1
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m0, m5, m6, m7, 0
+ mova [tmpq+16*0], m4
+ punpcklbw m4, m1, m2
+ punpckhbw m1, m2
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*1], m0
movu m0, [srcq+strideq*0]
- add tmpq, 16*8
- punpcklbw m1, m3, m2
- punpckhbw m4, m3, m2
PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*2], m4
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq-16*4], m1
- mova [tmpq-16*3], m4
- punpcklbw m1, m0, m3
- punpckhbw m2, m0, m3
- PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*3], m1
PMADDUBSW m2, m5, m6, m7, 0
- mova [tmpq-16*2], m1
- mova [tmpq-16*1], m2
+ mova [tmpq+16*4], m4
+ punpcklbw m4, m3, m0
+ punpckhbw m3, m0
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*5], m2
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*6], m4
+ mova [tmpq+16*7], m3
+ add tmpq, 16*8
sub hd, 4
jg .v_w16_loop
RET
-.v_w32:
- lea t2d, [hq+(0<<16)]
- mov t0d, 64
+.v_w128:
+ lea r3d, [hq+(3<<8)]
+ mov r6d, 256
jmp .v_w32_start
.v_w64:
- lea t2d, [hq+(1<<16)]
- mov t0d, 128
+ lea r3d, [hq+(1<<8)]
+ mov r6d, 128
jmp .v_w32_start
-.v_w128:
- lea t2d, [hq+(3<<16)]
- mov t0d, 256
+.v_w32:
+ xor r3d, r3d
+ mov r6d, 64
.v_w32_start:
%if ARCH_X86_64
%if WIN64
@@ -1232,7 +1213,7 @@
%endif
mov r7, tmpq
%endif
- mov t1, srcq
+ mov r5, srcq
.v_w32_hloop:
movu m0, [srcq+strideq*0+16*0]
movu m1, [srcq+strideq*0+16*1]
@@ -1240,48 +1221,48 @@
movu m2, [srcq+strideq*1+16*0]
movu m3, [srcq+strideq*1+16*1]
lea srcq, [srcq+strideq*2]
- punpcklbw m4, m2, m0
+ punpcklbw m4, m0, m2
+ punpckhbw m0, m2
PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m0, m5, m6, m7, 0
mova [tmpq+16*0], m4
- punpckhbw m4, m2, m0
+ mova [tmpq+16*1], m0
+ movu m0, [srcq+strideq*0+16*0]
+ punpcklbw m4, m1, m3
+ punpckhbw m1, m3
PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq+16*1], m4
- punpcklbw m4, m3, m1
- PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
mova [tmpq+16*2], m4
- punpckhbw m4, m3, m1
- PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq+16*3], m4
- add tmpq, t0q
- movu m0, [srcq+strideq*0+16*0]
+ mova [tmpq+16*3], m1
movu m1, [srcq+strideq*0+16*1]
- punpcklbw m4, m0, m2
+ add tmpq, r6
+ punpcklbw m4, m2, m0
+ punpckhbw m2, m0
PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
mova [tmpq+16*0], m4
- punpckhbw m4, m0, m2
+ mova [tmpq+16*1], m2
+ punpcklbw m4, m3, m1
+ punpckhbw m3, m1
PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq+16*1], m4
- punpcklbw m4, m1, m3
- PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
mova [tmpq+16*2], m4
- punpckhbw m4, m1, m3
- PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq+16*3], m4
- add tmpq, t0q
+ mova [tmpq+16*3], m3
+ add tmpq, r6
sub hd, 2
jg .v_w32_vloop
- movzx hd, t2w
- add t1, 32
- mov srcq, t1
+ add r5, 32
+ movzx hd, r3b
+ mov srcq, r5
%if ARCH_X86_64
- add r7, 2*16*2
+ add r7, 16*4
mov tmpq, r7
%else
mov tmpq, tmpmp
- add tmpq, 2*16*2
+ add tmpq, 16*4
mov tmpmp, tmpq
%endif
- sub t2d, 1<<16
+ sub r3d, 1<<8
jg .v_w32_hloop
%if WIN64
POP r7
@@ -1290,71 +1271,56 @@
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
%assign stack_offset stack_offset - stack_size_padded
%if cpuflag(ssse3)
+ imul mxyd, 0x08000800
WIN64_SPILL_XMM 8
%else
- WIN64_SPILL_XMM 10
-%endif
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
-%if cpuflag(ssse3)
- shl mxyd, 11
-%else
+ or mxyd, 1<<16
+ WIN64_SPILL_XMM 9
%if ARCH_X86_64
- mova m8, [pw_8]
+ mova m8, [base+pw_8]
%else
- %define m8 [t1-prep_sse2+pw_8]
+ %define m8 [base+pw_8]
%endif
pxor m7, m7
%endif
movd m6, mxyd
- add wq, t2
- pshuflw m6, m6, q0000
-%if cpuflag(ssse3)
- punpcklqdq m6, m6
-%elif ARCH_X86_64
- psrlw m0, m8, 3
- punpcklwd m6, m0
-%else
- punpcklwd m6, [base+pw_1]
-%endif
-%if ARCH_X86_32
- mov t1, t2 ; save base reg for w4
-%endif
- lea stride3q, [strideq*3]
+ add wq, r6
+ pshufd m6, m6, q0000
jmp wq
.hv_w4:
%if cpuflag(ssse3)
- %if ARCH_X86_32
- mova m4, [t1-prep_ssse3+bilin_h_shuf4]
- %else
- mova m4, [bilin_h_shuf4]
- %endif
-%endif
+ mova m4, [base+bilin_h_shuf4]
+ movddup m0, [srcq+strideq*0]
+%else
movhps m0, [srcq+strideq*0]
+%endif
+ lea r3, [strideq*3]
PSHUFB_BILIN_H4 m0, m4, m3
PMADDUBSW m0, m5, m7, m4, 0 ; _ 0
.hv_w4_loop:
movq m1, [srcq+strideq*1]
movhps m1, [srcq+strideq*2]
- movq m2, [srcq+stride3q ]
+ movq m2, [srcq+r3 ]
lea srcq, [srcq+strideq*4]
movhps m2, [srcq+strideq*0]
PSHUFB_BILIN_H4 m1, m4, m3
PSHUFB_BILIN_H4 m2, m4, m3
PMADDUBSW m1, m5, m7, m4, 0 ; 1 2
- shufpd m3, m0, m1, 0x01 ; 0 1
- mova m0, m2
- PMADDUBSW m0, m5, m7, m4, 0 ; 3 4
- shufpd m2, m1, m0, 0x01 ; 2 3
- psubw m1, m3
+ PMADDUBSW m2, m5, m7, m4, 0 ; 3 4
+ shufpd m0, m1, 0x01 ; 0 1
+ shufpd m3, m1, m2, 0x01 ; 2 3
+ psubw m1, m0
PMULHRSW m1, m6, m4, m8, 4
- paddw m1, m3
- psubw m3, m0, m2
- PMULHRSW m3, m6, m4, m8, 4
- paddw m3, m2
+ paddw m1, m0
+ mova m0, m2
+ psubw m2, m3
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m3
mova [tmpq+16*0], m1
- mova [tmpq+16*1], m3
+ mova [tmpq+16*1], m2
add tmpq, 32
sub hd, 4
jg .hv_w4_loop
@@ -1365,7 +1331,8 @@
PMADDUBSW m0, m5, m7, m4, 0 ; 0
.hv_w8_loop:
movu m1, [srcq+strideq*1]
- movu m2, [srcq+strideq*2]
+ lea srcq, [srcq+strideq*2]
+ movu m2, [srcq+strideq*0]
PSHUFB_BILIN_H8 m1, m4
PSHUFB_BILIN_H8 m2, m4
PMADDUBSW m1, m5, m7, m4, 0 ; 1
@@ -1373,68 +1340,40 @@
psubw m3, m1, m0
PMULHRSW m3, m6, m4, m8, 4
paddw m3, m0
-%if notcpuflag(ssse3) && ARCH_X86_64
- SWAP m9, m7
-%endif
- psubw m7, m2, m1
- PMULHRSW m7, m6, m4, m8, 4
- paddw m7, m1
+ mova m0, m2
+ psubw m2, m1
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m1
mova [tmpq+16*0], m3
- mova [tmpq+16*1], m7
-%if notcpuflag(ssse3) && ARCH_X86_64
- SWAP m7, m9
-%endif
- movu m1, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- movu m0, [srcq+strideq*0]
- PSHUFB_BILIN_H8 m1, m4
- PSHUFB_BILIN_H8 m0, m4
- PMADDUBSW m1, m5, m7, m4, ARCH_X86_32 ; 3
- PMADDUBSW m0, m5, m7, m4, 0 ; 4
- psubw m3, m1, m2
- PMULHRSW m3, m6, m4, m8, 4
- paddw m3, m2
-%if notcpuflag(ssse3) && ARCH_X86_64
- SWAP m9, m7
-%endif
- psubw m7, m0, m1
- PMULHRSW m7, m6, m4, m8, 4
- paddw m7, m1
- mova [tmpq+16*2], m3
- mova [tmpq+16*3], m7
-%if notcpuflag(ssse3)
- %if ARCH_X86_64
- SWAP m7, m9
- %else
- pxor m7, m7
- %endif
-%endif
- add tmpq, 16*4
- sub hd, 4
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 2
jg .hv_w8_loop
RET
-.hv_w16:
- mov t2d, hd
- mov t0d, 32
+.hv_w128:
+ lea r3d, [hq+(7<<8)]
+ mov r5d, 256
jmp .hv_w16_start
-.hv_w32:
- lea t2d, [hq+(1<<16)]
- mov t0d, 64
- jmp .hv_w16_start
.hv_w64:
- lea t2d, [hq+(3<<16)]
- mov t0d, 128
+ lea r3d, [hq+(3<<8)]
+ mov r5d, 128
jmp .hv_w16_start
-.hv_w128:
- lea t2d, [hq+(7<<16)]
- mov t0d, 256
+.hv_w32:
+ lea r3d, [hq+(1<<8)]
+ mov r5d, 64
+ jmp .hv_w16_start
+.hv_w16:
+ xor r3d, r3d
+ mov r5d, 32
.hv_w16_start:
+%if ARCH_X86_64 || cpuflag(ssse3)
+ mov r6, srcq
+%endif
%if ARCH_X86_64
%if WIN64
PUSH r7
%endif
mov r7, tmpq
- mov r5, srcq
%endif
.hv_w16_hloop:
movu m0, [srcq+strideq*0+8*0]
@@ -1459,7 +1398,7 @@
PMULHRSW m0, m6, m4, m8, 4
paddw m0, m1
mova [tmpq+16*1], m0
- add tmpq, t0q
+ add tmpq, r5
movu m0, [srcq+strideq*0+8*0]
PSHUFB_BILIN_H8 m0, m4
PMADDUBSW m0, m5, m7, m4, 0 ; 2a
@@ -1474,24 +1413,30 @@
PMULHRSW m2, m6, m4, m8, 4
paddw m2, m3
mova [tmpq+16*1], m2
- add tmpq, t0q
+ add tmpq, r5
sub hd, 2
jg .hv_w16_vloop
- movzx hd, t2w
+ movzx hd, r3b
%if ARCH_X86_64
- add r5, 16
+ add r6, 16
add r7, 2*16
- mov srcq, r5
+ mov srcq, r6
mov tmpq, r7
+%elif cpuflag(ssse3)
+ mov tmpq, tmpm
+ add r6, 16
+ add tmpq, 2*16
+ mov srcq, r6
+ mov tmpm, tmpq
%else
- mov srcq, srcmp
- mov tmpq, tmpmp
+ mov srcq, srcm
+ mov tmpq, tmpm
add srcq, 16
add tmpq, 2*16
- mov srcmp, srcq
- mov tmpmp, tmpq
+ mov srcm, srcq
+ mov tmpm, tmpq
%endif
- sub t2d, 1<<16
+ sub r3d, 1<<8
jg .hv_w16_hloop
%if WIN64
POP r7
@@ -1538,13 +1483,9 @@
%if ARCH_X86_32
%define base_reg r1
%define base base_reg-put_ssse3
- %define W32_RESTORE_DSQ mov dsq, dsm
- %define W32_RESTORE_SSQ mov ssq, ssm
%else
%define base_reg r8
%define base 0
- %define W32_RESTORE_DSQ
- %define W32_RESTORE_SSQ
%endif
cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
@@ -1575,10 +1516,9 @@
add wq, base_reg
; put_bilin mangling jump
%assign stack_offset org_stack_offset
-%if ARCH_X86_32
- mov dsq, dsm
- mov ssq, ssm
-%elif WIN64
+ movifnidn dsq, dsmp
+ movifnidn ssq, ssmp
+%if WIN64
pop r8
%endif
lea r6, [ssq*3]
@@ -1590,7 +1530,7 @@
test myd, 0xf00
%endif
jnz .hv
- W32_RESTORE_SSQ
+ movifnidn ssq, ssmp
WIN64_SPILL_XMM 12
cmp wd, 4
jl .h_w2
@@ -1604,11 +1544,10 @@
shr mxd, 16
sub srcq, 3
movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)]
- movd m5, [base_reg+mxq*8+subpel_filters-put_ssse3+0]
- pshufd m5, m5, q0000
- movd m6, [base_reg+mxq*8+subpel_filters-put_ssse3+4]
- pshufd m6, m6, q0000
+ movq m6, [base_reg+mxq*8+subpel_filters-put_ssse3]
mova m7, [base+pw_34] ; 2 + (8 << 2)
+ pshufd m5, m6, q0000
+ pshufd m6, m6, q1111
add wq, base_reg
jmp wq
.h_w2:
@@ -1620,9 +1559,9 @@
dec srcq
mova m4, [base+subpel_h_shuf4]
movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
- pshufd m3, m3, q0000
mova m5, [base+pw_34] ; 2 + (8 << 2)
- W32_RESTORE_DSQ
+ pshufd m3, m3, q0000
+ movifnidn dsq, dsmp
.h_w2_loop:
movq m0, [srcq+ssq*0]
movhps m0, [srcq+ssq*1]
@@ -1633,10 +1572,10 @@
paddw m0, m5 ; pw34
psraw m0, 6
packuswb m0, m0
- movd r4d, m0
- mov [dstq+dsq*0], r4w
- shr r4d, 16
- mov [dstq+dsq*1], r4w
+ movd r6d, m0
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2_loop
@@ -1649,10 +1588,10 @@
%endif
dec srcq
movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
- pshufd m3, m3, q0000
- mova m5, [base+pw_34] ; 2 + (8 << 2)
mova m6, [base+subpel_h_shufA]
- W32_RESTORE_DSQ
+ mova m5, [base+pw_34] ; 2 + (8 << 2)
+ pshufd m3, m3, q0000
+ movifnidn dsq, dsmp
.h_w4_loop:
movq m0, [srcq+ssq*0] ; 1
movq m1, [srcq+ssq*1] ; 2
@@ -1672,7 +1611,6 @@
sub hd, 2
jg .h_w4_loop
RET
- ;
%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
%if ARCH_X86_32
pshufb %2, %1, [base+subpel_h_shufB]
@@ -1693,18 +1631,17 @@
paddw %1, m7 ; pw34
psraw %1, 6
%endmacro
- ;
.h_w8:
- movu m0, [srcq+ssq*0]
- movu m1, [srcq+ssq*1]
- PUT_8TAP_H m0, m2, m3, m4
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H m0, m2, m3, m4
PUT_8TAP_H m1, m2, m3, m4
packuswb m0, m1
%if ARCH_X86_32
- movq [dstq ], m0
+ movq [dstq], m0
add dstq, dsm
- movhps [dstq ], m0
+ movhps [dstq], m0
add dstq, dsm
%else
movq [dstq+dsq*0], m0
@@ -1714,22 +1651,23 @@
sub hd, 2
jg .h_w8
RET
-.h_w16:
- xor r6d, r6d
- jmp .h_start
-.h_w32:
- mov r6, -16*1
- jmp .h_start
-.h_w64:
- mov r6, -16*3
- jmp .h_start
.h_w128:
- mov r6, -16*7
-.h_start:
- sub srcq, r6
- sub dstq, r6
- mov r4, r6
-.h_loop:
+ mov r4, -16*7
+ jmp .h_w16_start
+.h_w64:
+ mov r4, -16*3
+ jmp .h_w16_start
+.h_w32:
+ mov r4, -16*1
+ jmp .h_w16_start
+.h_w16:
+ xor r4d, r4d
+.h_w16_start:
+ sub srcq, r4
+ sub dstq, r4
+.h_w16_loop_v:
+ mov r6, r4
+.h_w16_loop_h:
movu m0, [srcq+r6+8*0]
movu m1, [srcq+r6+8*1]
PUT_8TAP_H m0, m2, m3, m4
@@ -1736,17 +1674,12 @@
PUT_8TAP_H m1, m2, m3, m4
packuswb m0, m1
mova [dstq+r6], m0
- add r6, mmsize
- jle .h_loop
+ add r6, 16
+ jle .h_w16_loop_h
add srcq, ssq
-%if ARCH_X86_32
- add dstq, dsm
-%else
- add dstq, dsq
-%endif
- mov r6, r4
+ add dstq, dsmp
dec hd
- jg .h_loop
+ jg .h_w16_loop_v
RET
.v:
%if ARCH_X86_32
@@ -1754,7 +1687,7 @@
shr ssd, 16
cmp hd, 6
cmovs ssd, mxd
- lea ssq, [base_reg+ssq*8+subpel_filters-put_ssse3]
+ movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
%else
%assign stack_offset org_stack_offset
WIN64_SPILL_XMM 16
@@ -1762,12 +1695,12 @@
shr myd, 16
cmp hd, 6
cmovs myd, mxd
- lea myq, [base_reg+myq*8+subpel_filters-put_ssse3]
+ movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
%endif
tzcnt r6d, wd
movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)]
+ punpcklwd m0, m0
mova m7, [base+pw_512]
- psrlw m2, m7, 1 ; 0x0100
add r6, base_reg
%if ARCH_X86_32
%define subpel0 [rsp+mmsize*0]
@@ -1775,20 +1708,16 @@
%define subpel2 [rsp+mmsize*2]
%define subpel3 [rsp+mmsize*3]
%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
- ALLOC_STACK -mmsize*4
+ ALLOC_STACK -16*4
%assign regs_used 7
- movd m0, [ssq+0]
- pshufb m0, m2
- mova subpel0, m0
- movd m0, [ssq+2]
- pshufb m0, m2
- mova subpel1, m0
- movd m0, [ssq+4]
- pshufb m0, m2
- mova subpel2, m0
- movd m0, [ssq+6]
- pshufb m0, m2
- mova subpel3, m0
+ pshufd m1, m0, q0000
+ mova subpel0, m1
+ pshufd m1, m0, q1111
+ mova subpel1, m1
+ pshufd m1, m0, q2222
+ mova subpel2, m1
+ pshufd m1, m0, q3333
+ mova subpel3, m1
mov ssq, [rstk+stack_offset+gprsize*4]
lea ssq, [ssq*3]
sub srcq, ssq
@@ -1799,47 +1728,46 @@
%define subpel1 m9
%define subpel2 m10
%define subpel3 m11
- movd subpel0, [myq+0]
- pshufb subpel0, m2
- movd subpel1, [myq+2]
- pshufb subpel1, m2
- movd subpel2, [myq+4]
- pshufb subpel2, m2
- movd subpel3, [myq+6]
- pshufb subpel3, m2
lea ss3q, [ssq*3]
+ pshufd m8, m0, q0000
sub srcq, ss3q
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
%endif
jmp r6
.v_w2:
- movd m2, [srcq+ssq*0] ; 0
- pinsrw m2, [srcq+ssq*1], 2 ; 0 1
- pinsrw m2, [srcq+ssq*2], 4 ; 0 1 2
+ movd m1, [srcq+ssq*0]
+ movd m0, [srcq+ssq*1]
%if ARCH_X86_32
lea srcq, [srcq+ssq*2]
- add srcq, ssq
- pinsrw m2, [srcq+ssq*0], 6 ; 0 1 2 3
- add srcq, ssq
-%else
- pinsrw m2, [srcq+ss3q ], 6 ; 0 1 2 3
- lea srcq, [srcq+ssq*4]
-%endif
- movd m3, [srcq+ssq*0] ; 4
- movd m1, [srcq+ssq*1] ; 5
- movd m0, [srcq+ssq*2] ; 6
-%if ARCH_X86_32
+ movd m2, [srcq+ssq*0]
+ movd m5, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- add srcq, ssq
+ movd m3, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
%else
+ movd m2, [srcq+ssq*2]
add srcq, ss3q
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m4, [srcq+ssq*2]
+ add srcq, ss3q
%endif
- punpckldq m3, m1 ; 4 5 _ _
- punpckldq m1, m0 ; 5 6 _ _
- palignr m4, m3, m2, 4 ; 1 2 3 4
- punpcklbw m3, m1 ; 45 56
- punpcklbw m1, m2, m4 ; 01 12
- punpckhbw m2, m4 ; 23 34
+ punpcklwd m1, m0 ; 0 1
+ punpcklwd m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+ssq*0]
+ punpcklwd m2, m5 ; 2 3
+ punpcklwd m5, m3 ; 3 4
+ punpcklwd m3, m4 ; 4 5
+ punpcklwd m4, m0 ; 5 6
+ punpcklbw m2, m5 ; 23 34
+ punpcklbw m3, m4 ; 45 56
.v_w2_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
pmaddubsw m5, m1, subpel0 ; a0 b0
mova m1, m2
pmaddubsw m2, subpel1 ; a1 b1
@@ -1847,17 +1775,14 @@
mova m2, m3
pmaddubsw m3, subpel2 ; a2 b2
paddw m5, m3
- movd m4, [srcq+ssq*0] ; 7
- punpckldq m3, m0, m4 ; 6 7 _ _
- movd m0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- punpckldq m4, m0 ; 7 8 _ _
+ punpcklwd m3, m0, m4 ; 6 7
+ movd m0, [srcq+ssq*0]
+ punpcklwd m4, m0 ; 7 8
punpcklbw m3, m4 ; 67 78
pmaddubsw m4, m3, subpel3 ; a3 b3
paddw m5, m4
pmulhrsw m5, m7
packuswb m5, m5
- pshuflw m5, m5, q2020
movd r6d, m5
mov [dstq+dsq*0], r6w
shr r6d, 16
@@ -1873,51 +1798,46 @@
.v_w32:
.v_w64:
.v_w128:
-%endif ; ARCH_X86_32
- lea r6d, [wq - 4] ; horizontal loop
- mov r4, dstq
-%if ARCH_X86_32
-%if STACK_ALIGNMENT < mmsize
- %define srcm [rsp+mmsize*4+gprsize]
+ shl wd, 14
+%if STACK_ALIGNMENT < 16
+ %define dstm [rsp+mmsize*4+gprsize]
+ mov dstm, dstq
%endif
- mov srcm, srcq
-%else
- mov r7, srcq
-%endif
- shl r6d, (16 - 2) ; (wq / 4) << 16
- mov r6w, hw
+ lea r6d, [hq+wq-(1<<16)]
+ mov r4, srcq
.v_w4_loop0:
- movd m2, [srcq+ssq*0] ; 0
- movhps m2, [srcq+ssq*2] ; 0 _ 2
- movd m3, [srcq+ssq*1] ; 1
-%if ARCH_X86_32
- lea srcq, [srcq+ssq*2]
- add srcq, ssq
- movhps m3, [srcq+ssq*0] ; 1 _ 3
- lea srcq, [srcq+ssq*1]
-%else
- movhps m3, [srcq+ss3q ] ; 1 _ 3
- lea srcq, [srcq+ssq*4]
%endif
- pshufd m2, m2, q2020 ; 0 2 0 2
- pshufd m3, m3, q2020 ; 1 3 1 3
- punpckldq m2, m3 ; 0 1 2 3
- movd m3, [srcq+ssq*0] ; 4
- movd m1, [srcq+ssq*1] ; 5
- movd m0, [srcq+ssq*2] ; 6
+ movd m1, [srcq+ssq*0]
+ movd m0, [srcq+ssq*1]
%if ARCH_X86_32
lea srcq, [srcq+ssq*2]
- add srcq, ssq
+ movd m2, [srcq+ssq*0]
+ movd m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movd m3, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
%else
+ movd m2, [srcq+ssq*2]
add srcq, ss3q
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m4, [srcq+ssq*2]
+ add srcq, ss3q
%endif
- punpckldq m3, m1 ; 4 5 _ _
- punpckldq m1, m0 ; 5 6 _ _
- palignr m4, m3, m2, 4 ; 1 2 3 4
- punpcklbw m3, m1 ; 45 56
- punpcklbw m1, m2, m4 ; 01 12
- punpckhbw m2, m4 ; 23 34
+ punpckldq m1, m0 ; 0 1
+ punpckldq m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+ssq*0]
+ punpckldq m2, m5 ; 2 3
+ punpckldq m5, m3 ; 3 4
+ punpckldq m3, m4 ; 4 5
+ punpckldq m4, m0 ; 5 6
+ punpcklbw m2, m5 ; 23 34
+ punpcklbw m3, m4 ; 45 56
.v_w4_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
pmaddubsw m5, m1, subpel0 ; a0 b0
mova m1, m2
pmaddubsw m2, subpel1 ; a1 b1
@@ -1925,10 +1845,8 @@
mova m2, m3
pmaddubsw m3, subpel2 ; a2 b2
paddw m5, m3
- movd m4, [srcq+ssq*0]
punpckldq m3, m0, m4 ; 6 7 _ _
- movd m0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
+ movd m0, [srcq+ssq*0]
punpckldq m4, m0 ; 7 8 _ _
punpcklbw m3, m4 ; 67 78
pmaddubsw m4, m3, subpel3 ; a3 b3
@@ -1936,24 +1854,21 @@
pmulhrsw m5, m7
packuswb m5, m5
movd [dstq+dsq*0], m5
- pshufd m5, m5, q0101
+ psrlq m5, 32
movd [dstq+dsq*1], m5
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
- mov hw, r6w ; reset vertical loop
- add r4, 4
- mov dstq, r4
%if ARCH_X86_32
- mov srcq, srcm
- add srcq, 4
- mov srcm, srcq
-%else
- add r7, 4
- mov srcq, r7
-%endif
- sub r6d, 1<<16 ; horizontal--
+ mov dstq, dstm
+ add r4, 4
+ movzx hd, r6w
+ add dstq, 4
+ mov srcq, r4
+ mov dstm, dstq
+ sub r6d, 1<<16
jg .v_w4_loop0
+%endif
RET
%if ARCH_X86_64
.v_w8:
@@ -1961,56 +1876,51 @@
.v_w32:
.v_w64:
.v_w128:
- lea r6d, [wq - 8] ; horizontal loop
- mov r4, dstq
- mov r7, srcq
- shl r6d, 8 - 3; (wq / 8) << 8
- mov r6b, hb
+ lea r6d, [wq*8-64]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*4]
.v_w8_loop0:
- movq m4, [srcq+ssq*0] ; 0
- movq m5, [srcq+ssq*1] ; 1
- lea srcq, [srcq+ssq*2]
- movq m6, [srcq+ssq*0] ; 2
- movq m0, [srcq+ssq*1] ; 3
- lea srcq, [srcq+ssq*2]
- movq m1, [srcq+ssq*0] ; 4
- movq m2, [srcq+ssq*1] ; 5
- lea srcq, [srcq+ssq*2] ;
- movq m3, [srcq+ssq*0] ; 6
- shufpd m4, m0, 0x0c
- shufpd m5, m1, 0x0c
- punpcklbw m1, m4, m5 ; 01
- punpckhbw m4, m5 ; 34
- shufpd m6, m2, 0x0c
- punpcklbw m2, m5, m6 ; 12
- punpckhbw m5, m6 ; 45
- shufpd m0, m3, 0x0c
- punpcklbw m3, m6, m0 ; 23
- punpckhbw m6, m0 ; 56
+ movq m1, [srcq+ssq*0]
+ movq m2, [srcq+ssq*1]
+ movq m3, [srcq+ssq*2]
+ add srcq, ss3q
+ movq m4, [srcq+ssq*0]
+ movq m5, [srcq+ssq*1]
+ movq m6, [srcq+ssq*2]
+ add srcq, ss3q
+ movq m0, [srcq+ssq*0]
+ punpcklbw m1, m2 ; 01
+ punpcklbw m2, m3 ; 12
+ punpcklbw m3, m4 ; 23
+ punpcklbw m4, m5 ; 34
+ punpcklbw m5, m6 ; 45
+ punpcklbw m6, m0 ; 56
.v_w8_loop:
- movq m12, [srcq+ssq*1] ; 8
+ movq m13, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- movq m13, [srcq+ssq*0] ; 9
pmaddubsw m14, m1, subpel0 ; a0
- pmaddubsw m15, m2, subpel0 ; b0
mova m1, m3
+ pmaddubsw m15, m2, subpel0 ; b0
mova m2, m4
pmaddubsw m3, subpel1 ; a1
+ mova m12, m0
pmaddubsw m4, subpel1 ; b1
+ movq m0, [srcq+ssq*0]
paddw m14, m3
paddw m15, m4
mova m3, m5
- mova m4, m6
pmaddubsw m5, subpel2 ; a2
+ mova m4, m6
pmaddubsw m6, subpel2 ; b2
+ punpcklbw m12, m13 ; 67
+ punpcklbw m13, m0 ; 78
paddw m14, m5
+ mova m5, m12
+ pmaddubsw m12, subpel3 ; a3
paddw m15, m6
- shufpd m6, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
- punpcklbw m5, m6, m0 ; 67
- punpckhbw m6, m0 ; 78
- pmaddubsw m12, m5, subpel3 ; a3
- pmaddubsw m13, m6, subpel3 ; b3
+ mova m6, m13
+ pmaddubsw m13, subpel3 ; b3
paddw m14, m12
paddw m15, m13
pmulhrsw m14, m7
@@ -2021,12 +1931,12 @@
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
- movzx hd, r6b ; reset vertical loop
add r4, 8
add r7, 8
- mov dstq, r4
- mov srcq, r7
- sub r6d, 1<<8 ; horizontal--
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
jg .v_w8_loop0
RET
%endif ;ARCH_X86_64
@@ -2051,7 +1961,7 @@
cmp hd, 6
cmovs ssd, mxd
movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
- W32_RESTORE_SSQ
+ mov ssq, ssmp
lea r6, [ssq*3]
sub srcq, r6
%define base_reg r6
@@ -2064,7 +1974,6 @@
%define subpelv1 [rsp+mmsize*1]
%define subpelv2 [rsp+mmsize*2]
%define subpelv3 [rsp+mmsize*3]
- punpcklqdq m0, m0
punpcklbw m0, m0
psraw m0, 8 ; sign-extend
pshufd m6, m0, q0000
@@ -2088,7 +1997,6 @@
%define subpelv1 m11
%define subpelv2 m12
%define subpelv3 m13
- punpcklqdq m0, m0
punpcklbw m0, m0
psraw m0, 8 ; sign-extend
mova m8, [base+pw_8192]
@@ -2103,22 +2011,21 @@
je .hv_w4
.hv_w2:
mova m6, [base+subpel_h_shuf4]
- ;
movq m2, [srcq+ssq*0] ; 0
movhps m2, [srcq+ssq*1] ; 0 _ 1
- movq m0, [srcq+ssq*2] ; 2
%if ARCH_X86_32
%define w8192reg [base+pw_8192]
%define d512reg [base+pd_512]
lea srcq, [srcq+ssq*2]
- add srcq, ssq
- movhps m0, [srcq+ssq*0] ; 2 _ 3
- lea srcq, [srcq+ssq*1]
+ movq m0, [srcq+ssq*0] ; 2
+ movhps m0, [srcq+ssq*1] ; 2 _ 3
+ lea srcq, [srcq+ssq*2]
%else
%define w8192reg m8
%define d512reg m9
- movhps m0, [srcq+ss3q ] ; 2 _ 3
- lea srcq, [srcq+ssq*4]
+ movq m0, [srcq+ssq*2] ; 2
+ add srcq, ss3q
+ movhps m0, [srcq+ssq*0] ; 2 _ 3
%endif
pshufb m2, m6 ; 0 ~ 1 ~
pshufb m0, m6 ; 2 ~ 3 ~
@@ -2126,16 +2033,16 @@
pmaddubsw m0, m7 ; subpel_filters
phaddw m2, m0 ; 0 1 2 3
pmulhrsw m2, w8192reg
- ;
+%if ARCH_X86_32
movq m3, [srcq+ssq*0] ; 4
movhps m3, [srcq+ssq*1] ; 4 _ 5
- movq m0, [srcq+ssq*2] ; 6
-%if ARCH_X86_32
lea srcq, [srcq+ssq*2]
- add srcq, ssq
%else
+ movq m3, [srcq+ssq*1] ; 4
+ movhps m3, [srcq+ssq*2] ; 4 _ 5
add srcq, ss3q
%endif
+ movq m0, [srcq+ssq*0] ; 6
pshufb m3, m6 ; 4 ~ 5 ~
pshufb m0, m6 ; 6 ~
pmaddubsw m3, m7 ; subpel_filters
@@ -2142,7 +2049,6 @@
pmaddubsw m0, m7 ; subpel_filters
phaddw m3, m0 ; 4 5 6 _
pmulhrsw m3, w8192reg
- ;
palignr m4, m3, m2, 4; V 1 2 3 4
punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2
punpckhwd m2, m4 ; V 23 34 2 3 3 4
@@ -2149,6 +2055,11 @@
pshufd m0, m3, q2121; V 5 6 5 6
punpcklwd m3, m0 ; V 45 56 4 5 5 6
.hv_w2_loop:
+ movq m4, [srcq+ssq*1] ; V 7
+ lea srcq, [srcq+ssq*2] ; V
+ movhps m4, [srcq+ssq*0] ; V 7 8
+ pshufb m4, m6
+ pmaddubsw m4, m7
pmaddwd m5, m1, subpelv0; V a0 b0
mova m1, m2 ; V
pmaddwd m2, subpelv1 ; V a1 b1
@@ -2155,14 +2066,9 @@
paddd m5, m2 ; V
mova m2, m3 ; V
pmaddwd m3, subpelv2 ; a2 b2
- paddd m5, m3 ; V
- movq m4, [srcq+ssq*0] ; V 7
- movhps m4, [srcq+ssq*1] ; V 7 8
- lea srcq, [srcq+ssq*2] ; V
- pshufb m4, m6
- pmaddubsw m4, m7
phaddw m4, m4
pmulhrsw m4, w8192reg
+ paddd m5, m3 ; V
palignr m3, m4, m0, 12
mova m0, m4
punpcklwd m3, m0 ; V 67 78
@@ -2182,7 +2088,6 @@
RET
%undef w8192reg
%undef d512reg
- ;
.hv_w4:
%define hv4_line_0_0 4
%define hv4_line_0_1 5
@@ -2194,7 +2099,6 @@
%define hv4_line_1_1 11
%define hv4_line_1_2 12
%define hv4_line_1_3 13
- ;
%macro SAVELINE_W4 3
mova [rsp+mmsize*hv4_line_%3_%2], %1
%endmacro
@@ -2201,7 +2105,6 @@
%macro RESTORELINE_W4 3
mova %1, [rsp+mmsize*hv4_line_%3_%2]
%endmacro
- ;
%if ARCH_X86_32
%define w8192reg [base+pw_8192]
%define d512reg [base+pd_512]
@@ -2213,13 +2116,13 @@
mova m6, [base+subpel_h_shuf4]
movq m5, [srcq+ssq*0] ; 0 _ _ _
movhps m5, [srcq+ssq*1] ; 0 _ 1 _
- movq m4, [srcq+ssq*2] ; 2 _ _ _
%if ARCH_X86_32
lea srcq, [srcq+ssq*2]
- add srcq, ssq
- movhps m4, [srcq+ssq*0] ; 2 _ 3 _
- add srcq, ssq
+ movq m4, [srcq+ssq*0] ; 2 _ _ _
+ movhps m4, [srcq+ssq*1] ; 2 _ 3 _
+ lea srcq, [srcq+ssq*2]
%else
+ movq m4, [srcq+ssq*2] ; 2 _ _ _
movhps m4, [srcq+ss3q ] ; 2 _ 3 _
lea srcq, [srcq+ssq*4]
%endif
@@ -2243,7 +2146,14 @@
mova m6, [base+subpel_h_shuf4]
movq m5, [srcq+ssq*0] ; 4 _ _ _
movhps m5, [srcq+ssq*1] ; 4 _ 5 _
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movq m4, [srcq+ssq*0] ; 6 _ _ _
+ add srcq, ssq
+%else
movq m4, [srcq+ssq*2] ; 6 _ _ _
+ add srcq, ss3q
+%endif
pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
pmaddubsw m3, m7 ;H subpel_filters
@@ -2259,13 +2169,6 @@
pmaddubsw m0, m7 ;H subpel_filters
phaddw m3, m0 ;H 4 5 6 7
pmulhrsw m3, w8192reg ;H pw_8192
- ;
-%if ARCH_X86_32
- lea srcq, [srcq+ssq*2]
- add srcq, ssq
-%else
- add srcq, ss3q
-%endif
;process high
palignr m4, m3, m2, 4;V 1 2 3 4
punpcklwd m1, m2, m4 ; V 01 12
@@ -2293,7 +2196,6 @@
mova m2, m3
pmaddwd m3, subpelv2; V a2 b2
paddd m5, m3
- ;
mova m6, [base+subpel_h_shuf4]
movq m4, [srcq+ssq*0] ; 7
movhps m4, [srcq+ssq*1] ; 7 _ 8 _
@@ -2325,10 +2227,10 @@
mova m2, m3
pmaddwd m3, subpelv2; V a2 b2
paddd m5, m3
- ;
mova m6, [base+subpel_h_shuf4+16]
movq m4, [srcq+ssq*0] ; 7
movhps m4, [srcq+ssq*1] ; 7 _ 8 _
+ lea srcq, [srcq+ssq*2]
pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
pmaddubsw m4, m7 ;H subpel_filters
phaddw m4, m4 ;H 7 8 7 8
@@ -2340,12 +2242,10 @@
paddd m5, d512reg ; pd_512
paddd m5, m4
psrad m4, m5, 10
- ;
RESTORELINE_W4 m5, 5, 0
packssdw m5, m4 ; d -> w
packuswb m5, m5 ; w -> b
pshuflw m5, m5, q3120
- lea srcq, [srcq+ssq*2]
movd [dstq+dsq*0], m5
psrlq m5, 32
movd [dstq+dsq*1], m5
@@ -2365,7 +2265,6 @@
%undef subpelv1
%undef subpelv2
%undef subpelv3
- ;
.hv_w8:
%assign stack_offset org_stack_offset
%define hv8_line_1 0
@@ -2400,7 +2299,7 @@
mov ssq, ssmp
ALLOC_STACK -mmsize*13
%if STACK_ALIGNMENT < 16
- %define srcm [rsp+mmsize*13+gprsize*1]
+ %define dstm [rsp+mmsize*13+gprsize*1]
%define dsm [rsp+mmsize*13+gprsize*2]
mov r6, [rstk+stack_offset+gprsize*2]
mov dsm, r6
@@ -2420,10 +2319,10 @@
mova subpelv2, m4
mova subpelv3, m5
lea r6, [ssq*3]
+ mov dstm, dstq
sub srcq, r6
- mov srcm, srcq
%else
- ALLOC_STACK mmsize*5, 16
+ ALLOC_STACK 16*5, 16
%define subpelh0 m10
%define subpelh1 m11
%define subpelv0 m12
@@ -2440,7 +2339,6 @@
movq m1, [base_reg+myq*8+subpel_filters-put_ssse3]
pshufd subpelh0, m0, q0000
pshufd subpelh1, m0, q1111
- punpcklqdq m1, m1
punpcklbw m1, m1
psraw m1, 8 ; sign-extend
pshufd subpelv0, m1, q0000
@@ -2448,18 +2346,18 @@
pshufd subpelv2, m1, q2222
pshufd subpelv3, m1, q3333
lea ss3q, [ssq*3]
+ mov r7, dstq
sub srcq, ss3q
- mov r7, srcq
%endif
- lea r6d, [wq-4]
- mov r4, dstq
- shl r6d, (16 - 2)
- mov r6w, hw
+ shl wd, 14
+ lea r6d, [hq+wq-(1<<16)]
+ mov r4, srcq
.hv_w8_loop0:
movu m4, [srcq+ssq*0] ; 0 = _ _
movu m5, [srcq+ssq*1] ; 1 = _ _
+%if ARCH_X86_32
lea srcq, [srcq+ssq*2]
- ;
+%endif
%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
%if ARCH_X86_32
pshufb %3, %1, [base+subpel_h_shufB]
@@ -2478,7 +2376,6 @@
paddw %1, %3 ; A0+C4
phaddw %1, %2
%endmacro
- ;
%if ARCH_X86_64
mova m7, [base+subpel_h_shufA]
mova m8, [base+subpel_h_shufB]
@@ -2486,12 +2383,17 @@
%endif
HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
+%if ARCH_X86_32
movu m6, [srcq+ssq*0] ; 2 = _ _
movu m0, [srcq+ssq*1] ; 3 = _ _
lea srcq, [srcq+ssq*2]
+%else
+ movu m6, [srcq+ssq*2] ; 2 = _ _
+ add srcq, ss3q
+ movu m0, [srcq+ssq*0] ; 3 = _ _
+%endif
HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
- ;
mova m7, [base+pw_8192]
pmulhrsw m4, m7 ; H pw_8192
pmulhrsw m5, m7 ; H pw_8192
@@ -2503,11 +2405,16 @@
SAVELINE_W8 1, m1
SAVELINE_W8 2, m2
SAVELINE_W8 3, m3
- ;
mova m7, [base+subpel_h_shufA]
+%if ARCH_X86_32
movu m4, [srcq+ssq*0] ; 4 = _ _
movu m5, [srcq+ssq*1] ; 5 = _ _
lea srcq, [srcq+ssq*2]
+%else
+ movu m4, [srcq+ssq*1] ; 4 = _ _
+ movu m5, [srcq+ssq*2] ; 5 = _ _
+ add srcq, ss3q
+%endif
movu m6, [srcq+ssq*0] ; 6 = _ _
HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
@@ -2519,7 +2426,6 @@
punpcklwd m4, m0, m1 ; 3 4 ~
punpcklwd m5, m1, m2 ; 4 5 ~
punpcklwd m6, m2, m3 ; 5 6 ~
- ;
SAVELINE_W8 6, m3
RESTORELINE_W8 1, m1
RESTORELINE_W8 2, m2
@@ -2603,16 +2509,19 @@
RESTORELINE_W8 4, m4
jmp .hv_w8_loop
.hv_w8_outer:
- movzx hd, r6w
- add r4, 4
- mov dstq, r4
%if ARCH_X86_32
- mov srcq, srcm
- add srcq, 4
- mov srcm, srcq
+ mov dstq, dstm
+ add r4, 4
+ movzx hd, r6w
+ add dstq, 4
+ mov srcq, r4
+ mov dstm, dstq
%else
+ add r4, 4
add r7, 4
- mov srcq, r7
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
%endif
sub r6d, 1<<16
jg .hv_w8_loop0
@@ -2836,7 +2745,7 @@
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 8tap_v, my, 4tap_v
- movsxd wq, wm
+ mov wd, wm
movifnidn srcd, srcm
movifnidn hd, hm
test mxd, 0xf00
@@ -2846,6 +2755,7 @@
LEA base_reg, prep_ssse3
tzcnt wd, wd
movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
+ pxor m4, m4
add wq, base_reg
movifnidn strided, stridem
lea r6, [strideq*3]
@@ -2885,16 +2795,13 @@
shr mxd, 16
sub srcq, 3
movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
- movd m5, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+0]
- pshufd m5, m5, q0000
- movd m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+4]
- pshufd m6, m6, q0000
+ movq m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
%if cpuflag(ssse3)
mova m7, [base+pw_8192]
+ pshufd m5, m6, q0000
+ pshufd m6, m6, q1111
%else
- punpcklbw m5, m5
punpcklbw m6, m6
- psraw m5, 8
psraw m6, 8
%if ARCH_X86_64
mova m7, [pw_2]
@@ -2902,6 +2809,8 @@
%else
%define m15 m4
%endif
+ pshufd m5, m6, q1010
+ punpckhqdq m6, m6
%endif
add wq, base_reg
jmp wq
@@ -2913,10 +2822,10 @@
%endif
dec srcq
movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
- pshufd m4, m4, q0000
%if cpuflag(ssse3)
mova m6, [base+pw_8192]
mova m5, [base+subpel_h_shufA]
+ pshufd m4, m4, q0000
%else
mova m6, [base+pw_2]
%if ARCH_X86_64
@@ -2926,6 +2835,7 @@
%endif
punpcklbw m4, m4
psraw m4, 8
+ punpcklqdq m4, m4
%endif
%if ARCH_X86_64
lea stride3q, [strideq*3]
@@ -3089,11 +2999,14 @@
shr myd, 16
cmp hd, 6
cmovs myd, mxd
- lea myq, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
%if cpuflag(ssse3)
mova m2, [base+pw_512]
- psrlw m2, m2, 1 ; 0x0100
mova m7, [base+pw_8192]
+ punpcklwd m0, m0
+%else
+ punpcklbw m0, m0
+ psraw m0, 8
%endif
%if ARCH_X86_32
%define subpel0 [rsp+mmsize*0]
@@ -3107,20 +3020,16 @@
ALLOC_STACK -mmsize*5
%endif
%assign regs_used 7
- movd m0, [myq+0]
- PSHUFB_0X1X m0, m2
- mova subpel0, m0
- movd m0, [myq+2]
- PSHUFB_0X1X m0, m2
- mova subpel1, m0
- movd m0, [myq+4]
- PSHUFB_0X1X m0, m2
- mova subpel2, m0
- movd m0, [myq+6]
- PSHUFB_0X1X m0, m2
- mova subpel3, m0
mov strideq, [rstk+stack_offset+gprsize*3]
+ pshufd m1, m0, q0000
+ mova subpel0, m1
+ pshufd m1, m0, q1111
+ mova subpel1, m1
lea r5, [strideq*3]
+ pshufd m1, m0, q2222
+ mova subpel2, m1
+ pshufd m1, m0, q3333
+ mova subpel3, m1
sub srcq, r5
%else
%define subpel0 m8
@@ -3127,15 +3036,11 @@
%define subpel1 m9
%define subpel2 m10
%define subpel3 m11
- movd subpel0, [myq+0]
- PSHUFB_0X1X subpel0, m2
- movd subpel1, [myq+2]
- PSHUFB_0X1X subpel1, m2
- movd subpel2, [myq+4]
- PSHUFB_0X1X subpel2, m2
- movd subpel3, [myq+6]
- PSHUFB_0X1X subpel3, m2
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
lea stride3q, [strideq*3]
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
sub srcq, stride3q
cmp wd, 8
jns .v_w8
@@ -3159,35 +3064,34 @@
mov r5w, hw
.v_w4_loop0:
%endif
- movd m2, [srcq+strideq*0] ; 0
- movhps m2, [srcq+strideq*2] ; 0 _ 2
- movd m3, [srcq+strideq*1] ; 1
+ movd m1, [srcq+strideq*0]
+ movd m0, [srcq+strideq*1]
%if ARCH_X86_32
lea srcq, [srcq+strideq*2]
- movhps m3, [srcq+strideq*1] ; 1 _ 3
+ movd m2, [srcq+strideq*0]
+ movd m4, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
-%else
- movhps m3, [srcq+stride3q ] ; 1 _ 3
- lea srcq, [srcq+strideq*4]
-%endif
- pshufd m2, m2, q2020 ; 0 2 0 2
- pshufd m3, m3, q2020 ; 1 3 1 3
- punpckldq m2, m3 ; 0 1 2 3
- movd m3, [srcq+strideq*0] ; 4
- movd m1, [srcq+strideq*1] ; 5
- movd m0, [srcq+strideq*2] ; 6
-%if ARCH_X86_32
+ movd m3, [srcq+strideq*0]
+ movd m5, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
- add srcq, strideq
%else
+ movd m2, [srcq+strideq*2]
add srcq, stride3q
+ movd m4, [srcq+strideq*0]
+ movd m3, [srcq+strideq*1]
+ movd m5, [srcq+strideq*2]
+ add srcq, stride3q
%endif
- punpckldq m3, m1 ; 4 5 _ _
- punpckldq m1, m0 ; 5 6 _ _
- PALIGNR m4, m3, m2, 4 ; 1 2 3 4
- punpcklbw m3, m1 ; 45 56
- punpcklbw m1, m2, m4 ; 01 12
- punpckhbw m2, m4 ; 23 34
+ punpckldq m1, m0 ; 0 1
+ punpckldq m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+strideq*0]
+ punpckldq m2, m4 ; 2 3
+ punpckldq m4, m3 ; 3 4
+ punpckldq m3, m5 ; 4 5
+ punpckldq m5, m0 ; 5 6
+ punpcklbw m2, m4 ; 23 34
+ punpcklbw m3, m5 ; 45 56
.v_w4_loop:
%if ARCH_X86_32 && notcpuflag(ssse3)
mova m7, subpel0
@@ -3208,11 +3112,11 @@
%endif
mova m2, m3
PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2
+ movd m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
paddw m5, m3
- movd m4, [srcq+strideq*0]
punpckldq m3, m0, m4 ; 6 7 _ _
- movd m0, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
+ movd m0, [srcq+strideq*0]
punpckldq m4, m0 ; 7 8 _ _
punpcklbw m3, m4 ; 67 78
%if notcpuflag(ssse3)
@@ -3242,13 +3146,13 @@
sub hd, 2
jg .v_w4_loop
%if ARCH_X86_32
- mov hw, r5w ; reset vertical loop
- mov tmpq, tmpm
mov srcq, srcm
- add tmpq, 8
+ mov tmpq, tmpm
+ movzx hd, r5w
add srcq, 4
- mov tmpm, tmpq
+ add tmpq, 8
mov srcm, srcq
+ mov tmpm, tmpq
sub r5d, 1<<16 ; horizontal--
jg .v_w4_loop0
%endif
@@ -3255,37 +3159,30 @@
RET
%if ARCH_X86_64
.v_w8:
- lea r5d, [wq - 8] ; horizontal loop
+ lea r6d, [wq*8-64]
+ mov r5, srcq
mov r8, tmpq
- mov r6, srcq
- shl r5d, 8 - 3; (wq / 8) << 8
- mov r5b, hb
+ lea r6d, [hq+r6*4]
.v_w8_loop0:
- movq m4, [srcq+strideq*0]
- movq m5, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movq m6, [srcq+strideq*0]
- movq m0, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
movq m1, [srcq+strideq*0]
movq m2, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movq m3, [srcq+strideq*0]
- shufpd m4, m0, 0x0c
- shufpd m5, m1, 0x0c
- punpcklbw m1, m4, m5 ; 01
- punpckhbw m4, m5 ; 34
- shufpd m6, m2, 0x0c
- punpcklbw m2, m5, m6 ; 12
- punpckhbw m5, m6 ; 45
- shufpd m0, m3, 0x0c
- punpcklbw m3, m6, m0 ; 23
- punpckhbw m6, m0 ; 56
+ movq m3, [srcq+strideq*2]
+ add srcq, stride3q
+ movq m4, [srcq+strideq*0]
+ movq m5, [srcq+strideq*1]
+ movq m6, [srcq+strideq*2]
+ add srcq, stride3q
+ movq m0, [srcq+strideq*0]
+ punpcklbw m1, m2 ; 01
+ punpcklbw m2, m3 ; 12
+ punpcklbw m3, m4 ; 23
+ punpcklbw m4, m5 ; 34
+ punpcklbw m5, m6 ; 45
+ punpcklbw m6, m0 ; 56
.v_w8_loop:
-%if cpuflag(ssse3)
- movq m12, [srcq+strideq*1]
+ movq m13, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
- movq m13, [srcq+strideq*0]
+%if cpuflag(ssse3)
pmaddubsw m14, m1, subpel0 ; a0
pmaddubsw m15, m2, subpel0 ; b0
mova m1, m3
@@ -3298,64 +3195,59 @@
mova m4, m6
pmaddubsw m5, subpel2 ; a2
pmaddubsw m6, subpel2 ; b2
+ punpcklbw m12, m0, m13 ; 67
+ movq m0, [srcq+strideq*0]
+ punpcklbw m13, m0 ; 78
paddw m14, m5
+ mova m5, m12
+ pmaddubsw m12, subpel3 ; a3
paddw m15, m6
- shufpd m6, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
- punpcklbw m5, m6, m0 ; 67
- punpckhbw m6, m0 ; 78
- pmaddubsw m12, m5, subpel3 ; a3
- pmaddubsw m13, m6, subpel3 ; b3
+ mova m6, m13
+ pmaddubsw m13, subpel3 ; b3
paddw m14, m12
paddw m15, m13
pmulhrsw m14, m7
pmulhrsw m15, m7
- movu [tmpq+wq*0], m14
- movu [tmpq+wq*2], m15
%else
mova m14, m1
PMADDUBSW m14, subpel0, m7, m12, 1 ; a0
+ mova m15, m2
+ PMADDUBSW m15, subpel0, m7, m12, 0 ; b0
mova m1, m3
PMADDUBSW m3, subpel1, m7, m12, 0 ; a1
+ mova m2, m4
+ PMADDUBSW m4, subpel1, m7, m12, 0 ; b1
paddw m14, m3
mova m3, m5
PMADDUBSW m5, subpel2, m7, m12, 0 ; a2
- paddw m14, m5
- movq m12, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movq m13, [srcq+strideq*0]
- shufpd m15, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
- punpcklbw m5, m15, m0 ; 67
- punpckhbw m15, m0 ; 78
- mova m13, m5
- PMADDUBSW m13, subpel3, m7, m12, 0 ; a3
- paddw m14, m13
- PMULHRSW_8192 m14, m14, [base+pw_2]
- movu [tmpq+wq*0], m14
- mova m14, m2
- PMADDUBSW m14, subpel0, m7, m12, 0 ; b0
- mova m2, m4
- PMADDUBSW m4, subpel1, m7, m12, 0 ; b1
- paddw m14, m4
+ paddw m15, m4
mova m4, m6
PMADDUBSW m6, subpel2, m7, m12, 0 ; b2
- paddw m14, m6
- mova m6, m15
- PMADDUBSW m15, subpel3, m7, m12, 0 ; b3
- paddw m14, m15
+ paddw m15, m6
+ punpcklbw m12, m0, m13 ; 67
+ movq m0, [srcq+strideq*0]
+ punpcklbw m13, m0 ; 78
+ paddw m14, m5
+ mova m5, m12
+ PMADDUBSW m12, subpel3, m7, m6, 0 ; a3
+ paddw m14, m12
+ mova m6, m13
+ PMADDUBSW m13, subpel3, m7, m12, 0 ; b3
+ paddw m15, m13
PMULHRSW_8192 m14, m14, [base+pw_2]
- movu [tmpq+wq*2], m14
+ PMULHRSW_8192 m15, m15, [base+pw_2]
%endif
+ movu [tmpq+wq*0], m14
+ movu [tmpq+wq*2], m15
lea tmpq, [tmpq+wq*4]
sub hd, 2
jg .v_w8_loop
- movzx hd, r5b ; reset vertical loop
+ add r5, 8
add r8, 16
- add r6, 8
+ movzx hd, r6b
+ mov srcq, r5
mov tmpq, r8
- mov srcq, r6
- sub r5d, 1<<8 ; horizontal--
+ sub r6d, 1<<8
jg .v_w8_loop0
RET
%endif ;ARCH_X86_64
@@ -3363,7 +3255,6 @@
%undef subpel1
%undef subpel2
%undef subpel3
- ;
.hv:
%assign stack_offset org_stack_offset
cmp wd, 4
@@ -3466,13 +3357,13 @@
%endif
movq m5, [srcq+strideq*0] ; 0 _ _ _
movhps m5, [srcq+strideq*1] ; 0 _ 1 _
- movq m4, [srcq+strideq*2] ; 2 _ _ _
%if ARCH_X86_32
lea srcq, [srcq+strideq*2]
- add srcq, strideq
- movhps m4, [srcq+strideq*0] ; 2 _ 3 _
- add srcq, strideq
+ movq m4, [srcq+strideq*0] ; 2 _ _ _
+ movhps m4, [srcq+strideq*1] ; 2 _ 3 _
+ lea srcq, [srcq+strideq*2]
%else
+ movq m4, [srcq+strideq*2] ; 2 _ _ _
movhps m4, [srcq+stride3q ] ; 2 _ 3 _
lea srcq, [srcq+strideq*4]
%endif
@@ -3506,7 +3397,14 @@
%endif
movq m5, [srcq+strideq*0] ; 4 _ _ _
movhps m5, [srcq+strideq*1] ; 4 _ 5 _
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movq m4, [srcq+strideq*0] ; 6 _ _ _
+ add srcq, strideq
+%else
movq m4, [srcq+strideq*2] ; 6 _ _ _
+ add srcq, stride3q
+%endif
PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~
PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~
PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters
@@ -3531,12 +3429,6 @@
mova m2, [esp+mmsize*4]
%endif
%endif
-%if ARCH_X86_32
- lea srcq, [srcq+strideq*2]
- add srcq, strideq
-%else
- add srcq, stride3q
-%endif
;process high
PALIGNR m4, m3, m2, 4;V 1 2 3 4
punpcklwd m1, m2, m4 ; V 01 12
@@ -3572,7 +3464,6 @@
%define m15 m3
%endif
%endif
- ;
%if cpuflag(ssse3)
mova m6, [base+subpel_h_shuf4]
%endif
@@ -3620,7 +3511,6 @@
mova [esp+0xA0], m5
%endif
%endif
- ;
%if cpuflag(ssse3)
mova m6, [base+subpel_h_shuf4+16]
%endif
@@ -3644,7 +3534,6 @@
paddd m5, d32reg ; pd_32
paddd m5, m4
psrad m4, m5, 6
- ;
RESTORELINE_W4 m5, 5, 0
packssdw m5, m4
pshufd m5, m5, q3120
@@ -3666,7 +3555,6 @@
%undef subpelv1
%undef subpelv2
%undef subpelv3
- ;
.hv_w8:
%assign stack_offset org_stack_offset
%define hv8_line_1 0
@@ -3699,20 +3587,20 @@
%define tmpm [rsp+mmsize*13+gprsize*1]
%define srcm [rsp+mmsize*13+gprsize*2]
%define stridem [rsp+mmsize*13+gprsize*3]
+ mov tmpm, tmpq
mov stridem, strideq
%endif
+ %if cpuflag(ssse3)
pshufd m0, m1, q0000
pshufd m1, m1, q1111
- punpcklbw m5, m5
- %if notcpuflag(ssse3)
- punpcklbw m0, m0
+ %else
punpcklbw m1, m1
- %endif
- psraw m5, 8
- %if notcpuflag(ssse3)
- psraw m0, 8
psraw m1, 8
+ pshufd m0, m1, q1010
+ punpckhqdq m1, m1
%endif
+ punpcklbw m5, m5
+ psraw m5, 8
pshufd m2, m5, q0000
pshufd m3, m5, q1111
pshufd m4, m5, q2222
@@ -3742,38 +3630,31 @@
cmp hd, 6
cmovs myd, mxd
movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
pshufd subpelh0, m0, q0000
pshufd subpelh1, m0, q1111
- punpcklbw m1, m1
- %if notcpuflag(ssse3)
- punpcklbw subpelh0, subpelh0
- punpcklbw subpelh1, subpelh1
+ %else
+ punpcklbw m0, m0
+ psraw m0, 8
+ pshufd subpelh0, m0, q1010
+ pshufd subpelh1, m0, q3232
+ mova m7, [base+pw_2]
%endif
+ punpcklbw m1, m1
psraw m1, 8
- %if notcpuflag(ssse3)
- psraw subpelh0, 8
- psraw subpelh1, 8
- %endif
pshufd subpelv0, m1, q0000
pshufd subpelv1, m1, q1111
pshufd subpelv2, m1, q2222
pshufd subpelv3, m1, q3333
- %if notcpuflag(ssse3)
- mova m7, [base+pw_2]
- %endif
lea stride3q, [strideq*3]
sub srcq, 3
sub srcq, stride3q
mov r6, srcq
-%endif
- lea r5d, [wq-4]
-%if ARCH_X86_64
mov r8, tmpq
-%else
- mov tmpm, tmpq
%endif
- shl r5d, (16 - 2)
- mov r5w, hw
+ lea r5d, [wq-4]
+ shl r5d, 14
+ add r5d, hd
.hv_w8_loop0:
%if cpuflag(ssse3)
%if ARCH_X86_64
@@ -3791,24 +3672,24 @@
%endif
PREP_8TAP_HV m4, srcq+strideq*0, m7, m0
PREP_8TAP_HV m5, srcq+strideq*1, m7, m0
+%if ARCH_X86_64
+ PREP_8TAP_HV m6, srcq+strideq*2, m7, m0
+ add srcq, stride3q
+ PREP_8TAP_HV m0, srcq+strideq*0, m7, m9
+%else
lea srcq, [srcq+strideq*2]
-%if notcpuflag(ssse3)
- %if ARCH_X86_64
- SWAP m9, m4
- %else
+ %if notcpuflag(ssse3)
mova [esp], m4
%endif
-%endif
PREP_8TAP_HV m6, srcq+strideq*0, m7, m4
PREP_8TAP_HV m0, srcq+strideq*1, m7, m4
lea srcq, [srcq+strideq*2]
+%endif
%if cpuflag(ssse3)
mova m7, [base+pw_8192]
%else
mova m7, [base+pw_2]
- %if ARCH_X86_64
- SWAP m4, m9
- %else
+ %if ARCH_X86_32
mova m4, [esp]
%endif
%endif
@@ -3824,28 +3705,26 @@
SAVELINE_W8 3, m3
%if cpuflag(ssse3)
mova m7, [base+subpel_h_shufA]
+%endif
+%if ARCH_X86_64
+ PREP_8TAP_HV m4, srcq+strideq*1, m8, m9
+ PREP_8TAP_HV m5, srcq+strideq*2, m8, m9
+ add srcq, stride3q
+ PREP_8TAP_HV m6, srcq+strideq*0, m8, m9
%else
- %if ARCH_X86_64
- SWAP m8, m7
- SWAP m9, m0
- %else
+ %if notcpuflag(ssse3)
mova [esp+0x30], m0
%endif
-%endif
PREP_8TAP_HV m4, srcq+strideq*0, m7, m0
PREP_8TAP_HV m5, srcq+strideq*1, m7, m0
- PREP_8TAP_HV m6, srcq+strideq*2, m7, m0
lea srcq, [srcq+strideq*2]
+ PREP_8TAP_HV m6, srcq+strideq*0, m7, m0
+%endif
%if cpuflag(ssse3)
mova m7, [base+pw_8192]
-%else
- %if ARCH_X86_64
- SWAP m0, m9
- SWAP m7, m8
- %else
+%elif ARCH_X86_32
mova m0, [esp+0x30]
mova m7, [base+pw_2]
- %endif
%endif
PMULHRSW_8192 m1, m4, m7
PMULHRSW_8192 m2, m5, m7
@@ -3902,8 +3781,8 @@
%endif
%endif
PREP_8TAP_HV m0, srcq+strideq*1, m5, m6
- PREP_8TAP_HV m4, srcq+strideq*2, m5, m6
lea srcq, [srcq+strideq*2]
+ PREP_8TAP_HV m4, srcq+strideq*0, m5, m6
%if cpuflag(ssse3)
mova m5, [base+pw_8192]
%else
@@ -3933,19 +3812,20 @@
RESTORELINE_W8 4, m4
jmp .hv_w8_loop
.hv_w8_outer:
- movzx hd, r5w
%if ARCH_X86_32
mov srcq, srcm
mov tmpq, tmpm
+ movzx hd, r5w
add srcq, 4
add tmpq, 8
mov srcm, srcq
mov tmpm, tmpq
%else
- add r8, 8
- mov tmpq, r8
add r6, 4
+ add r8, 8
+ movzx hd, r5b
mov srcq, r6
+ mov tmpq, r8
%endif
sub r5d, 1<<16
jg .hv_w8_loop0
--- a/tests/checkasm/looprestoration.c
+++ b/tests/checkasm/looprestoration.c
@@ -27,6 +27,7 @@
#include "tests/checkasm/checkasm.h"
+#include <stdio.h>
#include <string.h>
#include "src/levels.h"
@@ -33,6 +34,10 @@
#include "src/looprestoration.h"
#include "src/tables.h"
+static int to_binary(int x) { /* 0-15 -> 0000-1111 */
+ return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8);
+}
+
static void init_tmp(pixel *buf, const ptrdiff_t stride,
const int w, const int h, const int bitdepth_max)
{
@@ -47,38 +52,30 @@
ALIGN_STK_64(pixel, c_dst, 448 * 64,);
ALIGN_STK_64(pixel, a_dst, 448 * 64,);
ALIGN_STK_64(pixel, h_edge, 448 * 8,);
+ ALIGN_STK_16(int16_t, filter, 2, [8]);
pixel left[64][4];
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, ptrdiff_t lpf_stride,
- int w, int h, const int16_t filterh[7],
- const int16_t filterv[7], enum LrEdgeFlags edges
- HIGHBD_DECL_SUFFIX);
+ int w, int h, const int16_t filter[2][8],
+ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
- for (int pl = 0; pl < 2; pl++) {
- if (check_func(c->wiener, "wiener_%s_%dbpc",
- pl ? "chroma" : "luma", bpc))
- {
- int16_t filter[2][3], filter_v[7], filter_h[7];
+ for (int t = 0; t < 2; t++) {
+ if (check_func(c->wiener[t], "wiener_%dtap_%dbpc", t ? 5 : 7, bpc)) {
+ filter[0][0] = filter[0][6] = t ? 0 : (rnd() & 15) - 5;
+ filter[0][1] = filter[0][5] = (rnd() & 31) - 23;
+ filter[0][2] = filter[0][4] = (rnd() & 63) - 17;
+ filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2;
+#if BITDEPTH != 8
+ filter[0][3] += 128;
+#endif
- filter[0][0] = pl ? 0 : (rnd() & 15) - 5;
- filter[0][1] = (rnd() & 31) - 23;
- filter[0][2] = (rnd() & 63) - 17;
- filter[1][0] = pl ? 0 : (rnd() & 15) - 5;
- filter[1][1] = (rnd() & 31) - 23;
- filter[1][2] = (rnd() & 63) - 17;
+ filter[1][0] = filter[1][6] = t ? 0 : (rnd() & 15) - 5;
+ filter[1][1] = filter[1][5] = (rnd() & 31) - 23;
+ filter[1][2] = filter[1][4] = (rnd() & 63) - 17;
+ filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
- filter_h[0] = filter_h[6] = filter[0][0];
- filter_h[1] = filter_h[5] = filter[0][1];
- filter_h[2] = filter_h[4] = filter[0][2];
- filter_h[3] = -((filter_h[0] + filter_h[1] + filter_h[2]) * 2);
-
- filter_v[0] = filter_v[6] = filter[1][0];
- filter_v[1] = filter_v[5] = filter[1][1];
- filter_v[2] = filter_v[4] = filter[1][2];
- filter_v[3] = -((filter_v[0] + filter_v[1] + filter_v[2]) * 2);
-
const int base_w = 1 + (rnd() % 384);
const int base_h = 1 + (rnd() & 63);
const int bitdepth_max = (1 << bpc) - 1;
@@ -95,17 +92,22 @@
call_ref(c_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
- w, h, filter_h, filter_v, edges HIGHBD_TAIL_SUFFIX);
+ w, h, filter, edges HIGHBD_TAIL_SUFFIX);
call_new(a_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
- w, h, filter_h, filter_v, edges HIGHBD_TAIL_SUFFIX);
- checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
- a_dst + 32, 448 * sizeof(pixel),
- w, h, "dst");
+ w, h, filter, edges HIGHBD_TAIL_SUFFIX);
+ if (checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
+ a_dst + 32, 448 * sizeof(pixel),
+ w, h, "dst"))
+ {
+ fprintf(stderr, "size = %dx%d, edges = %04d\n",
+ w, h, to_binary(edges));
+ break;
+ }
}
bench_new(a_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
- 256, 64, filter_h, filter_v, 0xf HIGHBD_TAIL_SUFFIX);
+ 256, 64, filter, 0xf HIGHBD_TAIL_SUFFIX);
}
}
}
--- /dev/null
+++ b/tests/header_test.c
@@ -1,0 +1,33 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include DAV1D_TEST_HEADER
+
+int main()
+{
+ return 0;
+}
--- a/tests/header_test.c.in
+++ /dev/null
@@ -1,33 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <dav1d/INPUT>
-
-int main()
-{
- return 0;
-}
--- a/tests/libfuzzer/dav1d_fuzzer.c
+++ b/tests/libfuzzer/dav1d_fuzzer.c
@@ -31,6 +31,7 @@
#include <stddef.h>
#include <stdint.h>
#include <string.h>
+#include <stdlib.h>
#include <dav1d/dav1d.h>
#include "src/cpu.h"
@@ -38,8 +39,6 @@
#ifdef DAV1D_ALLOC_FAIL
-#include <stdlib.h>
-
#include "alloc_fail.h"
static unsigned djb_xor(const uint8_t * c, size_t len) {
@@ -56,6 +55,39 @@
#define DAV1D_FUZZ_MAX_SIZE 4096 * 4096
+// search for "--cpumask xxx" in argv and remove both parameters
+int LLVMFuzzerInitialize(int *argc, char ***argv) {
+ int i = 1;
+ for (; i < *argc; i++) {
+ if (!strcmp((*argv)[i], "--cpumask")) {
+ const char * cpumask = (*argv)[i+1];
+ if (cpumask) {
+ char *end;
+ unsigned res;
+ if (!strncmp(cpumask, "0x", 2)) {
+ cpumask += 2;
+ res = (unsigned) strtoul(cpumask, &end, 16);
+ } else {
+ res = (unsigned) strtoul(cpumask, &end, 0);
+ }
+ if (end != cpumask && !end[0]) {
+ dav1d_set_cpu_flags_mask(res);
+ }
+ }
+ break;
+ }
+ }
+
+ for (; i < *argc - 2; i++) {
+ (*argv)[i] = (*argv)[i + 2];
+ }
+
+ *argc = i;
+
+ return 0;
+}
+
+
// expects ivf input
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
@@ -146,12 +178,19 @@
dav1d_data_unref(&buf);
}
- do {
- memset(&pic, 0, sizeof(pic));
- err = dav1d_get_picture(ctx, &pic);
- if (err == 0)
- dav1d_picture_unref(&pic);
- } while (err != DAV1D_ERR(EAGAIN));
+ memset(&pic, 0, sizeof(pic));
+ if ((err = dav1d_get_picture(ctx, &pic)) == 0) {
+ /* Test calling dav1d_picture_unref() after dav1d_close() */
+ do {
+ Dav1dPicture pic2 = { 0 };
+ if ((err = dav1d_get_picture(ctx, &pic2)) == 0)
+ dav1d_picture_unref(&pic2);
+ } while (err != DAV1D_ERR(EAGAIN));
+
+ dav1d_close(&ctx);
+ dav1d_picture_unref(&pic);
+ return 0;
+ }
cleanup:
dav1d_flush(ctx);
--- a/tests/libfuzzer/dav1d_fuzzer.h
+++ b/tests/libfuzzer/dav1d_fuzzer.h
@@ -31,6 +31,7 @@
#include <stddef.h>
#include <stdint.h>
+int LLVMFuzzerInitialize(int *argc, char ***argv);
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size);
#endif /* DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H */
--- a/tests/libfuzzer/main.c
+++ b/tests/libfuzzer/main.c
@@ -40,7 +40,7 @@
// expects ivf input
-int main(const int argc, char *const *const argv) {
+int main(int argc, char *argv[]) {
int ret = -1;
FILE *f = NULL;
int64_t fsize;
@@ -47,6 +47,10 @@
const char *filename = NULL;
uint8_t *data = NULL;
size_t size = 0;
+
+ if (LLVMFuzzerInitialize(&argc, &argv)) {
+ return 1;
+ }
if (argc != 2) {
fprintf(stdout, "Usage:\n%s fuzzing_testcase.ivf\n", argv[0]);
--- a/tests/libfuzzer/meson.build
+++ b/tests/libfuzzer/meson.build
@@ -72,8 +72,15 @@
objcopy = find_program('objcopy',
required: false)
+
+if meson.version().version_compare('<0.56.99')
+ lto = get_option('b_lto') ? 'true' : 'false'
+else
+ lto = get_option('b_lto')
+endif
+
if (objcopy.found() and
- not get_option('b_lto') and
+ lto == 'false' and
get_option('default_library') == 'static' and
cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args))
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -31,8 +31,6 @@
subdir_done()
endif
-libdav1d_nasm_objs_if_needed = []
-
if is_asm_enabled
checkasm_sources = files(
'checkasm/checkasm.c',
@@ -62,25 +60,27 @@
checkasm_bitdepth_objs += checkasm_bitdepth_lib.extract_all_objects()
endforeach
- checkasm_nasm_objs = []
+ checkasm_asm_objs = []
+ checkasm_asm_sources = []
if host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64'
- checkasm_sources += files('checkasm/arm/checkasm_64.S')
+ checkasm_asm_sources += files('checkasm/arm/checkasm_64.S')
elif host_machine.cpu_family().startswith('arm')
- checkasm_sources += files('checkasm/arm/checkasm_32.S')
+ checkasm_asm_sources += files('checkasm/arm/checkasm_32.S')
elif host_machine.cpu_family().startswith('x86')
- checkasm_nasm_objs = nasm_gen.process(files('checkasm/x86/checkasm.asm'))
+ checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm'))
endif
- m_lib = cc.find_library('m', required: false)
-
- if meson.version().version_compare('< 0.48.999')
- libdav1d_nasm_objs_if_needed = libdav1d_nasm_objs
+ if use_gaspp
+ checkasm_asm_objs += gaspp_gen.process(checkasm_asm_sources)
+ else
+ checkasm_sources += checkasm_asm_sources
endif
+ m_lib = cc.find_library('m', required: false)
+
checkasm = executable('checkasm',
checkasm_sources,
- checkasm_nasm_objs,
- libdav1d_nasm_objs_if_needed,
+ checkasm_asm_objs,
objects: [
checkasm_bitdepth_objs,
@@ -98,7 +98,8 @@
],
)
- test('checkasm', checkasm, is_parallel: false)
+ test('checkasm', checkasm, suite: 'checkasm', is_parallel: false)
+ benchmark('checkasm', checkasm, suite: 'checkasm', timeout: 3600, args: '--bench')
endif
c99_extension_flag = cc.first_supported_argument(
@@ -110,31 +111,21 @@
# dav1d_api_headers
foreach header : dav1d_api_headers
- header_file = '@0@'.format(header).split('/')[-1]
- target = header_file + '_test'
+ target = header + '_test'
- header_test_source = custom_target(target,
- output : target + '.c',
- input : 'header_test.c.in',
- capture : true,
- command : ['sed', '-e', 's/INPUT/' + header_file + '/', '@INPUT@']
- )
-
header_test_exe = executable(target,
- header_test_source,
+ 'header_test.c',
include_directories: dav1d_inc_dirs,
- c_args: [c99_extension_flag],
+ c_args: ['-DDAV1D_TEST_HEADER="@0@"'.format(header), c99_extension_flag],
build_by_default: true
)
- test(target, header_test_exe)
+ test(target, header_test_exe, suite: 'headers')
endforeach
# fuzzing binaries
-if meson.version().version_compare('>=0.49')
- subdir('libfuzzer')
-endif
+subdir('libfuzzer')
# Include dav1d test data repository with additional tests
if get_option('testdata_tests')
--- /dev/null
+++ b/tools/dav1d.manifest
@@ -1,0 +1,10 @@
+<?xml version="1.0" encoding="utf-8" standalone="yes"?>
+<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
+ <assemblyIdentity type="win32" name="VideoLAN.dav1d" version="1.0.0.0"/>
+ <application xmlns="urn:schemas-microsoft-com:asm.v3">
+ <windowsSettings>
+ <longPathAware xmlns="http://schemas.microsoft.com/SMI/2016/WindowsSettings">true</longPathAware>
+ <activeCodePage xmlns="http://schemas.microsoft.com/SMI/2019/WindowsSettings">UTF-8</activeCodePage>
+ </windowsSettings>
+ </application>
+</assembly>
--- /dev/null
+++ b/tools/dav1d.rc.in
@@ -1,0 +1,33 @@
+#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0
+#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@"
+#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0
+#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@"
+
+#include <windows.h>
+
+1 RT_MANIFEST "dav1d.manifest"
+1 VERSIONINFO
+FILETYPE VFT_APP
+FILEOS VOS_NT_WINDOWS32
+PRODUCTVERSION PROJECT_VERSION_NUMBER
+FILEVERSION API_VERSION_NUMBER
+BEGIN
+ BLOCK "StringFileInfo"
+ BEGIN
+ BLOCK "040904E4"
+ BEGIN
+ VALUE "CompanyName", "VideoLAN"
+ VALUE "ProductName", "dav1d"
+ VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR
+ VALUE "FileVersion", API_VERSION_NUMBER_STR
+ VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
+ VALUE "InternalName", "dav1d"
+ VALUE "OriginalFilename", "dav1d.exe"
+ VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
+ END
+ END
+ BLOCK "VarFileInfo"
+ BEGIN
+ VALUE "Translation", 0x409, 1252
+ END
+END
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -77,8 +77,24 @@
'dav1d_cli_parse.c',
)
+if host_machine.system() == 'windows'
+ rc_file = configure_file(
+ input : 'dav1d.rc.in',
+ output : 'dav1d.rc',
+ configuration : rc_data
+ )
+
+ dav1d_rc_obj = winmod.compile_resources(rc_file,
+ depend_files : files('dav1d.manifest'),
+ include_directories : include_directories('.')
+ )
+else
+ dav1d_rc_obj = []
+endif
+
dav1d = executable('dav1d',
dav1d_sources,
+ dav1d_rc_obj,
rev_target, cli_config_h_target,
link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs],