ref: 2989cb3f79e34d561dd2311532b1cb9b182e1170
parent: 8863dabe0a500c9c641f546cd1ae3ac97b6312a1
parent: 3bfe8c7c8a553728e2d6556e4a95f5cd246d1c92
author: Sigrid Haflínudóttir <ftrvxmtrx@gmail.com>
date: Fri Sep 4 05:39:49 EDT 2020
Merge remote-tracking branch 'upstream' into master
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -81,6 +81,15 @@
fi;
done
+x86inc-check:
+ extends: .debian-amd64-common
+ stage: style
+ script:
+ - git remote rm x86inc 2> /dev/null || true
+ - git remote add x86inc https://code.videolan.org/videolan/x86inc.asm.git
+ - git fetch -q x86inc master
+ - git diff --exit-code x86inc/master:x86inc.asm src/ext/x86/x86inc.asm
+ allow_failure: true
build-debian:
extends: .debian-amd64-common
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -12,7 +12,7 @@
The codebase is developed with the following assumptions:
For the library:
-- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extension,
+- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extensions. Anonymous structures and unions are the only allowed compiler extensions for internal code.
- x86 asm in .asm files, using the NASM syntax,
- arm/arm64 in .S files, using the GAS syntax limited to subset llvm 5.0's internal assembler supports,
- no C++ is allowed, whatever the version.
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-![dav1d logo](dav1d_logo.png)
+![dav1d logo](doc/dav1d_logo.png)
# dav1d
@@ -30,17 +30,21 @@
1. Complete C implementation of the decoder,
2. Provide a usable API,
3. Port to most platforms,
-4. Make it fast on desktop, by writing asm for AVX-2 chips.
+4. Make it fast on desktop, by writing asm for AVX2 chips.
5. Make it fast on mobile, by writing asm for ARMv8 chips,
-6. Make it fast on older desktop, by writing asm for SSSE3+ chips.
+6. Make it fast on older desktop, by writing asm for SSSE3+ chips,
+7. Make high bit-depth fast on mobile, by writing asm for ARMv8 chips.
### On-going
-7. Make it fast on older mobiles, by writing asm for ARMv7 chips,
-8. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
-9. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
+8. Make it fast on older mobile, by writing asm for ARMv7 chips,
+9. Make high bit-depth fast on older mobile, by writing asm for ARMv7 chips,
+10. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
+11. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
### After
-10. Use more GPU, when possible.
+12. Make high bit-depth fast on desktop, by writing asm for AVX2 chips,
+13. Make high bit-depth fast on older desktop, by writing asm for SSSE3+ chips,
+14. Use more GPU, when possible.
# Contribute
@@ -130,7 +134,7 @@
## I am not a developer. Can I help?
-- Yes. We need testers, bug reporters, and documentation writers.
+- Yes. We need testers, bug reporters and documentation writers.
## What about the AV1 patent license?
@@ -142,3 +146,5 @@
- We do, but we don't have either the time or the knowledge. Therefore, patches and contributions welcome.
+## Where can I find documentation?
+- The current library documentation, built from master, can be found [here](https://videolan.videolan.me/dav1d/).
binary files a/dav1d_logo.png /dev/null differ
binary files /dev/null b/doc/dav1d_logo.png differ
--- a/examples/dp_renderer_placebo.c
+++ b/examples/dp_renderer_placebo.c
@@ -501,7 +501,7 @@
.num_points_uv = { src->num_uv_points[0], src->num_uv_points[1] },
.scaling_shift = src->scaling_shift,
.ar_coeff_lag = src->ar_coeff_lag,
- .ar_coeff_shift = src->ar_coeff_shift,
+ .ar_coeff_shift = (int)src->ar_coeff_shift,
.grain_scale_shift = src->grain_scale_shift,
.uv_mult = { src->uv_mult[0], src->uv_mult[1] },
.uv_mult_luma = { src->uv_luma_mult[0], src->uv_luma_mult[1] },
--- a/include/dav1d/dav1d.h
+++ b/include/dav1d/dav1d.h
@@ -68,9 +68,9 @@
int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
int all_layers; ///< output all spatial layers of a scalable AV1 biststream
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
- uint8_t reserved[32]; ///< reserved for future use
Dav1dPicAllocator allocator; ///< Picture allocator callback.
Dav1dLogger logger; ///< Logger callback.
+ uint8_t reserved[32]; ///< reserved for future use
} Dav1dSettings;
#pragma incomplete Dav1dSettings
--- a/include/dav1d/headers.h
+++ b/include/dav1d/headers.h
@@ -28,6 +28,7 @@
#ifndef DAV1D_HEADERS_H
#define DAV1D_HEADERS_H
+#include <stdint.h>
#include <stddef.h>
// Constants from Section 3. "Symbols and abbreviated terms"
@@ -95,9 +96,9 @@
union {
struct {
int16_t alpha, beta, gamma, delta;
- };
+ } p;
int16_t abcd[4];
- };
+ } u;
} Dav1dWarpedMotionParams;
enum Dav1dPixelLayout {
@@ -127,6 +128,7 @@
DAV1D_COLOR_PRI_SMPTE431 = 11,
DAV1D_COLOR_PRI_SMPTE432 = 12,
DAV1D_COLOR_PRI_EBU3213 = 22,
+ DAV1D_COLOR_PRI_RESERVED = 255,
};
enum Dav1dTransferCharacteristics {
@@ -147,6 +149,7 @@
DAV1D_TRC_SMPTE2084 = 16, ///< PQ
DAV1D_TRC_SMPTE428 = 17,
DAV1D_TRC_HLG = 18, ///< hybrid log/gamma (BT.2100 / ARIB STD-B67)
+ DAV1D_TRC_RESERVED = 255,
};
enum Dav1dMatrixCoefficients {
@@ -164,6 +167,7 @@
DAV1D_MC_CHROMAT_NCL = 12, ///< Chromaticity-derived
DAV1D_MC_CHROMAT_CL = 13,
DAV1D_MC_ICTCP = 14,
+ DAV1D_MC_RESERVED = 255,
};
enum Dav1dChromaSamplePosition {
--- a/include/dav1d/meson.build
+++ b/include/dav1d/meson.build
@@ -31,11 +31,15 @@
output: 'version.h',
configuration: version_h_data)
+dav1d_api_headers = files(
+ 'common.h',
+ 'data.h',
+ 'dav1d.h',
+ 'headers.h',
+ 'picture.h',
+ )
+
# install headers
-install_headers('common.h',
- 'data.h',
- 'dav1d.h',
- 'headers.h',
- 'picture.h',
+install_headers(dav1d_api_headers,
version_h_target,
subdir : 'dav1d')
--- a/meson.build
+++ b/meson.build
@@ -30,7 +30,7 @@
'b_ndebug=if-release'],
meson_version: '>= 0.47.0')
-dav1d_soname_version = '4.0.2'
+dav1d_soname_version = '5.0.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
@@ -62,7 +62,8 @@
# ASM option
is_asm_enabled = (get_option('enable_asm') == true and
- (host_machine.cpu_family().startswith('x86') or
+ (host_machine.cpu_family() == 'x86' or
+ (host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__') == '') or
host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm') or
host_machine.cpu() == 'ppc64le'))
@@ -350,6 +351,7 @@
cdata.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
if host_machine.cpu_family().startswith('x86')
+ cdata_asm.set('private_prefix', 'dav1d')
cdata_asm.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64')
cdata_asm.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
cdata_asm.set10('PIC', true)
--- a/src/arm/32/ipred.S
+++ b/src/arm/32/ipred.S
@@ -1,6 +1,6 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2019, Martin Storsjo
+ * Copyright © 2020, Martin Storsjo
* Copyright © 2019, B Krishnan Iyer
* All rights reserved.
*
@@ -132,7 +132,7 @@
.word 80f - L(ipred_v_tbl) + CONFIG_THUMB
.word 40f - L(ipred_v_tbl) + CONFIG_THUMB
40:
- vld1.32 {d0[0]}, [r2]
+ vld1.32 {d0[]}, [r2]
4:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
@@ -215,7 +215,7 @@
.word 8f - L(ipred_h_tbl) + CONFIG_THUMB
.word 4f - L(ipred_h_tbl) + CONFIG_THUMB
4:
- vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr
vst1.32 {d3[0]}, [r0, :32], r1
vst1.32 {d2[0]}, [r12, :32], r1
subs r4, r4, #4
@@ -224,7 +224,7 @@
bgt 4b
pop {r4-r5, pc}
8:
- vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr
vst1.8 {d3}, [r0, :64], r1
vst1.8 {d2}, [r12, :64], r1
subs r4, r4, #4
@@ -453,7 +453,7 @@
.word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
L(ipred_dc_left_h4):
- vld1.32 {d0[]}, [r2]
+ vld1.32 {d0[]}, [r2, :32]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #2
@@ -468,7 +468,7 @@
bgt L(ipred_dc_left_w4)
pop {r4-r5, pc}
L(ipred_dc_left_h8):
- vld1.8 {d0}, [r2]
+ vld1.8 {d0}, [r2, :64]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vpadd.u16 d0, d0
@@ -484,7 +484,7 @@
bgt L(ipred_dc_left_w8)
pop {r4-r5, pc}
L(ipred_dc_left_h16):
- vld1.8 {d0, d1}, [r2]
+ vld1.8 {d0, d1}, [r2, :128]
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
@@ -501,7 +501,7 @@
bgt L(ipred_dc_left_w16)
pop {r4-r5, pc}
L(ipred_dc_left_h32):
- vld1.8 {d0, d1, d2, d3}, [r2]
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
@@ -522,8 +522,8 @@
bgt 1b
pop {r4-r5, pc}
L(ipred_dc_left_h64):
- vld1.8 {d0, d1, d2, d3}, [r2]!
- vld1.8 {d4, d5, d6, d7}, [r2]
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.8 {d4, d5, d6, d7}, [r2, :128]
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vaddl.u8 q2, d4, d5
@@ -568,7 +568,6 @@
clz r3, r3
clz r12, r4
vdup.16 q15, lr // width + height
- mov r6, #0
adr r5, L(ipred_dc_tbl)
rbit lr, lr // rbit(width + height)
sub r3, r3, #20 // 25 leading bits, minus table offset 5
@@ -599,22 +598,21 @@
.word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB
L(ipred_dc_h4):
- vld1.32 {d0[0]}, [r2]!
+ vld1.32 {d0[]}, [r2, :32]!
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w4):
add r2, r2, #1
- vld1.32 {d1[0]}, [r2]
- vmov.32 d1[1], r6
+ vld1.32 {d1[]}, [r2]
vadd.s16 d0, d0, d30
vpaddl.u8 d1, d1
vpadd.u16 d1, d1
- vpadd.u16 d1, d1
cmp r4, #4
vadd.s16 d0, d0, d1
vshl.u16 d0, d0, d28
- beq 1f // h = 8/16
+ beq 1f
+ // h = 8/16
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
cmp r4, #16
@@ -634,7 +632,7 @@
pop {r4-r6, pc}
L(ipred_dc_h8):
- vld1.8 {d0}, [r2]!
+ vld1.8 {d0}, [r2, :64]!
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vpadd.u16 d0, d0
@@ -649,13 +647,14 @@
cmp r4, #8
vadd.s16 d0, d0, d2
vshl.u16 d0, d0, d28
- beq 1f // h = 4/16/32
+ beq 1f
+ // h = 4/16/32
cmp r4, #32
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
it ne
movne lr, r5
- vdup.16 q12, lr
+ vdup.16 d24, lr
vqdmulh.s16 d0, d0, d24
1:
vdup.8 d0, d0[0]
@@ -669,7 +668,7 @@
pop {r4-r6, pc}
L(ipred_dc_h16):
- vld1.8 {d0, d1}, [r2]!
+ vld1.8 {d0, d1}, [r2, :128]!
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
@@ -686,13 +685,14 @@
cmp r4, #16
vadd.s16 d0, d0, d2
vshl.u16 d0, d0, d28
- beq 1f // h = 4/8/32/64
+ beq 1f
+ // h = 4/8/32/64
tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
it ne
movne lr, r5
- vdup.16 q12, lr
+ vdup.16 d24, lr
vqdmulh.s16 d0, d0, d24
1:
vdup.8 q0, d0[0]
@@ -706,7 +706,7 @@
pop {r4-r6, pc}
L(ipred_dc_h32):
- vld1.8 {d0, d1, d2, d3}, [r2]!
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
@@ -718,25 +718,23 @@
add r2, r2, #1
vld1.8 {d2, d3, d4, d5}, [r2]
vadd.s16 d0, d0, d30
- vaddl.u8 q2, d4, d5
- vadd.u16 d4, d4, d5
vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vadd.u16 q1, q1, q2
vadd.u16 d2, d2, d3
- vpadd.u16 d4, d4
vpadd.u16 d2, d2
- vpadd.u16 d4, d4
vpadd.u16 d2, d2
cmp r4, #32
- vadd.s16 d0, d0, d4
vadd.s16 d0, d0, d2
vshl.u16 d4, d0, d28
- beq 1f // h = 8/16/64
+ beq 1f
+ // h = 8/16/64
cmp r4, #8
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
it ne
movne lr, r5
- vdup.16 q12, lr
+ vdup.16 d24, lr
vqdmulh.s16 d4, d4, d24
1:
vdup.8 q0, d4[0]
@@ -751,9 +749,9 @@
pop {r4-r6, pc}
L(ipred_dc_h64):
- vld1.8 {d0, d1, d2, d3}, [r2]!
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
vaddl.u8 q0, d0, d1
- vld1.8 {d4, d5, d6, d7}, [r2]!
+ vld1.8 {d4, d5, d6, d7}, [r2, :128]!
vaddl.u8 q1, d2, d3
vaddl.u8 q2, d4, d5
vaddl.u8 q3, d6, d7
@@ -819,3 +817,2143 @@
pop {r4-r6, pc}
endfunc
+// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ clz lr, r3
+ adr r5, L(ipred_paeth_tbl)
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.8 {d4[], d5[]}, [r2]
+ add r8, r2, #1
+ sub r2, r2, #4
+ add r5, r5, lr
+ mov r7, #-4
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_paeth_tbl):
+ .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[], d7[]}, [r8]
+ vsubl.u8 q8, d6, d4 // top - topleft
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7
+ vzip.32 d0, d1
+ vzip.32 d2, d3
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d2
+ vqmovun.s16 d18, q9 // base
+ vqmovun.s16 d19, q10
+ vmov d1, d2
+ vabd.u8 q10, q3, q9 // tdiff
+ vabd.u8 q11, q2, q9 // tldiff
+ vabd.u8 q9, q0, q9 // ldiff
+ vmin.u8 q12, q10, q11 // min(tdiff, tldiff)
+ vcge.u8 q10, q11, q10 // tldiff >= tdiff
+ vcge.u8 q9, q12, q9 // min(tdiff, tldiff) >= ldiff
+ vbsl q10, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbit q10, q0, q9 // ldiff <= min ? left : ...
+ vst1.32 {d21[1]}, [r0, :32], r1
+ vst1.32 {d21[0]}, [r6, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d20[1]}, [r0, :32], r1
+ vst1.32 {d20[0]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d6}, [r8]
+ vsubl.u8 q8, d6, d4 // top - topleft
+ vmov d7, d6
+8:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d1
+ vaddw.u8 q11, q8, d2
+ vaddw.u8 q12, q8, d3
+ vqmovun.s16 d18, q9 // base
+ vqmovun.s16 d19, q10
+ vqmovun.s16 d20, q11
+ vqmovun.s16 d21, q12
+ vabd.u8 q11, q3, q9 // tdiff
+ vabd.u8 q12, q3, q10
+ vabd.u8 q13, q2, q9 // tldiff
+ vabd.u8 q14, q2, q10
+ vabd.u8 q10, q1, q10 // ldiff
+ vabd.u8 q9, q0, q9
+ vmin.u8 q15, q12, q14 // min(tdiff, tldiff)
+ vcge.u8 q12, q14, q12 // tldiff >= tdiff
+ vmin.u8 q14, q11, q13 // min(tdiff, tldiff)
+ vcge.u8 q11, q13, q11 // tldiff >= tdiff
+ vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff
+ vcge.u8 q9, q14, q9
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ vst1.8 {d25}, [r0, :64], r1
+ vst1.8 {d24}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d23}, [r0, :64], r1
+ vst1.8 {d22}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+640:
+ vld1.8 {d6}, [r8]!
+ mov r12, r3
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3
+1:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7
+2:
+ vsubl.u8 q8, d6, d4 // top - topleft
+ vmov d7, d6
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d1
+ vaddw.u8 q11, q8, d2
+ vaddw.u8 q12, q8, d3
+ vqmovun.s16 d18, q9 // base
+ vqmovun.s16 d19, q10
+ vqmovun.s16 d20, q11
+ vqmovun.s16 d21, q12
+ vabd.u8 q11, q3, q9 // tdiff
+ vabd.u8 q12, q3, q10
+ vabd.u8 q13, q2, q9 // tldiff
+ vabd.u8 q14, q2, q10
+ vabd.u8 q10, q1, q10 // ldiff
+ vabd.u8 q9, q0, q9
+ vmin.u8 q15, q12, q14 // min(tdiff, tldiff)
+ vcge.u8 q12, q14, q12 // tldiff >= tdiff
+ vmin.u8 q14, q11, q13 // min(tdiff, tldiff)
+ vcge.u8 q11, q13, q11 // tldiff >= tdiff
+ vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff
+ vcge.u8 q9, q14, q9
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ subs r3, r3, #8
+ vst1.8 {d25}, [r0, :64]!
+ vst1.8 {d24}, [r6, :64]!
+ vst1.8 {d23}, [r5, :64]!
+ vst1.8 {d22}, [lr, :64]!
+ ble 8f
+ vld1.8 {d6}, [r8]!
+ b 2b
+8:
+ subs r4, r4, #4
+ ble 9f
+ // End of horizontal loop, move pointers to next four rows
+ sub r8, r8, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ vld1.8 {d6}, [r8]!
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_8bpc_neon, export=1
+ push {r4-r10, lr}
+ ldr r4, [sp, #32]
+ movrel r10, X(sm_weights)
+ add r12, r10, r4
+ add r10, r10, r3
+ clz r9, r3
+ adr r5, L(ipred_smooth_tbl)
+ sub lr, r2, r4
+ sub r9, r9, #25
+ ldr r9, [r5, r9, lsl #2]
+ vld1.8 {d4[]}, [lr] // bottom
+ add r8, r2, #1
+ add r5, r5, r9
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_tbl):
+ .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d16[]}, [r8] // top
+ vld1.32 {d18[]}, [r10, :32] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vdup.8 q3, d16[3] // right
+ vsubl.u8 q8, d16, d4 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left
+ vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver
+ vshll.i8 q12, d6, #8 // right*256
+ vshll.i8 q13, d6, #8
+ vzip.32 d1, d0 // left, flipped
+ vzip.32 d3, d2
+ vzip.32 d20, d21 // weights_ver
+ vzip.32 d22, d23
+ vshll.i8 q14, d4, #8 // bottom*256
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q0, d1, d6 // left-right
+ vsubl.u8 q1, d3, d6
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmla.i16 q12, q1, q9 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q0, q9 // (left flipped)
+ vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q15, q8, q11
+ vhadd.u16 q12, q12, q14
+ vhadd.u16 q13, q13, q15
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ vst1.32 {d24[0]}, [r0, :32], r1
+ vst1.32 {d24[1]}, [r6, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d25[0]}, [r0, :32], r1
+ vst1.32 {d25[1]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r10, pc}
+80:
+ vld1.8 {d16}, [r8] // top
+ vld1.8 {d18}, [r10, :64] // weights_hor
+ sub r2, r2, #2
+ mov r7, #-2
+ vdup.8 q3, d16[7] // right
+ vsubl.u8 q8, d16, d4 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+8:
+ vld2.8 {d0[], d1[]}, [r2, :16], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vshll.i8 q12, d6, #8 // right*256
+ vshll.i8 q13, d6, #8
+ vshll.i8 q14, d4, #8 // bottom*256
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q1, d0, d6 // left-right (left flipped)
+ vsubl.u8 q0, d1, d6
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q1, q9
+ vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q15, q8, q11
+ vhadd.u16 q12, q12, q14
+ vhadd.u16 q13, q13, q15
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ subs r4, r4, #2
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d25}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r10, pc}
+160:
+320:
+640:
+ add lr, r2, r3
+ sub r2, r2, #2
+ mov r7, #-2
+ vld1.8 {d6[], d7[]}, [lr] // right
+ sub r1, r1, r3
+ mov r9, r3
+
+1:
+ vld2.8 {d0[], d1[]}, [r2, :16], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vsubl.u8 q1, d0, d6 // left-right (left flipped)
+ vsubl.u8 q0, d1, d6
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+2:
+ vld1.8 {d16}, [r8]! // top
+ vld1.8 {d18}, [r10, :64]! // weights_hor
+ vshll.i8 q12, d6, #8 // right*256
+ vshll.i8 q13, d6, #8
+ vmovl.u8 q9, d18 // weights_hor
+ vshll.i8 q14, d4, #8 // bottom*256
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q8, d16, d4 // top-bottom
+ vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q1, q9
+ vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q15, q8, q11
+ vhadd.u16 q12, q12, q14
+ vhadd.u16 q13, q13, q15
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ subs r3, r3, #8
+ vst1.8 {d24}, [r0, :64]!
+ vst1.8 {d25}, [r6, :64]!
+ bgt 2b
+ subs r4, r4, #2
+ ble 9f
+ sub r8, r8, r9
+ sub r10, r10, r9
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, r9
+ b 1b
+9:
+ pop {r4-r10, pc}
+endfunc
+
+// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_8bpc_neon, export=1
+ push {r4-r7, lr}
+ ldr r4, [sp, #20]
+ movrel r7, X(sm_weights)
+ add r7, r7, r4
+ clz lr, r3
+ adr r5, L(ipred_smooth_v_tbl)
+ sub r12, r2, r4
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.8 {d4[]}, [r12] // bottom
+ add r2, r2, #1
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_v_tbl):
+ .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[]}, [r2] // top
+ vsubl.u8 q3, d6, d4 // top-bottom
+4:
+ vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver
+ vshll.i8 q10, d4, #8 // bottom*256
+ vshll.i8 q11, d4, #8
+ vzip.32 d16, d17 // weights_ver
+ vzip.32 d18, d19
+ vmovl.u8 q8, d16 // weights_ver
+ vmovl.u8 q9, d18
+ subs r4, r4, #4
+ vmla.i16 q10, q3, q8 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q11, q3, q9
+ vrshrn.i16 d20, q10, #8
+ vrshrn.i16 d21, q11, #8
+ vst1.32 {d20[0]}, [r0, :32], r1
+ vst1.32 {d20[1]}, [r6, :32], r1
+ vst1.32 {d21[0]}, [r0, :32], r1
+ vst1.32 {d21[1]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r7, pc}
+80:
+ vld1.8 {d6}, [r2] // top
+ vsubl.u8 q3, d6, d4 // top-bottom
+8:
+ vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver
+ vshll.i8 q12, d4, #8 // bottom*256
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vmovl.u8 q8, d16 // weights_ver
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+ vmla.i16 q12, q3, q8 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q13, q3, q9
+ vmla.i16 q14, q3, q10
+ vmla.i16 q15, q3, q11
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ vrshrn.i16 d26, q14, #8
+ vrshrn.i16 d27, q15, #8
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d25}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d26}, [r0, :64], r1
+ vst1.8 {d27}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r7, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3
+ mov r12, r3
+
+1:
+ vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver
+ vmovl.u8 q4, d8 // weights_ver
+ vmovl.u8 q5, d10
+ vmovl.u8 q6, d12
+ vmovl.u8 q7, d14
+2:
+ vld1.8 {q3}, [r2]! // top
+ vshll.i8 q8, d4, #8 // bottom*256
+ vshll.i8 q9, d4, #8
+ vshll.i8 q10, d4, #8
+ vshll.i8 q11, d4, #8
+ vsubl.u8 q0, d6, d4 // top-bottom
+ vsubl.u8 q1, d7, d4
+ vshll.i8 q12, d4, #8
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vmla.i16 q8, q0, q4 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q9, q1, q4
+ vmla.i16 q10, q0, q5
+ vmla.i16 q11, q1, q5
+ vmla.i16 q12, q0, q6 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q13, q1, q6
+ vmla.i16 q14, q0, q7
+ vmla.i16 q15, q1, q7
+ vrshrn.i16 d16, q8, #8
+ vrshrn.i16 d17, q9, #8
+ vrshrn.i16 d18, q10, #8
+ vrshrn.i16 d19, q11, #8
+ vrshrn.i16 d20, q12, #8
+ vrshrn.i16 d21, q13, #8
+ vrshrn.i16 d22, q14, #8
+ vrshrn.i16 d23, q15, #8
+ subs r3, r3, #16
+ vst1.8 {q8}, [r0, :128]!
+ vst1.8 {q9}, [r6, :128]!
+ vst1.8 {q10}, [r5, :128]!
+ vst1.8 {q11}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r2, r2, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r7, pc}
+endfunc
+
+// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ movrel r8, X(sm_weights)
+ add r8, r8, r3
+ clz lr, r3
+ adr r5, L(ipred_smooth_h_tbl)
+ add r12, r2, r3
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.8 {d4[]}, [r12] // right
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_h_tbl):
+ .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[]}, [r8, :32] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vmovl.u8 q3, d6 // weights_hor
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left
+ vshll.i8 q8, d4, #8 // right*256
+ vshll.i8 q9, d4, #8
+ vzip.32 d3, d2 // left, flipped
+ vzip.32 d1, d0
+ vsubl.u8 q1, d3, d4 // left-right
+ vsubl.u8 q0, d1, d4
+ subs r4, r4, #4
+ vmla.i16 q8, q1, q3 // right*256 + (left-right)*weights_hor
+ vmla.i16 q9, q0, q3
+ vrshrn.i16 d16, q8, #8
+ vrshrn.i16 d17, q9, #8
+ vst1.32 {d16[0]}, [r0, :32], r1
+ vst1.32 {d16[1]}, [r6, :32], r1
+ vst1.32 {d17[0]}, [r0, :32], r1
+ vst1.32 {d17[1]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d6}, [r8, :64] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vmovl.u8 q3, d6 // weights_hor
+8:
+ vld4.8 {d16[], d18[], d20[], d22[]}, [r2, :32], r7 // left
+ vshll.i8 q12, d4, #8 // right*256
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q11, d22, d4 // left-right
+ vsubl.u8 q10, d20, d4
+ vsubl.u8 q9, d18, d4
+ vsubl.u8 q8, d16, d4
+ vmla.i16 q12, q11, q3 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q10, q3 // (left flipped)
+ vmla.i16 q14, q9, q3
+ vmla.i16 q15, q8, q3
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ vrshrn.i16 d26, q14, #8
+ vrshrn.i16 d27, q15, #8
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d25}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d26}, [r0, :64], r1
+ vst1.8 {d27}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ sub r2, r2, #4
+ mov r7, #-4
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3
+ mov r12, r3
+
+1:
+ vld4.8 {d8[], d10[], d12[], d14[]}, [r2, :32], r7 // left
+ vsubl.u8 q4, d8, d4 // left-right
+ vsubl.u8 q5, d10, d4
+ vsubl.u8 q6, d12, d4
+ vsubl.u8 q7, d14, d4
+2:
+ vld1.8 {q1}, [r8, :128]! // weights_hor
+ vshll.i8 q8, d4, #8 // right*256
+ vshll.i8 q9, d4, #8
+ vshll.i8 q10, d4, #8
+ vshll.i8 q11, d4, #8
+ vmovl.u8 q0, d2 // weights_hor
+ vmovl.u8 q1, d3
+ vshll.i8 q12, d4, #8
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vmla.i16 q8, q7, q0 // right*256 + (left-right)*weights_hor
+ vmla.i16 q9, q7, q1 // (left flipped)
+ vmla.i16 q10, q6, q0
+ vmla.i16 q11, q6, q1
+ vmla.i16 q12, q5, q0
+ vmla.i16 q13, q5, q1
+ vmla.i16 q14, q4, q0
+ vmla.i16 q15, q4, q1
+ vrshrn.i16 d16, q8, #8
+ vrshrn.i16 d17, q9, #8
+ vrshrn.i16 d18, q10, #8
+ vrshrn.i16 d19, q11, #8
+ vrshrn.i16 d20, q12, #8
+ vrshrn.i16 d21, q13, #8
+ vrshrn.i16 d22, q14, #8
+ vrshrn.i16 d23, q15, #8
+ subs r3, r3, #16
+ vst1.8 {q8}, [r0, :128]!
+ vst1.8 {q9}, [r6, :128]!
+ vst1.8 {q10}, [r5, :128]!
+ vst1.8 {q11}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r8, r8, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height);
+function ipred_filter_8bpc_neon, export=1
+ push {r4-r8, lr}
+ movw r12, #511
+ ldr r5, [sp, #28]
+ ldr r4, [sp, #24]
+ and r5, r5, r12 // 511
+ movrel r6, X(filter_intra_taps)
+ lsl r5, r5, #6
+ add r6, r6, r5
+ vld1.8 {d20, d21, d22, d23}, [r6, :128]!
+ clz lr, r3
+ adr r5, L(ipred_filter_tbl)
+ vld1.8 {d27, d28, d29}, [r6, :64]
+ sub lr, lr, #26
+ ldr lr, [r5, lr, lsl #2]
+ vmovl.s8 q8, d20
+ vmovl.s8 q9, d21
+ add r5, r5, lr
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmovl.s8 q12, d27
+ vmovl.s8 q13, d28
+ vmovl.s8 q14, d29
+ add r8, r2, #1
+ bx r5
+
+ .align 2
+L(ipred_filter_tbl):
+ .word 320f - L(ipred_filter_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_filter_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_filter_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_filter_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d0[]}, [r8] // top (0-3)
+ sub r2, r2, #2
+ mov r7, #-2
+ vmovl.u8 q0, d0 // top (0-3)
+4:
+ vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2)
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmovl.u8 q1, d2 // left (0-1) + topleft (2)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vqrshrun.s16 d4, q2, #4
+ subs r4, r4, #2
+ vst1.32 {d4[0]}, [r0, :32], r1
+ vmovl.u8 q0, d4
+ vst1.32 {d4[1]}, [r6, :32], r1
+ vext.8 q0, q0, q0, #8 // move top from [4-7] to [0-3]
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d0}, [r8] // top (0-7)
+ sub r2, r2, #2
+ mov r7, #-2
+ vmovl.u8 q0, d0 // top (0-7)
+8:
+ vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2)
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmovl.u8 q1, d2 // left (0-1) + topleft (2)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d4, q2, #4
+ vmovl.u8 q1, d4 // first block, in 16 bit
+ vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4)
+ vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d2[3] // p5(left[0]) * filter(5)
+ vmla.i16 q3, q14, d3[3] // p6(left[1]) * filter(6)
+ vqrshrun.s16 d5, q3, #4
+ vzip.32 d4, d5
+ subs r4, r4, #2
+ vst1.64 {d4}, [r0, :64], r1
+ vmovl.u8 q0, d5
+ vst1.64 {d5}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+ vpush {q4-q5}
+ sub r2, r2, #2
+ mov r7, #-2
+ sub r1, r1, r3
+ mov lr, r3
+
+1:
+ vld1.32 {d0[]}, [r2], r7 // left (0-1) + topleft (2)
+ vmovl.u8 q0, d0 // left (0-1) + topleft (2)
+2:
+ vld1.8 {q2}, [r8]! // top(0-15)
+ vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5)
+ vmovl.u8 q1, d4 // top(0-7)
+ vmovl.u8 q2, d5 // top(8-15)
+ vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6)
+ vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3)
+ vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4)
+
+ vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1)
+ vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2)
+ vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d6, q3, #4
+ vmovl.u8 q0, d6 // first block, in 16 bit
+ vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4)
+ vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0)
+ vmla.i16 q4, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q4, q14, d1[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1)
+ vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2)
+ vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d7, q4, #4
+ vmovl.u8 q0, d7 // second block, in 16 bit
+ vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4)
+ vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0)
+ vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q15, q9, d5[0] // p1(top[0]) * filter(1)
+ vmla.i16 q15, q10, d5[1] // p2(top[1]) * filter(2)
+ vmla.i16 q15, q11, d5[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d8, q5, #4
+ vmovl.u8 q0, d8 // third block, in 16 bit
+ vmov.u8 r12, d5[6]
+ vmla.i16 q15, q12, d5[3] // p4(top[3]) * filter(4)
+ vmla.i16 q15, q8, d4[3] // p0(topleft) * filter(0)
+ vmla.i16 q15, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q15, q14, d1[3] // p6(left[1]) * filter(6)
+ vmov.8 d0[4], r12
+
+ subs r3, r3, #16
+ vqrshrun.s16 d9, q15, #4
+
+ vst4.32 {d6[0], d7[0], d8[0], d9[0]}, [r0, :128]!
+ vst4.32 {d6[1], d7[1], d8[1], d9[1]}, [r6, :128]!
+ ble 8f
+ vmov.u8 r12, d9[7]
+ vmov.8 d0[0], r12
+ vmov.u8 r12, d9[3]
+ vmov.8 d0[2], r12
+ b 2b
+8:
+ subs r4, r4, #2
+
+ ble 9f
+ sub r8, r6, lr
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ b 1b
+9:
+ vpop {q4-q5}
+ pop {r4-r8, pc}
+endfunc
+
+// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint16_t *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_8bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ vld1.16 {q0}, [r2, :128]
+ clz lr, r4
+ adr r12, L(pal_pred_tbl)
+ sub lr, lr, #25
+ ldr lr, [r12, lr, lsl #2]
+ vmovn.i16 d0, q0
+ add r12, r12, lr
+ add r2, r0, r1
+ bx r12
+
+ .align 2
+L(pal_pred_tbl):
+ .word 640f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 320f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 160f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 80f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 40f - L(pal_pred_tbl) + CONFIG_THUMB
+
+40:
+ lsl r1, r1, #1
+4:
+ vld1.8 {q1}, [r3, :128]!
+ subs r5, r5, #4
+ vtbl.8 d2, {d0}, d2
+ vtbl.8 d3, {d0}, d3
+ vst1.32 {d2[0]}, [r0, :32], r1
+ vst1.32 {d2[1]}, [r2, :32], r1
+ vst1.32 {d3[0]}, [r0, :32], r1
+ vst1.32 {d3[1]}, [r2, :32], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ lsl r1, r1, #1
+8:
+ vld1.8 {q1, q2}, [r3, :128]!
+ subs r5, r5, #4
+ vtbl.8 d2, {d0}, d2
+ vtbl.8 d3, {d0}, d3
+ vst1.8 {d2}, [r0, :64], r1
+ vtbl.8 d4, {d0}, d4
+ vst1.8 {d3}, [r2, :64], r1
+ vtbl.8 d5, {d0}, d5
+ vst1.8 {d4}, [r0, :64], r1
+ vst1.8 {d5}, [r2, :64], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ lsl r1, r1, #1
+16:
+ vld1.8 {q8, q9}, [r3, :128]!
+ subs r5, r5, #4
+ vld1.8 {q10, q11}, [r3, :128]!
+ vtbl.8 d16, {d0}, d16
+ vtbl.8 d17, {d0}, d17
+ vtbl.8 d18, {d0}, d18
+ vtbl.8 d19, {d0}, d19
+ vtbl.8 d20, {d0}, d20
+ vtbl.8 d21, {d0}, d21
+ vst1.8 {q8}, [r0, :128], r1
+ vtbl.8 d22, {d0}, d22
+ vst1.8 {q9}, [r2, :128], r1
+ vtbl.8 d23, {d0}, d23
+ vst1.8 {q10}, [r0, :128], r1
+ vst1.8 {q11}, [r2, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ lsl r1, r1, #1
+32:
+ vld1.8 {q8, q9}, [r3, :128]!
+ subs r5, r5, #2
+ vld1.8 {q10, q11}, [r3, :128]!
+ vtbl.8 d16, {d0}, d16
+ vtbl.8 d17, {d0}, d17
+ vtbl.8 d18, {d0}, d18
+ vtbl.8 d19, {d0}, d19
+ vtbl.8 d20, {d0}, d20
+ vtbl.8 d21, {d0}, d21
+ vst1.8 {q8, q9}, [r0, :128], r1
+ vtbl.8 d22, {d0}, d22
+ vtbl.8 d23, {d0}, d23
+ vst1.8 {q10, q11}, [r2, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ sub r1, r1, #32
+64:
+ vld1.8 {q8, q9}, [r3, :128]!
+ subs r5, r5, #1
+ vld1.8 {q10, q11}, [r3, :128]!
+ vtbl.8 d16, {d0}, d16
+ vtbl.8 d17, {d0}, d17
+ vtbl.8 d18, {d0}, d18
+ vtbl.8 d19, {d0}, d19
+ vtbl.8 d20, {d0}, d20
+ vtbl.8 d21, {d0}, d21
+ vst1.8 {q8, q9}, [r0, :128]!
+ vtbl.8 d22, {d0}, d22
+ vtbl.8 d23, {d0}, d23
+ vst1.8 {q10, q11}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_128_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ ldr r5, [sp, #28]
+ ldr r6, [sp, #32]
+ clz lr, r3
+ adr r12, L(ipred_cfl_128_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vmov.i16 q0, #128 // dc
+ vdup.i16 q1, r6 // alpha
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r12
+
+ .align 2
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_splat_w4):
+ vld1.16 {q2, q3}, [r5, :128]!
+ vmul.i16 q2, q2, q1 // diff = ac * alpha
+ vmul.i16 q3, q3, q1
+ vshr.s16 q8, q2, #15 // sign = diff >> 15
+ vshr.s16 q9, q3, #15
+ vadd.i16 q2, q2, q8 // diff + sign
+ vadd.i16 q3, q3, q9
+ vrshr.s16 q2, q2, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshr.s16 q3, q3, #6
+ vadd.i16 q2, q2, q0 // dc + apply_sign()
+ vadd.i16 q3, q3, q0
+ vqmovun.s16 d4, q2 // iclip_pixel(dc + apply_sign())
+ vqmovun.s16 d5, q3
+ vst1.32 {d4[0]}, [r0, :32], r1
+ vst1.32 {d4[1]}, [r6, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d5[0]}, [r0, :32], r1
+ vst1.32 {d5[1]}, [r6, :32], r1
+ bgt L(ipred_cfl_splat_w4)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w8):
+ vld1.16 {q8, q9}, [r5, :128]!
+ vld1.16 {q10, q11}, [r5, :128]!
+ vmul.i16 q8, q8, q1 // diff = ac * alpha
+ vmul.i16 q9, q9, q1
+ vmul.i16 q10, q10, q1
+ vmul.i16 q11, q11, q1
+ vshr.s16 q12, q8, #15 // sign = diff >> 15
+ vshr.s16 q13, q9, #15
+ vshr.s16 q14, q10, #15
+ vshr.s16 q15, q11, #15
+ vadd.i16 q8, q8, q12 // diff + sign
+ vadd.i16 q9, q9, q13
+ vadd.i16 q10, q10, q14
+ vadd.i16 q11, q11, q15
+ vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q10, q10, #6
+ vrshr.s16 q11, q11, #6
+ vadd.i16 q8, q8, q0 // dc + apply_sign()
+ vadd.i16 q9, q9, q0
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q0
+ vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign())
+ vqmovun.s16 d17, q9
+ vqmovun.s16 d18, q10
+ vqmovun.s16 d19, q11
+ vst1.8 {d16}, [r0, :64], r1
+ vst1.8 {d17}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d18}, [r0, :64], r1
+ vst1.8 {d19}, [r6, :64], r1
+ bgt L(ipred_cfl_splat_w8)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w16):
+ add r12, r5, r3, lsl #1
+ sub r1, r1, r3
+ mov lr, r3
+1:
+ vld1.16 {q8, q9}, [r5, :128]!
+ vmul.i16 q8, q8, q1 // diff = ac * alpha
+ vld1.16 {q10, q11}, [r12, :128]!
+ vmul.i16 q9, q9, q1
+ vmul.i16 q10, q10, q1
+ vmul.i16 q11, q11, q1
+ vshr.s16 q12, q8, #15 // sign = diff >> 15
+ vshr.s16 q13, q9, #15
+ vshr.s16 q14, q10, #15
+ vshr.s16 q15, q11, #15
+ vadd.i16 q8, q8, q12 // diff + sign
+ vadd.i16 q9, q9, q13
+ vadd.i16 q10, q10, q14
+ vadd.i16 q11, q11, q15
+ vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q10, q10, #6
+ vrshr.s16 q11, q11, #6
+ vadd.i16 q8, q8, q0 // dc + apply_sign()
+ vadd.i16 q9, q9, q0
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q0
+ vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign())
+ vqmovun.s16 d17, q9
+ vqmovun.s16 d18, q10
+ vqmovun.s16 d19, q11
+ subs r3, r3, #16
+ vst1.16 {q8}, [r0, :128]!
+ vst1.16 {q9}, [r6, :128]!
+ bgt 1b
+ subs r4, r4, #2
+ add r5, r5, lr, lsl #1
+ add r12, r12, lr, lsl #1
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ bgt 1b
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_top_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ ldr r5, [sp, #28]
+ ldr r6, [sp, #32]
+ clz lr, r3
+ adr r12, L(ipred_cfl_top_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r2, r2, #1
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r12
+
+ .align 2
+L(ipred_cfl_top_tbl):
+ .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+
+4:
+ vld1.32 {d0[]}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ vld1.8 {d0}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ vld1.8 {q0}, [r2]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ vld1.8 {q2, q3}, [r2]
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q2, q3
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #5
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_left_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ ldr r5, [sp, #28]
+ ldr r6, [sp, #32]
+ sub r2, r2, r4
+ clz lr, r3
+ clz r8, r4
+ adr r12, L(ipred_cfl_splat_tbl)
+ adr r7, L(ipred_cfl_left_tbl)
+ sub lr, lr, #26
+ sub r8, r8, #26
+ ldr lr, [r12, lr, lsl #2]
+ ldr r8, [r7, r8, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r12, r12, lr
+ add r7, r7, r8
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r7
+
+ .align 2
+L(ipred_cfl_left_tbl):
+ .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_left_h4):
+ vld1.32 {d0[]}, [r2, :32]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h8):
+ vld1.8 {d0}, [r2, :64]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h16):
+ vld1.8 {q0}, [r2, :128]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h32):
+ vld1.8 {q2, q3}, [r2, :128]
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q2, q3
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #5
+ vdup.16 q0, d0[0]
+ bx r12
+endfunc
+
+// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ ldr r5, [sp, #28]
+ ldr r6, [sp, #32]
+ sub r2, r2, r4
+ add r8, r3, r4 // width + height
+ vdup.16 q1, r6 // alpha
+ clz lr, r3
+ clz r6, r4
+ vdup.16 d16, r8 // width + height
+ adr r7, L(ipred_cfl_tbl)
+ rbit r8, r8 // rbit(width + height)
+ sub lr, lr, #22 // 26 leading bits, minus table offset 4
+ sub r6, r6, #26
+ clz r8, r8 // ctz(width + height)
+ ldr lr, [r7, lr, lsl #2]
+ ldr r6, [r7, r6, lsl #2]
+ neg r8, r8 // -ctz(width + height)
+ add r12, r7, lr
+ add r7, r7, r6
+ vshr.u16 d16, d16, #1 // (width + height) >> 1
+ vdup.16 d17, r8 // -ctz(width + height)
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r7
+
+ .align 2
+L(ipred_cfl_tbl):
+ .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_h4):
+ vld1.32 {d0[]}, [r2, :32]!
+ vpaddl.u8 d0, d0
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w4):
+ add r2, r2, #1
+ vld1.32 {d1[]}, [r2]
+ vadd.i16 d0, d0, d16
+ vpaddl.u8 d1, d1
+ vpadd.u16 d1, d1
+ cmp r4, #4
+ vadd.i16 d0, d0, d1
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 8/16
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ cmp r4, #16
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ vld1.8 {d0}, [r2, :64]!
+ vpaddl.u8 d0, d0
+ vpadd.i16 d0, d0
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w8):
+ add r2, r2, #1
+ vld1.8 {d1}, [r2]
+ vadd.i16 d0, d0, d16
+ vpaddl.u8 d1, d1
+ vpadd.i16 d1, d1
+ vpadd.i16 d1, d1
+ cmp r4, #8
+ vadd.i16 d0, d0, d1
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 4/16/32
+ cmp r4, #32
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ vld1.8 {q0}, [r2, :128]!
+ vaddl.u8 q0, d0, d1
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w16):
+ add r2, r2, #1
+ vld1.8 {q2}, [r2]
+ vadd.i16 d0, d0, d16
+ vaddl.u8 q2, d4, d5
+ vadd.i16 d4, d4, d5
+ vpadd.i16 d4, d4
+ vpadd.i16 d4, d4
+ cmp r4, #16
+ vadd.i16 d0, d0, d4
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ vld1.8 {q2, q3}, [r2, :128]!
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.i16 q0, q2, q3
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w32):
+ add r2, r2, #1
+ vld1.8 {q2, q3}, [r2]
+ vadd.i16 d0, d0, d16
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.i16 q2, q2, q3
+ vadd.i16 d4, d4, d5
+ vpadd.i16 d4, d4
+ vpadd.i16 d4, d4
+ cmp r4, #32
+ vadd.i16 d0, d0, d4
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 8/16/64
+ cmp r4, #8
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldr r4, [sp, #24]
+ ldr r5, [sp, #28]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_420_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_tbl):
+ .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w4):
+1: // Copy and subsample input
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d2}, [r12, :64], r2
+ vld1.8 {d1}, [r1, :64], r2
+ vld1.8 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vadd.i16 q0, q0, q1
+ vshl.i16 q0, q0, #1
+ subs r8, r8, #2
+ vst1.16 {q0}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d1
+ vmov d2, d1
+ vmov d3, d1
+L(ipred_cfl_ac_420_w4_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q8, q8, q1
+ bgt 2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+ // Aggregate the sums
+ vadd.i16 q0, q8, q9
+ vadd.i16 q1, q10, q11
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
+ vadd.i32 q0, q1
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d0, d0 // sum
+ sub r0, r0, r6, lsl #3
+ vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz
+ vdup.16 q8, d16[0]
+L(ipred_cfl_ac_420_w4_subtract_dc):
+6: // Subtract dc from ac
+ vld1.16 {q0, q1}, [r0, :128]
+ subs r6, r6, #4
+ vsub.i16 q0, q0, q8
+ vsub.i16 q1, q1, q8
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 6b
+ pop {r4-r8, pc}
+
+L(ipred_cfl_ac_420_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q1}, [r12, :128], r2
+ vld1.8 {q2}, [r1, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q3}, [r12, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q2, #1
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.16 {d0}, [r1, :64], r2
+ vld1.16 {d2}, [r12, :64], r2
+ vld1.16 {d1}, [r1, :64], r2
+ vld1.16 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vadd.i16 q0, q0, q1
+ vshl.i16 q0, q0, #1
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+
+L(ipred_cfl_ac_420_w8_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 2b
+3:
+
+ // Double the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #1
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+ adr r7, L(ipred_cfl_ac_420_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_w16_tbl):
+ .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w16_wpad0):
+1: // Copy and subsample input, without padding
+ vld1.8 {q0, q1}, [r1, :128], r2
+ vld1.8 {q2, q3}, [r12, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q12, q13}, [r1, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vld1.8 {q2, q3}, [r12, :128], r2
+ vpaddl.u8 q12, q12
+ vpaddl.u8 q13, q13
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q12, q12, q2
+ vadd.i16 q13, q13, q3
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q1, #1
+ vshl.i16 q2, q12, #1
+ vshl.i16 q3, q13, #1
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+1: // Copy and subsample input, padding 4
+ vldr d2, [r1, #16]
+ vld1.8 {q0}, [r1, :128], r2
+ vldr d6, [r12, #16]
+ vld1.8 {q2}, [r12, :128], r2
+ vpaddl.u8 d2, d2
+ vldr d26, [r1, #16]
+ vpaddl.u8 q0, q0
+ vld1.8 {q12}, [r1, :128], r2
+ vpaddl.u8 d6, d6
+ vldr d30, [r12, #16]
+ vpaddl.u8 q2, q2
+ vld1.8 {q14}, [r12, :128], r2
+ vpaddl.u8 d26, d26
+ vpaddl.u8 q12, q12
+ vpaddl.u8 d30, d30
+ vpaddl.u8 q14, q14
+ vadd.i16 d2, d2, d6
+ vadd.i16 q0, q0, q2
+ vadd.i16 d26, d26, d30
+ vadd.i16 q12, q12, q14
+ vshl.i16 d2, d2, #1
+ vshl.i16 q0, q0, #1
+ vshl.i16 d6, d26, #1
+ vshl.i16 q2, q12, #1
+ vdup.16 d3, d2[3]
+ vdup.16 d7, d6[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q1}, [r12, :128], r2
+ vld1.8 {q2}, [r1, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q3}, [r12, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vshl.i16 q0, q0, #1
+ vshl.i16 q2, q2, #1
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vld1.8 {d4}, [r1, :64], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {d5}, [r12, :64], r2
+ vpaddl.u8 q2, q2
+ vadd.i16 d0, d0, d1
+ vadd.i16 d4, d4, d5
+ vshl.i16 d0, d0, #1
+ vshl.i16 d4, d4, #1
+ vdup.16 q1, d0[3]
+ vdup.16 q3, d4[3]
+ vdup.16 d1, d0[3]
+ vdup.16 d5, d4[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 2b
+3:
+
+ // Quadruple the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #2
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+endfunc
+
+// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldr r4, [sp, #24]
+ ldr r5, [sp, #28]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_422_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_tbl):
+ .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w4):
+1: // Copy and subsample input
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vld1.8 {d2}, [r1, :64], r2
+ vld1.8 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q1}, [r12, :128], r2
+ vld1.8 {q2}, [r1, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q3}, [r12, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 q3, q3, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vld1.8 {d2}, [r1, :64], r2
+ vld1.8 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vdup.16 d7, d3[3]
+ vmov d6, d3
+ vdup.16 d5, d2[3]
+ vmov d4, d2
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ adr r7, L(ipred_cfl_ac_422_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_w16_tbl):
+ .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w16_wpad0):
+1: // Copy and subsample input, without padding
+ vld1.8 {q0, q1}, [r1, :128], r2
+ vld1.8 {q2, q3}, [r12, :128], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 q3, q3, #2
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+1: // Copy and subsample input, padding 4
+ vldr d2, [r1, #16]
+ vld1.8 {q0}, [r1, :128], r2
+ vldr d6, [r12, #16]
+ vld1.8 {q2}, [r12, :128], r2
+ vpaddl.u8 d2, d2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 d6, d6
+ vpaddl.u8 q2, q2
+ vshl.i16 d2, d2, #2
+ vshl.i16 q0, q0, #2
+ vshl.i16 d6, d6, #2
+ vshl.i16 q2, q2, #2
+ vdup.16 d3, d2[3]
+ vdup.16 d7, d6[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q2}, [r12, :128], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q2, q2
+ vshl.i16 q0, q0, #2
+ vshl.i16 q2, q2, #2
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vshl.i16 q0, q0, #2
+ vdup.16 q3, d1[3]
+ vdup.16 q1, d0[3]
+ vdup.16 d5, d1[3]
+ vmov d4, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+endfunc
+
+// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldr r4, [sp, #24]
+ ldr r5, [sp, #28]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_444_tbl)
+ sub r8, r8, #26
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_tbl):
+ .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w4):
+1: // Copy and expand input
+ vld1.32 {d0[]}, [r1, :32], r2
+ vld1.32 {d0[1]}, [r12, :32], r2
+ vld1.32 {d2[]}, [r1, :32], r2
+ vld1.32 {d2[1]}, [r12, :32], r2
+ vshll.u8 q0, d0, #3
+ vshll.u8 q1, d2, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+1: // Copy and expand input
+ vld1.16 {d0}, [r1, :64], r2
+ vld1.16 {d2}, [r12, :64], r2
+ vld1.16 {d4}, [r1, :64], r2
+ vshll.u8 q0, d0, #3
+ vld1.16 {d6}, [r12, :64], r2
+ vshll.u8 q1, d2, #3
+ vshll.u8 q2, d4, #3
+ vshll.u8 q3, d6, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ vld1.8 {q1}, [r1, :128], r2
+ vld1.8 {q3}, [r12, :128], r2
+ vshll.u8 q0, d2, #3
+ vshll.u8 q1, d3, #3
+ vshll.u8 q2, d6, #3
+ vshll.u8 q3, d7, #3
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d4}, [r12, :64], r2
+ vshll.u8 q0, d0, #3
+ vshll.u8 q2, d4, #3
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ adr r7, L(ipred_cfl_ac_444_w32_tbl)
+ ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_w32_tbl):
+ .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w32_wpad0):
+1: // Copy and expand input, without padding
+ vld1.8 {q2, q3}, [r1, :128], r2
+ vld1.8 {q13, q14}, [r12, :128], r2
+ vshll.u8 q0, d4, #3
+ vshll.u8 q1, d5, #3
+ vshll.u8 q2, d6, #3
+ vshll.u8 q3, d7, #3
+ vshll.u8 q12, d26, #3
+ vshll.u8 q13, d27, #3
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vshll.u8 q0, d28, #3
+ vshll.u8 q1, d29, #3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+1: // Copy and expand input, padding 8
+ vldr d4, [r1, #16]
+ vld1.8 {q1}, [r1, :128], r2
+ vldr d28, [r12, #16]
+ vld1.8 {q13}, [r12, :128], r2
+ vshll.u8 q2, d4, #3
+ vshll.u8 q0, d2, #3
+ vshll.u8 q1, d3, #3
+ vshll.u8 q12, d26, #3
+ vshll.u8 q13, d27, #3
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vshll.u8 q0, d28, #3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vdup.16 q1, d1[3]
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+1: // Copy and expand input, padding 16
+ vld1.8 {q1}, [r1, :128], r2
+ vld1.8 {q13}, [r12, :128], r2
+ vshll.u8 q0, d2, #3
+ vshll.u8 q1, d3, #3
+ vshll.u8 q12, d26, #3
+ vshll.u8 q13, d27, #3
+ vdup.16 q2, d3[3]
+ vdup.16 q3, d3[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vdup.16 q0, d27[3]
+ vdup.16 q1, d27[3]
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+1: // Copy and expand input, padding 24
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d24}, [r12, :64], r2
+ vshll.u8 q0, d0, #3
+ vshll.u8 q12, d24, #3
+ subs r8, r8, #2
+ vdup.16 q1, d1[3]
+ vdup.16 q2, d1[3]
+ vdup.16 q3, d1[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vdup.16 q13, d25[3]
+ vdup.16 q0, d25[3]
+ vdup.16 q1, d25[3]
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+
+L(ipred_cfl_ac_444_w32_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #1
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 2b
+3:
+
+ // Multiply the height by eight and reuse the w4 subtracting
+ lsl r6, r6, #3
+ // Aggregate the sums, with wider intermediates earlier than in
+ // ipred_cfl_ac_420_w8_calc_subtract_dc.
+ vpaddl.u16 q0, q8
+ vpaddl.u16 q1, q9
+ vpaddl.u16 q2, q10
+ vpaddl.u16 q3, q11
+ vadd.i32 q0, q0, q1
+ vadd.i32 q2, q2, q3
+ vadd.i32 q0, q0, q2
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d0, d0 // sum
+ sub r0, r0, r6, lsl #3
+ vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz
+ vdup.16 q8, d16[0]
+ b L(ipred_cfl_ac_420_w4_subtract_dc)
+endfunc
--- /dev/null
+++ b/src/arm/32/mc16.S
@@ -1,0 +1,274 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define PREP_BIAS 8192
+
+.macro avg d0, d00, d01, d1, d10, d11
+ vld1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q2, q3}, [r3, :128]!
+ vqadd.s16 q0, q0, q2
+ vqadd.s16 q1, q1, q3
+ vmax.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vmax.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vqsub.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vqsub.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vshl.s16 \d0, q0, q13 // -(intermediate_bits+1)
+ vshl.s16 \d1, q1, q13 // -(intermediate_bits+1)
+.endm
+
+.macro w_avg d0, d00, d01, d1, d10, d11
+ vld1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q2, q3}, [r3, :128]!
+ // This difference requires a 17 bit range, and all bits are
+ // significant for the following multiplication.
+ vsubl.s16 \d0, d4, d0
+ vsubl.s16 q0, d5, d1
+ vsubl.s16 \d1, d6, d2
+ vsubl.s16 q1, d7, d3
+ vmul.s32 \d0, \d0, q4
+ vmul.s32 q0, q0, q4
+ vmul.s32 \d1, \d1, q4
+ vmul.s32 q1, q1, q4
+ vshr.s32 \d0, \d0, #4
+ vshr.s32 q0, q0, #4
+ vshr.s32 \d1, \d1, #4
+ vshr.s32 q1, q1, #4
+ vaddw.s16 \d0, \d0, d4
+ vaddw.s16 q0, q0, d5
+ vaddw.s16 \d1, \d1, d6
+ vaddw.s16 q1, q1, d7
+ vmovn.i32 \d00, \d0
+ vmovn.i32 \d01, q0
+ vmovn.i32 \d10, \d1
+ vmovn.i32 \d11, q1
+ vrshl.s16 \d0, \d0, q13 // -intermediate_bits
+ vrshl.s16 \d1, \d1, q13 // -intermediate_bits
+ vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits
+ vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits
+ vmin.s16 \d0, \d0, q15 // bitdepth_max
+ vmin.s16 \d1, \d1, q15 // bitdepth_max
+ vmax.s16 \d0, \d0, q14 // 0
+ vmax.s16 \d1, \d1, q14 // 0
+.endm
+
+.macro mask d0, d00, d01, d1, d10, d11
+ vld1.8 {q7}, [r6, :128]!
+ vld1.16 {q0, q1}, [r2, :128]!
+ vneg.s8 q7, q7
+ vld1.16 {q2, q3}, [r3, :128]!
+ vmovl.s8 q6, d14
+ vmovl.s8 q7, d15
+ vmovl.s16 q4, d12
+ vmovl.s16 q5, d13
+ vmovl.s16 q6, d14
+ vmovl.s16 q7, d15
+ vsubl.s16 \d0, d4, d0
+ vsubl.s16 q0, d5, d1
+ vsubl.s16 \d1, d6, d2
+ vsubl.s16 q1, d7, d3
+ vmul.s32 \d0, \d0, q4
+ vmul.s32 q0, q0, q5
+ vmul.s32 \d1, \d1, q6
+ vmul.s32 q1, q1, q7
+ vshr.s32 \d0, \d0, #6
+ vshr.s32 q0, q0, #6
+ vshr.s32 \d1, \d1, #6
+ vshr.s32 q1, q1, #6
+ vaddw.s16 \d0, \d0, d4
+ vaddw.s16 q0, q0, d5
+ vaddw.s16 \d1, \d1, d6
+ vaddw.s16 q1, q1, d7
+ vmovn.i32 \d00, \d0
+ vmovn.i32 \d01, q0
+ vmovn.i32 \d10, \d1
+ vmovn.i32 \d11, q1
+ vrshl.s16 \d0, \d0, q13 // -intermediate_bits
+ vrshl.s16 \d1, \d1, q13 // -intermediate_bits
+ vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits
+ vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits
+ vmin.s16 \d0, \d0, q15 // bitdepth_max
+ vmin.s16 \d1, \d1, q15 // bitdepth_max
+ vmax.s16 \d0, \d0, q14 // 0
+ vmax.s16 \d1, \d1, q14 // 0
+.endm
+
+.macro bidir_fn type, bdmax
+function \type\()_16bpc_neon, export=1
+ push {r4-r7,lr}
+ ldr r4, [sp, #20]
+ ldr r5, [sp, #24]
+ ldr r6, [sp, #28]
+ clz r4, r4
+.ifnc \type, avg
+ ldr r7, [sp, #32]
+ vmov.i16 q14, #0
+ vdup.16 q15, r7 // bitdepth_max
+.endif
+.ifc \type, w_avg
+ vpush {q4}
+.endif
+.ifc \type, mask
+ vpush {q4-q7}
+.endif
+ clz r7, \bdmax
+ sub r7, r7, #18 // intermediate_bits = clz(bitdepth_max) - 18
+.ifc \type, avg
+ mov lr, #1
+ movw r12, #2*PREP_BIAS
+ lsl lr, lr, r7 // 1 << intermediate_bits
+ neg r12, r12 // -2*PREP_BIAS
+ add r7, r7, #1
+ sub r12, r12, lr // -2*PREP_BIAS - 1 << intermediate_bits
+ neg r7, r7 // -(intermediate_bits+1)
+ vdup.16 q12, r12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vdup.16 q13, r7 // -(intermediate_bits+1)
+.else
+ mov r12, #PREP_BIAS
+ lsr r12, r12, r7 // PREP_BIAS >> intermediate_bits
+ neg r7, r7 // -intermediate_bits
+ vdup.16 q12, r12 // PREP_BIAS >> intermediate_bits
+ vdup.16 q13, r7 // -intermediate_bits
+.endif
+.ifc \type, w_avg
+ vdup.32 q4, r6
+ vneg.s32 q4, q4
+.endif
+ adr r7, L(\type\()_tbl)
+ sub r4, r4, #24
+ \type q8, d16, d17, q9, d18, d19
+ ldr r4, [r7, r4, lsl #2]
+ add r7, r7, r4
+ bx r7
+
+ .align 2
+L(\type\()_tbl):
+ .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_tbl) + CONFIG_THUMB
+
+40:
+ add r7, r0, r1
+ lsl r1, r1, #1
+4:
+ subs r5, r5, #4
+ vst1.16 {d16}, [r0, :64], r1
+ vst1.16 {d17}, [r7, :64], r1
+ vst1.16 {d18}, [r0, :64], r1
+ vst1.16 {d19}, [r7, :64], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 4b
+80:
+ add r7, r0, r1
+ lsl r1, r1, #1
+8:
+ vst1.16 {q8}, [r0, :128], r1
+ subs r5, r5, #2
+ vst1.16 {q9}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 8b
+160:
+16:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #2
+ vst1.16 {q10, q11}, [r0, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 16b
+320:
+ add r7, r0, #32
+32:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #1
+ vst1.16 {q10, q11}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 32b
+640:
+ add r7, r0, #32
+ mov r12, #64
+ sub r1, r1, #64
+64:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #1
+ vst1.16 {q10, q11}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 64b
+1280:
+ add r7, r0, #32
+ mov r12, #64
+ sub r1, r1, #192
+128:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #1
+ vst1.16 {q10, q11}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 128b
+0:
+.ifc \type, mask
+ vpop {q4-q7}
+.endif
+.ifc \type, w_avg
+ vpop {q4}
+.endif
+ pop {r4-r7,pc}
+endfunc
+.endm
+
+bidir_fn avg, r6
+bidir_fn w_avg, r7
+bidir_fn mask, r7
--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -884,10 +884,10 @@
lsl x1, x1, #1
br x5
40:
- sub x2, x2, #4
- mov x7, #-4
ld1r {v6.2s}, [x8] // top
ld1r {v7.2s}, [x10] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
dup v5.16b, v6.b[3] // right
usubl v6.8h, v6.8b, v4.8b // top-bottom
uxtl v7.8h, v7.8b // weights_hor
@@ -922,10 +922,10 @@
b.gt 4b
ret
80:
- sub x2, x2, #4
- mov x7, #-4
ld1 {v6.8b}, [x8] // top
ld1 {v7.8b}, [x10] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
dup v5.16b, v6.b[7] // right
usubl v6.8h, v6.8b, v4.8b // top-bottom
uxtl v7.8h, v7.8b // weights_hor
@@ -1460,12 +1460,14 @@
subs w3, w3, #16
sqrshrun v6.8b, v6.8h, #4
- ins v0.h[2], v2.h[7]
st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
- ins v0.b[0], v6.b[7]
st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
- ins v0.b[2], v6.b[3]
- b.gt 2b
+ b.le 8f
+ ins v0.h[2], v2.h[7]
+ ins v0.b[0], v6.b[7]
+ ins v0.b[2], v6.b[3]
+ b 2b
+8:
subs w4, w4, #2
b.le 9f
sub x8, x6, w9, uxtw
@@ -1815,7 +1817,7 @@
dup v16.8h, w8 // width + height
adr x7, L(ipred_cfl_tbl)
rbit w8, w8 // rbit(width + height)
- sub w9, w9, #22 // 22 leading bits, minus table offset 4
+ sub w9, w9, #22 // 26 leading bits, minus table offset 4
sub w6, w6, #26
clz w8, w8 // ctz(width + height)
ldrh w9, [x7, w9, uxtw #1]
@@ -2078,6 +2080,7 @@
sub x0, x0, w6, uxtw #4
urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
dup v4.8h, v4.h[0]
+L(ipred_cfl_ac_420_w8_subtract_dc):
6: // Subtract dc from ac
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
subs w6, w6, #4
@@ -2223,7 +2226,6 @@
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
- b L(ipred_cfl_ac_420_w16_hpad)
L(ipred_cfl_ac_420_w16_hpad):
cbz w4, 3f
@@ -2244,7 +2246,6 @@
// Double the height and reuse the w8 summing/subtracting
lsl w6, w6, #1
- lsl w9, w9, #1
b L(ipred_cfl_ac_420_w8_calc_subtract_dc)
L(ipred_cfl_ac_420_tbl):
@@ -2473,4 +2474,291 @@
.hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
.hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
.hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
+
+// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_8bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_444_tbl)
+ sub w8, w8, #26
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_444_w4):
+1: // Copy and expand input
+ ld1 {v0.s}[0], [x1], x2
+ ld1 {v0.s}[1], [x10], x2
+ ld1 {v1.s}[0], [x1], x2
+ ld1 {v1.s}[1], [x10], x2
+ ushll v0.8h, v0.8b, #3
+ ushll v1.8h, v1.8b, #3
+ subs w8, w8, #4
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+1: // Copy and expand input
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x10], x2
+ ld1 {v2.8b}, [x1], x2
+ ushll v0.8h, v0.8b, #3
+ ld1 {v3.8b}, [x10], x2
+ ushll v1.8h, v1.8b, #3
+ ushll v2.8h, v2.8b, #3
+ ushll v3.8h, v3.8b, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ cbnz w3, L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v2.16b}, [x10], x2
+ ld1 {v4.16b}, [x1], x2
+ ushll2 v1.8h, v0.16b, #3
+ ushll v0.8h, v0.8b, #3
+ ld1 {v6.16b}, [x10], x2
+ ushll2 v3.8h, v2.16b, #3
+ ushll v2.8h, v2.8b, #3
+ ushll2 v5.8h, v4.16b, #3
+ ushll v4.8h, v4.8b, #3
+ ushll2 v7.8h, v6.16b, #3
+ ushll v6.8h, v6.8b, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ mov v0.16b, v6.16b
+ mov v1.16b, v7.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v2.8b}, [x10], x2
+ ld1 {v4.8b}, [x1], x2
+ ld1 {v6.8b}, [x10], x2
+ ushll v0.8h, v0.8b, #3
+ ushll v2.8h, v2.8b, #3
+ ushll v4.8h, v4.8b, #3
+ ushll v6.8h, v6.8b, #3
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ dup v5.8h, v4.h[7]
+ dup v7.8h, v6.h[7]
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ mov v0.16b, v6.16b
+ mov v1.16b, v7.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ adr x7, L(ipred_cfl_ac_444_w32_tbl)
+ ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_444_w32_wpad0):
+1: // Copy and expand input, without padding
+ ld1 {v2.16b, v3.16b}, [x1], x2
+ ld1 {v6.16b, v7.16b}, [x10], x2
+ ushll v0.8h, v2.8b, #3
+ ushll2 v1.8h, v2.16b, #3
+ ushll v2.8h, v3.8b, #3
+ ushll2 v3.8h, v3.16b, #3
+ ushll v4.8h, v6.8b, #3
+ ushll2 v5.8h, v6.16b, #3
+ ushll v6.8h, v7.8b, #3
+ ushll2 v7.8h, v7.16b, #3
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+1: // Copy and expand input, padding 8
+ ldr d2, [x1, #16]
+ ld1 {v1.16b}, [x1], x2
+ ldr d6, [x10, #16]
+ ld1 {v5.16b}, [x10], x2
+ ushll v2.8h, v2.8b, #3
+ ushll v0.8h, v1.8b, #3
+ ushll2 v1.8h, v1.16b, #3
+ ushll v6.8h, v6.8b, #3
+ ushll v4.8h, v5.8b, #3
+ ushll2 v5.8h, v5.16b, #3
+ dup v3.8h, v2.h[7]
+ dup v7.8h, v6.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+1: // Copy and expand input, padding 16
+ ld1 {v1.16b}, [x1], x2
+ ld1 {v5.16b}, [x10], x2
+ ushll v0.8h, v1.8b, #3
+ ushll2 v1.8h, v1.16b, #3
+ ushll v4.8h, v5.8b, #3
+ ushll2 v5.8h, v5.16b, #3
+ dup v2.8h, v1.h[7]
+ dup v3.8h, v1.h[7]
+ dup v6.8h, v5.h[7]
+ dup v7.8h, v5.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+1: // Copy and expand input, padding 24
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v4.8b}, [x10], x2
+ ushll v0.8h, v0.8b, #3
+ ushll v4.8h, v4.8b, #3
+ dup v1.8h, v0.h[7]
+ dup v2.8h, v0.h[7]
+ dup v3.8h, v0.h[7]
+ dup v5.8h, v4.h[7]
+ dup v6.8h, v4.h[7]
+ dup v7.8h, v4.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+
+L(ipred_cfl_ac_444_w32_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 2b
+3:
+
+ // Quadruple the height and reuse the w8 subtracting
+ lsl w6, w6, #2
+ // Aggregate the sums, with wider intermediates earlier than in
+ // ipred_cfl_ac_420_w8_calc_subtract_dc.
+ uaddlp v0.4s, v16.8h
+ uaddlp v1.4s, v17.8h
+ uaddlp v2.4s, v18.8h
+ uaddlp v3.4s, v19.8h
+ add v0.4s, v0.4s, v1.4s
+ add v2.4s, v2.4s, v3.4s
+ add v0.4s, v0.4s, v2.4s
+ addv s0, v0.4s // sum
+ sub x0, x0, w6, uxtw #4
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+ b L(ipred_cfl_ac_420_w8_subtract_dc)
+
+L(ipred_cfl_ac_444_tbl):
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
+
+L(ipred_cfl_ac_444_w32_tbl):
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
endfunc
--- a/src/arm/64/ipred16.S
+++ b/src/arm/64/ipred16.S
@@ -920,10 +920,10 @@
lsl x1, x1, #1
br x5
40:
- sub x2, x2, #8
- mov x7, #-8
ld1r {v6.2d}, [x8] // top
ld1r {v7.2s}, [x10] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
dup v5.8h, v6.h[3] // right
sub v6.8h, v6.8h, v4.8h // top-bottom
uxtl v7.8h, v7.8b // weights_hor
@@ -963,10 +963,10 @@
b.gt 4b
ret
80:
- sub x2, x2, #8
- mov x7, #-8
ld1 {v6.8h}, [x8] // top
ld1 {v7.8b}, [x10] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
dup v5.8h, v6.h[7] // right
sub v6.8h, v6.8h, v4.8h // top-bottom
uxtl v7.8h, v7.8b // weights_hor
@@ -2125,7 +2125,7 @@
dup v16.4s, w8 // width + height
adr x7, L(ipred_cfl_tbl)
rbit w8, w8 // rbit(width + height)
- sub w9, w9, #22 // 22 leading bits, minus table offset 4
+ sub w9, w9, #22 // 26 leading bits, minus table offset 4
sub w6, w6, #26
clz w8, w8 // ctz(width + height)
ldrh w9, [x7, w9, uxtw #1]
@@ -2398,7 +2398,6 @@
// Double the height and reuse the w4 summing/subtracting
lsl w6, w6, #1
- lsl w9, w9, #1
b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
L(ipred_cfl_ac_420_w16):
@@ -2547,7 +2546,6 @@
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
- b L(ipred_cfl_ac_420_w16_hpad)
L(ipred_cfl_ac_420_w16_hpad):
cbz w4, 3f
@@ -2576,7 +2574,6 @@
// Quadruple the height and reuse the w4 summing/subtracting
lsl w6, w6, #2
- lsl w9, w9, #2
b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
L(ipred_cfl_ac_420_tbl):
@@ -2831,4 +2828,249 @@
.hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
.hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
.hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
+
+// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_16bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_444_tbl)
+ sub w8, w8, #26
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v24.4s, #0
+ movi v25.4s, #0
+ movi v26.4s, #0
+ movi v27.4s, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_444_w4):
+1: // Copy and expand input
+ ld1 {v0.4h}, [x1], x2
+ ld1 {v0.d}[1], [x10], x2
+ ld1 {v1.4h}, [x1], x2
+ ld1 {v1.d}[1], [x10], x2
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+1: // Copy and expand input
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ shl v0.8h, v0.8h, #3
+ ld1 {v3.8h}, [x10], x2
+ shl v1.8h, v1.8h, #3
+ shl v2.8h, v2.8h, #3
+ shl v3.8h, v3.8h, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ cbnz w3, L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ shl v2.8h, v2.8h, #3
+ shl v3.8h, v3.8h, #3
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v2.8h}, [x10], x2
+ shl v0.8h, v0.8h, #3
+ shl v2.8h, v2.8h, #3
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ adr x7, L(ipred_cfl_ac_444_w32_tbl)
+ ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1
+ lsr x2, x2, #1 // Restore the stride to one line increments
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_444_w32_wpad0):
+1: // Copy and expand input, without padding
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ shl v2.8h, v2.8h, #3
+ shl v3.8h, v3.8h, #3
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+1: // Copy and expand input, padding 8
+ ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2
+ shl v2.8h, v2.8h, #3
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+1: // Copy and expand input, padding 16
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ shl v1.8h, v1.8h, #3
+ shl v0.8h, v0.8h, #3
+ dup v2.8h, v1.h[7]
+ dup v3.8h, v1.h[7]
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+1: // Copy and expand input, padding 24
+ ld1 {v0.8h}, [x1], x2
+ shl v0.8h, v0.8h, #3
+ dup v1.8h, v0.h[7]
+ dup v2.8h, v0.h[7]
+ dup v3.8h, v0.h[7]
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+
+L(ipred_cfl_ac_444_w32_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 2b
+3:
+
+ // Multiply the height by eight and reuse the w4 subtracting
+ lsl w6, w6, #3
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_444_tbl):
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
+
+L(ipred_cfl_ac_444_w32_tbl):
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
endfunc
--- a/src/arm/asm.S
+++ b/src/arm/asm.S
@@ -94,6 +94,8 @@
#ifdef __ELF__
.type EXTERN\name, %function
.hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
#endif
#if HAVE_AS_FUNC
.func EXTERN\name
@@ -129,6 +131,8 @@
.global EXTERN\name
#ifdef __ELF__
.hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
#endif
EXTERN\name:
.endif
--- a/src/arm/ipred_init_tmpl.c
+++ b/src/arm/ipred_init_tmpl.c
@@ -46,6 +46,7 @@
decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon));
decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon));
decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
@@ -61,7 +62,6 @@
c->intra_pred[LEFT_DC_PRED] = BF(dav1d_ipred_dc_left, neon);
c->intra_pred[HOR_PRED] = BF(dav1d_ipred_h, neon);
c->intra_pred[VERT_PRED] = BF(dav1d_ipred_v, neon);
-#if ARCH_AARCH64
c->intra_pred[PAETH_PRED] = BF(dav1d_ipred_paeth, neon);
c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon);
c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon);
@@ -75,8 +75,8 @@
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon);
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);
c->pal_pred = BF(dav1d_pal_pred, neon);
-#endif
#endif
}
--- a/src/arm/mc_init_tmpl.c
+++ b/src/arm/mc_init_tmpl.c
@@ -99,10 +99,12 @@
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
+#endif
c->avg = BF(dav1d_avg, neon);
c->w_avg = BF(dav1d_w_avg, neon);
c->mask = BF(dav1d_mask, neon);
+#if BITDEPTH == 8 || ARCH_AARCH64
c->blend = BF(dav1d_blend, neon);
c->blend_h = BF(dav1d_blend_h, neon);
c->blend_v = BF(dav1d_blend_v, neon);
--- a/src/decode.c
+++ b/src/decode.c
@@ -776,10 +776,10 @@
signabs(t->warpmv.matrix[3]),
signabs(t->warpmv.matrix[4]),
signabs(t->warpmv.matrix[5]),
- signabs(t->warpmv.alpha),
- signabs(t->warpmv.beta),
- signabs(t->warpmv.gamma),
- signabs(t->warpmv.delta),
+ signabs(t->warpmv.u.p.alpha),
+ signabs(t->warpmv.u.p.beta),
+ signabs(t->warpmv.u.p.gamma),
+ signabs(t->warpmv.u.p.delta),
b->mv2d.y, b->mv2d.x);
#undef signabs
}
@@ -1849,10 +1849,10 @@
signabs(t->warpmv.matrix[3]),
signabs(t->warpmv.matrix[4]),
signabs(t->warpmv.matrix[5]),
- signabs(t->warpmv.alpha),
- signabs(t->warpmv.beta),
- signabs(t->warpmv.gamma),
- signabs(t->warpmv.delta),
+ signabs(t->warpmv.u.p.alpha),
+ signabs(t->warpmv.u.p.beta),
+ signabs(t->warpmv.u.p.gamma),
+ signabs(t->warpmv.u.p.delta),
b->mv[0].y, b->mv[0].x);
#undef signabs
if (f->frame_thread.pass) {
--- a/src/ext/x86/x86inc.asm
+++ b/src/ext/x86/x86inc.asm
@@ -1,5 +1,5 @@
;*****************************************************************************
-;* x86inc.asm: x264asm abstraction layer
+;* x86inc.asm: x86 abstraction layer
;*****************************************************************************
;* Copyright (C) 2005-2020 x264 project
;*
@@ -21,23 +21,14 @@
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;*****************************************************************************
-; This is a header file for the x264ASM assembly language, which uses
+; This is a header file for the x86inc.asm assembly language, which uses
; NASM/YASM syntax combined with a large number of macros to provide easy
; abstraction between different calling conventions (x86_32, win64, linux64).
; It also has various other useful features to simplify writing the kind of
-; DSP functions that are most often used in x264.
+; DSP functions that are most often used.
-; Unlike the rest of x264, this file is available under an ISC license, as it
-; has significant usefulness outside of x264 and we want it to be available
-; to the largest audience possible. Of course, if you modify it for your own
-; purposes to add a new feature, we strongly encourage contributing a patch
-; as this feature might be useful for others as well. Send patches or ideas
-; to x264-devel@videolan.org .
-
-%include "config.asm"
-
%ifndef private_prefix
- %define private_prefix dav1d
+ %error private_prefix not defined
%endif
%ifndef public_prefix
@@ -118,7 +109,7 @@
; Macros to eliminate most code duplication between x86_32 and x86_64:
; Currently this works only for leaf functions which load all their arguments
; into registers at the start, and make no other use of the stack. Luckily that
-; covers most of x264's asm.
+; covers most use cases.
; PROLOGUE:
; %1 = number of arguments. loads them from stack if needed.
@@ -1522,12 +1513,11 @@
AVX_INSTR pabsb, ssse3
AVX_INSTR pabsd, ssse3
AVX_INSTR pabsw, ssse3
-AVX_INSTR packsswb, mmx, 0, 0, 0
AVX_INSTR packssdw, mmx, 0, 0, 0
-AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR packsswb, mmx, 0, 0, 0
AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
AVX_INSTR paddb, mmx, 0, 0, 1
-AVX_INSTR paddw, mmx, 0, 0, 1
AVX_INSTR paddd, mmx, 0, 0, 1
AVX_INSTR paddq, sse2, 0, 0, 1
AVX_INSTR paddsb, mmx, 0, 0, 1
@@ -1534,6 +1524,7 @@
AVX_INSTR paddsw, mmx, 0, 0, 1
AVX_INSTR paddusb, mmx, 0, 0, 1
AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
AVX_INSTR palignr, ssse3, 0, 1, 0
AVX_INSTR pand, mmx, 0, 0, 1
AVX_INSTR pandn, mmx, 0, 0, 0
@@ -1541,71 +1532,71 @@
AVX_INSTR pavgw, mmx2, 0, 0, 1
AVX_INSTR pblendvb, sse4 ; can't be emulated
AVX_INSTR pblendw, sse4, 0, 1, 0
-AVX_INSTR pclmulqdq, fnord, 0, 1, 0
AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
-AVX_INSTR pcmpestri, sse42
-AVX_INSTR pcmpestrm, sse42
-AVX_INSTR pcmpistri, sse42
-AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pclmulqdq, fnord, 0, 1, 0
AVX_INSTR pcmpeqb, mmx, 0, 0, 1
-AVX_INSTR pcmpeqw, mmx, 0, 0, 1
AVX_INSTR pcmpeqd, mmx, 0, 0, 1
AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
AVX_INSTR pcmpgtb, mmx, 0, 0, 0
-AVX_INSTR pcmpgtw, mmx, 0, 0, 0
AVX_INSTR pcmpgtd, mmx, 0, 0, 0
AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
AVX_INSTR pextrb, sse4
AVX_INSTR pextrd, sse4
AVX_INSTR pextrq, sse4
AVX_INSTR pextrw, mmx2
-AVX_INSTR phaddw, ssse3, 0, 0, 0
AVX_INSTR phaddd, ssse3, 0, 0, 0
AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phaddw, ssse3, 0, 0, 0
AVX_INSTR phminposuw, sse4
-AVX_INSTR phsubw, ssse3, 0, 0, 0
AVX_INSTR phsubd, ssse3, 0, 0, 0
AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR phsubw, ssse3, 0, 0, 0
AVX_INSTR pinsrb, sse4, 0, 1, 0
AVX_INSTR pinsrd, sse4, 0, 1, 0
AVX_INSTR pinsrq, sse4, 0, 1, 0
AVX_INSTR pinsrw, mmx2, 0, 1, 0
-AVX_INSTR pmaddwd, mmx, 0, 0, 1
AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
AVX_INSTR pmaxsb, sse4, 0, 0, 1
-AVX_INSTR pmaxsw, mmx2, 0, 0, 1
AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
AVX_INSTR pmaxub, mmx2, 0, 0, 1
-AVX_INSTR pmaxuw, sse4, 0, 0, 1
AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
AVX_INSTR pminsb, sse4, 0, 0, 1
-AVX_INSTR pminsw, mmx2, 0, 0, 1
AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
AVX_INSTR pminub, mmx2, 0, 0, 1
-AVX_INSTR pminuw, sse4, 0, 0, 1
AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
AVX_INSTR pmovmskb, mmx2
-AVX_INSTR pmovsxbw, sse4
AVX_INSTR pmovsxbd, sse4
AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxdq, sse4
AVX_INSTR pmovsxwd, sse4
AVX_INSTR pmovsxwq, sse4
-AVX_INSTR pmovsxdq, sse4
-AVX_INSTR pmovzxbw, sse4
AVX_INSTR pmovzxbd, sse4
AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxdq, sse4
AVX_INSTR pmovzxwd, sse4
AVX_INSTR pmovzxwq, sse4
-AVX_INSTR pmovzxdq, sse4
AVX_INSTR pmuldq, sse4, 0, 0, 1
AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
AVX_INSTR pmulhuw, mmx2, 0, 0, 1
AVX_INSTR pmulhw, mmx, 0, 0, 1
-AVX_INSTR pmullw, mmx, 0, 0, 1
AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
AVX_INSTR pmuludq, sse2, 0, 0, 1
AVX_INSTR por, mmx, 0, 0, 1
AVX_INSTR psadbw, mmx2, 0, 0, 1
@@ -1614,20 +1605,19 @@
AVX_INSTR pshufhw, sse2
AVX_INSTR pshuflw, sse2
AVX_INSTR psignb, ssse3, 0, 0, 0
-AVX_INSTR psignw, ssse3, 0, 0, 0
AVX_INSTR psignd, ssse3, 0, 0, 0
-AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
AVX_INSTR pslld, mmx, 0, 0, 0
-AVX_INSTR psllq, mmx, 0, 0, 0
AVX_INSTR pslldq, sse2, 0, 0, 0
-AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
AVX_INSTR psrad, mmx, 0, 0, 0
-AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
AVX_INSTR psrld, mmx, 0, 0, 0
-AVX_INSTR psrlq, mmx, 0, 0, 0
AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
AVX_INSTR psubb, mmx, 0, 0, 0
-AVX_INSTR psubw, mmx, 0, 0, 0
AVX_INSTR psubd, mmx, 0, 0, 0
AVX_INSTR psubq, sse2, 0, 0, 0
AVX_INSTR psubsb, mmx, 0, 0, 0
@@ -1634,15 +1624,16 @@
AVX_INSTR psubsw, mmx, 0, 0, 0
AVX_INSTR psubusb, mmx, 0, 0, 0
AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
AVX_INSTR ptest, sse4
AVX_INSTR punpckhbw, mmx, 0, 0, 0
-AVX_INSTR punpckhwd, mmx, 0, 0, 0
AVX_INSTR punpckhdq, mmx, 0, 0, 0
AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
AVX_INSTR punpcklbw, mmx, 0, 0, 0
-AVX_INSTR punpcklwd, mmx, 0, 0, 0
AVX_INSTR punpckldq, mmx, 0, 0, 0
AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
AVX_INSTR pxor, mmx, 0, 0, 1
AVX_INSTR rcpps, sse, 1
AVX_INSTR rcpss, sse, 1, 0, 0
@@ -1674,8 +1665,8 @@
; 3DNow instructions, for sharing code between AVX, SSE and 3DN
AVX_INSTR pfadd, 3dnow, 1, 0, 1
-AVX_INSTR pfsub, 3dnow, 1, 0, 0
AVX_INSTR pfmul, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
;%1 == instruction
;%2 == minimal instruction set
@@ -1740,9 +1731,9 @@
%endmacro
%endmacro
-FMA_INSTR pmacsww, pmullw, paddw
-FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
-FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmacsww, pmullw, paddw
FMA_INSTR pmadcswd, pmaddwd, paddd
; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
--- a/src/meson.build
+++ b/src/meson.build
@@ -147,6 +147,7 @@
if dav1d_bitdepths.contains('16')
libdav1d_sources += files(
+ 'arm/32/mc16.S',
)
endif
endif
--- a/src/obu.c
+++ b/src/obu.c
@@ -1201,7 +1201,6 @@
const unsigned init_bit_pos = dav1d_get_bits_pos(&gb);
const unsigned init_byte_pos = init_bit_pos >> 3;
- const unsigned pkt_bytelen = init_byte_pos + len;
// We must have read a whole number of bytes at this point (1 byte
// for the header and whole bytes at a time when reading the
@@ -1345,6 +1344,7 @@
// The current bit position is a multiple of 8 (because we
// just aligned it) and less than 8*pkt_bytelen because
// otherwise the overrun check would have fired.
+ const unsigned pkt_bytelen = init_byte_pos + len;
const unsigned bit_pos = dav1d_get_bits_pos(&gb);
assert((bit_pos & 7) == 0);
assert(pkt_bytelen >= (bit_pos >> 3));
@@ -1371,17 +1371,12 @@
const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3;
if (gb.error) goto error;
- Dav1dRef *ref;
- Dav1dContentLightLevel *content_light;
- Dav1dMasteringDisplay *mastering_display;
- Dav1dITUTT35 *itut_t35_metadata;
switch (meta_type) {
- case OBU_META_HDR_CLL:
- ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel));
+ case OBU_META_HDR_CLL: {
+ Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel));
if (!ref) return DAV1D_ERR(ENOMEM);
- content_light = ref->data;
- memset(content_light, 0, sizeof(*content_light));
+ Dav1dContentLightLevel *const content_light = ref->data;
content_light->max_content_light_level = dav1d_get_bits(&gb, 16);
content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16);
@@ -1398,11 +1393,11 @@
c->content_light = content_light;
c->content_light_ref = ref;
break;
+ }
case OBU_META_HDR_MDCV: {
- ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay));
+ Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay));
if (!ref) return DAV1D_ERR(ENOMEM);
- mastering_display = ref->data;
- memset(mastering_display, 0, sizeof(*mastering_display));
+ Dav1dMasteringDisplay *const mastering_display = ref->data;
for (int i = 0; i < 3; i++) {
mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16);
@@ -1450,9 +1445,9 @@
goto error;
}
- ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t));
+ Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t));
if (!ref) return DAV1D_ERR(ENOMEM);
- itut_t35_metadata = ref->data;
+ Dav1dITUTT35 *const itut_t35_metadata = ref->data;
// We need our public headers to be C++ compatible, so payload can't be
// a flexible array member
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -1084,11 +1084,11 @@
const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
const int dx = (int) (mvx >> 16) - 4;
- const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 -
- wmp->beta * 7) & ~0x3f;
+ const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -
+ wmp->u.p.beta * 7) & ~0x3f;
const int dy = (int) (mvy >> 16) - 4;
- const int my = (((int) mvy & 0xffff) - wmp->gamma * 4 -
- wmp->delta * 4) & ~0x3f;
+ const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -
+ wmp->u.p.delta * 4) & ~0x3f;
const pixel *ref_ptr;
ptrdiff_t ref_stride = refp->p.stride[!!pl];
@@ -1110,10 +1110,10 @@
}
if (dst16 != NULL)
dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
- wmp->abcd, mx, my HIGHBD_CALL_SUFFIX);
+ wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
else
dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
- wmp->abcd, mx, my HIGHBD_CALL_SUFFIX);
+ wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
}
if (dst8) dst8 += 8 * PXSTRIDE(dstride);
else dst16 += 8 * dstride;
--- a/src/tables.c
+++ b/src/tables.c
@@ -689,7 +689,7 @@
{ 0, -1, 2, -4, -127, 3, -1, 0 }, { 0, 0, 1, -2, -128, 1, 0, 0 },
};
-const uint8_t dav1d_sm_weights[128] = {
+const uint8_t ALIGN(dav1d_sm_weights[128], 16) = {
// Unused, because we always offset by bs, which is at least 2.
0, 0,
// bs = 2
--- a/src/warpmv.c
+++ b/src/warpmv.c
@@ -82,21 +82,21 @@
if (mat[2] <= 0) return 1;
- wm->alpha = iclip_wmp(mat[2] - 0x10000);
- wm->beta = iclip_wmp(mat[3]);
+ wm->u.p.alpha = iclip_wmp(mat[2] - 0x10000);
+ wm->u.p.beta = iclip_wmp(mat[3]);
int shift;
const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]);
const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y;
const int rnd = (1 << shift) >> 1;
- wm->gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
+ wm->u.p.gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y;
- wm->delta = iclip_wmp(mat[5] -
+ wm->u.p.delta = iclip_wmp(mat[5] -
apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) -
0x10000);
- return (4 * abs(wm->alpha) + 7 * abs(wm->beta) >= 0x10000) ||
- (4 * abs(wm->gamma) + 4 * abs(wm->delta) >= 0x10000);
+ return (4 * abs(wm->u.p.alpha) + 7 * abs(wm->u.p.beta) >= 0x10000) ||
+ (4 * abs(wm->u.p.gamma) + 4 * abs(wm->u.p.delta) >= 0x10000);
}
static int resolve_divisor_64(const uint64_t d, int *const shift) {
--- a/src/x86/cdef_avx2.asm
+++ b/src/x86/cdef_avx2.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
--- a/src/x86/cdef_avx512.asm
+++ b/src/x86/cdef_avx512.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
%if HAVE_AVX512ICL && ARCH_X86_64
--- a/src/x86/cdef_sse.asm
+++ b/src/x86/cdef_sse.asm
@@ -24,32 +24,36 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA 16
-%if ARCH_X86_32
-pb_0: times 16 db 0
-pb_0xFF: times 16 db 0xFF
-%endif
+%macro DUP8 1-*
+ %rep %0
+ times 8 db %1
+ %rotate 1
+ %endrep
+%endmacro
+
+div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105
+ dd 420, 210, 140, 105, 105, 105, 105, 105
+div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210
+ dw 168, 168, 140, 140, 120, 120, 105, 105
+ dw 420, 420, 210, 210, 140, 140, 105, 105
+ dw 105, 105, 105, 105, 105, 105, 105, 105
+shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
+shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pw_8: times 8 dw 8
pw_128: times 8 dw 128
pw_256: times 8 dw 256
pw_2048: times 8 dw 2048
-%if ARCH_X86_32
pw_0x7FFF: times 8 dw 0x7FFF
pw_0x8000: times 8 dw 0x8000
-%endif
-div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105
- dd 420, 210, 140, 105, 105, 105, 105, 105
-div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
- dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105
-shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
-shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
tap_table: ; masks for 8-bit shift emulation
- db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
+ DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80
; weights
- db 4, 2, 3, 3, 2, 1
+ DUP8 4, 2, 3, 3, 2, 1
; taps indices
db -1 * 16 + 1, -2 * 16 + 2
db 0 * 16 + 1, -1 * 16 + 2
@@ -75,59 +79,19 @@
%endif
%endmacro
-%macro SAVE_ARG 2 ; varname, argnum
- %define %1_stkloc [rsp+%2*gprsize]
- %define %1_argnum %2
- mov r2, r%2m
- mov %1_stkloc, r2
-%endmacro
-
-%macro LOAD_ARG 1-2 0 ; varname, load_to_varname_register
- %if %2 == 0
- mov r %+ %{1}_argnum, %1_stkloc
+%macro PMOVZXBW 2-3 0 ; %3 = half
+ %if cpuflag(sse4) && %3 == 0
+ pmovzxbw %1, %2
%else
- mov %1q, %1_stkloc
- %endif
-%endmacro
-
-%macro LOAD_ARG32 1-2 ; varname, load_to_varname_register
- %if ARCH_X86_32
- %if %0 == 1
- LOAD_ARG %1
+ %if %3 == 1
+ movd %1, %2
%else
- LOAD_ARG %1, %2
+ movq %1, %2
%endif
+ punpcklbw %1, m7
%endif
%endmacro
-%if ARCH_X86_32
- %define PIC_base_offset $$
- %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
-%else
- %define PIC_sym(sym) sym
-%endif
-
-%macro SAVE_PIC_REG 1
- %if ARCH_X86_32
- mov [esp+%1], PIC_reg
- %endif
-%endmacro
-
-%macro LOAD_PIC_REG 1
- %if ARCH_X86_32
- mov PIC_reg, [esp+%1]
- %endif
-%endmacro
-
-%macro PMOVZXBW 2-3 0 ; %3 = half
- %if %3 == 1
- movd %1, %2
- %else
- movq %1, %2
- %endif
- punpcklbw %1, m15
-%endmacro
-
%macro PSHUFB_0 2
%if cpuflag(ssse3)
pshufb %1, %2
@@ -138,34 +102,33 @@
%endif
%endmacro
-%macro LOAD_SEC_TAP 0
- %if ARCH_X86_64
- movd m3, [secq+kq]
- PSHUFB_0 m3, m15
- %else
- movd m2, [secq+kq] ; sec_taps
- pxor m3, m3
- PSHUFB_0 m2, m3
- %endif
+%macro MOVDDUP 2
+%if cpuflag(ssse3)
+ movddup %1, %2
+%else
+ movq %1, %2
+ punpcklqdq %1, %1
+%endif
%endmacro
-%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, stride
+%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax
; load p0/p1
- movsx offq, byte [dirq+kq+%1] ; off1
+ movsx offq, byte [dirq+kq+%1+14*8] ; off1
%if %6 == 4
- movq m5, [stkq+offq*2+%7*0] ; p0
- movhps m5, [stkq+offq*2+%7*1]
+ movq m5, [stkq+offq*2+32*0] ; p0
+ movhps m5, [stkq+offq*2+32*1]
%else
- movu m5, [stkq+offq*2+%7*0] ; p0
+ movu m5, [stkq+offq*2+32*0] ; p0
%endif
neg offq ; -off1
%if %6 == 4
- movq m6, [stkq+offq*2+%7*0] ; p1
- movhps m6, [stkq+offq*2+%7*1]
+ movq m6, [stkq+offq*2+32*0] ; p1
+ movhps m6, [stkq+offq*2+32*1]
%else
- movu m6, [stkq+offq*2+%7*0] ; p1
+ movu m6, [stkq+offq*2+32*0] ; p1
%endif
- %if cpuflag(sse4)
+ %if %7
+ %if cpuflag(sse4)
; out of bounds values are set to a value that is a both a large unsigned
; value and a negative signed value.
; use signed max and unsigned min to remove them
@@ -173,40 +136,26 @@
pminuw m8, m5
pmaxsw m7, m6
pminuw m8, m6
- %else
- %if ARCH_X86_64
- pcmpeqw m9, m14, m5
- pcmpeqw m10, m14, m6
- pandn m9, m5
- pandn m10, m6
- pmaxsw m7, m9 ; max after p0
- pminsw m8, m5 ; min after p0
- pmaxsw m7, m10 ; max after p1
- pminsw m8, m6 ; min after p1
%else
- pcmpeqw m9, m5, OUT_OF_BOUNDS_MEM
- pandn m9, m5
- pmaxsw m7, m9 ; max after p0
- pminsw m8, m5 ; min after p0
- pcmpeqw m9, m6, OUT_OF_BOUNDS_MEM
- pandn m9, m6
- pmaxsw m7, m9 ; max after p1
- pminsw m8, m6 ; min after p1
+ pcmpeqw m3, m14, m5
+ pminsw m8, m5 ; min after p0
+ pandn m3, m5
+ pmaxsw m7, m3 ; max after p0
+ pcmpeqw m3, m14, m6
+ pminsw m8, m6 ; min after p1
+ pandn m3, m6
+ pmaxsw m7, m3 ; max after p1
%endif
%endif
; accumulate sum[m13] over p0/p1
- psubw m5, m4 ; diff_p0(p0 - px)
- psubw m6, m4 ; diff_p1(p1 - px)
- packsswb m5, m6 ; convert pixel diff to 8-bit
+ psubw m5, m4 ; diff_p0(p0 - px)
+ psubw m6, m4 ; diff_p1(p1 - px)
+ packsswb m5, m6 ; convert pixel diff to 8-bit
%if cpuflag(ssse3)
- %if ARCH_X86_64 && cpuflag(sse4)
- pshufb m5, m14 ; group diffs p0 and p1 into pairs
- %else
- pshufb m5, [PIC_sym(shufb_lohi)]
- %endif
+ pshufb m5, m13 ; group diffs p0 and p1 into pairs
pabsb m6, m5
- psignb m9, %5, m5
+ psignb m3, %5, m5
%else
movlhps m6, m5
punpckhbw m6, m5
@@ -214,111 +163,113 @@
pcmpgtb m5, m6
paddb m6, m5
pxor m6, m5
- paddb m9, %5, m5
- pxor m9, m5
+ paddb m3, %5, m5
+ pxor m3, m5
%endif
- %if ARCH_X86_64
- psrlw m10, m6, %2 ; emulate 8-bit shift
- pand m10, %3
- psubusb m5, %4, m10
- %else
- psrlw m5, m6, %2 ; emulate 8-bit shift
- pand m5, %3
- paddusb m5, %4
- pxor m5, [PIC_sym(pb_0xFF)]
- %endif
- pminub m5, m6 ; constrain(diff_p)
+ pand m9, %3, m6 ; emulate 8-bit shift
+ psrlw m9, %2
+ psubusb m5, %4, m9
+ pminub m5, m6 ; constrain(diff_p)
%if cpuflag(ssse3)
- pmaddubsw m5, m9 ; constrain(diff_p) * taps
+ pmaddubsw m5, m3 ; constrain(diff_p) * taps
%else
- psrlw m2, m5, 8
- psraw m6, m9, 8
+ psrlw m9, m5, 8
+ psraw m6, m3, 8
psllw m5, 8
- psllw m9, 8
- pmullw m2, m6
- pmulhw m5, m9
- paddw m5, m2
+ psllw m3, 8
+ pmullw m9, m6
+ pmulhw m5, m3
+ paddw m5, m9
%endif
- paddw m13, m5
+ paddw m0, m5
%endmacro
-%macro LOAD_BODY 4 ; dst, src, block_width, tmp_stride
+%macro LOAD_BODY 3 ; dst, src, block_width
%if %3 == 4
PMOVZXBW m0, [%2+strideq*0]
PMOVZXBW m1, [%2+strideq*1]
PMOVZXBW m2, [%2+strideq*2]
PMOVZXBW m3, [%2+stride3q]
+ mova [%1+32*0], m0
+ mova [%1+32*1], m1
+ mova [%1+32*2], m2
+ mova [%1+32*3], m3
%else
movu m0, [%2+strideq*0]
movu m1, [%2+strideq*1]
movu m2, [%2+strideq*2]
movu m3, [%2+stride3q]
- punpckhbw m4, m0, m15
- punpcklbw m0, m15
- punpckhbw m5, m1, m15
- punpcklbw m1, m15
- punpckhbw m6, m2, m15
- punpcklbw m2, m15
- punpckhbw m7, m3, m15
- punpcklbw m3, m15
+ punpcklbw m4, m0, m7
+ punpckhbw m0, m7
+ mova [%1+32*0+ 0], m4
+ mova [%1+32*0+16], m0
+ punpcklbw m4, m1, m7
+ punpckhbw m1, m7
+ mova [%1+32*1+ 0], m4
+ mova [%1+32*1+16], m1
+ punpcklbw m4, m2, m7
+ punpckhbw m2, m7
+ mova [%1+32*2+ 0], m4
+ mova [%1+32*2+16], m2
+ punpcklbw m4, m3, m7
+ punpckhbw m3, m7
+ mova [%1+32*3+ 0], m4
+ mova [%1+32*3+16], m3
%endif
- mova [%1+0*%4], m0
- mova [%1+1*%4], m1
- mova [%1+2*%4], m2
- mova [%1+3*%4], m3
- %if %3 == 8
- mova [%1+0*%4+2*8], m4
- mova [%1+1*%4+2*8], m5
- mova [%1+2*%4+2*8], m6
- mova [%1+3*%4+2*8], m7
- %endif
%endmacro
-%macro CDEF_FILTER 3 ; w, h, stride
-
- %if cpuflag(sse4)
- %define OUT_OF_BOUNDS 0x80008000
+%macro CDEF_FILTER_END 2 ; w, minmax
+ pxor m6, m6
+ pcmpgtw m6, m0
+ paddw m0, m6
+ %if cpuflag(ssse3)
+ pmulhrsw m0, m15
%else
- %define OUT_OF_BOUNDS 0x7FFF7FFF
+ paddw m0, m15
+ psraw m0, 4
%endif
+ paddw m4, m0
+ %if %2
+ pminsw m4, m7
+ pmaxsw m4, m8
+ %endif
+ packuswb m4, m4
+ %if %1 == 4
+ movd [dstq+strideq*0], m4
+ psrlq m4, 32
+ movd [dstq+strideq*1], m4
+ add stkq, 32*2
+ lea dstq, [dstq+strideq*2]
+ %else
+ movq [dstq], m4
+ add stkq, 32
+ add dstq, strideq
+ %endif
+%endmacro
+%macro CDEF_FILTER 2 ; w, h
%if ARCH_X86_64
-cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*%3, \
- dst, stride, left, top, pri, sec, stride3, dst4, edge
- pcmpeqw m14, m14
- %if cpuflag(sse4)
- psllw m14, 15 ; 0x8000
- %else
- psrlw m14, 1 ; 0x7FFF
- %endif
- pxor m15, m15
-
- %define px rsp+3*16+2*%3
+cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*32, \
+ dst, stride, left, top, pri, sec, edge, stride3, dst4
+ %define px rsp+3*16+2*32
+ %define base 0
%else
-cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
- dst, stride, left, top, stride3, dst4, edge
- SAVE_ARG left, 2
- SAVE_ARG top, 3
- SAVE_ARG pri, 4
- SAVE_ARG sec, 5
- SAVE_ARG dir, 6
- SAVE_ARG damping, 7
-
- %define PIC_reg r2
- LEA PIC_reg, PIC_base_offset
-
- %if cpuflag(sse4)
- %define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x8000)]
- %else
- %define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x7FFF)]
- %endif
-
- %define m15 [PIC_sym(pb_0)]
-
- %define px esp+7*16+2*%3
+cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
+ dst, stride, left, edge, stride3
+ %define topq r2
+ %define dst4q r2
+ LEA r5, tap_table
+ %define px esp+7*16+2*32
+ %define base r5-tap_table
%endif
-
mov edged, r8m
+ %if cpuflag(sse4)
+ %define OUT_OF_BOUNDS_MEM [base+pw_0x8000]
+ %else
+ %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF]
+ %endif
+ mova m6, OUT_OF_BOUNDS_MEM
+ pxor m7, m7
; prepare pixel buffers - body/right
%if %2 == 8
@@ -325,11 +276,11 @@
lea dst4q, [dstq+strideq*4]
%endif
lea stride3q, [strideq*3]
- test edged, 2 ; have_right
+ test edgeb, 2 ; have_right
jz .no_right
- LOAD_BODY px, dstq, %1, %3
+ LOAD_BODY px, dstq, %1
%if %2 == 8
- LOAD_BODY px+4*%3, dst4q, %1, %3
+ LOAD_BODY px+4*32, dst4q, %1
%endif
jmp .body_done
.no_right:
@@ -337,39 +288,37 @@
PMOVZXBW m1, [dstq+strideq*1], %1 == 4
PMOVZXBW m2, [dstq+strideq*2], %1 == 4
PMOVZXBW m3, [dstq+stride3q ], %1 == 4
+ mova [px+32*0], m0
+ mova [px+32*1], m1
+ mova [px+32*2], m2
+ mova [px+32*3], m3
+ movd [px+32*0+%1*2], m6
+ movd [px+32*1+%1*2], m6
+ movd [px+32*2+%1*2], m6
+ movd [px+32*3+%1*2], m6
%if %2 == 8
- PMOVZXBW m4, [dst4q+strideq*0], %1 == 4
- PMOVZXBW m5, [dst4q+strideq*1], %1 == 4
- PMOVZXBW m6, [dst4q+strideq*2], %1 == 4
- PMOVZXBW m7, [dst4q+stride3q ], %1 == 4
+ PMOVZXBW m0, [dst4q+strideq*0], %1 == 4
+ PMOVZXBW m1, [dst4q+strideq*1], %1 == 4
+ PMOVZXBW m2, [dst4q+strideq*2], %1 == 4
+ PMOVZXBW m3, [dst4q+stride3q ], %1 == 4
+ mova [px+32*4], m0
+ mova [px+32*5], m1
+ mova [px+32*6], m2
+ mova [px+32*7], m3
+ movd [px+32*4+%1*2], m6
+ movd [px+32*5+%1*2], m6
+ movd [px+32*6+%1*2], m6
+ movd [px+32*7+%1*2], m6
%endif
- mova [px+0*%3], m0
- mova [px+1*%3], m1
- mova [px+2*%3], m2
- mova [px+3*%3], m3
- %if %2 == 8
- mova [px+4*%3], m4
- mova [px+5*%3], m5
- mova [px+6*%3], m6
- mova [px+7*%3], m7
- mov dword [px+4*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+5*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+6*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+7*%3+%1*2], OUT_OF_BOUNDS
- %endif
- mov dword [px+0*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+1*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+2*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+3*%3+%1*2], OUT_OF_BOUNDS
.body_done:
; top
- LOAD_ARG32 top
- test edged, 4 ; have_top
+ movifnidn topq, r3mp
+ test edgeb, 4 ; have_top
jz .no_top
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .top_no_left
- test edged, 2 ; have_right
+ test edgeb, 2 ; have_right
jz .top_no_right
%if %1 == 4
PMOVZXBW m0, [topq+strideq*0-2]
@@ -377,39 +326,39 @@
%else
movu m0, [topq+strideq*0-4]
movu m1, [topq+strideq*1-4]
- punpckhbw m2, m0, m15
- punpcklbw m0, m15
- punpckhbw m3, m1, m15
- punpcklbw m1, m15
- movu [px-2*%3+8], m2
- movu [px-1*%3+8], m3
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movu [px-32*2+8], m2
+ movu [px-32*1+8], m3
%endif
- movu [px-2*%3-%1], m0
- movu [px-1*%3-%1], m1
+ movu [px-32*2-%1], m0
+ movu [px-32*1-%1], m1
jmp .top_done
.top_no_right:
%if %1 == 4
PMOVZXBW m0, [topq+strideq*0-%1]
PMOVZXBW m1, [topq+strideq*1-%1]
- movu [px-2*%3-4*2], m0
- movu [px-1*%3-4*2], m1
+ movu [px-32*2-8], m0
+ movu [px-32*1-8], m1
%else
movu m0, [topq+strideq*0-%1]
movu m1, [topq+strideq*1-%2]
- punpckhbw m2, m0, m15
- punpcklbw m0, m15
- punpckhbw m3, m1, m15
- punpcklbw m1, m15
- mova [px-2*%3-8*2], m0
- mova [px-2*%3-0*2], m2
- mova [px-1*%3-8*2], m1
- mova [px-1*%3-0*2], m3
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px-32*2-16], m0
+ mova [px-32*2+ 0], m2
+ mova [px-32*1-16], m1
+ mova [px-32*1+ 0], m3
%endif
- mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
+ movd [px-32*2+%1*2], m6
+ movd [px-32*1+%1*2], m6
jmp .top_done
.top_no_left:
- test edged, 2 ; have_right
+ test edgeb, 2 ; have_right
jz .top_no_left_right
%if %1 == 4
PMOVZXBW m0, [topq+strideq*0]
@@ -417,102 +366,92 @@
%else
movu m0, [topq+strideq*0]
movu m1, [topq+strideq*1]
- punpckhbw m2, m0, m15
- punpcklbw m0, m15
- punpckhbw m3, m1, m15
- punpcklbw m1, m15
- movd [px-2*%3+8*2], m2
- movd [px-1*%3+8*2], m3
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movd [px-32*2+16], m2
+ movd [px-32*1+16], m3
%endif
- mova [px-2*%3], m0
- mova [px-1*%3], m1
- mov dword [px-2*%3-4], OUT_OF_BOUNDS
- mov dword [px-1*%3-4], OUT_OF_BOUNDS
+ movd [px-32*2- 4], m6
+ movd [px-32*1- 4], m6
+ mova [px-32*2+ 0], m0
+ mova [px-32*1+ 0], m1
jmp .top_done
.top_no_left_right:
PMOVZXBW m0, [topq+strideq*0], %1 == 4
PMOVZXBW m1, [topq+strideq*1], %1 == 4
- mova [px-2*%3], m0
- mova [px-1*%3], m1
- mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px-2*%3-4], OUT_OF_BOUNDS
- mov dword [px-1*%3-4], OUT_OF_BOUNDS
+ movd [px-32*2-4], m6
+ movd [px-32*1-4], m6
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ movd [px-32*2+%1*2], m6
+ movd [px-32*1+%1*2], m6
jmp .top_done
.no_top:
- %if ARCH_X86_64
- SWAP m0, m14
- %else
- mova m0, OUT_OF_BOUNDS_MEM
- %endif
- movu [px-2*%3-4], m0
- movu [px-1*%3-4], m0
+ movu [px-32*2- 4], m6
+ movu [px-32*1- 4], m6
%if %1 == 8
- movq [px-2*%3+12], m0
- movq [px-1*%3+12], m0
+ movq [px-32*2+12], m6
+ movq [px-32*1+12], m6
%endif
- %if ARCH_X86_64
- SWAP m0, m14
- %endif
.top_done:
; left
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .no_left
- SAVE_PIC_REG 0
- LOAD_ARG32 left
+ movifnidn leftq, leftmp
%if %2 == 4
movq m0, [leftq]
%else
movu m0, [leftq]
%endif
- LOAD_PIC_REG 0
%if %2 == 4
- punpcklbw m0, m15
+ punpcklbw m0, m7
%else
- punpckhbw m1, m0, m15
- punpcklbw m0, m15
+ punpckhbw m1, m0, m7
+ punpcklbw m0, m7
movhlps m3, m1
- movd [px+4*%3-4], m1
- movd [px+6*%3-4], m3
+ movd [px+32*4-4], m1
+ movd [px+32*6-4], m3
psrlq m1, 32
psrlq m3, 32
- movd [px+5*%3-4], m1
- movd [px+7*%3-4], m3
+ movd [px+32*5-4], m1
+ movd [px+32*7-4], m3
%endif
movhlps m2, m0
- movd [px+0*%3-4], m0
- movd [px+2*%3-4], m2
+ movd [px+32*0-4], m0
+ movd [px+32*2-4], m2
psrlq m0, 32
psrlq m2, 32
- movd [px+1*%3-4], m0
- movd [px+3*%3-4], m2
+ movd [px+32*1-4], m0
+ movd [px+32*3-4], m2
jmp .left_done
.no_left:
- mov dword [px+0*%3-4], OUT_OF_BOUNDS
- mov dword [px+1*%3-4], OUT_OF_BOUNDS
- mov dword [px+2*%3-4], OUT_OF_BOUNDS
- mov dword [px+3*%3-4], OUT_OF_BOUNDS
+ movd [px+32*0-4], m6
+ movd [px+32*1-4], m6
+ movd [px+32*2-4], m6
+ movd [px+32*3-4], m6
%if %2 == 8
- mov dword [px+4*%3-4], OUT_OF_BOUNDS
- mov dword [px+5*%3-4], OUT_OF_BOUNDS
- mov dword [px+6*%3-4], OUT_OF_BOUNDS
- mov dword [px+7*%3-4], OUT_OF_BOUNDS
+ movd [px+32*4-4], m6
+ movd [px+32*5-4], m6
+ movd [px+32*6-4], m6
+ movd [px+32*7-4], m6
%endif
.left_done:
; bottom
%if ARCH_X86_64
- DEFINE_ARGS dst, stride, dummy1, dst8, pri, sec, stride3, dummy2, edge
+ DEFINE_ARGS dst, stride, dst8, dummy, pri, sec, edge, stride3
%else
- DEFINE_ARGS dst, stride, dummy1, dst8, stride3, dummy2, edge
+ DEFINE_ARGS dst, stride, dst8, edge, stride3
%endif
- test edged, 8 ; have_bottom
+ test edgeb, 8 ; have_bottom
jz .no_bottom
lea dst8q, [dstq+%2*strideq]
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .bottom_no_left
- test edged, 2 ; have_right
+ test edgeb, 2 ; have_right
jz .bottom_no_right
%if %1 == 4
PMOVZXBW m0, [dst8q-(%1/2)]
@@ -520,40 +459,40 @@
%else
movu m0, [dst8q-4]
movu m1, [dst8q+strideq-4]
- punpckhbw m2, m0, m15
- punpcklbw m0, m15
- punpckhbw m3, m1, m15
- punpcklbw m1, m15
- movu [px+(%2+0)*%3+8], m2
- movu [px+(%2+1)*%3+8], m3
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movu [px+32*(%2+0)+8], m2
+ movu [px+32*(%2+1)+8], m3
%endif
- movu [px+(%2+0)*%3-%1], m0
- movu [px+(%2+1)*%3-%1], m1
+ movu [px+32*(%2+0)-%1], m0
+ movu [px+32*(%2+1)-%1], m1
jmp .bottom_done
.bottom_no_right:
%if %1 == 4
PMOVZXBW m0, [dst8q-4]
PMOVZXBW m1, [dst8q+strideq-4]
- movu [px+(%2+0)*%3-4*2], m0
- movu [px+(%2+1)*%3-4*2], m1
+ movu [px+32*(%2+0)-8], m0
+ movu [px+32*(%2+1)-8], m1
%else
movu m0, [dst8q-8]
movu m1, [dst8q+strideq-8]
- punpckhbw m2, m0, m15
- punpcklbw m0, m15
- punpckhbw m3, m1, m15
- punpcklbw m1, m15
- mova [px+(%2+0)*%3-8*2], m0
- mova [px+(%2+0)*%3-0*2], m2
- mova [px+(%2+1)*%3-8*2], m1
- mova [px+(%2+1)*%3-0*2], m3
- mov dword [px+(%2-1)*%3+8*2], OUT_OF_BOUNDS ; overwritten by first mova
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px+32*(%2+0)-16], m0
+ mova [px+32*(%2+0)+ 0], m2
+ mova [px+32*(%2+1)-16], m1
+ mova [px+32*(%2+1)+ 0], m3
+ movd [px+32*(%2-1)+16], m6 ; overwritten by first mova
%endif
- mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
+ movd [px+32*(%2+0)+%1*2], m6
+ movd [px+32*(%2+1)+%1*2], m6
jmp .bottom_done
.bottom_no_left:
- test edged, 2 ; have_right
+ test edgeb, 2 ; have_right
jz .bottom_no_left_right
%if %1 == 4
PMOVZXBW m0, [dst8q]
@@ -561,233 +500,245 @@
%else
movu m0, [dst8q]
movu m1, [dst8q+strideq]
- punpckhbw m2, m0, m15
- punpcklbw m0, m15
- punpckhbw m3, m1, m15
- punpcklbw m1, m15
- mova [px+(%2+0)*%3+8*2], m2
- mova [px+(%2+1)*%3+8*2], m3
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px+32*(%2+0)+16], m2
+ mova [px+32*(%2+1)+16], m3
%endif
- mova [px+(%2+0)*%3], m0
- mova [px+(%2+1)*%3], m1
- mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
- mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
+ mova [px+32*(%2+0)+ 0], m0
+ mova [px+32*(%2+1)+ 0], m1
+ movd [px+32*(%2+0)- 4], m6
+ movd [px+32*(%2+1)- 4], m6
jmp .bottom_done
.bottom_no_left_right:
PMOVZXBW m0, [dst8q+strideq*0], %1 == 4
PMOVZXBW m1, [dst8q+strideq*1], %1 == 4
- mova [px+(%2+0)*%3], m0
- mova [px+(%2+1)*%3], m1
- mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
- mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
+ mova [px+32*(%2+0)+ 0], m0
+ mova [px+32*(%2+1)+ 0], m1
+ movd [px+32*(%2+0)+%1*2], m6
+ movd [px+32*(%2+1)+%1*2], m6
+ movd [px+32*(%2+0)- 4], m6
+ movd [px+32*(%2+1)- 4], m6
jmp .bottom_done
.no_bottom:
- %if ARCH_X86_64
- SWAP m0, m14
- %else
- mova m0, OUT_OF_BOUNDS_MEM
- %endif
- movu [px+(%2+0)*%3-4], m0
- movu [px+(%2+1)*%3-4], m0
+ movu [px+32*(%2+0)- 4], m6
+ movu [px+32*(%2+1)- 4], m6
%if %1 == 8
- movq [px+(%2+0)*%3+12], m0
- movq [px+(%2+1)*%3+12], m0
+ movq [px+32*(%2+0)+12], m6
+ movq [px+32*(%2+1)+12], m6
%endif
- %if ARCH_X86_64
- SWAP m0, m14
- %endif
.bottom_done:
; actual filter
- DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, secdmp
%if ARCH_X86_64
- movifnidn prid, prim
- movifnidn secd, secm
- mov dampingd, r7m
+ DEFINE_ARGS dst, stride, pridmp, damping, pri, sec
+ mova m13, [shufb_lohi]
+ %if cpuflag(ssse3)
+ mova m15, [pw_2048]
%else
- LOAD_ARG pri
- LOAD_ARG sec
- LOAD_ARG damping, 1
+ mova m15, [pw_8]
%endif
-
- SAVE_PIC_REG 8
- mov pridmpd, prid
- mov secdmpd, secd
- or pridmpd, 1
- or secdmpd, 1
- bsr pridmpd, pridmpd
- bsr secdmpd, secdmpd
+ mova m14, m6
+ %else
+ DEFINE_ARGS dst, pridmp, sec, damping, pri, tap
+ %xdefine m8 m1
+ %xdefine m9 m2
+ %xdefine m10 m0
+ %xdefine m13 [base+shufb_lohi]
+ %xdefine m14 OUT_OF_BOUNDS_MEM
+ %if cpuflag(ssse3)
+ %xdefine m15 [base+pw_2048]
+ %else
+ %xdefine m15 [base+pw_8]
+ %endif
+ %endif
+ movifnidn prid, r4m
+ movifnidn secd, r5m
+ mov dampingd, r7m
+ movif32 [esp+0x3C], r1d
+ test prid, prid
+ jz .sec_only
+ movd m1, prim
+ bsr pridmpd, prid
+ test secd, secd
+ jz .pri_only
+ movd m10, r5m
+ bsr secd, secd
+ and prid, 1
sub pridmpd, dampingd
- sub secdmpd, dampingd
+ sub secd, dampingd
xor dampingd, dampingd
+ add prid, prid
neg pridmpd
cmovs pridmpd, dampingd
- neg secdmpd
- cmovs secdmpd, dampingd
+ neg secd
+ cmovs secd, dampingd
+ PSHUFB_0 m1, m7
+ PSHUFB_0 m10, m7
%if ARCH_X86_64
- mov [rsp+ 0], pridmpq ; pri_shift
- mov [rsp+16], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, pridmp, tap, pri, sec
+ lea tapq, [tap_table]
+ MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask
+ MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask
+ mov [rsp+0x00], pridmpq ; pri_shift
+ mov [rsp+0x10], secq ; sec_shift
+ DEFINE_ARGS dst, stride, dir, tap, pri, stk, k, off, h
%else
+ MOVDDUP m2, [tapq+pridmpq*8]
+ MOVDDUP m3, [tapq+secq*8]
+ mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw
+ mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP
mov [esp+0x00], pridmpd
- mov [esp+0x30], secdmpd
- mov dword [esp+0x04], 0 ; zero upper 32 bits of psrlw
- mov dword [esp+0x34], 0 ; source operand in ACCUMULATE_TAP
- %define PIC_reg r4
- LOAD_PIC_REG 8
+ mov [esp+0x30], secd
+ DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
+ %define offq dstq
+ %define kd strided
+ %define kq strideq
+ mova [esp+0x10], m2
+ mova [esp+0x40], m3
+ mova [esp+0x20], m1
+ mova [esp+0x50], m10
%endif
-
- DEFINE_ARGS dst, stride, pridmp, table, pri, sec, secdmp
- lea tableq, [PIC_sym(tap_table)]
- %if ARCH_X86_64
- SWAP m2, m11
- SWAP m3, m12
+ mov dird, r6m
+ lea stkq, [px]
+ lea priq, [tapq+8*8+priq*8] ; pri_taps
+ mov hd, %1*%2/8
+ lea dirq, [tapq+dirq*2]
+.v_loop:
+ movif32 [esp+0x38], dstd
+ mov kd, 1
+ %if %1 == 4
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
+ %else
+ mova m4, [stkq+32*0] ; px
%endif
- movd m2, [tableq+pridmpq]
- movd m3, [tableq+secdmpq]
- PSHUFB_0 m2, m15 ; pri_shift_mask
- PSHUFB_0 m3, m15 ; sec_shift_mask
+ pxor m0, m0 ; sum
+ mova m7, m4 ; max
+ mova m8, m4 ; min
+.k_loop:
+ MOVDDUP m2, [priq+kq*8]
%if ARCH_X86_64
- SWAP m2, m11
- SWAP m3, m12
+ ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1
+ ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1
%else
- %define PIC_reg r6
- mov PIC_reg, r4
- DEFINE_ARGS dst, stride, dir, table, pri, sec, secdmp
- LOAD_ARG pri
- LOAD_ARG dir, 1
- mova [esp+0x10], m2
- mova [esp+0x40], m3
+ ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
%endif
+ dec kd
+ jge .k_loop
+ movif32 dstq, [esp+0x38]
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 1
+ dec hd
+ jg .v_loop
+ RET
- ; pri/sec_taps[k] [4 total]
- DEFINE_ARGS dst, stride, dummy, tap, pri, sec
- movd m0, prid
- movd m1, secd
- %if ARCH_X86_64
- PSHUFB_0 m0, m15
- PSHUFB_0 m1, m15
+.pri_only:
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, pridmp, damping, pri, tap, zero
+ lea tapq, [tap_table]
%else
- %if cpuflag(ssse3)
- pxor m2, m2
- %endif
- mova m3, [PIC_sym(pb_0xFF)]
- PSHUFB_0 m0, m2
- PSHUFB_0 m1, m2
- pxor m0, m3
- pxor m1, m3
- mova [esp+0x20], m0
- mova [esp+0x50], m1
+ DEFINE_ARGS dst, pridmp, zero, damping, pri, tap
%endif
and prid, 1
- lea priq, [tapq+8+priq*2] ; pri_taps
- lea secq, [tapq+12] ; sec_taps
-
- %if ARCH_X86_64 && cpuflag(sse4)
- mova m14, [shufb_lohi]
- %endif
-
- ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
- DEFINE_ARGS dst, stride, dir, tap, pri, sec
+ xor zerod, zerod
+ sub dampingd, pridmpd
+ cmovs dampingd, zerod
+ add prid, prid
+ PSHUFB_0 m1, m7
+ MOVDDUP m7, [tapq+dampingq*8]
+ mov [rsp+0x00], dampingq
%if ARCH_X86_64
- mov dird, r6m
- lea dirq, [tapq+14+dirq*2]
- DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
+ DEFINE_ARGS dst, stride, dir, stk, pri, tap, k, off, h
%else
- lea dird, [tapd+14+dird*2]
- DEFINE_ARGS dst, stride, dir, stk, pri, sec
- %define hd dword [esp+8]
- %define offq dstq
- %define kq strideq
+ mov [rsp+0x04], zerod
+ DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
%endif
- mov hd, %1*%2*2/mmsize
+ mov dird, r6m
lea stkq, [px]
- movif32 [esp+0x3C], strided
-.v_loop:
+ lea priq, [tapq+8*8+priq*8]
+ mov hd, %1*%2/8
+ lea dirq, [tapq+dirq*2]
+.pri_v_loop:
movif32 [esp+0x38], dstd
- mov kq, 1
+ mov kd, 1
%if %1 == 4
- movq m4, [stkq+%3*0]
- movhps m4, [stkq+%3*1]
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
%else
- mova m4, [stkq+%3*0] ; px
+ mova m4, [stkq+32*0]
%endif
+ pxor m0, m0
+.pri_k_loop:
+ MOVDDUP m2, [priq+kq*8]
+ ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0
+ dec kd
+ jge .pri_k_loop
+ movif32 dstq, [esp+0x38]
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 0
+ dec hd
+ jg .pri_v_loop
+ RET
- %if ARCH_X86_32
- %xdefine m9 m3
- %xdefine m13 m7
- %xdefine m7 m0
- %xdefine m8 m1
- %endif
-
- pxor m13, m13 ; sum
- mova m7, m4 ; max
- mova m8, m4 ; min
-.k_loop:
- movd m2, [priq+kq] ; pri_taps
+.sec_only:
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, dir, damping, tap, sec, zero
+%else
+ DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero
+%endif
+ movd m1, r5m
+ bsr secd, secd
+ mov dird, r6m
+ xor zerod, zerod
+ sub dampingd, secd
+ cmovs dampingd, zerod
+ PSHUFB_0 m1, m7
%if ARCH_X86_64
- PSHUFB_0 m2, m15
- %if cpuflag(ssse3)
- LOAD_SEC_TAP ; sec_taps
- %endif
- ACCUMULATE_TAP 0*2, [rsp+ 0], m11, m0, m2, %1, %3
- %if notcpuflag(ssse3)
- LOAD_SEC_TAP ; sec_taps
- %endif
- ACCUMULATE_TAP 2*2, [rsp+16], m12, m1, m3, %1, %3
- ACCUMULATE_TAP 6*2, [rsp+16], m12, m1, m3, %1, %3
+ lea tapq, [tap_table]
%else
- %if cpuflag(ssse3)
- pxor m3, m3
- %endif
- PSHUFB_0 m2, m3
- ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, %3
- LOAD_SEC_TAP ; sec_taps
- ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
- %if notcpuflag(ssse3)
- LOAD_SEC_TAP ; sec_taps
- %endif
- ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
+ mov [rsp+0x04], zerod
%endif
-
- dec kq
- jge .k_loop
-
- pxor m6, m6
- pcmpgtw m6, m13
- paddw m13, m6
- %if cpuflag(ssse3)
- pmulhrsw m13, [PIC_sym(pw_2048)]
+ mov [rsp+0x00], dampingq
+ MOVDDUP m7, [tapq+dampingq*8]
+ lea dirq, [tapq+dirq*2]
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, dir, stk, tap, off, k, h
%else
- paddw m13, [PIC_sym(pw_8)]
- psraw m13, 4
+ DEFINE_ARGS dst, stride, off, stk, dir, tap, h
%endif
- paddw m4, m13
- pminsw m4, m7
- pmaxsw m4, m8
- packuswb m4, m4
- movif32 dstd, [esp+0x38]
- movif32 strided, [esp+0x3C]
+ lea stkq, [px]
+ mov hd, %1*%2/8
+.sec_v_loop:
+ mov kd, 1
%if %1 == 4
- movd [dstq+strideq*0], m4
- psrlq m4, 32
- movd [dstq+strideq*1], m4
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
%else
- movq [dstq], m4
+ mova m4, [stkq+32*0]
%endif
-
- %if %1 == 4
- %define vloop_lines (mmsize/(%1*2))
- lea dstq, [dstq+strideq*vloop_lines]
- add stkq, %3*vloop_lines
- %else
- lea dstq, [dstq+strideq]
- add stkq, %3
+ pxor m0, m0
+.sec_k_loop:
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0
+ %if ARCH_X86_32
+ MOVDDUP m2, [tapq+12*8+kq*8]
%endif
+ ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0
+ dec kd
+ jge .sec_k_loop
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 0
dec hd
- jg .v_loop
-
+ jg .sec_v_loop
RET
%endmacro
@@ -1079,18 +1030,16 @@
shr r1d, 10
mov [varq], r1d
%else
-cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
- %define PIC_reg r4
- LEA PIC_reg, PIC_base_offset
-
+cglobal cdef_dir, 2, 4, 8, 96, src, stride, var, stride3
+%define base r2-shufw_6543210x
+ LEA r2, shufw_6543210x
pxor m0, m0
- mova m1, [PIC_sym(pw_128)]
-
lea stride3q, [strideq*3]
movq m5, [srcq+strideq*0]
movhps m5, [srcq+strideq*1]
movq m7, [srcq+strideq*2]
movhps m7, [srcq+stride3q]
+ mova m1, [base+pw_128]
psadbw m2, m5, m0
psadbw m3, m7, m0
packssdw m2, m3
@@ -1143,7 +1092,7 @@
pmaddwd m0, m0
phaddd m2, m0
- MULLD m2, [PIC_sym(div_table%+SUFFIX)+48]
+ MULLD m2, [base+div_table%+SUFFIX+48]
mova [esp+0x30], m2
mova m1, [esp+0x10]
@@ -1176,13 +1125,13 @@
paddw m0, m2 ; partial_sum_diag[0][0-7]
paddw m1, m3 ; partial_sum_diag[0][8-14,zero]
mova m3, [esp+0x50]
- pshufb m1, [PIC_sym(shufw_6543210x)]
+ pshufb m1, [base+shufw_6543210x]
punpckhwd m2, m0, m1
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
- MULLD m2, [PIC_sym(div_table%+SUFFIX)+16]
- MULLD m0, [PIC_sym(div_table%+SUFFIX)+0]
+ MULLD m2, [base+div_table%+SUFFIX+16]
+ MULLD m0, [base+div_table%+SUFFIX+ 0]
paddd m0, m2 ; cost[0a-d]
mova [esp+0x40], m0
@@ -1217,13 +1166,13 @@
paddw m0, m2 ; partial_sum_diag[1][0-7]
paddw m1, m3 ; partial_sum_diag[1][8-14,zero]
mova m3, [esp+0x50]
- pshufb m1, [PIC_sym(shufw_6543210x)]
+ pshufb m1, [base+shufw_6543210x]
punpckhwd m2, m0, m1
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
- MULLD m2, [PIC_sym(div_table%+SUFFIX)+16]
- MULLD m0, [PIC_sym(div_table%+SUFFIX)+0]
+ MULLD m2, [base+div_table%+SUFFIX+16]
+ MULLD m0, [base+div_table%+SUFFIX+ 0]
paddd m0, m2 ; cost[4a-d]
phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b]
phaddd m1, [esp+0x30] ; cost[0,4,2,6]
@@ -1259,8 +1208,8 @@
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
- MULLD m2, [PIC_sym(div_table%+SUFFIX)+48]
- MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
+ MULLD m2, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
paddd m0, m2 ; cost[7a-d]
mova [esp+0x40], m0
@@ -1280,8 +1229,8 @@
punpcklwd m0, m2
pmaddwd m7, m7
pmaddwd m0, m0
- MULLD m7, [PIC_sym(div_table%+SUFFIX)+48]
- MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
+ MULLD m7, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
paddd m0, m7 ; cost[5a-d]
mova [esp+0x50], m0
@@ -1303,8 +1252,8 @@
punpcklwd m0, m2
pmaddwd m7, m7
pmaddwd m0, m0
- MULLD m7, [PIC_sym(div_table%+SUFFIX)+48]
- MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
+ MULLD m7, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
paddd m0, m7 ; cost[1a-d]
SWAP m0, m4
@@ -1330,8 +1279,8 @@
punpcklwd m4, m2
pmaddwd m0, m0
pmaddwd m4, m4
- MULLD m0, [PIC_sym(div_table%+SUFFIX)+48]
- MULLD m4, [PIC_sym(div_table%+SUFFIX)+32]
+ MULLD m0, [base+div_table%+SUFFIX+48]
+ MULLD m4, [base+div_table%+SUFFIX+32]
paddd m4, m0 ; cost[3a-d]
mova m1, [esp+0x00]
@@ -1367,6 +1316,7 @@
%endif
; get direction and variance
+ mov vard, varm
punpckhdq m3, m2, m1
punpckldq m2, m1
psubd m1, m0, m3
@@ -1388,18 +1338,18 @@
%endmacro
INIT_XMM sse4
-CDEF_FILTER 8, 8, 32
-CDEF_FILTER 4, 8, 32
-CDEF_FILTER 4, 4, 32
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
CDEF_DIR
INIT_XMM ssse3
-CDEF_FILTER 8, 8, 32
-CDEF_FILTER 4, 8, 32
-CDEF_FILTER 4, 4, 32
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
CDEF_DIR
INIT_XMM sse2
-CDEF_FILTER 8, 8, 32
-CDEF_FILTER 4, 8, 32
-CDEF_FILTER 4, 4, 32
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
--- a/src/x86/cpuid.asm
+++ b/src/x86/cpuid.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION .text
--- a/src/x86/film_grain.asm
+++ b/src/x86/film_grain.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
--- a/src/x86/film_grain_ssse3.asm
+++ b/src/x86/film_grain_ssse3.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA
--- a/src/x86/ipred.asm
+++ b/src/x86/ipred.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
--- a/src/x86/ipred_ssse3.asm
+++ b/src/x86/ipred_ssse3.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA 16
--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
--- a/src/x86/loopfilter.asm
+++ b/src/x86/loopfilter.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
--- a/src/x86/looprestoration.asm
+++ b/src/x86/looprestoration.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
--- a/src/x86/looprestoration_ssse3.asm
+++ b/src/x86/looprestoration_ssse3.asm
@@ -24,6 +24,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA 16
--- a/src/x86/mc_avx2.asm
+++ b/src/x86/mc_avx2.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
@@ -2766,9 +2767,9 @@
%ifidn %1, put
%assign isprep 0
%if required_stack_alignment <= STACK_ALIGNMENT
-cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy
+cglobal put_8tap_scaled, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
%else
-cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
+cglobal put_8tap_scaled, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
%endif
%xdefine base_reg r12
%define rndshift 10
@@ -2775,11 +2776,11 @@
%else
%assign isprep 1
%if required_stack_alignment <= STACK_ALIGNMENT
-cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+cglobal prep_8tap_scaled, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
%xdefine tmp_stridem r14q
%else
-cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
- %define tmp_stridem qword [rsp+104]
+cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
+ %define tmp_stridem qword [rsp+120]
%endif
%xdefine base_reg r11
%define rndshift 6
@@ -2808,7 +2809,7 @@
%define hm r6m
%endif
%if required_stack_alignment > STACK_ALIGNMENT
- %define dsm [rsp+96]
+ %define dsm [rsp+112]
%define rX r1
%define rXd r1d
%else
@@ -2824,7 +2825,7 @@
%define dxm r7m
%else
DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
- %define hm [rsp+96]
+ %define hm [rsp+112]
%endif
MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
%define rX r14
@@ -3104,181 +3105,9 @@
lea srcq, [srcq+ssq*2]
jmp .w4_loop
.w8:
-%ifidn %1, put
- movifnidn dsm, dsq
-%endif
- shr t0d, 16
- sub srcq, 3
- movd xm15, t0d
- pmaddwd m8, [base+rescale_mul]
- vpbroadcastq m11, [base+pq_0x40000000]
- vpbroadcastd m15, xm15
- paddd m14, m8 ; mx+dx*[0-7]
- pand m6, m14, m10
- psrld m6, 6
- paddd m15, m6
- pcmpeqd m6, m9
- vextracti128 xm7, m15, 1
- movd r4d, xm15
- pextrd r6d, xm15, 2
- pextrd r7d, xm15, 1
- pextrd r9d, xm15, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- movq xm15, [base+subpel_filters+r4*8]
- movq xm10, [base+subpel_filters+r6*8]
- movhps xm15, [base+subpel_filters+r7*8]
- movhps xm10, [base+subpel_filters+r9*8]
- vinserti128 m15, [base+subpel_filters+r10*8], 1
- vinserti128 m10, [base+subpel_filters+r11*8], 1
- vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+rX*8]
- psrld m14, 10
- mova [rsp], xm14
- vextracti128 xm7, m14, 1
- movd r4d, xm14
- pextrd r6d, xm14, 2
- pextrd r7d, xm14, 1
- pextrd r9d, xm14, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- pshufd m5, m6, q1100
- pshufd m6, m6, q3322
- vpblendd m15, m9, 0xc0
- vpblendd m10, m8, 0xc0
- pblendvb m15, m11, m5
- pblendvb m10, m11, m6
- vbroadcasti128 m14, [base+subpel_s_shuf8]
- MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
- mov myd, mym
- mov dyd, dym
- pshufb m0, m14 ; 01a 01b
- pshufb m1, m14 ; 23a 23b
- pshufb m2, m14 ; 45a 45b
- pshufb m3, m14 ; 67a 67b
- vbroadcasti128 m14, [base+wswap]
-.w8_loop:
- and myd, 0x3ff
- mov r6d, 64 << 24
- mov r4d, myd
- shr r4d, 6
- lea r4d, [t1+r4]
- cmovnz r6q, [base+subpel_filters+r4*8]
- movq xm11, r6q
- punpcklbw xm11, xm11
- psraw xm11, 8
- vinserti128 m11, xm11, 1
- pshufd m8, m11, q0000
- pshufd m9, m11, q1111
- pmaddwd m4, m0, m8
- pmaddwd m5, m1, m9
- pshufd m8, m11, q2222
- pshufd m11, m11, q3333
- pmaddwd m6, m2, m8
- pmaddwd m7, m3, m11
- paddd m4, m5
- paddd m6, m7
- paddd m4, m13
- paddd m4, m6
- psrad m4, rndshift
- vextracti128 xm5, m4, 1
- packssdw xm4, xm5
-%ifidn %1, put
- packuswb xm4, xm4
- movq [dstq], xm4
- add dstq, dsm
-%else
- mova [tmpq], xm4
- add tmpq, 16
-%endif
- dec hd
- jz .ret
- add myd, dyd
- test myd, ~0x3ff
- jz .w8_loop
- test myd, 0x400
- mov [rsp+16], myd
- mov r4d, [rsp+ 0]
- mov r6d, [rsp+ 8]
- mov r7d, [rsp+ 4]
- mov r9d, [rsp+12]
- jz .w8_skip_line
- vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+ rX]
- movq xm4, [srcq+ r4]
- movq xm5, [srcq+ r6]
- movhps xm4, [srcq+ r7]
- movhps xm5, [srcq+ r9]
- vinserti128 m4, [srcq+r10], 1
- vinserti128 m5, [srcq+r11], 1
- add srcq, ssq
- mov myd, [rsp+16]
- mov dyd, dym
- pshufb m0, m14
- pshufb m1, m14
- pshufb m2, m14
- pshufb m3, m14
- vpblendd m4, m6, 0xc0
- vpblendd m5, m7, 0xc0
- pmaddubsw m4, m15
- pmaddubsw m5, m10
- phaddw m4, m5
- pslld m5, m4, 16
- paddw m4, m5
- pmulhrsw m4, m12
- pblendw m0, m1, 0xaa
- pblendw m1, m2, 0xaa
- pblendw m2, m3, 0xaa
- pblendw m3, m4, 0xaa
- jmp .w8_loop
-.w8_skip_line:
- mova m0, m1
- mova m1, m2
- mova m2, m3
- vpbroadcastq m7, [srcq+r13]
- vpbroadcastq m8, [srcq+ rX]
- movq xm3, [srcq+ r4]
- movq xm4, [srcq+ r6]
- movhps xm3, [srcq+ r7]
- movhps xm4, [srcq+ r9]
- vinserti128 m3, [srcq+r10], 1
- vinserti128 m4, [srcq+r11], 1
- add srcq, ssq
- movq xm5, [srcq+ r4]
- movq xm6, [srcq+ r6]
- movhps xm5, [srcq+ r7]
- movhps xm6, [srcq+ r9]
- vinserti128 m5, [srcq+r10], 1
- vinserti128 m6, [srcq+r11], 1
- vpbroadcastq m9, [srcq+r13]
- vpbroadcastq m11, [srcq+ rX]
- add srcq, ssq
- mov myd, [rsp+16]
- mov dyd, dym
- vpblendd m3, m7, 0xc0
- vpblendd m4, m8, 0xc0
- vpblendd m5, m9, 0xc0
- vpblendd m6, m11, 0xc0
- pmaddubsw m3, m15
- pmaddubsw m4, m10
- pmaddubsw m5, m15
- pmaddubsw m6, m10
- phaddw m3, m4
- phaddw m5, m6
- psrld m4, m3, 16
- pslld m6, m5, 16
- paddw m3, m4
- paddw m5, m6
- pblendw m3, m5, 0xaa
- pmulhrsw m3, m12
- jmp .w8_loop
+ mov dword [rsp+48], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
.w16:
mov dword [rsp+48], 2
movifprep tmp_stridem, 32
@@ -3698,127 +3527,9 @@
jg .dy1_w4_loop
MC_8TAP_SCALED_RET
.dy1_w8:
-%ifidn %1, put
- movifnidn dsm, dsq
-%endif
- shr t0d, 16
- sub srcq, 3
- movd xm15, t0d
- pmaddwd m8, [base+rescale_mul]
- vpbroadcastq m11, [base+pq_0x40000000]
- vpbroadcastd m15, xm15
- paddd m14, m8 ; mx+dx*[0-7]
- pand m6, m14, m10
- psrld m6, 6
- paddd m15, m6
- pcmpeqd m6, m9
- vextracti128 xm7, m15, 1
- movd r4d, xm15
- pextrd r6d, xm15, 2
- pextrd r7d, xm15, 1
- pextrd r9d, xm15, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- movq xm15, [base+subpel_filters+ r4*8]
- movq xm10, [base+subpel_filters+ r6*8]
- movhps xm15, [base+subpel_filters+ r7*8]
- movhps xm10, [base+subpel_filters+ r9*8]
- vinserti128 m15, [base+subpel_filters+r10*8], 1
- vinserti128 m10, [base+subpel_filters+r11*8], 1
- vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+ rX*8]
- psrld m14, 10
- vextracti128 xm7, m14, 1
- movd r4d, xm14
- pextrd r6d, xm14, 2
- pextrd r7d, xm14, 1
- pextrd r9d, xm14, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- mov [rsp+32], r7d
- pshufd m5, m6, q1100
- pshufd m6, m6, q3322
- vpblendd m15, m9, 0xc0
- vpblendd m10, m8, 0xc0
- pblendvb m15, m11, m5
- pblendvb m10, m11, m6
- vbroadcasti128 m14, [base+subpel_s_shuf8]
- MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
- mov myd, mym
- movu [rsp], m10
- pshufb m0, m14 ; 01a 01b
- pshufb m1, m14 ; 23a 23b
- pshufb m2, m14 ; 45a 45b
- pshufb m3, m14 ; 67a 67b
- shr myd, 6
- lea myd, [t1+myq]
- mov t1d, 64 << 24
- cmovnz t1q, [base+subpel_filters+myq*8]
- vbroadcasti128 m14, [base+wswap]
- movq xm11, t1q
- punpcklbw xm11, xm11
- psraw xm11, 8
- vinserti128 m11, xm11, 1
- mov r7d, [rsp+32]
- pshufd m8, m11, q0000
- pshufd m9, m11, q1111
- pshufd m10, m11, q2222
- pshufd m11, m11, q3333
-.dy1_w8_loop:
- pmaddwd m4, m0, m8
- pmaddwd m5, m1, m9
- pmaddwd m6, m2, m10
- pmaddwd m7, m3, m11
- paddd m4, m5
- paddd m6, m7
- paddd m4, m13
- paddd m4, m6
- psrad m4, rndshift
- vextracti128 xm5, m4, 1
- packssdw xm4, xm5
-%ifidn %1, put
- packuswb xm4, xm4
- movq [dstq], xm4
- add dstq, dsm
-%else
- mova [tmpq], xm4
- add tmpq, 16
-%endif
- dec hd
- jz .ret
- movq xm4, [srcq+ r4]
- movq xm5, [srcq+ r6]
- movhps xm4, [srcq+ r7]
- movhps xm5, [srcq+ r9]
- vinserti128 m4, [srcq+r10], 1
- vinserti128 m5, [srcq+r11], 1
- vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+ rX]
- add srcq, ssq
- pshufb m0, m14
- pshufb m1, m14
- pshufb m2, m14
- pshufb m3, m14
- vpblendd m4, m6, 0xc0
- vpblendd m5, m7, 0xc0
- pmaddubsw m4, m15
- pmaddubsw m5, [rsp]
- phaddw m4, m5
- pslld m5, m4, 16
- paddw m4, m5
- pmulhrsw m4, m12
- pblendw m0, m1, 0xaa
- pblendw m1, m2, 0xaa
- pblendw m2, m3, 0xaa
- pblendw m3, m4, 0xaa
- jmp .dy1_w8_loop
+ mov dword [rsp+72], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
.dy1_w16:
mov dword [rsp+72], 2
movifprep tmp_stridem, 32
@@ -3835,11 +3546,16 @@
mov dword [rsp+72], 16
movifprep tmp_stridem, 256
.dy1_w_start:
+ mov myd, mym
%ifidn %1, put
movifnidn dsm, dsq
%endif
shr t0d, 16
sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
pmaddwd m8, [base+rescale_mul]
movd xm15, t0d
mov [rsp+76], t0d
@@ -3851,6 +3567,10 @@
shl dword dxm, 3 ; dx*8
vpbroadcastd m15, xm15
paddd m14, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ punpcklbw xm0, xm0
+ psraw xm0, 8
+ mova [rsp+96], xm0
jmp .dy1_hloop
.dy1_hloop_prep:
dec dword [rsp+72]
@@ -3910,27 +3630,16 @@
MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
- mov myd, mym
movu [rsp], m10
+ vpbroadcastd m8, [rsp+0x60]
+ vpbroadcastd m9, [rsp+0x64]
+ vpbroadcastd m10, [rsp+0x68]
+ vpbroadcastd m11, [rsp+0x6c]
pshufb m0, m14 ; 01a 01b
pshufb m1, m14 ; 23a 23b
pshufb m2, m14 ; 45a 45b
pshufb m3, m14 ; 67a 67b
- shr myd, 6
- mov r4d, 64 << 24
- lea myd, [t1+myq]
- cmovnz r4q, [base+subpel_filters+myq*8]
vbroadcasti128 m14, [base+wswap]
- movq xm11, r4q
- punpcklbw xm11, xm11
- psraw xm11, 8
- vinserti128 m11, xm11, 1
- mov r4d, [rsp+64]
- mov r7d, [rsp+68]
- pshufd m8, m11, q0000
- pshufd m9, m11, q1111
- pshufd m10, m11, q2222
- pshufd m11, m11, q3333
.dy1_vloop:
pmaddwd m4, m0, m8
pmaddwd m5, m1, m9
@@ -4182,137 +3891,9 @@
jg .dy2_w4_loop
MC_8TAP_SCALED_RET
.dy2_w8:
-%ifidn %1, put
- movifnidn dsm, dsq
-%endif
- shr t0d, 16
- sub srcq, 3
- movd xm15, t0d
- pmaddwd m8, [base+rescale_mul]
- vpbroadcastq m11, [base+pq_0x40000000]
- vpbroadcastd m15, xm15
- paddd m14, m8 ; mx+dx*[0-7]
- pand m6, m14, m10
- psrld m6, 6
- paddd m15, m6
- pcmpeqd m6, m9
- vextracti128 xm7, m15, 1
- movd r4d, xm15
- pextrd r6d, xm15, 2
- pextrd r7d, xm15, 1
- pextrd r9d, xm15, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- movq xm15, [base+subpel_filters+ r4*8]
- movq xm10, [base+subpel_filters+ r6*8]
- movhps xm15, [base+subpel_filters+ r7*8]
- movhps xm10, [base+subpel_filters+ r9*8]
- vinserti128 m15, [base+subpel_filters+r10*8], 1
- vinserti128 m10, [base+subpel_filters+r11*8], 1
- vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+ rX*8]
- psrld m14, 10
- vextracti128 xm7, m14, 1
- movd r4d, xm14
- pextrd r6d, xm14, 2
- pextrd r7d, xm14, 1
- pextrd r9d, xm14, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- mov [rsp], r7d
- pshufd m5, m6, q1100
- pshufd m6, m6, q3322
- vpblendd m15, m9, 0xc0
- vpblendd m10, m8, 0xc0
- pblendvb m15, m11, m5
- pblendvb m10, m11, m6
- vbroadcasti128 m14, [base+subpel_s_shuf8]
- MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
- mov myd, mym
- pshufb m0, m14 ; 01a 01b
- pshufb m1, m14 ; 23a 23b
- pshufb m2, m14 ; 45a 45b
- pshufb m3, m14 ; 67a 67b
- shr myd, 6
- lea myd, [t1+myq]
- mov t1d, 64 << 24
- cmovnz t1q, [base+subpel_filters+myq*8]
- movq xm11, t1q
- punpcklbw xm11, xm11
- psraw xm11, 8
- vinserti128 m11, xm11, 1
- mov r7d, [rsp]
- pshufd m8, m11, q0000
- pshufd m9, m11, q1111
- pshufd m14, m11, q2222
- pshufd m11, m11, q3333
-.dy2_w8_loop:
- pmaddwd m4, m0, m8
- pmaddwd m5, m1, m9
- pmaddwd m6, m2, m14
- pmaddwd m7, m3, m11
- paddd m4, m5
- paddd m6, m7
- paddd m4, m13
- paddd m4, m6
- psrad m4, rndshift
- vextracti128 xm5, m4, 1
- packssdw xm4, xm5
-%ifidn %1, put
- packuswb xm4, xm4
- movq [dstq], xm4
- add dstq, dsm
-%else
- mova [tmpq], xm4
- add tmpq, 16
-%endif
- dec hd
- jz .ret
- mova m0, m1
- mova m1, m2
- mova m2, m3
- movq xm3, [srcq+ r4]
- movq xm4, [srcq+ r6]
- movhps xm3, [srcq+ r7]
- movhps xm4, [srcq+ r9]
- vinserti128 m3, [srcq+r10], 1
- vinserti128 m4, [srcq+r11], 1
- vpbroadcastq m5, [srcq+r13]
- vpbroadcastq m6, [srcq+ rX]
- add srcq, ssq
- vpblendd m3, m5, 0xc0
- vpblendd m4, m6, 0xc0
- pmaddubsw m3, m15
- pmaddubsw m4, m10
- phaddw m3, m4
- movq xm4, [srcq+ r4]
- movq xm5, [srcq+ r6]
- movhps xm4, [srcq+ r7]
- movhps xm5, [srcq+ r9]
- vinserti128 m4, [srcq+r10], 1
- vinserti128 m5, [srcq+r11], 1
- vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+ rX]
- add srcq, ssq
- vpblendd m4, m6, 0xc0
- vpblendd m5, m7, 0xc0
- pmaddubsw m4, m15
- pmaddubsw m5, m10
- phaddw m4, m5
- psrld m5, m3, 16
- pslld m6, m4, 16
- paddw m3, m5
- paddw m4, m6
- pblendw m3, m4, 0xaa
- pmulhrsw m3, m12
- jmp .dy2_w8_loop
+ mov dword [rsp+40], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
.dy2_w16:
mov dword [rsp+40], 2
movifprep tmp_stridem, 32
@@ -4329,11 +3910,16 @@
mov dword [rsp+40], 16
movifprep tmp_stridem, 256
.dy2_w_start:
+ mov myd, mym
%ifidn %1, put
movifnidn dsm, dsq
%endif
shr t0d, 16
sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
pmaddwd m8, [base+rescale_mul]
movd xm15, t0d
mov [rsp+64], t0d
@@ -4345,6 +3931,10 @@
shl dword dxm, 3 ; dx*8
vpbroadcastd m15, xm15
paddd m14, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ punpcklbw xm0, xm0
+ psraw xm0, 8
+ mova [rsp+0x50], xm0
jmp .dy2_hloop
.dy2_hloop_prep:
dec dword [rsp+40]
@@ -4384,7 +3974,6 @@
vpbroadcastq m8, [base+subpel_filters+ rX*8]
psrld m14, 10
vextracti128 xm7, m14, 1
- movq [rsp+32], xm14
movd r4d, xm14
pextrd r6d, xm14, 2
pextrd r7d, xm14, 1
@@ -4404,25 +3993,15 @@
MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
- mov myd, mym
+ vpbroadcastd m8, [rsp+0x50]
+ vpbroadcastd m9, [rsp+0x54]
+ vpbroadcastd m11, [rsp+0x58]
+ vpbroadcastd m4, [rsp+0x5c]
pshufb m0, m14 ; 01a 01b
pshufb m1, m14 ; 23a 23b
pshufb m2, m14 ; 45a 45b
pshufb m3, m14 ; 67a 67b
- shr myd, 6
- mov r4d, 64 << 24
- lea myd, [t1+myq]
- cmovnz r4q, [base+subpel_filters+myq*8]
- movq xm14, r4q
- punpcklbw xm14, xm14
- psraw xm14, 8
- vinserti128 m14, xm14, 1
- mov r4d, [rsp+32]
- mov r7d, [rsp+36]
- pshufd m8, m14, q0000
- pshufd m9, m14, q1111
- pshufd m11, m14, q2222
- pshufd m14, m14, q3333
+ SWAP m14, m4
.dy2_vloop:
pmaddwd m4, m0, m8
pmaddwd m5, m1, m9
--- a/src/x86/mc_avx512.asm
+++ b/src/x86/mc_avx512.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
%if HAVE_AVX512ICL && ARCH_X86_64
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -91,26 +91,46 @@
decl_mct_fn(dav1d_prep_bilin_sse2);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_ssse3);
decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2);
+decl_mc_scaled_fn(dav1d_put_bilin_scaled_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_ssse3);
decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2);
+decl_mct_scaled_fn(dav1d_prep_bilin_scaled_ssse3);
decl_avg_fn(dav1d_avg_avx512icl);
decl_avg_fn(dav1d_avg_avx2);
@@ -206,6 +226,30 @@
init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
+
+#if ARCH_X86_64
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
+
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
+#endif
c->avg = dav1d_avg_ssse3;
c->w_avg = dav1d_w_avg_ssse3;
--- a/src/x86/mc_sse.asm
+++ b/src/x86/mc_sse.asm
@@ -24,6 +24,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA 16
@@ -54,12 +55,19 @@
subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
+subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
pb_8x0_8x8: times 8 db 0
times 8 db 8
-resize_mul: dd 0, 1, 2, 3
+bdct_lb_dw: times 4 db 0
+ times 4 db 4
+ times 4 db 8
+ times 4 db 12
+rescale_mul: dd 0, 1, 2, 3
resize_shuf: times 5 db 0
db 1, 2, 3, 4, 5, 6
times 5+16 db 7
@@ -82,6 +90,9 @@
pd_16384: times 4 dd 16484
pd_32768: times 4 dd 32768
pd_262144:times 4 dd 262144
+pd_0x3ff: times 4 dd 0x3ff
+pd_0x4000:times 4 dd 0x4000
+pq_0x40000000: times 2 dq 0x40000000
pw_258: times 2 dw 258
@@ -165,6 +176,35 @@
HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128
+%macro SCALED_JMP_TABLE 1-*
+ %xdefine %1_table (%%table - %2)
+ %xdefine %%base mangle(private_prefix %+ _%1)
+%%table:
+ %rep %0 - 1
+ dw %%base %+ .w%2 - %%base
+ %rotate 1
+ %endrep
+ %rotate 1
+%%dy_1024:
+ %xdefine %1_dy1_table (%%dy_1024 - %2)
+ %rep %0 - 1
+ dw %%base %+ .dy1_w%2 - %%base
+ %rotate 1
+ %endrep
+ %rotate 1
+%%dy_2048:
+ %xdefine %1_dy2_table (%%dy_2048 - %2)
+ %rep %0 - 1
+ dw %%base %+ .dy2_w%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%if ARCH_X86_64
+SCALED_JMP_TABLE put_8tap_scaled_ssse3, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled_ssse3, 4, 8, 16, 32, 64, 128
+%endif
+
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
cextern mc_warp_filter
@@ -1464,8 +1504,8 @@
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
-%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v
-cglobal %1_8tap_%2
+%macro FN 4 ; prefix, type, type_h, type_v
+cglobal %1_%2
mov t0d, FILTER_%3
%ifidn %3, %4
mov t1d, t0d
@@ -1473,7 +1513,7 @@
mov t1d, FILTER_%4
%endif
%ifnidn %2, regular ; skip the jump in the last filter
- jmp mangle(private_prefix %+ _%1_8tap %+ SUFFIX)
+ jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
%endif
%endmacro
@@ -1485,15 +1525,15 @@
DECLARE_REG_TMP 7, 8
%endif
-MC_8TAP_FN put, sharp, SHARP, SHARP
-MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH
-MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP
-MC_8TAP_FN put, smooth, SMOOTH, SMOOTH
-MC_8TAP_FN put, sharp_regular, SHARP, REGULAR
-MC_8TAP_FN put, regular_sharp, REGULAR, SHARP
-MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR
-MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH
-MC_8TAP_FN put, regular, REGULAR, REGULAR
+FN put_8tap, sharp, SHARP, SHARP
+FN put_8tap, sharp_smooth, SHARP, SMOOTH
+FN put_8tap, smooth_sharp, SMOOTH, SHARP
+FN put_8tap, smooth, SMOOTH, SMOOTH
+FN put_8tap, sharp_regular, SHARP, REGULAR
+FN put_8tap, regular_sharp, REGULAR, SHARP
+FN put_8tap, smooth_regular, SMOOTH, REGULAR
+FN put_8tap, regular_smooth, REGULAR, SMOOTH
+FN put_8tap, regular, REGULAR, REGULAR
%if ARCH_X86_32
%define base_reg r1
@@ -2773,15 +2813,15 @@
DECLARE_REG_TMP 6, 7
%endif
-MC_8TAP_FN prep, sharp, SHARP, SHARP
-MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH
-MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP
-MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH
-MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR
-MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP
-MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR
-MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH
-MC_8TAP_FN prep, regular, REGULAR, REGULAR
+FN prep_8tap, sharp, SHARP, SHARP
+FN prep_8tap, sharp_smooth, SHARP, SMOOTH
+FN prep_8tap, smooth_sharp, SMOOTH, SHARP
+FN prep_8tap, smooth, SMOOTH, SMOOTH
+FN prep_8tap, sharp_regular, SHARP, REGULAR
+FN prep_8tap, regular_sharp, REGULAR, SHARP
+FN prep_8tap, smooth_regular, SMOOTH, REGULAR
+FN prep_8tap, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap, regular, REGULAR, REGULAR
%if ARCH_X86_32
%define base_reg r2
@@ -3912,6 +3952,1738 @@
RET
%endmacro
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro REMAP_REG 2
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %xdefine r14_save r14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ %xdefine r14 r14_save
+ %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3]
+ SWAP m%2, m%5
+ movq m%1, [srcq+ r4]
+ movq m%2, [srcq+ r6]
+ movhps m%1, [srcq+ r7]
+ movhps m%2, [srcq+ r9]
+ movq m%3, [srcq+r10]
+ movq m%4, [srcq+r11]
+ movhps m%3, [srcq+r13]
+ movhps m%4, [srcq+ rX]
+ add srcq, ssq
+ movq m%5, [srcq+ r4]
+ movq m%6, [srcq+ r6]
+ movhps m%5, [srcq+ r7]
+ movhps m%6, [srcq+ r9]
+ movq m%7, [srcq+r10]
+ movq m%8, [srcq+r11]
+ movhps m%7, [srcq+r13]
+ movhps m%8, [srcq+ rX]
+ add srcq, ssq
+ pmaddubsw m%1, m%9
+ pmaddubsw m%5, m%9
+ pmaddubsw m%2, m%10
+ pmaddubsw m%6, m%10
+ pmaddubsw m%3, m%11
+ pmaddubsw m%7, m%11
+ pmaddubsw m%4, m%12
+ pmaddubsw m%8, m%12
+ phaddw m%1, m%2
+ phaddw m%5, m%6
+ phaddw m%3, m%4
+ phaddw m%7, m%8
+ phaddw m%1, m%3
+ phaddw m%5, m%7
+ pmulhrsw m%1, m12
+ pmulhrsw m%5, m12
+ SWAP m%2, m%5
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled, 4, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal put_8tap_scaled, 4, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %xdefine base_reg r12
+ %define rndshift 10
+%else
+ %assign isprep 1
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled, 4, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ %define tmp_stridem qword [rsp+0x138]
+ %endif
+ %xdefine base_reg r11
+ %define rndshift 6
+%endif
+ LEA base_reg, %1_8tap_scaled_ssse3
+%define base base_reg-%1_8tap_scaled_ssse3
+ tzcnt wd, wm
+ movd m8, dxm
+ movd m14, mxm
+ pshufd m8, m8, q0000
+ pshufd m14, m14, q0000
+%if isprep && UNIX64
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%endif
+ mov dyd, dym
+%ifidn %1, put
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %else
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %if required_stack_alignment > STACK_ALIGNMENT
+ %define dsm [rsp+0x138]
+ %define rX r1
+ %define rXd r1d
+ %else
+ %define dsm dsq
+ %define rX r14
+ %define rXd r14d
+ %endif
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %else
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+0x94]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define rX r14
+ %define rXd r14d
+%endif
+ mova m10, [base+pd_0x3ff]
+ mova m12, [base+pw_8192]
+%ifidn %1, put
+ mova m13, [base+pd_512]
+%else
+ mova m13, [base+pd_32]
+%endif
+ pxor m9, m9
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ punpckldq m9, m8
+ SWAP m8, m9
+ paddd m14, m8 ; mx+dx*[0-1]
+ mova m11, [base+pd_0x4000]
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ psrldq m15, 4
+ movd r6d, m15
+ mova m5, [base+bdct_lb_dw]
+ mova m6, [base+subpel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r6*8+2]
+ pxor m9, m9
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq m0, [srcq+ssq*0]
+ movq m2, [srcq+ssq*2]
+ movhps m0, [srcq+ssq*1]
+ movhps m2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ movq m1, [srcq+ssq*0]
+ movq m3, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*1]
+ movhps m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m7
+ punpcklqdq m15, m15
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pmaddubsw m0, m15
+ pmaddubsw m2, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12 ; 0 1 2 3
+ pmulhrsw m1, m12 ; 4 5 6 7
+ palignr m2, m1, m0, 4 ; 1 2 3 4
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ pshufd m5, m1, q0321 ; 5 6 7 _
+ punpcklwd m2, m1, m5 ; 45 56
+ punpckhwd m4, m1, m5 ; 67 __
+.w2_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m11, r6q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m5, m3, m8
+ pmaddwd m6, m0, m9
+ pmaddwd m7, m2, m10
+ pmaddwd m8, m4, m11
+ paddd m5, m6
+ paddd m7, m8
+ paddd m5, m13
+ paddd m5, m7
+ psrad m5, 10
+ packssdw m5, m5
+ packuswb m5, m5
+ pextrw r6d, m5, 0
+ mov [dstq], r6w
+ add dstq, dsq
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w2_loop
+ movq m5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps m3, m0, q1032 ; 01 12
+ shufps m0, m2, q1032 ; 23 34
+ shufps m2, m4, q1032 ; 45 56
+ pshufb m5, m14
+ pmaddubsw m5, m15
+ phaddw m5, m5
+ pmulhrsw m5, m12
+ palignr m4, m5, m1, 12
+ punpcklqdq m1, m4, m4 ; 6 7 6 7
+ punpcklwd m4, m1, m5 ; 67 __
+ jmp .w2_loop
+.w2_skip_line:
+ movhps m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m3, m0 ; 01 12
+ mova m0, m2 ; 23 34
+ pshufb m5, m14
+ pmaddubsw m5, m15
+ phaddw m5, m5
+ pmulhrsw m5, m12 ; 6 7 6 7
+ palignr m4, m5, m1, 8 ; 4 5 6 7
+ pshufd m5, m4, q0321 ; 5 6 7 _
+ mova m1, m4
+ punpcklwd m2, m4, m5 ; 45 56
+ punpckhwd m4, m5 ; 67 __
+ jmp .w2_loop
+ SWAP m15, m8, m9
+%endif
+.w4:
+ mov myd, mym
+ mova m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ pmaddwd m8, m7
+ mova m11, [base+pd_0x4000]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ psrldq m7, m15, 8
+ movd r4d, m15
+ movd r11d, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r6d, m15
+ movd r13d, m7
+ movd m15, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+r11*8+2]
+ movd m3, [base+subpel_filters+ r6*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+ mova m5, [base+bdct_lb_dw]
+ movq m6, [base+subpel_s_shuf2]
+ pcmpeqd m0, m9
+ psrld m14, 10
+ movu m7, [srcq+ssq*0]
+ movu m9, [srcq+ssq*1]
+ movu m8, [srcq+ssq*2]
+ movu m10, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m3
+ punpckldq m2, m4
+ punpcklqdq m6, m6
+ punpcklqdq m15, m2
+ pshufb m14, m5
+ paddb m14, m6
+ movu m2, [srcq+ssq*0]
+ movu m4, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m5, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pand m11, m0
+ pandn m0, m15
+ SWAP m15, m0
+ por m15, m11
+ pshufb m7, m14
+ pshufb m9, m14
+ pshufb m8, m14
+ pshufb m10, m14
+ pshufb m2, m14
+ pshufb m4, m14
+ pshufb m3, m14
+ pshufb m5, m14
+ pmaddubsw m7, m15
+ pmaddubsw m9, m15
+ pmaddubsw m8, m15
+ pmaddubsw m10, m15
+ pmaddubsw m2, m15
+ pmaddubsw m4, m15
+ pmaddubsw m3, m15
+ pmaddubsw m5, m15
+ phaddw m7, m9
+ phaddw m8, m10
+ phaddw m9, m2, m4
+ phaddw m3, m5
+ pmulhrsw m7, m12 ; 0 1
+ pmulhrsw m8, m12 ; 2 3
+ pmulhrsw m9, m12 ; 4 5
+ pmulhrsw m3, m12 ; 6 7
+ shufps m4, m7, m8, q1032 ; 1 2
+ shufps m5, m8, m9, q1032 ; 3 4
+ shufps m6, m9, m3, q1032 ; 5 6
+ psrldq m11, m3, 8 ; 7 _
+ punpcklwd m0, m7, m4 ; 01
+ punpckhwd m7, m4 ; 12
+ punpcklwd m1, m8, m5 ; 23
+ punpckhwd m8, m5 ; 34
+ punpcklwd m2, m9, m6 ; 45
+ punpckhwd m9, m6 ; 56
+ punpcklwd m3, m11 ; 67
+ mova [rsp+0x00], m7
+ mova [rsp+0x10], m8
+ mova [rsp+0x20], m9
+.w4_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m10, r6q
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ pmaddwd m7, m3, m10
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ packssdw m4, m4
+%ifidn %1, put
+ packuswb m4, m4
+ movd [dstq], m4
+ add dstq, dsq
+%else
+ movq [tmpq], m4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ movu m4, [srcq]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova m0, [rsp+0x00]
+ mova [rsp+0x00], m1
+ mova m1, [rsp+0x10]
+ mova [rsp+0x10], m2
+ mova m2, [rsp+0x20]
+ mova [rsp+0x20], m3
+ pshufb m4, m14
+ pmaddubsw m4, m15
+ phaddw m4, m4
+ pmulhrsw m4, m12
+ punpcklwd m3, m11, m4
+ mova m11, m4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m6, [rsp+0x10]
+ mova m7, [rsp+0x20]
+ pshufb m4, m14
+ pshufb m5, m14
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ phaddw m4, m5
+ pmulhrsw m4, m12
+ punpcklwd m9, m11, m4
+ mova [rsp+0x00], m6
+ mova [rsp+0x10], m7
+ mova [rsp+0x20], m9
+ psrldq m11, m4, 8
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ punpcklwd m3, m4, m11
+ jmp .w4_loop
+ SWAP m0, m15
+.w8:
+ mov dword [rsp+0x90], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [rsp+0x90], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [rsp+0x90], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [rsp+0x90], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [rsp+0x90], 16
+ movifprep tmp_stridem, 256
+.w_start:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ movd m15, t0d
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ mova [rsp+0x100], m7
+ mova [rsp+0x120], m15
+ mov [rsp+0x098], srcq
+ mov [rsp+0x130], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ jmp .hloop
+.hloop_prep:
+ dec dword [rsp+0x090]
+ jz .ret
+ add qword [rsp+0x130], 8*(isprep+1)
+ mov hd, hm
+ mova m7, [rsp+0x100]
+ mova m14, [rsp+0x110]
+ mova m10, [base+pd_0x3ff]
+ mova m15, [rsp+0x120]
+ pxor m9, m9
+ mov srcq, [rsp+0x098]
+ mov r0q, [rsp+0x130] ; dstq / tmpq
+ paddd m14, m7
+.hloop:
+ mova m11, [base+pq_0x40000000]
+ psrld m4, m14, 10
+ mova [rsp], m4
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m9
+ psrldq m4, m5, 8
+ movd r4d, m5
+ movd r6d, m4
+ psrldq m5, 4
+ psrldq m4, 4
+ movd r7d, m5
+ movd r9d, m4
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ mova [rsp+0x110], m14
+ psrldq m4, m15, 8
+ movd r10d, m15
+ movd r11d, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ psrldq m4, m14, 8
+ movd r10d, m14
+ movd r11d, m4
+ psrldq m14, 4
+ psrldq m4, 4
+ movd r13d, m14
+ movd rXd, m4
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m11, m4
+ pand m8, m11, m6
+ pand m15, m11, m14
+ pand m11, m11, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m11, m5
+ mova [rsp+0x10], m7
+ mova [rsp+0x20], m8
+ mova [rsp+0x30], m15
+ mova [rsp+0x40], m11
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1
+ mova [rsp+0x50], m1
+ mova [rsp+0x60], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3
+ mova [rsp+0x70], m3
+ mova [rsp+0x80], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5
+ MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7
+ SWAP m7, m0
+ SWAP m8, m14
+ mova m1, [rsp+0x50]
+ mova m2, [rsp+0x60]
+ mova m3, [rsp+0x70]
+ mova m9, [rsp+0x80]
+ mov myd, mym
+ mov dyd, dym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova [rsp+0x50], m4
+ mova [rsp+0x60], m5
+ mova [rsp+0x70], m6
+ mova [rsp+0x80], m7
+ SWAP m14, m8
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m11, r6q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufd m5, m11, q0000
+ pshufd m7, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m4, m5, m0
+ pmaddwd m5, m5, m1
+ pmaddwd m6, m7, m2
+ pmaddwd m7, m7, m3
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [rsp+0x50], m10
+ pmaddwd m7, [rsp+0x60], m10
+ pmaddwd m8, [rsp+0x70], m11
+ pmaddwd m9, [rsp+0x80], m11
+ paddd m4, m6
+ paddd m5, m7
+ paddd m4, m8
+ paddd m5, m9
+ psrad m4, rndshift
+ psrad m5, rndshift
+ packssdw m4, m5
+%ifidn %1, put
+ packuswb m4, m4
+ movq [dstq], m4
+ add dstq, dsm
+%else
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [rsp+0x140], myd
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ jz .skip_line
+ mova m14, [base+unpckw]
+ movq m6, [srcq+r10]
+ movq m7, [srcq+r11]
+ movhps m6, [srcq+r13]
+ movhps m7, [srcq+ rX]
+ movq m4, [srcq+ r4]
+ movq m5, [srcq+ r6]
+ movhps m4, [srcq+ r7]
+ movhps m5, [srcq+ r9]
+ add srcq, ssq
+ mov myd, [rsp+0x140]
+ mov dyd, dym
+ pshufd m9, m14, q1032
+ pshufb m0, m14 ; 0a 1a
+ pshufb m1, m14 ; 0b 1b
+ pshufb m2, m9 ; 3a 2a
+ pshufb m3, m9 ; 3b 2b
+ pmaddubsw m6, [rsp+0x30]
+ pmaddubsw m7, [rsp+0x40]
+ pmaddubsw m4, [rsp+0x10]
+ pmaddubsw m5, [rsp+0x20]
+ phaddw m6, m7
+ phaddw m4, m5
+ phaddw m4, m6
+ pmulhrsw m4, m12
+ pshufb m5, [rsp+0x50], m14 ; 4a 5a
+ pshufb m6, [rsp+0x60], m14 ; 4b 5b
+ pshufb m7, [rsp+0x70], m9 ; 7a 6a
+ pshufb m8, [rsp+0x80], m9 ; 7b 6b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ punpckhwd m5, m7 ; 56a
+ punpckhwd m6, m8 ; 56b
+ punpcklwd m7, m4 ; 78a
+ punpckhqdq m4, m4
+ punpcklwd m8, m4 ; 78b
+ mova [rsp+0x50], m5
+ mova [rsp+0x60], m6
+ mova [rsp+0x70], m7
+ mova [rsp+0x80], m8
+ jmp .vloop
+.skip_line:
+ mova m0, [rsp+0x10]
+ mova m1, [rsp+0x20]
+ mova m14, [rsp+0x30]
+ mova m15, [rsp+0x40]
+ MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15
+ mov myd, [rsp+0x140]
+ mov dyd, dym
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ mova m2, [rsp+0x50] ; 23a
+ mova m3, [rsp+0x60] ; 23b
+ mova m5, [rsp+0x70] ; 45a
+ mova m6, [rsp+0x80] ; 45b
+ punpcklwd m7, m4, m8 ; 67a
+ punpckhwd m4, m8 ; 67b
+ mova [rsp+0x50], m5
+ mova [rsp+0x60], m6
+ mova [rsp+0x70], m7
+ mova [rsp+0x80], m4
+ jmp .vloop
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy1_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ punpckldq m9, m8
+ SWAP m8, m9
+ paddd m14, m8 ; mx+dx*[0-1]
+ mova m11, [base+pd_0x4000]
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ psrldq m15, 4
+ movd r6d, m15
+ mova m5, [base+bdct_lb_dw]
+ mova m6, [base+subpel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r6*8+2]
+ pxor m9, m9
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq m0, [srcq+ssq*0]
+ movq m2, [srcq+ssq*2]
+ movhps m0, [srcq+ssq*1]
+ movhps m2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ movq m1, [srcq+ssq*0]
+ movq m3, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*1]
+ add srcq, ss3q
+ movq xm10, r4q
+ punpcklbw xm10, xm10
+ psraw xm10, 8
+ punpckldq m15, m7
+ punpcklqdq m15, m15
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ pshufd m8, m10, q0000
+ pshufd m9, m10, q1111
+ pshufd m11, m10, q3333
+ pshufd m10, m10, q2222
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pmaddubsw m0, m15
+ pmaddubsw m2, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ palignr m2, m1, m0, 4
+ pshufd m4, m1, q2121
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ punpcklwd m2, m1, m4 ; 45 56
+.dy1_w2_loop:
+ movq m1, [srcq+ssq*0]
+ movhps m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m3, m8
+ pmaddwd m6, m0, m9
+ pmaddwd m7, m2, m10
+ mova m3, m0
+ mova m0, m2
+ paddd m5, m13
+ paddd m6, m7
+ pshufb m1, m14
+ pmaddubsw m1, m15
+ phaddw m1, m1
+ pmulhrsw m1, m12
+ palignr m7, m1, m4, 12
+ punpcklwd m2, m7, m1 ; 67 78
+ pmaddwd m7, m2, m11
+ mova m4, m1
+ paddd m5, m6
+ paddd m5, m7
+ psrad m5, rndshift
+ packssdw m5, m5
+ packuswb m5, m5
+ pextrw r4d, m5, 0
+ pextrw r6d, m5, 1
+ mov [dstq+dsq*0], r4w
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+ SWAP m15, m8, m9
+%endif
+.dy1_w4:
+ mov myd, mym
+ mova m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ pmaddwd m8, m7
+ mova m11, [base+pd_0x4000]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ psrldq m7, m15, 8
+ movd r4d, m15
+ movd r11d, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r6d, m15
+ movd r13d, m7
+ movd m15, [base+subpel_filters+ r4*8+2]
+ movd m4, [base+subpel_filters+r11*8+2]
+ movd m5, [base+subpel_filters+ r6*8+2]
+ movd m7, [base+subpel_filters+r13*8+2]
+ movq m6, [base+subpel_s_shuf2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m5
+ punpckldq m4, m7
+ punpcklqdq m6, m6
+ punpcklqdq m15, m4
+ pshufb m14, [base+bdct_lb_dw]
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m7, [srcq+ssq*2]
+ add srcq, ss3q
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ paddb m14, m6
+ movq m10, r4q
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m4, m14
+ pshufb m5, m14
+ pshufb m7, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ pmaddubsw m7, m15
+ phaddw m0, m1
+ phaddw m2, m3
+ phaddw m4, m5
+ phaddw m6, m7, m7
+ pmulhrsw m0, m12 ; 0 1
+ pmulhrsw m2, m12 ; 2 3
+ pmulhrsw m4, m12 ; 4 5
+ pmulhrsw m6, m12 ; 6 _
+ shufps m1, m0, m2, q1032 ; 1 2
+ shufps m3, m2, m4, q1032 ; 3 4
+ shufps m5, m4, m6, q1032 ; 5 6
+ punpcklwd m7, m0, m1 ; 01
+ punpckhwd m0, m1 ; 12
+ punpcklwd m8, m2, m3 ; 23
+ punpckhwd m2, m3 ; 34
+ punpcklwd m9, m4, m5 ; 45
+ punpckhwd m4, m5 ; 56
+ pshufd m1, m10, q0000
+ pshufd m3, m10, q1111
+ pshufd m5, m10, q2222
+ pshufd m10, m10, q3333
+ mova [rsp+0x00], m8
+ mova [rsp+0x10], m2
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m4
+.dy1_w4_loop:
+ movu m11, [srcq+ssq*0]
+ pmaddwd m7, m1
+ pmaddwd m8, m3
+ pmaddwd m0, m1
+ pmaddwd m2, m3
+ pmaddwd m9, m5
+ pmaddwd m4, m5
+ paddd m7, m8
+ paddd m0, m2
+ movu m8, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m11, m14
+ pmaddubsw m11, m15
+ paddd m7, m13
+ paddd m0, m13
+ paddd m7, m9
+ paddd m0, m4
+ pshufb m8, m14
+ pmaddubsw m8, m15
+ phaddw m11, m8
+ mova m8, [rsp+0x20]
+ pmulhrsw m11, m12
+ punpcklwd m9, m6, m11 ; 67
+ psrldq m6, m11, 8
+ punpcklwd m4, m11, m6 ; 78
+ pmaddwd m2, m9, m10
+ pmaddwd m11, m4, m10
+ paddd m7, m2
+ mova m2, [rsp+0x30]
+ paddd m0, m11
+ psrad m7, rndshift
+ psrad m0, rndshift
+ packssdw m7, m0
+ mova m0, [rsp+0x10]
+%ifidn %1, put
+ packuswb m7, m7
+ psrldq m11, m7, 4
+ movd [dstq+dsq*0], m7
+ movd [dstq+dsq*1], m11
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], m7
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jz .ret
+ mova m7, [rsp+0x00]
+ mova [rsp+0x00], m8
+ mova [rsp+0x10], m2
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m4
+ jmp .dy1_w4_loop
+ SWAP m8, m15
+.dy1_w8:
+ mov dword [rsp+0x90], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [rsp+0x90], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [rsp+0x90], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [rsp+0x90], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [rsp+0x90], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ movq m3, r4q
+ punpcklbw m3, m3
+ psraw m3, 8
+ mova [rsp+0x100], m7
+ mova [rsp+0x120], m15
+ mov [rsp+0x098], srcq
+ mov [rsp+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [rsp+0x140], m0
+ mova [rsp+0x150], m1
+ mova [rsp+0x160], m2
+ mova [rsp+0x170], m3
+%if UNIX64
+ mov hm, hd
+%endif
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [rsp+0x090]
+ jz .ret
+ add qword [rsp+0x130], 8*(isprep+1)
+ mov hd, hm
+ mova m7, [rsp+0x100]
+ mova m14, [rsp+0x110]
+ mova m10, [base+pd_0x3ff]
+ mova m15, [rsp+0x120]
+ pxor m9, m9
+ mov srcq, [rsp+0x098]
+ mov r0q, [rsp+0x130] ; dstq / tmpq
+ paddd m14, m7
+.dy1_hloop:
+ mova m11, [base+pq_0x40000000]
+ psrld m4, m14, 10
+ mova [rsp], m4
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m9
+ psrldq m4, m5, 8
+ movd r4d, m5
+ movd r6d, m4
+ psrldq m5, 4
+ psrldq m4, 4
+ movd r7d, m5
+ movd r9d, m4
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ mova [rsp+0x110], m14
+ psrldq m4, m15, 8
+ movd r10d, m15
+ movd r11d, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ psrldq m4, m14, 8
+ movd r10d, m14
+ movd r11d, m4
+ psrldq m14, 4
+ psrldq m4, 4
+ movd r13d, m14
+ movd rXd, m4
+ punpcklbw m14, m14
+ psraw m14, 8
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m8, m11, m4
+ pand m9, m11, m6
+ pand m15, m11, m7
+ pand m11, m11, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m7, m2
+ pandn m5, m3
+ por m8, m4
+ por m9, m6
+ por m15, m7
+ por m11, m5
+ mova [rsp+0x10], m8
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m15
+ mova [rsp+0x40], m11
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
+ mova [rsp+0x50], m1
+ mova [rsp+0x60], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
+ mova [rsp+0x70], m3
+ mova [rsp+0x80], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
+ MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
+ SWAP m7, m0
+ SWAP m8, m14
+ mova m1, [rsp+0x50]
+ mova m2, [rsp+0x60]
+ mova m3, [rsp+0x70]
+ mova m15, [rsp+0x80]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ SWAP m14, m8
+ mova m8, [rsp+0x140]
+ mova m9, [rsp+0x150]
+ mova m10, [rsp+0x160]
+ mova m11, [rsp+0x170]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m15; 23a
+ punpckhwd m3, m15 ; 23b
+ mova [rsp+0x50], m4
+ mova [rsp+0x60], m5
+ mova [rsp+0x70], m6
+ mova [rsp+0x80], m7
+ mova m14, [base+unpckw]
+.dy1_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ pmaddwd m7, m3, m9
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [rsp+0x50], m10
+ pmaddwd m7, [rsp+0x60], m10
+ pmaddwd m15, [rsp+0x70], m11
+ paddd m4, m6
+ pmaddwd m6, [rsp+0x80], m11
+ paddd m5, m7
+ paddd m4, m15
+ paddd m5, m6
+ psrad m4, rndshift
+ psrad m5, rndshift
+ packssdw m4, m5
+%ifidn %1, put
+ packuswb m4, m4
+ movq [dstq], m4
+ add dstq, dsm
+%else
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+ movq m4, [srcq+ r4]
+ movq m5, [srcq+ r6]
+ movhps m4, [srcq+ r7]
+ movhps m5, [srcq+ r9]
+ movq m6, [srcq+r10]
+ movq m7, [srcq+r11]
+ movhps m6, [srcq+r13]
+ movhps m7, [srcq+ rX]
+ add srcq, ssq
+ pshufd m15, m14, q1032
+ pshufb m0, m14 ; 0a 1a
+ pshufb m1, m14 ; 0b 1b
+ pshufb m2, m15 ; 3a 2a
+ pshufb m3, m15 ; 3b 2b
+ pmaddubsw m4, [rsp+0x10]
+ pmaddubsw m5, [rsp+0x20]
+ pmaddubsw m6, [rsp+0x30]
+ pmaddubsw m7, [rsp+0x40]
+ phaddw m4, m5
+ phaddw m6, m7
+ phaddw m4, m6
+ pmulhrsw m4, m12
+ pshufb m5, [rsp+0x70], m15 ; 7a 6a
+ pshufb m7, [rsp+0x80], m15 ; 7b 6b
+ pshufb m6, [rsp+0x50], m14 ; 4a 5a
+ pshufb m15, [rsp+0x60], m14 ; 4b 5b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m6 ; 34a
+ punpcklwd m3, m15 ; 34b
+ punpckhwd m6, m5 ; 56a
+ punpckhwd m15, m7 ; 56b
+ punpcklwd m5, m4 ; 78a
+ psrldq m4, 8
+ punpcklwd m7, m4 ; 78b
+ mova [rsp+0x50], m6
+ mova [rsp+0x60], m15
+ mova [rsp+0x70], m5
+ mova [rsp+0x80], m7
+ jmp .dy1_vloop
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy2_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ punpckldq m9, m8
+ SWAP m8, m9
+ paddd m14, m8 ; mx+dx*[0-1]
+ mova m11, [base+pd_0x4000]
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ psrldq m15, 4
+ movd r6d, m15
+ mova m5, [base+bdct_lb_dw]
+ mova m6, [base+subpel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r6*8+2]
+ pxor m9, m9
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ movhps m0, [srcq+ssq*2]
+ movhps m1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ punpckldq m15, m7
+ punpcklqdq m15, m15
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ movq m3, [srcq+ssq*0]
+ movhps m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ movq m11, r4q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pslldq m2, m3, 8
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12 ; 0 2 _ 4
+ pmulhrsw m1, m12 ; 1 3 _ 5
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pshufd m2, m0, q3110 ; 0 2 2 4
+ pshufd m1, m1, q3110 ; 1 3 3 5
+ punpcklwd m3, m2, m1 ; 01 23
+ punpckhwd m2, m1 ; 23 45
+.dy2_w2_loop:
+ movq m6, [srcq+ssq*0]
+ movq m7, [srcq+ssq*1]
+ movhps m6, [srcq+ssq*2]
+ movhps m7, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m4, m3, m8
+ pmaddwd m5, m2, m9
+ pshufb m6, m14
+ pshufb m7, m14
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ phaddw m6, m7
+ pmulhrsw m6, m12
+ psrldq m7, m6, 8
+ palignr m6, m0, 8
+ palignr m7, m1, 8
+ mova m0, m6
+ mova m1, m7
+ pshufd m6, m6, q3221
+ pshufd m7, m7, q3221
+ punpcklwd m3, m6, m7 ; 45 67
+ punpckhwd m2, m6, m7 ; 67 89
+ pmaddwd m6, m3, m10
+ pmaddwd m7, m2, m11
+ paddd m4, m5
+ paddd m4, m13
+ paddd m6, m7
+ paddd m4, m6
+ psrad m4, rndshift
+ packssdw m4, m4
+ packuswb m4, m4
+ movd r4d, m4
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+ SWAP m15, m8, m9
+%endif
+.dy2_w4:
+ mov myd, mym
+ mova m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ pmaddwd m8, m7
+ mova m11, [base+pd_0x4000]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ psrldq m7, m15, 8
+ movd r4d, m15
+ movd r11d, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r6d, m15
+ movd r13d, m7
+ movd m15, [base+subpel_filters+ r4*8+2]
+ movd m4, [base+subpel_filters+r11*8+2]
+ movd m5, [base+subpel_filters+ r6*8+2]
+ movd m7, [base+subpel_filters+r13*8+2]
+ movq m6, [base+subpel_s_shuf2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movu m0, [srcq+ssq*0]
+ movu m2, [srcq+ssq*2]
+ movu m1, [srcq+ssq*1]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m5
+ punpckldq m4, m7
+ punpcklqdq m6, m6
+ punpcklqdq m15, m4
+ pshufb m14, [base+bdct_lb_dw]
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ paddb m14, m6
+ movq m11, r4q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pshufb m4, m14
+ pshufb m5, m14
+ pmaddubsw m0, m15
+ pmaddubsw m2, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ phaddw m0, m2
+ phaddw m1, m3
+ phaddw m4, m5
+ pmulhrsw m0, m12 ; 0 2
+ pmulhrsw m1, m12 ; 1 3
+ pmulhrsw m4, m12 ; 4 5
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ psrldq m5, m4, 8 ; 5 _
+ punpckhwd m2, m0, m1 ; 23
+ punpcklwd m0, m1 ; 01
+ punpcklwd m4, m5 ; 45
+.dy2_w4_loop:
+ pmaddwd m0, m8 ; a0
+ pmaddwd m5, m2, m8 ; b0
+ pmaddwd m2, m9 ; a1
+ pmaddwd m7, m4, m9 ; b1
+ pmaddwd m3, m4, m10 ; a2
+ paddd m0, m13
+ paddd m5, m13
+ paddd m0, m2
+ paddd m5, m7
+ paddd m0, m3
+ movu m6, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m6, m14
+ pshufb m7, m14
+ pshufb m3, m14
+ pshufb m1, m14
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ pmaddubsw m3, m15
+ pmaddubsw m1, m15
+ phaddw m6, m7
+ phaddw m3, m1
+ pmulhrsw m6, m12 ; 6 7
+ pmulhrsw m3, m12 ; 8 9
+ psrldq m7, m6, 8
+ psrldq m1, m3, 8
+ punpcklwd m6, m7 ; 67
+ punpcklwd m3, m1 ; 89
+ mova m2, m6
+ pmaddwd m1, m6, m10 ; b2
+ pmaddwd m6, m11 ; a3
+ pmaddwd m7, m3, m11 ; b3
+ paddd m5, m1
+ paddd m0, m6
+ paddd m5, m7
+ psrad m0, rndshift
+ psrad m5, rndshift
+ packssdw m0, m5
+%ifidn %1, put
+ packuswb m0, m0
+ psrldq m1, m0, 4
+ movd [dstq+dsq*0], m0
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], m0
+ add tmpq, 16
+%endif
+ mova m0, m4
+ mova m4, m3
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET
+ SWAP m8, m15
+.dy2_w8:
+ mov dword [rsp+0x90], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [rsp+0x90], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [rsp+0x90], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [rsp+0x90], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [rsp+0x90], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ movq m3, r4q
+ punpcklbw m3, m3
+ psraw m3, 8
+ mova [rsp+0x100], m7
+ mova [rsp+0x120], m15
+ mov [rsp+0x098], srcq
+ mov [rsp+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [rsp+0x140], m0
+ mova [rsp+0x150], m1
+ mova [rsp+0x160], m2
+ mova [rsp+0x170], m3
+%if UNIX64
+ mov hm, hd
+%endif
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [rsp+0x090]
+ jz .ret
+ add qword [rsp+0x130], 8*(isprep+1)
+ mov hd, hm
+ mova m7, [rsp+0x100]
+ mova m14, [rsp+0x110]
+ mova m10, [base+pd_0x3ff]
+ mova m15, [rsp+0x120]
+ pxor m9, m9
+ mov srcq, [rsp+0x098]
+ mov r0q, [rsp+0x130] ; dstq / tmpq
+ paddd m14, m7
+.dy2_hloop:
+ mova m11, [base+pq_0x40000000]
+ psrld m4, m14, 10
+ mova [rsp], m4
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m9
+ psrldq m4, m5, 8
+ movd r4d, m5
+ movd r6d, m4
+ psrldq m5, 4
+ psrldq m4, 4
+ movd r7d, m5
+ movd r9d, m4
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ mova [rsp+0x110], m14
+ psrldq m4, m15, 8
+ movd r10d, m15
+ movd r11d, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ psrldq m4, m14, 8
+ movd r10d, m14
+ movd r11d, m4
+ psrldq m14, 4
+ psrldq m4, 4
+ movd r13d, m14
+ movd rXd, m4
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m8, m11, m4
+ pand m9, m11, m6
+ pand m15, m11, m7
+ pand m11, m11, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m7, m2
+ pandn m5, m3
+ por m8, m4
+ por m9, m6
+ por m15, m7
+ por m11, m5
+ mova [rsp+0x10], m8
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m15
+ mova [rsp+0x40], m11
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
+ mova [rsp+0x50], m1
+ mova [rsp+0x60], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
+ mova [rsp+0x70], m3
+ mova [rsp+0x80], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
+ MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
+ SWAP m7, m0
+ SWAP m8, m14
+ mova m1, [rsp+0x50]
+ mova m2, [rsp+0x60]
+ mova m3, [rsp+0x70]
+ mova m15, [rsp+0x80]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ SWAP m14, m8
+ mova m8, [rsp+0x140]
+ mova m9, [rsp+0x150]
+ mova m10, [rsp+0x160]
+ mova m11, [rsp+0x170]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m15; 23a
+ punpckhwd m3, m15 ; 23b
+ mova [rsp+0x50], m4
+ mova [rsp+0x60], m5
+ mova [rsp+0x70], m6
+ mova [rsp+0x80], m7
+.dy2_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ pmaddwd m7, m3, m9
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [rsp+0x50], m10
+ pmaddwd m7, [rsp+0x60], m10
+ pmaddwd m15, [rsp+0x70], m11
+ paddd m4, m6
+ pmaddwd m6, [rsp+0x80], m11
+ paddd m5, m7
+ paddd m4, m15
+ paddd m5, m6
+ psrad m4, rndshift
+ psrad m5, rndshift
+ packssdw m4, m5
+%ifidn %1, put
+ packuswb m4, m4
+ movq [dstq], m4
+ add dstq, dsm
+%else
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+ mova m8, [rsp+0x10]
+ mova m9, [rsp+0x20]
+ mova m10, [rsp+0x30]
+ mova m11, [rsp+0x40]
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11
+ mova m3, [rsp+0x50] ; 23a
+ mova m4, [rsp+0x60] ; 23b
+ mova m5, [rsp+0x70] ; 45a
+ mova m7, [rsp+0x80] ; 45b
+ mova m8, [rsp+0x140]
+ mova m9, [rsp+0x150]
+ mova m10, [rsp+0x160]
+ mova m11, [rsp+0x170]
+ punpcklwd m14, m2, m6 ; 67a
+ punpckhwd m2, m6 ; 67b
+ mova [rsp+0x50], m5
+ mova [rsp+0x60], m7
+ mova [rsp+0x70], m14
+ mova [rsp+0x80], m2
+ mova m2, m3
+ mova m3, m4
+ jmp .dy2_vloop
+.ret:
+ MC_8TAP_SCALED_RET 0
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, (5*15 << 16) | 5*15
+ jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
+%endmacro
+
+%if ARCH_X86_64
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+BILIN_SCALED_FN put
+FN put_8tap_scaled, sharp, SHARP, SHARP
+FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN put_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN put_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN put_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN put_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+BILIN_SCALED_FN prep
+FN prep_8tap_scaled, sharp, SHARP, SHARP
+FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+%endif
+
%if ARCH_X86_32
%macro SAVE_ALPHA_BETA 0
mov alpham, alphad
@@ -5715,7 +7487,7 @@
%define m11 [base+pd_63]
%define m10 [base+pb_8x0_8x8]
%endif
- pmaddwd m4, m7, [base+resize_mul] ; dx*[0,1,2,3]
+ pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
pslld m7, 2 ; dx*4
pslld m5, 14
paddd m6, m4 ; mx+[0..3]*dx
--- a/src/x86/msac.asm
+++ b/src/x86/msac.asm
@@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA 64 ; avoids cacheline splits
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -518,9 +518,7 @@
}
int main(int argc, char *argv[]) {
- (void)func_new, (void)func_ref;
state.seed = get_seed();
- int ret = 0;
while (argc > 1) {
if (!strncmp(argv[1], "--help", 6)) {
@@ -568,6 +566,24 @@
dav1d_init_cpu();
+#ifdef readtime
+ if (state.bench_pattern) {
+ static int testing = 0;
+ checkasm_save_context();
+ if (!testing) {
+ checkasm_set_signal_handler_state(1);
+ testing = 1;
+ readtime();
+ checkasm_set_signal_handler_state(0);
+ } else {
+ fprintf(stderr, "checkasm: unable to access cycle counter\n");
+ return 1;
+ }
+ }
+#endif
+
+ int ret = 0;
+
if (!state.function_listing) {
fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
#if ARCH_X86_64
@@ -672,7 +688,9 @@
/* Indicate that the current test has failed, return whether verbose printing
* is requested. */
int checkasm_fail_func(const char *const msg, ...) {
- if (state.current_func_ver->cpu && state.current_func_ver->ok) {
+ if (state.current_func_ver && state.current_func_ver->cpu &&
+ state.current_func_ver->ok)
+ {
va_list arg;
print_cpu_name();
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -86,8 +86,6 @@
int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps,
unsigned max_ulp, int len);
-static void *func_ref, *func_new;
-
#define BENCH_RUNS (1 << 12) /* Trade-off between accuracy and speed */
/* Decide whether or not the specified function needs to be tested */
@@ -99,6 +97,7 @@
* is optional. */
#define declare_func(ret, ...)\
declare_new(ret, __VA_ARGS__)\
+ void *func_ref, *func_new;\
typedef ret func_type(__VA_ARGS__);\
checkasm_save_context()
@@ -127,6 +126,9 @@
}
#define readtime readtime
#endif
+#elif (ARCH_AARCH64 || ARCH_ARM) && defined(__APPLE__)
+#include <mach/mach_time.h>
+#define readtime() mach_absolute_time()
#elif ARCH_AARCH64
#ifdef _MSC_VER
#include <windows.h>
--- a/tests/checkasm/msac.c
+++ b/tests/checkasm/msac.c
@@ -140,11 +140,11 @@
report("decode_symbol");
}
-static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
+static void check_decode_bool_adapt(MsacDSPContext *const c, uint8_t *const buf) {
MsacContext s_c, s_a;
+ declare_func(unsigned, MsacContext *s, uint16_t *cdf);
if (check_func(c->bool_adapt, "msac_decode_bool_adapt")) {
- declare_func(unsigned, MsacContext *s, uint16_t *cdf);
uint16_t cdf[2][2];
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
@@ -165,9 +165,13 @@
bench_new(&s_a, cdf[1]);
}
}
+}
+static void check_decode_bool_equi(MsacDSPContext *const c, uint8_t *const buf) {
+ MsacContext s_c, s_a;
+
+ declare_func(unsigned, MsacContext *s);
if (check_func(c->bool_equi, "msac_decode_bool_equi")) {
- declare_func(unsigned, MsacContext *s);
dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
s_a = s_c;
for (int i = 0; i < 64; i++) {
@@ -180,9 +184,13 @@
}
bench_new(&s_a);
}
+}
+static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
+ MsacContext s_c, s_a;
+
+ declare_func(unsigned, MsacContext *s, unsigned f);
if (check_func(c->bool, "msac_decode_bool")) {
- declare_func(unsigned, MsacContext *s, unsigned f);
dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
s_a = s_c;
for (int i = 0; i < 64; i++) {
@@ -197,6 +205,12 @@
bench_new(&s_a, 16384);
}
+}
+
+static void check_decode_bool_funcs(MsacDSPContext *const c, uint8_t *const buf) {
+ check_decode_bool_adapt(c, buf);
+ check_decode_bool_equi(c, buf);
+ check_decode_bool(c, buf);
report("decode_bool");
}
@@ -204,8 +218,8 @@
ALIGN_STK_16(uint16_t, cdf, 2, [16]);
MsacContext s_c, s_a;
+ declare_func(unsigned, MsacContext *s, uint16_t *cdf);
if (check_func(c->hi_tok, "msac_decode_hi_tok")) {
- declare_func(unsigned, MsacContext *s, uint16_t *cdf);
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
s_a = s_c;
@@ -272,6 +286,6 @@
buf[i] = rnd();
check_decode_symbol(&c, buf);
- check_decode_bool(&c, buf);
+ check_decode_bool_funcs(&c, buf);
check_decode_hi_tok(&c, buf);
}
--- a/tests/checkasm/x86/checkasm.asm
+++ b/tests/checkasm/x86/checkasm.asm
@@ -23,8 +23,9 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-%define private_prefix checkasm
%include "config.asm"
+%undef private_prefix
+%define private_prefix checkasm
%include "ext/x86/x86inc.asm"
SECTION_RODATA 16
--- /dev/null
+++ b/tests/header_test.c.in
@@ -1,0 +1,33 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <dav1d/INPUT>
+
+int main()
+{
+ return 0;
+}
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -101,6 +101,36 @@
test('checkasm', checkasm, is_parallel: false)
endif
+c99_extension_flag = cc.first_supported_argument(
+ '-Werror=c11-extensions',
+ '-Werror=c99-c11-compat',
+ '-Wc11-extensions',
+ '-Wc99-c11-compat',
+)
+
+# dav1d_api_headers
+foreach header : dav1d_api_headers
+ header_file = '@0@'.format(header).split('/')[-1]
+ target = header_file + '_test'
+
+ header_test_source = custom_target(target,
+ output : target + '.c',
+ input : 'header_test.c.in',
+ capture : true,
+ command : ['sed', '-e', 's/INPUT/' + header_file + '/', '@INPUT@']
+ )
+
+ header_test_exe = executable(target,
+ header_test_source,
+ include_directories: dav1d_inc_dirs,
+ c_args: [c99_extension_flag],
+ build_by_default: true
+ )
+
+ test(target, header_test_exe)
+endforeach
+
+
# fuzzing binaries
if meson.version().version_compare('>=0.49')
subdir('libfuzzer')
--- a/tools/dav1d.c
+++ b/tools/dav1d.c
@@ -124,11 +124,15 @@
else
b += snprintf(b, end - b, "Decoded %u/%u frames (%.1lf%%)",
n, num, 100.0 * n / num);
- if (i_fps && b < end) {
+ if (b < end) {
const double d_fps = 1e9 * n / elapsed;
- const double speed = d_fps / i_fps;
- b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
- d_fps, i_fps, speed);
+ if (i_fps) {
+ const double speed = d_fps / i_fps;
+ b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
+ d_fps, i_fps, speed);
+ } else {
+ b += snprintf(b, end - b, " - %.2lf fps", d_fps);
+ }
}
if (!istty)
strcpy(b > end - 2 ? end - 2 : b, "\n");
--- a/tools/output/y4m2.c
+++ b/tools/output/y4m2.c
@@ -28,6 +28,7 @@
#include "config.h"
#include <errno.h>
+#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -77,8 +78,17 @@
chr_names_8bpc_i420[p->seq_hdr->chr > 2 ? DAV1D_CHR_UNKNOWN : p->seq_hdr->chr] :
ss_names[p->p.layout][p->seq_hdr->hbd];
- fprintf(c->f, "YUV4MPEG2 W%d H%d F%d:%d Ip C%s\n",
- p->p.w, p->p.h, c->fps[0], c->fps[1], ss_name);
+ const unsigned fw = p->p.w;
+ const unsigned fh = p->p.h;
+ uint64_t aw = (uint64_t)fh * p->frame_hdr->render_width;
+ uint64_t ah = (uint64_t)fw * p->frame_hdr->render_height;
+ uint64_t gcd = ah;
+ for (uint64_t a = aw, b; (b = a % gcd); a = gcd, gcd = b);
+ aw /= gcd;
+ ah /= gcd;
+
+ fprintf(c->f, "YUV4MPEG2 W%u H%u F%u:%u Ip A%"PRIu64":%"PRIu64" C%s\n",
+ fw, fh, c->fps[0], c->fps[1], aw, ah, ss_name);
return 0;
}