shithub: dav1d

Download patch

ref: 2989cb3f79e34d561dd2311532b1cb9b182e1170
parent: 8863dabe0a500c9c641f546cd1ae3ac97b6312a1
parent: 3bfe8c7c8a553728e2d6556e4a95f5cd246d1c92
author: Sigrid Haflínudóttir <ftrvxmtrx@gmail.com>
date: Fri Sep 4 05:39:49 EDT 2020

Merge remote-tracking branch 'upstream' into master

--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -81,6 +81,15 @@
               fi;
           done
 
+x86inc-check:
+    extends: .debian-amd64-common
+    stage: style
+    script:
+        - git remote rm x86inc 2> /dev/null || true
+        - git remote add x86inc https://code.videolan.org/videolan/x86inc.asm.git
+        - git fetch -q x86inc master
+        - git diff --exit-code x86inc/master:x86inc.asm src/ext/x86/x86inc.asm
+    allow_failure: true
 
 build-debian:
     extends: .debian-amd64-common
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -12,7 +12,7 @@
 The codebase is developed with the following assumptions:
 
 For the library:
-- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extension,
+- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extensions. Anonymous structures and unions are the only allowed compiler extensions for internal code.
 - x86 asm in .asm files, using the NASM syntax,
 - arm/arm64 in .S files, using the GAS syntax limited to subset llvm 5.0's internal assembler supports,
 - no C++ is allowed, whatever the version.
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-![dav1d logo](dav1d_logo.png)
+![dav1d logo](doc/dav1d_logo.png)
 
 # dav1d
 
@@ -30,17 +30,21 @@
 1. Complete C implementation of the decoder,
 2. Provide a usable API,
 3. Port to most platforms,
-4. Make it fast on desktop, by writing asm for AVX-2 chips.
+4. Make it fast on desktop, by writing asm for AVX2 chips.
 5. Make it fast on mobile, by writing asm for ARMv8 chips,
-6. Make it fast on older desktop, by writing asm for SSSE3+ chips.
+6. Make it fast on older desktop, by writing asm for SSSE3+ chips,
+7. Make high bit-depth fast on mobile, by writing asm for ARMv8 chips.
 
 ### On-going
-7. Make it fast on older mobiles, by writing asm for ARMv7 chips,
-8. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
-9. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
+8. Make it fast on older mobile, by writing asm for ARMv7 chips,
+9. Make high bit-depth fast on older mobile, by writing asm for ARMv7 chips,
+10. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
+11. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
 
 ### After
-10. Use more GPU, when possible.
+12. Make high bit-depth fast on desktop, by writing asm for AVX2 chips,
+13. Make high bit-depth fast on older desktop, by writing asm for SSSE3+ chips,
+14. Use more GPU, when possible.
 
 # Contribute
 
@@ -130,7 +134,7 @@
 
 ## I am not a developer. Can I help?
 
-- Yes. We need testers, bug reporters, and documentation writers.
+- Yes. We need testers, bug reporters and documentation writers.
 
 ## What about the AV1 patent license?
 
@@ -142,3 +146,5 @@
 
 - We do, but we don't have either the time or the knowledge. Therefore, patches and contributions welcome.
 
+## Where can I find documentation?
+- The current library documentation, built from master, can be found [here](https://videolan.videolan.me/dav1d/).
binary files a/dav1d_logo.png /dev/null differ
binary files /dev/null b/doc/dav1d_logo.png differ
--- a/examples/dp_renderer_placebo.c
+++ b/examples/dp_renderer_placebo.c
@@ -501,7 +501,7 @@
             .num_points_uv  = { src->num_uv_points[0], src->num_uv_points[1] },
             .scaling_shift  = src->scaling_shift,
             .ar_coeff_lag   = src->ar_coeff_lag,
-            .ar_coeff_shift = src->ar_coeff_shift,
+            .ar_coeff_shift = (int)src->ar_coeff_shift,
             .grain_scale_shift = src->grain_scale_shift,
             .uv_mult        = { src->uv_mult[0], src->uv_mult[1] },
             .uv_mult_luma   = { src->uv_luma_mult[0], src->uv_luma_mult[1] },
--- a/include/dav1d/dav1d.h
+++ b/include/dav1d/dav1d.h
@@ -68,9 +68,9 @@
     int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
     int all_layers; ///< output all spatial layers of a scalable AV1 biststream
     unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
-    uint8_t reserved[32]; ///< reserved for future use
     Dav1dPicAllocator allocator; ///< Picture allocator callback.
     Dav1dLogger logger; ///< Logger callback.
+    uint8_t reserved[32]; ///< reserved for future use
 } Dav1dSettings;
 #pragma incomplete Dav1dSettings
 
--- a/include/dav1d/headers.h
+++ b/include/dav1d/headers.h
@@ -28,6 +28,7 @@
 #ifndef DAV1D_HEADERS_H
 #define DAV1D_HEADERS_H
 
+#include <stdint.h>
 #include <stddef.h>
 
 // Constants from Section 3. "Symbols and abbreviated terms"
@@ -95,9 +96,9 @@
     union {
         struct {
             int16_t alpha, beta, gamma, delta;
-        };
+        } p;
         int16_t abcd[4];
-    };
+    } u;
 } Dav1dWarpedMotionParams;
 
 enum Dav1dPixelLayout {
@@ -127,6 +128,7 @@
     DAV1D_COLOR_PRI_SMPTE431 = 11,
     DAV1D_COLOR_PRI_SMPTE432 = 12,
     DAV1D_COLOR_PRI_EBU3213 = 22,
+    DAV1D_COLOR_PRI_RESERVED = 255,
 };
 
 enum Dav1dTransferCharacteristics {
@@ -147,6 +149,7 @@
     DAV1D_TRC_SMPTE2084 = 16,     ///< PQ
     DAV1D_TRC_SMPTE428 = 17,
     DAV1D_TRC_HLG = 18,           ///< hybrid log/gamma (BT.2100 / ARIB STD-B67)
+    DAV1D_TRC_RESERVED = 255,
 };
 
 enum Dav1dMatrixCoefficients {
@@ -164,6 +167,7 @@
     DAV1D_MC_CHROMAT_NCL = 12, ///< Chromaticity-derived
     DAV1D_MC_CHROMAT_CL = 13,
     DAV1D_MC_ICTCP = 14,
+    DAV1D_MC_RESERVED = 255,
 };
 
 enum Dav1dChromaSamplePosition {
--- a/include/dav1d/meson.build
+++ b/include/dav1d/meson.build
@@ -31,11 +31,15 @@
                                   output: 'version.h',
                                   configuration: version_h_data)
 
+dav1d_api_headers = files(
+    'common.h',
+    'data.h',
+    'dav1d.h',
+    'headers.h',
+    'picture.h',
+    )
+
 # install headers
-install_headers('common.h',
-                'data.h',
-                'dav1d.h',
-                'headers.h',
-                'picture.h',
+install_headers(dav1d_api_headers,
                 version_h_target,
                 subdir : 'dav1d')
--- a/meson.build
+++ b/meson.build
@@ -30,7 +30,7 @@
                       'b_ndebug=if-release'],
     meson_version: '>= 0.47.0')
 
-dav1d_soname_version       = '4.0.2'
+dav1d_soname_version       = '5.0.0'
 dav1d_api_version_array    = dav1d_soname_version.split('.')
 dav1d_api_version_major    = dav1d_api_version_array[0]
 dav1d_api_version_minor    = dav1d_api_version_array[1]
@@ -62,7 +62,8 @@
 
 # ASM option
 is_asm_enabled = (get_option('enable_asm') == true and
-    (host_machine.cpu_family().startswith('x86') or
+    (host_machine.cpu_family() == 'x86' or
+     (host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__') == '') or
      host_machine.cpu_family() == 'aarch64'      or
      host_machine.cpu_family().startswith('arm') or
      host_machine.cpu() == 'ppc64le'))
@@ -350,6 +351,7 @@
 cdata.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
 
 if host_machine.cpu_family().startswith('x86')
+    cdata_asm.set('private_prefix', 'dav1d')
     cdata_asm.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64')
     cdata_asm.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
     cdata_asm.set10('PIC', true)
--- a/src/arm/32/ipred.S
+++ b/src/arm/32/ipred.S
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2019, Martin Storsjo
+ * Copyright © 2020, Martin Storsjo
  * Copyright © 2019, B Krishnan Iyer
  * All rights reserved.
  *
@@ -132,7 +132,7 @@
         .word 80f  - L(ipred_v_tbl) + CONFIG_THUMB
         .word 40f  - L(ipred_v_tbl) + CONFIG_THUMB
 40:
-        vld1.32         {d0[0]},  [r2]
+        vld1.32         {d0[]},   [r2]
 4:
         vst1.32         {d0[0]},  [r0,  :32], r1
         vst1.32         {d0[0]},  [r12, :32], r1
@@ -215,7 +215,7 @@
         .word 8f   - L(ipred_h_tbl) + CONFIG_THUMB
         .word 4f   - L(ipred_h_tbl) + CONFIG_THUMB
 4:
-        vld4.8          {d0[],  d1[],  d2[],  d3[]},  [r2],  lr
+        vld4.8          {d0[],  d1[],  d2[],  d3[]},  [r2, :32],  lr
         vst1.32         {d3[0]},  [r0,  :32], r1
         vst1.32         {d2[0]},  [r12, :32], r1
         subs            r4,  r4,  #4
@@ -224,7 +224,7 @@
         bgt             4b
         pop             {r4-r5, pc}
 8:
-        vld4.8          {d0[],  d1[],  d2[],  d3[]},  [r2],  lr
+        vld4.8          {d0[],  d1[],  d2[],  d3[]},  [r2, :32],  lr
         vst1.8          {d3},  [r0,  :64], r1
         vst1.8          {d2},  [r12, :64], r1
         subs            r4,  r4,  #4
@@ -453,7 +453,7 @@
         .word L(ipred_dc_left_w4)  - L(ipred_dc_left_tbl) + CONFIG_THUMB
 
 L(ipred_dc_left_h4):
-        vld1.32         {d0[]},  [r2]
+        vld1.32         {d0[]},  [r2, :32]
         vpaddl.u8       d0,  d0
         vpadd.u16       d0,  d0
         vrshrn.u16      d0,  q0,  #2
@@ -468,7 +468,7 @@
         bgt             L(ipred_dc_left_w4)
         pop             {r4-r5, pc}
 L(ipred_dc_left_h8):
-        vld1.8          {d0},  [r2]
+        vld1.8          {d0},  [r2, :64]
         vpaddl.u8       d0,  d0
         vpadd.u16       d0,  d0
         vpadd.u16       d0,  d0
@@ -484,7 +484,7 @@
         bgt             L(ipred_dc_left_w8)
         pop             {r4-r5, pc}
 L(ipred_dc_left_h16):
-        vld1.8          {d0,  d1},  [r2]
+        vld1.8          {d0,  d1},  [r2, :128]
         vaddl.u8        q0,  d0,  d1
         vadd.u16        d0,  d0,  d1
         vpadd.u16       d0,  d0
@@ -501,7 +501,7 @@
         bgt             L(ipred_dc_left_w16)
         pop             {r4-r5, pc}
 L(ipred_dc_left_h32):
-        vld1.8          {d0,  d1,  d2,  d3},  [r2]
+        vld1.8          {d0,  d1,  d2,  d3},  [r2, :128]
         vaddl.u8        q0,  d0,  d1
         vaddl.u8        q1,  d2,  d3
         vadd.u16        q0,  q0,  q1
@@ -522,8 +522,8 @@
         bgt             1b
         pop             {r4-r5, pc}
 L(ipred_dc_left_h64):
-        vld1.8          {d0,  d1,  d2,  d3},  [r2]!
-        vld1.8          {d4,  d5,  d6,  d7},  [r2]
+        vld1.8          {d0,  d1,  d2,  d3},  [r2, :128]!
+        vld1.8          {d4,  d5,  d6,  d7},  [r2, :128]
         vaddl.u8        q0,  d0,  d1
         vaddl.u8        q1,  d2,  d3
         vaddl.u8        q2,  d4,  d5
@@ -568,7 +568,6 @@
         clz             r3,  r3
         clz             r12, r4
         vdup.16         q15, lr             // width + height
-        mov             r6,  #0
         adr             r5,  L(ipred_dc_tbl)
         rbit            lr,  lr             // rbit(width + height)
         sub             r3,  r3,  #20       // 25 leading bits, minus table offset 5
@@ -599,22 +598,21 @@
         .word L(ipred_dc_w4)  - L(ipred_dc_tbl) + CONFIG_THUMB
 
 L(ipred_dc_h4):
-        vld1.32         {d0[0]},  [r2]!
+        vld1.32         {d0[]},  [r2, :32]!
         vpaddl.u8       d0,  d0
         vpadd.u16       d0,  d0
         bx              r3
 L(ipred_dc_w4):
         add             r2,  r2,  #1
-        vld1.32         {d1[0]},  [r2]
-        vmov.32         d1[1],  r6
+        vld1.32         {d1[]},  [r2]
         vadd.s16        d0,  d0,  d30
         vpaddl.u8       d1,  d1
         vpadd.u16       d1,  d1
-        vpadd.u16       d1,  d1
         cmp             r4,  #4
         vadd.s16        d0,  d0,  d1
         vshl.u16        d0,  d0,  d28
-        beq             1f                  // h = 8/16
+        beq             1f
+        // h = 8/16
         movw            lr,  #(0x3334/2)
         movw            r5,  #(0x5556/2)
         cmp             r4,  #16
@@ -634,7 +632,7 @@
         pop             {r4-r6, pc}
 
 L(ipred_dc_h8):
-        vld1.8          {d0},  [r2]!
+        vld1.8          {d0},  [r2, :64]!
         vpaddl.u8       d0,  d0
         vpadd.u16       d0,  d0
         vpadd.u16       d0,  d0
@@ -649,13 +647,14 @@
         cmp             r4,  #8
         vadd.s16        d0,  d0,  d2
         vshl.u16        d0,  d0,  d28
-        beq             1f                  // h = 4/16/32
+        beq             1f
+        // h = 4/16/32
         cmp             r4,  #32
         movw            lr,  #(0x3334/2)
         movw            r5,  #(0x5556/2)
         it              ne
         movne           lr,  r5
-        vdup.16         q12, lr
+        vdup.16         d24, lr
         vqdmulh.s16     d0,  d0,  d24
 1:
         vdup.8          d0,  d0[0]
@@ -669,7 +668,7 @@
         pop             {r4-r6, pc}
 
 L(ipred_dc_h16):
-        vld1.8          {d0,  d1},  [r2]!
+        vld1.8          {d0,  d1},  [r2, :128]!
         vaddl.u8        q0,  d0,  d1
         vadd.u16        d0,  d0,  d1
         vpadd.u16       d0,  d0
@@ -686,13 +685,14 @@
         cmp             r4,  #16
         vadd.s16        d0,  d0,  d2
         vshl.u16        d0,  d0,  d28
-        beq             1f                  // h = 4/8/32/64
+        beq             1f
+        // h = 4/8/32/64
         tst             r4,  #(32+16+8)     // 16 added to make a consecutive bitmask
         movw            lr,  #(0x3334/2)
         movw            r5,  #(0x5556/2)
         it              ne
         movne           lr,  r5
-        vdup.16         q12, lr
+        vdup.16         d24, lr
         vqdmulh.s16     d0,  d0,  d24
 1:
         vdup.8          q0,  d0[0]
@@ -706,7 +706,7 @@
         pop             {r4-r6, pc}
 
 L(ipred_dc_h32):
-        vld1.8          {d0,  d1,  d2,  d3},  [r2]!
+        vld1.8          {d0,  d1,  d2,  d3},  [r2, :128]!
         vaddl.u8        q0,  d0,  d1
         vaddl.u8        q1,  d2,  d3
         vadd.u16        q0,  q0,  q1
@@ -718,25 +718,23 @@
         add             r2,  r2,  #1
         vld1.8          {d2,  d3,  d4,  d5},  [r2]
         vadd.s16        d0,  d0,  d30
-        vaddl.u8        q2,  d4,  d5
-        vadd.u16        d4,  d4,  d5
         vaddl.u8        q1,  d2,  d3
+        vaddl.u8        q2,  d4,  d5
+        vadd.u16        q1,  q1,  q2
         vadd.u16        d2,  d2,  d3
-        vpadd.u16       d4,  d4
         vpadd.u16       d2,  d2
-        vpadd.u16       d4,  d4
         vpadd.u16       d2,  d2
         cmp             r4,  #32
-        vadd.s16        d0,  d0,  d4
         vadd.s16        d0,  d0,  d2
         vshl.u16        d4,  d0,  d28
-        beq             1f                  // h = 8/16/64
+        beq             1f
+        // h = 8/16/64
         cmp             r4,  #8
         movw            lr,  #(0x3334/2)
         movw            r5,  #(0x5556/2)
         it              ne
         movne           lr,  r5
-        vdup.16         q12, lr
+        vdup.16         d24, lr
         vqdmulh.s16     d4,  d4,  d24
 1:
         vdup.8          q0,  d4[0]
@@ -751,9 +749,9 @@
         pop             {r4-r6, pc}
 
 L(ipred_dc_h64):
-        vld1.8          {d0,  d1,  d2,  d3},  [r2]!
+        vld1.8          {d0,  d1,  d2,  d3},  [r2, :128]!
         vaddl.u8        q0,  d0,  d1
-        vld1.8          {d4,  d5,  d6,  d7},  [r2]!
+        vld1.8          {d4,  d5,  d6,  d7},  [r2, :128]!
         vaddl.u8        q1,  d2,  d3
         vaddl.u8        q2,  d4,  d5
         vaddl.u8        q3,  d6,  d7
@@ -819,3 +817,2143 @@
         pop             {r4-r6, pc}
 endfunc
 
+// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                            const pixel *const topleft,
+//                            const int width, const int height, const int a,
+//                            const int max_width, const int max_height);
+function ipred_paeth_8bpc_neon, export=1
+        push            {r4-r8, lr}
+        ldr             r4,  [sp, #24]
+        clz             lr,  r3
+        adr             r5,  L(ipred_paeth_tbl)
+        sub             lr,  lr,  #25
+        ldr             lr,  [r5, lr, lsl #2]
+        vld1.8          {d4[], d5[]},  [r2]
+        add             r8,  r2,  #1
+        sub             r2,  r2,  #4
+        add             r5,  r5,  lr
+        mov             r7,  #-4
+        add             r6,  r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r5
+
+        .align 2
+L(ipred_paeth_tbl):
+        .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB
+        .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB
+        .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB
+        .word 80f  - L(ipred_paeth_tbl) + CONFIG_THUMB
+        .word 40f  - L(ipred_paeth_tbl) + CONFIG_THUMB
+
+40:
+        vld1.32         {d6[], d7[]},  [r8]
+        vsubl.u8        q8,  d6,  d4  // top - topleft
+4:
+        vld4.8          {d0[], d1[], d2[], d3[]},  [r2, :32], r7
+        vzip.32         d0,  d1
+        vzip.32         d2,  d3
+        vaddw.u8        q9,  q8,  d0
+        vaddw.u8        q10, q8,  d2
+        vqmovun.s16     d18, q9       // base
+        vqmovun.s16     d19, q10
+        vmov            d1,  d2
+        vabd.u8         q10, q3,  q9  // tdiff
+        vabd.u8         q11, q2,  q9  // tldiff
+        vabd.u8         q9,  q0,  q9  // ldiff
+        vmin.u8         q12, q10, q11 // min(tdiff, tldiff)
+        vcge.u8         q10, q11, q10 // tldiff >= tdiff
+        vcge.u8         q9,  q12, q9  // min(tdiff, tldiff) >= ldiff
+        vbsl            q10, q3,  q2  // tdiff <= tldiff ? top : topleft
+        vbit            q10, q0,  q9  // ldiff <= min ? left : ...
+        vst1.32         {d21[1]}, [r0, :32], r1
+        vst1.32         {d21[0]}, [r6, :32], r1
+        subs            r4,  r4,  #4
+        vst1.32         {d20[1]}, [r0, :32], r1
+        vst1.32         {d20[0]}, [r6, :32], r1
+        bgt             4b
+        pop             {r4-r8, pc}
+80:
+        vld1.8          {d6},  [r8]
+        vsubl.u8        q8,  d6,  d4  // top - topleft
+        vmov            d7,  d6
+8:
+        vld4.8          {d0[], d1[], d2[], d3[]},  [r2, :32], r7
+        vaddw.u8        q9,  q8,  d0
+        vaddw.u8        q10, q8,  d1
+        vaddw.u8        q11, q8,  d2
+        vaddw.u8        q12, q8,  d3
+        vqmovun.s16     d18, q9       // base
+        vqmovun.s16     d19, q10
+        vqmovun.s16     d20, q11
+        vqmovun.s16     d21, q12
+        vabd.u8         q11, q3,  q9  // tdiff
+        vabd.u8         q12, q3,  q10
+        vabd.u8         q13, q2,  q9  // tldiff
+        vabd.u8         q14, q2,  q10
+        vabd.u8         q10, q1,  q10 // ldiff
+        vabd.u8         q9,  q0,  q9
+        vmin.u8         q15, q12, q14 // min(tdiff, tldiff)
+        vcge.u8         q12, q14, q12 // tldiff >= tdiff
+        vmin.u8         q14, q11, q13 // min(tdiff, tldiff)
+        vcge.u8         q11, q13, q11 // tldiff >= tdiff
+        vcge.u8         q10, q15, q10 // min(tdiff, tldiff) >= ldiff
+        vcge.u8         q9,  q14, q9
+        vbsl            q12, q3,  q2  // tdiff <= tldiff ? top : topleft
+        vbsl            q11, q3,  q2
+        vbit            q12, q1,  q10 // ldiff <= min ? left : ...
+        vbit            q11, q0,  q9
+        vst1.8          {d25}, [r0, :64], r1
+        vst1.8          {d24}, [r6, :64], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d23}, [r0, :64], r1
+        vst1.8          {d22}, [r6, :64], r1
+        bgt             8b
+        pop             {r4-r8, pc}
+160:
+320:
+640:
+        vld1.8          {d6},  [r8]!
+        mov             r12, r3
+        // Set up pointers for four rows in parallel; r0, r6, r5, lr
+        add             r5,  r0,  r1
+        add             lr,  r6,  r1
+        lsl             r1,  r1,  #1
+        sub             r1,  r1,  r3
+1:
+        vld4.8          {d0[], d1[], d2[], d3[]},  [r2, :32], r7
+2:
+        vsubl.u8        q8,  d6,  d4  // top - topleft
+        vmov            d7,  d6
+        vaddw.u8        q9,  q8,  d0
+        vaddw.u8        q10, q8,  d1
+        vaddw.u8        q11, q8,  d2
+        vaddw.u8        q12, q8,  d3
+        vqmovun.s16     d18, q9       // base
+        vqmovun.s16     d19, q10
+        vqmovun.s16     d20, q11
+        vqmovun.s16     d21, q12
+        vabd.u8         q11, q3,  q9  // tdiff
+        vabd.u8         q12, q3,  q10
+        vabd.u8         q13, q2,  q9  // tldiff
+        vabd.u8         q14, q2,  q10
+        vabd.u8         q10, q1,  q10 // ldiff
+        vabd.u8         q9,  q0,  q9
+        vmin.u8         q15, q12, q14 // min(tdiff, tldiff)
+        vcge.u8         q12, q14, q12 // tldiff >= tdiff
+        vmin.u8         q14, q11, q13 // min(tdiff, tldiff)
+        vcge.u8         q11, q13, q11 // tldiff >= tdiff
+        vcge.u8         q10, q15, q10 // min(tdiff, tldiff) >= ldiff
+        vcge.u8         q9,  q14, q9
+        vbsl            q12, q3,  q2  // tdiff <= tldiff ? top : topleft
+        vbsl            q11, q3,  q2
+        vbit            q12, q1,  q10 // ldiff <= min ? left : ...
+        vbit            q11, q0,  q9
+        subs            r3,  r3,  #8
+        vst1.8          {d25}, [r0, :64]!
+        vst1.8          {d24}, [r6, :64]!
+        vst1.8          {d23}, [r5, :64]!
+        vst1.8          {d22}, [lr, :64]!
+        ble             8f
+        vld1.8          {d6},  [r8]!
+        b               2b
+8:
+        subs            r4,  r4,  #4
+        ble             9f
+        // End of horizontal loop, move pointers to next four rows
+        sub             r8,  r8,  r12
+        add             r0,  r0,  r1
+        add             r6,  r6,  r1
+        vld1.8          {d6},  [r8]!
+        add             r5,  r5,  r1
+        add             lr,  lr,  r1
+        mov             r3,  r12
+        b               1b
+9:
+        pop             {r4-r8, pc}
+endfunc
+
+// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int a,
+//                             const int max_width, const int max_height);
+function ipred_smooth_8bpc_neon, export=1
+        push            {r4-r10, lr}
+        ldr             r4,  [sp, #32]
+        movrel          r10, X(sm_weights)
+        add             r12, r10, r4
+        add             r10, r10, r3
+        clz             r9,  r3
+        adr             r5,  L(ipred_smooth_tbl)
+        sub             lr,  r2,  r4
+        sub             r9,  r9,  #25
+        ldr             r9,  [r5, r9, lsl #2]
+        vld1.8          {d4[]},  [lr] // bottom
+        add             r8,  r2,  #1
+        add             r5,  r5,  r9
+        add             r6,  r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r5
+
+        .align 2
+L(ipred_smooth_tbl):
+        .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB
+        .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB
+        .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB
+        .word 80f  - L(ipred_smooth_tbl) + CONFIG_THUMB
+        .word 40f  - L(ipred_smooth_tbl) + CONFIG_THUMB
+
+40:
+        vld1.32         {d16[]}, [r8]       // top
+        vld1.32         {d18[]}, [r10, :32] // weights_hor
+        sub             r2,  r2,  #4
+        mov             r7,  #-4
+        vdup.8          q3,  d16[3]   // right
+        vsubl.u8        q8,  d16, d4  // top-bottom
+        vmovl.u8        q9,  d18      // weights_hor
+4:
+        vld4.8          {d0[],  d1[],  d2[],  d3[]},  [r2,  :32], r7 // left
+        vld4.8          {d20[], d21[], d22[], d23[]}, [r12, :32]!    // weights_ver
+        vshll.i8        q12, d6,  #8  // right*256
+        vshll.i8        q13, d6,  #8
+        vzip.32         d1,  d0       // left, flipped
+        vzip.32         d3,  d2
+        vzip.32         d20, d21      // weights_ver
+        vzip.32         d22, d23
+        vshll.i8        q14, d4,  #8  // bottom*256
+        vshll.i8        q15, d4,  #8
+        vsubl.u8        q0,  d1,  d6  // left-right
+        vsubl.u8        q1,  d3,  d6
+        vmovl.u8        q10, d20      // weights_ver
+        vmovl.u8        q11, d22
+        vmla.i16        q12, q1,  q9  // right*256  + (left-right)*weights_hor
+        vmla.i16        q13, q0,  q9  // (left flipped)
+        vmla.i16        q14, q8,  q10 // bottom*256 + (top-bottom)*weights_ver
+        vmla.i16        q15, q8,  q11
+        vhadd.u16       q12, q12, q14
+        vhadd.u16       q13, q13, q15
+        vrshrn.i16      d24, q12, #8
+        vrshrn.i16      d25, q13, #8
+        vst1.32         {d24[0]}, [r0, :32], r1
+        vst1.32         {d24[1]}, [r6, :32], r1
+        subs            r4,  r4,  #4
+        vst1.32         {d25[0]}, [r0, :32], r1
+        vst1.32         {d25[1]}, [r6, :32], r1
+        bgt             4b
+        pop             {r4-r10, pc}
+80:
+        vld1.8          {d16}, [r8]       // top
+        vld1.8          {d18}, [r10, :64] // weights_hor
+        sub             r2,  r2,  #2
+        mov             r7,  #-2
+        vdup.8          q3,  d16[7]   // right
+        vsubl.u8        q8,  d16, d4  // top-bottom
+        vmovl.u8        q9,  d18      // weights_hor
+8:
+        vld2.8          {d0[],  d1[]},  [r2,  :16], r7 // left
+        vld2.8          {d20[], d22[]}, [r12, :16]!    // weights_ver
+        vshll.i8        q12, d6,  #8  // right*256
+        vshll.i8        q13, d6,  #8
+        vshll.i8        q14, d4,  #8  // bottom*256
+        vshll.i8        q15, d4,  #8
+        vsubl.u8        q1,  d0,  d6  // left-right (left flipped)
+        vsubl.u8        q0,  d1,  d6
+        vmovl.u8        q10, d20      // weights_ver
+        vmovl.u8        q11, d22
+        vmla.i16        q12, q0,  q9  // right*256  + (left-right)*weights_hor
+        vmla.i16        q13, q1,  q9
+        vmla.i16        q14, q8,  q10 // bottom*256 + (top-bottom)*weights_ver
+        vmla.i16        q15, q8,  q11
+        vhadd.u16       q12, q12, q14
+        vhadd.u16       q13, q13, q15
+        vrshrn.i16      d24, q12, #8
+        vrshrn.i16      d25, q13, #8
+        subs            r4,  r4,  #2
+        vst1.8          {d24}, [r0, :64], r1
+        vst1.8          {d25}, [r6, :64], r1
+        bgt             8b
+        pop             {r4-r10, pc}
+160:
+320:
+640:
+        add             lr,  r2,  r3
+        sub             r2,  r2,  #2
+        mov             r7,  #-2
+        vld1.8          {d6[], d7[]}, [lr] // right
+        sub             r1,  r1,  r3
+        mov             r9,  r3
+
+1:
+        vld2.8          {d0[],  d1[]},  [r2,  :16], r7 // left
+        vld2.8          {d20[], d22[]}, [r12, :16]!    // weights_ver
+        vsubl.u8        q1,  d0,  d6  // left-right (left flipped)
+        vsubl.u8        q0,  d1,  d6
+        vmovl.u8        q10, d20      // weights_ver
+        vmovl.u8        q11, d22
+2:
+        vld1.8          {d16}, [r8]!       // top
+        vld1.8          {d18}, [r10, :64]! // weights_hor
+        vshll.i8        q12, d6,  #8  // right*256
+        vshll.i8        q13, d6,  #8
+        vmovl.u8        q9,  d18      // weights_hor
+        vshll.i8        q14, d4,  #8  // bottom*256
+        vshll.i8        q15, d4,  #8
+        vsubl.u8        q8,  d16, d4  // top-bottom
+        vmla.i16        q12, q0,  q9  // right*256  + (left-right)*weights_hor
+        vmla.i16        q13, q1,  q9
+        vmla.i16        q14, q8,  q10 // bottom*256 + (top-bottom)*weights_ver
+        vmla.i16        q15, q8,  q11
+        vhadd.u16       q12, q12, q14
+        vhadd.u16       q13, q13, q15
+        vrshrn.i16      d24, q12, #8
+        vrshrn.i16      d25, q13, #8
+        subs            r3,  r3,  #8
+        vst1.8          {d24}, [r0, :64]!
+        vst1.8          {d25}, [r6, :64]!
+        bgt             2b
+        subs            r4,  r4,  #2
+        ble             9f
+        sub             r8,  r8,  r9
+        sub             r10, r10, r9
+        add             r0,  r0,  r1
+        add             r6,  r6,  r1
+        mov             r3,  r9
+        b               1b
+9:
+        pop             {r4-r10, pc}
+endfunc
+
+// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height, const int a,
+//                               const int max_width, const int max_height);
+function ipred_smooth_v_8bpc_neon, export=1
+        push            {r4-r7, lr}
+        ldr             r4,  [sp, #20]
+        movrel          r7,  X(sm_weights)
+        add             r7,  r7,  r4
+        clz             lr,  r3
+        adr             r5,  L(ipred_smooth_v_tbl)
+        sub             r12, r2,  r4
+        sub             lr,  lr,  #25
+        ldr             lr,  [r5, lr, lsl #2]
+        vld1.8          {d4[]},  [r12] // bottom
+        add             r2,  r2,  #1
+        add             r5,  r5,  lr
+        add             r6,  r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r5
+
+        .align 2
+L(ipred_smooth_v_tbl):
+        .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+        .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+        .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+        .word 80f  - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+        .word 40f  - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+
+40:
+        vld1.32         {d6[]}, [r2]  // top
+        vsubl.u8        q3,  d6,  d4  // top-bottom
+4:
+        vld4.8          {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver
+        vshll.i8        q10, d4,  #8  // bottom*256
+        vshll.i8        q11, d4,  #8
+        vzip.32         d16, d17      // weights_ver
+        vzip.32         d18, d19
+        vmovl.u8        q8,  d16      // weights_ver
+        vmovl.u8        q9,  d18
+        subs            r4,  r4,  #4
+        vmla.i16        q10, q3,  q8  // bottom*256 + (top-bottom)*weights_ver
+        vmla.i16        q11, q3,  q9
+        vrshrn.i16      d20, q10, #8
+        vrshrn.i16      d21, q11, #8
+        vst1.32         {d20[0]}, [r0, :32], r1
+        vst1.32         {d20[1]}, [r6, :32], r1
+        vst1.32         {d21[0]}, [r0, :32], r1
+        vst1.32         {d21[1]}, [r6, :32], r1
+        bgt             4b
+        pop             {r4-r7, pc}
+80:
+        vld1.8          {d6}, [r2]    // top
+        vsubl.u8        q3,  d6,  d4  // top-bottom
+8:
+        vld4.8          {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver
+        vshll.i8        q12, d4,  #8  // bottom*256
+        vshll.i8        q13, d4,  #8
+        vshll.i8        q14, d4,  #8
+        vshll.i8        q15, d4,  #8
+        vmovl.u8        q8,  d16      // weights_ver
+        vmovl.u8        q9,  d18
+        vmovl.u8        q10, d20
+        vmovl.u8        q11, d22
+        vmla.i16        q12, q3,  q8  // bottom*256 + (top-bottom)*weights_ver
+        vmla.i16        q13, q3,  q9
+        vmla.i16        q14, q3,  q10
+        vmla.i16        q15, q3,  q11
+        vrshrn.i16      d24, q12, #8
+        vrshrn.i16      d25, q13, #8
+        vrshrn.i16      d26, q14, #8
+        vrshrn.i16      d27, q15, #8
+        vst1.8          {d24}, [r0, :64], r1
+        vst1.8          {d25}, [r6, :64], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d26}, [r0, :64], r1
+        vst1.8          {d27}, [r6, :64], r1
+        bgt             8b
+        pop             {r4-r7, pc}
+160:
+320:
+640:
+        vpush           {q4-q7}
+        // Set up pointers for four rows in parallel; r0, r6, r5, lr
+        add             r5,  r0,  r1
+        add             lr,  r6,  r1
+        lsl             r1,  r1,  #1
+        sub             r1,  r1,  r3
+        mov             r12, r3
+
+1:
+        vld4.8          {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver
+        vmovl.u8        q4,  d8       // weights_ver
+        vmovl.u8        q5,  d10
+        vmovl.u8        q6,  d12
+        vmovl.u8        q7,  d14
+2:
+        vld1.8          {q3}, [r2]!   // top
+        vshll.i8        q8,  d4,  #8  // bottom*256
+        vshll.i8        q9,  d4,  #8
+        vshll.i8        q10, d4,  #8
+        vshll.i8        q11, d4,  #8
+        vsubl.u8        q0,  d6,  d4  // top-bottom
+        vsubl.u8        q1,  d7,  d4
+        vshll.i8        q12, d4,  #8
+        vshll.i8        q13, d4,  #8
+        vshll.i8        q14, d4,  #8
+        vshll.i8        q15, d4,  #8
+        vmla.i16        q8,  q0,  q4  // bottom*256 + (top-bottom)*weights_ver
+        vmla.i16        q9,  q1,  q4
+        vmla.i16        q10, q0,  q5
+        vmla.i16        q11, q1,  q5
+        vmla.i16        q12, q0,  q6  // bottom*256 + (top-bottom)*weights_ver
+        vmla.i16        q13, q1,  q6
+        vmla.i16        q14, q0,  q7
+        vmla.i16        q15, q1,  q7
+        vrshrn.i16      d16, q8,  #8
+        vrshrn.i16      d17, q9,  #8
+        vrshrn.i16      d18, q10, #8
+        vrshrn.i16      d19, q11, #8
+        vrshrn.i16      d20, q12, #8
+        vrshrn.i16      d21, q13, #8
+        vrshrn.i16      d22, q14, #8
+        vrshrn.i16      d23, q15, #8
+        subs            r3,  r3,  #16
+        vst1.8          {q8},  [r0, :128]!
+        vst1.8          {q9},  [r6, :128]!
+        vst1.8          {q10}, [r5, :128]!
+        vst1.8          {q11}, [lr, :128]!
+        bgt             2b
+        subs            r4,  r4,  #4
+        ble             9f
+        sub             r2,  r2,  r12
+        add             r0,  r0,  r1
+        add             r6,  r6,  r1
+        add             r5,  r5,  r1
+        add             lr,  lr,  r1
+        mov             r3,  r12
+        b               1b
+9:
+        vpop            {q4-q7}
+        pop             {r4-r7, pc}
+endfunc
+
+// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height, const int a,
+//                               const int max_width, const int max_height);
+function ipred_smooth_h_8bpc_neon, export=1
+        push            {r4-r8, lr}
+        ldr             r4,  [sp, #24]
+        movrel          r8,  X(sm_weights)
+        add             r8,  r8,  r3
+        clz             lr,  r3
+        adr             r5,  L(ipred_smooth_h_tbl)
+        add             r12, r2,  r3
+        sub             lr,  lr,  #25
+        ldr             lr,  [r5, lr, lsl #2]
+        vld1.8          {d4[]},  [r12] // right
+        add             r5,  r5,  lr
+        add             r6,  r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r5
+
+        .align 2
+L(ipred_smooth_h_tbl):
+        .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+        .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+        .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+        .word 80f  - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+        .word 40f  - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+
+40:
+        vld1.32         {d6[]}, [r8, :32] // weights_hor
+        sub             r2,  r2,  #4
+        mov             r7,  #-4
+        vmovl.u8        q3,  d6       // weights_hor
+4:
+        vld4.8          {d0[], d1[], d2[], d3[]},  [r2, :32], r7 // left
+        vshll.i8        q8,  d4,  #8  // right*256
+        vshll.i8        q9,  d4,  #8
+        vzip.32         d3,  d2       // left, flipped
+        vzip.32         d1,  d0
+        vsubl.u8        q1,  d3,  d4  // left-right
+        vsubl.u8        q0,  d1,  d4
+        subs            r4,  r4,  #4
+        vmla.i16        q8,  q1,  q3  // right*256  + (left-right)*weights_hor
+        vmla.i16        q9,  q0,  q3
+        vrshrn.i16      d16, q8,  #8
+        vrshrn.i16      d17, q9,  #8
+        vst1.32         {d16[0]}, [r0, :32], r1
+        vst1.32         {d16[1]}, [r6, :32], r1
+        vst1.32         {d17[0]}, [r0, :32], r1
+        vst1.32         {d17[1]}, [r6, :32], r1
+        bgt             4b
+        pop             {r4-r8, pc}
+80:
+        vld1.8          {d6}, [r8, :64] // weights_hor
+        sub             r2,  r2,  #4
+        mov             r7,  #-4
+        vmovl.u8        q3,  d6       // weights_hor
+8:
+        vld4.8          {d16[], d18[], d20[], d22[]},  [r2, :32], r7 // left
+        vshll.i8        q12, d4,  #8  // right*256
+        vshll.i8        q13, d4,  #8
+        vshll.i8        q14, d4,  #8
+        vshll.i8        q15, d4,  #8
+        vsubl.u8        q11, d22, d4  // left-right
+        vsubl.u8        q10, d20, d4
+        vsubl.u8        q9,  d18, d4
+        vsubl.u8        q8,  d16, d4
+        vmla.i16        q12, q11, q3  // right*256  + (left-right)*weights_hor
+        vmla.i16        q13, q10, q3  // (left flipped)
+        vmla.i16        q14, q9,  q3
+        vmla.i16        q15, q8,  q3
+        vrshrn.i16      d24, q12, #8
+        vrshrn.i16      d25, q13, #8
+        vrshrn.i16      d26, q14, #8
+        vrshrn.i16      d27, q15, #8
+        vst1.8          {d24}, [r0, :64], r1
+        vst1.8          {d25}, [r6, :64], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d26}, [r0, :64], r1
+        vst1.8          {d27}, [r6, :64], r1
+        bgt             8b
+        pop             {r4-r8, pc}
+160:
+320:
+640:
+        vpush           {q4-q7}
+        sub             r2,  r2,  #4
+        mov             r7,  #-4
+        // Set up pointers for four rows in parallel; r0, r6, r5, lr
+        add             r5,  r0,  r1
+        add             lr,  r6,  r1
+        lsl             r1,  r1,  #1
+        sub             r1,  r1,  r3
+        mov             r12, r3
+
+1:
+        vld4.8          {d8[], d10[], d12[], d14[]},  [r2, :32], r7 // left
+        vsubl.u8        q4,  d8,  d4  // left-right
+        vsubl.u8        q5,  d10, d4
+        vsubl.u8        q6,  d12, d4
+        vsubl.u8        q7,  d14, d4
+2:
+        vld1.8          {q1}, [r8, :128]! // weights_hor
+        vshll.i8        q8,  d4,  #8  // right*256
+        vshll.i8        q9,  d4,  #8
+        vshll.i8        q10, d4,  #8
+        vshll.i8        q11, d4,  #8
+        vmovl.u8        q0,  d2       // weights_hor
+        vmovl.u8        q1,  d3
+        vshll.i8        q12, d4,  #8
+        vshll.i8        q13, d4,  #8
+        vshll.i8        q14, d4,  #8
+        vshll.i8        q15, d4,  #8
+        vmla.i16        q8,  q7,  q0  // right*256  + (left-right)*weights_hor
+        vmla.i16        q9,  q7,  q1  // (left flipped)
+        vmla.i16        q10, q6,  q0
+        vmla.i16        q11, q6,  q1
+        vmla.i16        q12, q5,  q0
+        vmla.i16        q13, q5,  q1
+        vmla.i16        q14, q4,  q0
+        vmla.i16        q15, q4,  q1
+        vrshrn.i16      d16, q8,  #8
+        vrshrn.i16      d17, q9,  #8
+        vrshrn.i16      d18, q10, #8
+        vrshrn.i16      d19, q11, #8
+        vrshrn.i16      d20, q12, #8
+        vrshrn.i16      d21, q13, #8
+        vrshrn.i16      d22, q14, #8
+        vrshrn.i16      d23, q15, #8
+        subs            r3,  r3,  #16
+        vst1.8          {q8},  [r0, :128]!
+        vst1.8          {q9},  [r6, :128]!
+        vst1.8          {q10}, [r5, :128]!
+        vst1.8          {q11}, [lr, :128]!
+        bgt             2b
+        subs            r4,  r4,  #4
+        ble             9f
+        sub             r8,  r8,  r12
+        add             r0,  r0,  r1
+        add             r6,  r6,  r1
+        add             r5,  r5,  r1
+        add             lr,  lr,  r1
+        mov             r3,  r12
+        b               1b
+9:
+        vpop            {q4-q7}
+        pop             {r4-r8, pc}
+endfunc
+
+// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int filt_idx,
+//                             const int max_width, const int max_height);
+function ipred_filter_8bpc_neon, export=1
+        push            {r4-r8, lr}
+        movw            r12, #511
+        ldr             r5, [sp, #28]
+        ldr             r4, [sp, #24]
+        and             r5,  r5,  r12 // 511
+        movrel          r6,  X(filter_intra_taps)
+        lsl             r5,  r5,  #6
+        add             r6,  r6,  r5
+        vld1.8          {d20, d21, d22, d23}, [r6, :128]!
+        clz             lr,  r3
+        adr             r5,  L(ipred_filter_tbl)
+        vld1.8          {d27, d28, d29}, [r6, :64]
+        sub             lr,  lr,  #26
+        ldr             lr,  [r5, lr, lsl #2]
+        vmovl.s8        q8,  d20
+        vmovl.s8        q9,  d21
+        add             r5,  r5,  lr
+        vmovl.s8        q10, d22
+        vmovl.s8        q11, d23
+        add             r6,  r0,  r1
+        lsl             r1,  r1,  #1
+        vmovl.s8        q12, d27
+        vmovl.s8        q13, d28
+        vmovl.s8        q14, d29
+        add             r8,  r2,  #1
+        bx              r5
+
+        .align 2
+L(ipred_filter_tbl):
+        .word 320f - L(ipred_filter_tbl) + CONFIG_THUMB
+        .word 160f - L(ipred_filter_tbl) + CONFIG_THUMB
+        .word 80f  - L(ipred_filter_tbl) + CONFIG_THUMB
+        .word 40f  - L(ipred_filter_tbl) + CONFIG_THUMB
+
+40:
+        vld1.32         {d0[]}, [r8]     // top (0-3)
+        sub             r2,  r2,  #2
+        mov             r7,  #-2
+        vmovl.u8        q0,  d0          // top (0-3)
+4:
+        vld1.32         {d2[]}, [r2], r7 // left (0-1) + topleft (2)
+        vmul.i16        q2,  q9,  d0[0]  // p1(top[0]) * filter(1)
+        vmla.i16        q2,  q10, d0[1]  // p2(top[1]) * filter(2)
+        vmla.i16        q2,  q11, d0[2]  // p3(top[2]) * filter(3)
+        vmovl.u8        q1,  d2          // left (0-1) + topleft (2)
+        vmla.i16        q2,  q12, d0[3]  // p4(top[3]) * filter(4)
+        vmla.i16        q2,  q8,  d2[2]  // p0(topleft) * filter(0)
+        vmla.i16        q2,  q13, d2[1]  // p5(left[0]) * filter(5)
+        vmla.i16        q2,  q14, d2[0]  // p6(left[1]) * filter(6)
+        vqrshrun.s16    d4,  q2,  #4
+        subs            r4,  r4,  #2
+        vst1.32         {d4[0]}, [r0, :32], r1
+        vmovl.u8        q0,  d4
+        vst1.32         {d4[1]}, [r6, :32], r1
+        vext.8          q0,  q0,  q0,  #8 // move top from [4-7] to [0-3]
+        bgt             4b
+        pop             {r4-r8, pc}
+80:
+        vld1.8          {d0},  [r8]      // top (0-7)
+        sub             r2,  r2,  #2
+        mov             r7,  #-2
+        vmovl.u8        q0,  d0          // top (0-7)
+8:
+        vld1.32         {d2[]}, [r2], r7 // left (0-1) + topleft (2)
+        vmul.i16        q2,  q9,  d0[0]  // p1(top[0]) * filter(1)
+        vmla.i16        q2,  q10, d0[1]  // p2(top[1]) * filter(2)
+        vmla.i16        q2,  q11, d0[2]  // p3(top[2]) * filter(3)
+        vmovl.u8        q1,  d2          // left (0-1) + topleft (2)
+        vmla.i16        q2,  q12, d0[3]  // p4(top[3]) * filter(4)
+        vmla.i16        q2,  q8,  d2[2]  // p0(topleft) * filter(0)
+        vmla.i16        q2,  q13, d2[1]  // p5(left[0]) * filter(5)
+        vmla.i16        q2,  q14, d2[0]  // p6(left[1]) * filter(6)
+        vmul.i16        q3,  q9,  d1[0]  // p1(top[0]) * filter(1)
+        vmla.i16        q3,  q10, d1[1]  // p2(top[1]) * filter(2)
+        vmla.i16        q3,  q11, d1[2]  // p3(top[2]) * filter(3)
+        vqrshrun.s16    d4,  q2,  #4
+        vmovl.u8        q1,  d4          // first block, in 16 bit
+        vmla.i16        q3,  q12, d1[3]  // p4(top[3]) * filter(4)
+        vmla.i16        q3,  q8,  d0[3]  // p0(topleft) * filter(0)
+        vmla.i16        q3,  q13, d2[3]  // p5(left[0]) * filter(5)
+        vmla.i16        q3,  q14, d3[3]  // p6(left[1]) * filter(6)
+        vqrshrun.s16    d5,  q3,  #4
+        vzip.32         d4,  d5
+        subs            r4,  r4,  #2
+        vst1.64         {d4}, [r0, :64], r1
+        vmovl.u8        q0,  d5
+        vst1.64         {d5}, [r6, :64], r1
+        bgt             8b
+        pop             {r4-r8, pc}
+160:
+320:
+        vpush           {q4-q5}
+        sub             r2,  r2,  #2
+        mov             r7,  #-2
+        sub             r1,  r1,  r3
+        mov             lr,  r3
+
+1:
+        vld1.32         {d0[]}, [r2], r7 // left (0-1) + topleft (2)
+        vmovl.u8        q0,  d0          // left (0-1) + topleft (2)
+2:
+        vld1.8          {q2}, [r8]!      // top(0-15)
+        vmul.i16        q3,  q8,  d0[2]  // p0(topleft) * filter(0)
+        vmla.i16        q3,  q13, d0[1]  // p5(left[0]) * filter(5)
+        vmovl.u8        q1,  d4          // top(0-7)
+        vmovl.u8        q2,  d5          // top(8-15)
+        vmla.i16        q3,  q14, d0[0]  // p6(left[1]) * filter(6)
+        vmla.i16        q3,  q9,  d2[0]  // p1(top[0]) * filter(1)
+        vmla.i16        q3,  q10, d2[1]  // p2(top[1]) * filter(2)
+        vmla.i16        q3,  q11, d2[2]  // p3(top[2]) * filter(3)
+        vmla.i16        q3,  q12, d2[3]  // p4(top[3]) * filter(4)
+
+        vmul.i16        q4,  q9,  d3[0]  // p1(top[0]) * filter(1)
+        vmla.i16        q4,  q10, d3[1]  // p2(top[1]) * filter(2)
+        vmla.i16        q4,  q11, d3[2]  // p3(top[2]) * filter(3)
+        vqrshrun.s16    d6,  q3,  #4
+        vmovl.u8        q0,  d6          // first block, in 16 bit
+        vmla.i16        q4,  q12, d3[3]  // p4(top[3]) * filter(4)
+        vmla.i16        q4,  q8,  d2[3]  // p0(topleft) * filter(0)
+        vmla.i16        q4,  q13, d0[3]  // p5(left[0]) * filter(5)
+        vmla.i16        q4,  q14, d1[3]  // p6(left[1]) * filter(6)
+
+        vmul.i16        q5,  q9,  d4[0]  // p1(top[0]) * filter(1)
+        vmla.i16        q5,  q10, d4[1]  // p2(top[1]) * filter(2)
+        vmla.i16        q5,  q11, d4[2]  // p3(top[2]) * filter(3)
+        vqrshrun.s16    d7,  q4,  #4
+        vmovl.u8        q0,  d7          // second block, in 16 bit
+        vmla.i16        q5,  q12, d4[3]  // p4(top[3]) * filter(4)
+        vmla.i16        q5,  q8,  d3[3]  // p0(topleft) * filter(0)
+        vmla.i16        q5,  q13, d0[3]  // p5(left[0]) * filter(5)
+        vmla.i16        q5,  q14, d1[3]  // p6(left[1]) * filter(6)
+
+        vmul.i16        q15, q9,  d5[0]  // p1(top[0]) * filter(1)
+        vmla.i16        q15, q10, d5[1]  // p2(top[1]) * filter(2)
+        vmla.i16        q15, q11, d5[2]  // p3(top[2]) * filter(3)
+        vqrshrun.s16    d8,  q5,  #4
+        vmovl.u8        q0,  d8          // third block, in 16 bit
+        vmov.u8         r12, d5[6]
+        vmla.i16        q15, q12, d5[3]  // p4(top[3]) * filter(4)
+        vmla.i16        q15, q8,  d4[3]  // p0(topleft) * filter(0)
+        vmla.i16        q15, q13, d0[3]  // p5(left[0]) * filter(5)
+        vmla.i16        q15, q14, d1[3]  // p6(left[1]) * filter(6)
+        vmov.8          d0[4], r12
+
+        subs            r3,  r3,  #16
+        vqrshrun.s16    d9,  q15, #4
+
+        vst4.32         {d6[0], d7[0], d8[0], d9[0]}, [r0, :128]!
+        vst4.32         {d6[1], d7[1], d8[1], d9[1]}, [r6, :128]!
+        ble             8f
+        vmov.u8         r12, d9[7]
+        vmov.8          d0[0], r12
+        vmov.u8         r12, d9[3]
+        vmov.8          d0[2], r12
+        b               2b
+8:
+        subs            r4,  r4,  #2
+
+        ble             9f
+        sub             r8,  r6,  lr
+        add             r0,  r0,  r1
+        add             r6,  r6,  r1
+        mov             r3,  lr
+        b               1b
+9:
+        vpop            {q4-q5}
+        pop             {r4-r8, pc}
+endfunc
+
+// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                         const uint16_t *const pal, const uint8_t *idx,
+//                         const int w, const int h);
+function pal_pred_8bpc_neon, export=1
+        push            {r4-r5, lr}
+        ldr             r4,  [sp, #12]
+        ldr             r5,  [sp, #16]
+        vld1.16         {q0}, [r2, :128]
+        clz             lr,  r4
+        adr             r12, L(pal_pred_tbl)
+        sub             lr,  lr,  #25
+        ldr             lr,  [r12, lr, lsl #2]
+        vmovn.i16       d0,  q0
+        add             r12, r12, lr
+        add             r2,  r0,  r1
+        bx              r12
+
+        .align 2
+L(pal_pred_tbl):
+        .word 640f - L(pal_pred_tbl) + CONFIG_THUMB
+        .word 320f - L(pal_pred_tbl) + CONFIG_THUMB
+        .word 160f - L(pal_pred_tbl) + CONFIG_THUMB
+        .word 80f  - L(pal_pred_tbl) + CONFIG_THUMB
+        .word 40f  - L(pal_pred_tbl) + CONFIG_THUMB
+
+40:
+        lsl             r1,  r1,  #1
+4:
+        vld1.8          {q1}, [r3, :128]!
+        subs            r5,  r5,  #4
+        vtbl.8          d2, {d0}, d2
+        vtbl.8          d3, {d0}, d3
+        vst1.32         {d2[0]}, [r0, :32], r1
+        vst1.32         {d2[1]}, [r2, :32], r1
+        vst1.32         {d3[0]}, [r0, :32], r1
+        vst1.32         {d3[1]}, [r2, :32], r1
+        bgt             4b
+        pop             {r4-r5, pc}
+80:
+        lsl             r1,  r1,  #1
+8:
+        vld1.8          {q1, q2}, [r3, :128]!
+        subs            r5,  r5,  #4
+        vtbl.8          d2, {d0}, d2
+        vtbl.8          d3, {d0}, d3
+        vst1.8          {d2}, [r0, :64], r1
+        vtbl.8          d4, {d0}, d4
+        vst1.8          {d3}, [r2, :64], r1
+        vtbl.8          d5, {d0}, d5
+        vst1.8          {d4}, [r0, :64], r1
+        vst1.8          {d5}, [r2, :64], r1
+        bgt             8b
+        pop             {r4-r5, pc}
+160:
+        lsl             r1,  r1,  #1
+16:
+        vld1.8          {q8,  q9},  [r3, :128]!
+        subs            r5,  r5,  #4
+        vld1.8          {q10, q11}, [r3, :128]!
+        vtbl.8          d16, {d0}, d16
+        vtbl.8          d17, {d0}, d17
+        vtbl.8          d18, {d0}, d18
+        vtbl.8          d19, {d0}, d19
+        vtbl.8          d20, {d0}, d20
+        vtbl.8          d21, {d0}, d21
+        vst1.8          {q8},  [r0, :128], r1
+        vtbl.8          d22, {d0}, d22
+        vst1.8          {q9},  [r2, :128], r1
+        vtbl.8          d23, {d0}, d23
+        vst1.8          {q10}, [r0, :128], r1
+        vst1.8          {q11}, [r2, :128], r1
+        bgt             16b
+        pop             {r4-r5, pc}
+320:
+        lsl             r1,  r1,  #1
+32:
+        vld1.8          {q8,  q9},  [r3, :128]!
+        subs            r5,  r5,  #2
+        vld1.8          {q10, q11}, [r3, :128]!
+        vtbl.8          d16, {d0}, d16
+        vtbl.8          d17, {d0}, d17
+        vtbl.8          d18, {d0}, d18
+        vtbl.8          d19, {d0}, d19
+        vtbl.8          d20, {d0}, d20
+        vtbl.8          d21, {d0}, d21
+        vst1.8          {q8,  q9},  [r0, :128], r1
+        vtbl.8          d22, {d0}, d22
+        vtbl.8          d23, {d0}, d23
+        vst1.8          {q10, q11}, [r2, :128], r1
+        bgt             32b
+        pop             {r4-r5, pc}
+640:
+        sub             r1,  r1,  #32
+64:
+        vld1.8          {q8,  q9},  [r3, :128]!
+        subs            r5,  r5,  #1
+        vld1.8          {q10, q11}, [r3, :128]!
+        vtbl.8          d16, {d0}, d16
+        vtbl.8          d17, {d0}, d17
+        vtbl.8          d18, {d0}, d18
+        vtbl.8          d19, {d0}, d19
+        vtbl.8          d20, {d0}, d20
+        vtbl.8          d21, {d0}, d21
+        vst1.8          {q8,  q9},  [r0, :128]!
+        vtbl.8          d22, {d0}, d22
+        vtbl.8          d23, {d0}, d23
+        vst1.8          {q10, q11}, [r0, :128], r1
+        bgt             64b
+        pop             {r4-r5, pc}
+endfunc
+
+// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height,
+//                              const int16_t *ac, const int alpha);
+function ipred_cfl_128_8bpc_neon, export=1
+        push            {r4-r8, lr}
+        ldr             r4,  [sp, #24]
+        ldr             r5,  [sp, #28]
+        ldr             r6,  [sp, #32]
+        clz             lr,  r3
+        adr             r12, L(ipred_cfl_128_tbl)
+        sub             lr,  lr,  #26
+        ldr             lr,  [r12, lr, lsl #2]
+        vmov.i16        q0,  #128     // dc
+        vdup.i16        q1,  r6       // alpha
+        add             r12, r12, lr
+        add             r6,  r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r12
+
+        .align 2
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+        .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_splat_w8)  - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_splat_w4)  - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_splat_w4):
+        vld1.16         {q2, q3}, [r5, :128]!
+        vmul.i16        q2,  q2,  q1  // diff = ac * alpha
+        vmul.i16        q3,  q3,  q1
+        vshr.s16        q8,  q2,  #15 // sign = diff >> 15
+        vshr.s16        q9,  q3,  #15
+        vadd.i16        q2,  q2,  q8  // diff + sign
+        vadd.i16        q3,  q3,  q9
+        vrshr.s16       q2,  q2,  #6  // (diff + sign + 32) >> 6 = apply_sign()
+        vrshr.s16       q3,  q3,  #6
+        vadd.i16        q2,  q2,  q0  // dc + apply_sign()
+        vadd.i16        q3,  q3,  q0
+        vqmovun.s16     d4,  q2       // iclip_pixel(dc + apply_sign())
+        vqmovun.s16     d5,  q3
+        vst1.32         {d4[0]}, [r0, :32], r1
+        vst1.32         {d4[1]}, [r6, :32], r1
+        subs            r4,  r4,  #4
+        vst1.32         {d5[0]}, [r0, :32], r1
+        vst1.32         {d5[1]}, [r6, :32], r1
+        bgt             L(ipred_cfl_splat_w4)
+        pop             {r4-r8, pc}
+L(ipred_cfl_splat_w8):
+        vld1.16         {q8, q9},   [r5, :128]!
+        vld1.16         {q10, q11}, [r5, :128]!
+        vmul.i16        q8,  q8,  q1  // diff = ac * alpha
+        vmul.i16        q9,  q9,  q1
+        vmul.i16        q10, q10, q1
+        vmul.i16        q11, q11, q1
+        vshr.s16        q12, q8,  #15 // sign = diff >> 15
+        vshr.s16        q13, q9,  #15
+        vshr.s16        q14, q10, #15
+        vshr.s16        q15, q11, #15
+        vadd.i16        q8,  q8,  q12 // diff + sign
+        vadd.i16        q9,  q9,  q13
+        vadd.i16        q10, q10, q14
+        vadd.i16        q11, q11, q15
+        vrshr.s16       q8,  q8,  #6  // (diff + sign + 32) >> 6 = apply_sign()
+        vrshr.s16       q9,  q9,  #6
+        vrshr.s16       q10, q10, #6
+        vrshr.s16       q11, q11, #6
+        vadd.i16        q8,  q8,  q0  // dc + apply_sign()
+        vadd.i16        q9,  q9,  q0
+        vadd.i16        q10, q10, q0
+        vadd.i16        q11, q11, q0
+        vqmovun.s16     d16, q8       // iclip_pixel(dc + apply_sign())
+        vqmovun.s16     d17, q9
+        vqmovun.s16     d18, q10
+        vqmovun.s16     d19, q11
+        vst1.8          {d16}, [r0, :64], r1
+        vst1.8          {d17}, [r6, :64], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d18}, [r0, :64], r1
+        vst1.8          {d19}, [r6, :64], r1
+        bgt             L(ipred_cfl_splat_w8)
+        pop             {r4-r8, pc}
+L(ipred_cfl_splat_w16):
+        add             r12, r5,  r3, lsl #1
+        sub             r1,  r1,  r3
+        mov             lr,  r3
+1:
+        vld1.16         {q8, q9},   [r5, :128]!
+        vmul.i16        q8,  q8,  q1  // diff = ac * alpha
+        vld1.16         {q10, q11}, [r12, :128]!
+        vmul.i16        q9,  q9,  q1
+        vmul.i16        q10, q10, q1
+        vmul.i16        q11, q11, q1
+        vshr.s16        q12, q8,  #15 // sign = diff >> 15
+        vshr.s16        q13, q9,  #15
+        vshr.s16        q14, q10, #15
+        vshr.s16        q15, q11, #15
+        vadd.i16        q8,  q8,  q12 // diff + sign
+        vadd.i16        q9,  q9,  q13
+        vadd.i16        q10, q10, q14
+        vadd.i16        q11, q11, q15
+        vrshr.s16       q8,  q8,  #6  // (diff + sign + 32) >> 6 = apply_sign()
+        vrshr.s16       q9,  q9,  #6
+        vrshr.s16       q10, q10, #6
+        vrshr.s16       q11, q11, #6
+        vadd.i16        q8,  q8,  q0  // dc + apply_sign()
+        vadd.i16        q9,  q9,  q0
+        vadd.i16        q10, q10, q0
+        vadd.i16        q11, q11, q0
+        vqmovun.s16     d16, q8       // iclip_pixel(dc + apply_sign())
+        vqmovun.s16     d17, q9
+        vqmovun.s16     d18, q10
+        vqmovun.s16     d19, q11
+        subs            r3,  r3,  #16
+        vst1.16         {q8}, [r0, :128]!
+        vst1.16         {q9}, [r6, :128]!
+        bgt             1b
+        subs            r4,  r4,  #2
+        add             r5,  r5,  lr, lsl #1
+        add             r12, r12, lr, lsl #1
+        add             r0,  r0,  r1
+        add             r6,  r6,  r1
+        mov             r3,  lr
+        bgt             1b
+        pop             {r4-r8, pc}
+endfunc
+
+// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height,
+//                              const int16_t *ac, const int alpha);
+function ipred_cfl_top_8bpc_neon, export=1
+        push            {r4-r8, lr}
+        ldr             r4,  [sp, #24]
+        ldr             r5,  [sp, #28]
+        ldr             r6,  [sp, #32]
+        clz             lr,  r3
+        adr             r12, L(ipred_cfl_top_tbl)
+        sub             lr,  lr,  #26
+        ldr             lr,  [r12, lr, lsl #2]
+        vdup.16         q1,  r6   // alpha
+        add             r2,  r2,  #1
+        add             r12, r12, lr
+        add             r6,  r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r12
+
+        .align 2
+L(ipred_cfl_top_tbl):
+        .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+        .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+        .word 8f  - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+        .word 4f  - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+
+4:
+        vld1.32         {d0[]}, [r2]
+        vpaddl.u8       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshr.u16       d0,  d0,  #2
+        vdup.16         q0,  d0[0]
+        b               L(ipred_cfl_splat_w4)
+8:
+        vld1.8          {d0}, [r2]
+        vpaddl.u8       d0,  d0
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshr.u16       d0,  d0,  #3
+        vdup.16         q0,  d0[0]
+        b               L(ipred_cfl_splat_w8)
+16:
+        vld1.8          {q0}, [r2]
+        vaddl.u8        q0,  d0,  d1
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshr.u16       d0,  d0,  #4
+        vdup.16         q0,  d0[0]
+        b               L(ipred_cfl_splat_w16)
+32:
+        vld1.8          {q2, q3}, [r2]
+        vaddl.u8        q2,  d4,  d5
+        vaddl.u8        q3,  d6,  d7
+        vadd.u16        q0,  q2,  q3
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshr.u16       d0,  d0,  #5
+        vdup.16         q0,  d0[0]
+        b               L(ipred_cfl_splat_w16)
+endfunc
+
+// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height,
+//                               const int16_t *ac, const int alpha);
+function ipred_cfl_left_8bpc_neon, export=1
+        push            {r4-r8, lr}
+        ldr             r4,  [sp, #24]
+        ldr             r5,  [sp, #28]
+        ldr             r6,  [sp, #32]
+        sub             r2,  r2,  r4
+        clz             lr,  r3
+        clz             r8,  r4
+        adr             r12, L(ipred_cfl_splat_tbl)
+        adr             r7,  L(ipred_cfl_left_tbl)
+        sub             lr,  lr,  #26
+        sub             r8,  r8,  #26
+        ldr             lr,  [r12, lr, lsl #2]
+        ldr             r8,  [r7,  r8, lsl #2]
+        vdup.16         q1,  r6   // alpha
+        add             r12, r12, lr
+        add             r7,  r7,  r8
+        add             r6,  r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r7
+
+        .align 2
+L(ipred_cfl_left_tbl):
+        .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_left_h8)  - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_left_h4)  - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_left_h4):
+        vld1.32         {d0[]}, [r2, :32]
+        vpaddl.u8       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshr.u16       d0,  d0,  #2
+        vdup.16         q0,  d0[0]
+        bx              r12
+
+L(ipred_cfl_left_h8):
+        vld1.8          {d0}, [r2, :64]
+        vpaddl.u8       d0,  d0
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshr.u16       d0,  d0,  #3
+        vdup.16         q0,  d0[0]
+        bx              r12
+
+L(ipred_cfl_left_h16):
+        vld1.8          {q0}, [r2, :128]
+        vaddl.u8        q0,  d0,  d1
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshr.u16       d0,  d0,  #4
+        vdup.16         q0,  d0[0]
+        bx              r12
+
+L(ipred_cfl_left_h32):
+        vld1.8          {q2, q3}, [r2, :128]
+        vaddl.u8        q2,  d4,  d5
+        vaddl.u8        q3,  d6,  d7
+        vadd.u16        q0,  q2,  q3
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshr.u16       d0,  d0,  #5
+        vdup.16         q0,  d0[0]
+        bx              r12
+endfunc
+
+// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                          const pixel *const topleft,
+//                          const int width, const int height,
+//                          const int16_t *ac, const int alpha);
+function ipred_cfl_8bpc_neon, export=1
+        push            {r4-r8, lr}
+        ldr             r4,  [sp, #24]
+        ldr             r5,  [sp, #28]
+        ldr             r6,  [sp, #32]
+        sub             r2,  r2,  r4
+        add             r8,  r3,  r4  // width + height
+        vdup.16         q1,  r6       // alpha
+        clz             lr,  r3
+        clz             r6,  r4
+        vdup.16         d16, r8       // width + height
+        adr             r7,  L(ipred_cfl_tbl)
+        rbit            r8,  r8       // rbit(width + height)
+        sub             lr,  lr,  #22 // 26 leading bits, minus table offset 4
+        sub             r6,  r6,  #26
+        clz             r8,  r8       // ctz(width + height)
+        ldr             lr,  [r7, lr, lsl #2]
+        ldr             r6,  [r7, r6, lsl #2]
+        neg             r8,  r8       // -ctz(width + height)
+        add             r12, r7,  lr
+        add             r7,  r7,  r6
+        vshr.u16        d16, d16, #1  // (width + height) >> 1
+        vdup.16         d17, r8       // -ctz(width + height)
+        add             r6,  r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r7
+
+        .align 2
+L(ipred_cfl_tbl):
+        .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_h8)  - L(ipred_cfl_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_h4)  - L(ipred_cfl_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_w8)  - L(ipred_cfl_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_w4)  - L(ipred_cfl_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_h4):
+        vld1.32         {d0[]}, [r2, :32]!
+        vpaddl.u8       d0,  d0
+        vpadd.i16       d0,  d0
+        bx              r12
+L(ipred_cfl_w4):
+        add             r2,  r2,  #1
+        vld1.32         {d1[]},  [r2]
+        vadd.i16        d0,  d0,  d16
+        vpaddl.u8       d1,  d1
+        vpadd.u16       d1,  d1
+        cmp             r4,  #4
+        vadd.i16        d0,  d0,  d1
+        vshl.u16        d0,  d0,  d17
+        beq             1f
+        // h = 8/16
+        movw            lr,  #(0x3334/2)
+        movw            r8,  #(0x5556/2)
+        cmp             r4,  #16
+        it              ne
+        movne           lr,  r8
+        vdup.16         d18, lr
+        vqdmulh.s16     d0,  d0,  d18
+1:
+        vdup.16         q0,  d0[0]
+        b               L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+        vld1.8          {d0}, [r2, :64]!
+        vpaddl.u8       d0,  d0
+        vpadd.i16       d0,  d0
+        vpadd.i16       d0,  d0
+        bx              r12
+L(ipred_cfl_w8):
+        add             r2,  r2,  #1
+        vld1.8          {d1}, [r2]
+        vadd.i16        d0,  d0,  d16
+        vpaddl.u8       d1,  d1
+        vpadd.i16       d1,  d1
+        vpadd.i16       d1,  d1
+        cmp             r4,  #8
+        vadd.i16        d0,  d0,  d1
+        vshl.u16        d0,  d0,  d17
+        beq             1f
+        // h = 4/16/32
+        cmp             r4,  #32
+        movw            lr,  #(0x3334/2)
+        movw            r8,  #(0x5556/2)
+        it              ne
+        movne           lr,  r8
+        vdup.16         d18, lr
+        vqdmulh.s16     d0,  d0,  d18
+1:
+        vdup.16         q0,  d0[0]
+        b               L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+        vld1.8          {q0}, [r2, :128]!
+        vaddl.u8        q0,  d0,  d1
+        vadd.i16        d0,  d0,  d1
+        vpadd.i16       d0,  d0
+        vpadd.i16       d0,  d0
+        bx              r12
+L(ipred_cfl_w16):
+        add             r2,  r2,  #1
+        vld1.8          {q2}, [r2]
+        vadd.i16        d0,  d0,  d16
+        vaddl.u8        q2,  d4,  d5
+        vadd.i16        d4,  d4,  d5
+        vpadd.i16       d4,  d4
+        vpadd.i16       d4,  d4
+        cmp             r4,  #16
+        vadd.i16        d0,  d0,  d4
+        vshl.u16        d0,  d0,  d17
+        beq             1f
+        // h = 4/8/32/64
+        tst             r4,  #(32+16+8)  // 16 added to make a consecutive bitmask
+        movw            lr,  #(0x3334/2)
+        movw            r8,  #(0x5556/2)
+        it              ne
+        movne           lr,  r8
+        vdup.16         d18, lr
+        vqdmulh.s16     d0,  d0,  d18
+1:
+        vdup.16         q0,  d0[0]
+        b               L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+        vld1.8          {q2, q3}, [r2, :128]!
+        vaddl.u8        q2,  d4,  d5
+        vaddl.u8        q3,  d6,  d7
+        vadd.i16        q0,  q2,  q3
+        vadd.i16        d0,  d0,  d1
+        vpadd.i16       d0,  d0
+        vpadd.i16       d0,  d0
+        bx              r12
+L(ipred_cfl_w32):
+        add             r2,  r2,  #1
+        vld1.8          {q2, q3},  [r2]
+        vadd.i16        d0,  d0,  d16
+        vaddl.u8        q2,  d4,  d5
+        vaddl.u8        q3,  d6,  d7
+        vadd.i16        q2,  q2,  q3
+        vadd.i16        d4,  d4,  d5
+        vpadd.i16       d4,  d4
+        vpadd.i16       d4,  d4
+        cmp             r4,  #32
+        vadd.i16        d0,  d0,  d4
+        vshl.u16        d0,  d0,  d17
+        beq             1f
+        // h = 8/16/64
+        cmp             r4,  #8
+        movw            lr,  #(0x3334/2)
+        movw            r8,  #(0x5556/2)
+        it              ne
+        movne           lr,  r8
+        vdup.16         d18, lr
+        vqdmulh.s16     d0,  d0,  d18
+1:
+        vdup.16         q0,  d0[0]
+        b               L(ipred_cfl_splat_w16)
+endfunc
+
+// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                           const ptrdiff_t stride, const int w_pad,
+//                           const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_8bpc_neon, export=1
+        push            {r4-r8,lr}
+        ldr             r4,  [sp, #24]
+        ldr             r5,  [sp, #28]
+        ldr             r6,  [sp, #32]
+        clz             r8,  r5
+        lsl             r4,  r4,  #2
+        adr             r7,  L(ipred_cfl_ac_420_tbl)
+        sub             r8,  r8,  #27
+        ldr             r8,  [r7, r8, lsl #2]
+        vmov.i16        q8,  #0
+        vmov.i16        q9,  #0
+        vmov.i16        q10, #0
+        vmov.i16        q11, #0
+        add             r7,  r7,  r8
+        sub             r8,  r6,  r4  // height - h_pad
+        rbit            lr,  r5       // rbit(width)
+        rbit            r12, r6       // rbit(height)
+        clz             lr,  lr       // ctz(width)
+        clz             r12, r12      // ctz(height)
+        add             lr,  lr,  r12 // log2sz
+        add             r12, r1,  r2
+        vdup.32         d31, lr
+        lsl             r2,  r2,  #1
+        vneg.s32        d31, d31      // -log2sz
+        bx              r7
+
+        .align 2
+L(ipred_cfl_ac_420_tbl):
+        .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_420_w8)  - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_420_w4)  - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w4):
+1:      // Copy and subsample input
+        vld1.8          {d0}, [r1,  :64], r2
+        vld1.8          {d2}, [r12, :64], r2
+        vld1.8          {d1}, [r1,  :64], r2
+        vld1.8          {d3}, [r12, :64], r2
+        vpaddl.u8       q0,  q0
+        vpaddl.u8       q1,  q1
+        vadd.i16        q0,  q0,  q1
+        vshl.i16        q0,  q0,  #1
+        subs            r8,  r8,  #2
+        vst1.16         {q0}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        bgt             1b
+        cmp             r4,  #0
+        vmov            d0,  d1
+        vmov            d2,  d1
+        vmov            d3,  d1
+L(ipred_cfl_ac_420_w4_hpad):
+        beq             3f // This assumes that all callers already did "cmp r4, #0"
+2:      // Vertical padding (h_pad > 0)
+        subs            r4,  r4,  #4
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q8,  q8,  q1
+        bgt             2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+        // Aggregate the sums
+        vadd.i16        q0,  q8,  q9
+        vadd.i16        q1,  q10, q11
+        vpaddl.u16      q0,  q0
+        vpaddl.u16      q1,  q1
+        vadd.i32        q0,  q1
+        vadd.i32        d0,  d0,  d1
+        vpadd.i32       d0,  d0,  d0  // sum
+        sub             r0,  r0,  r6, lsl #3
+        vrshl.u32       d16, d0,  d31 // (sum + (1 << (log2sz - 1))) >>= log2sz
+        vdup.16         q8,  d16[0]
+L(ipred_cfl_ac_420_w4_subtract_dc):
+6:      // Subtract dc from ac
+        vld1.16         {q0, q1}, [r0, :128]
+        subs            r6,  r6,  #4
+        vsub.i16        q0,  q0,  q8
+        vsub.i16        q1,  q1,  q8
+        vst1.16         {q0, q1}, [r0, :128]!
+        bgt             6b
+        pop             {r4-r8, pc}
+
+L(ipred_cfl_ac_420_w8):
+        cmp             r3,  #0
+        bne             L(ipred_cfl_ac_420_w8_wpad)
+1:      // Copy and subsample input, without padding
+        vld1.8          {q0}, [r1,  :128], r2
+        vld1.8          {q1}, [r12, :128], r2
+        vld1.8          {q2}, [r1,  :128], r2
+        vpaddl.u8       q0,  q0
+        vld1.8          {q3}, [r12, :128], r2
+        vpaddl.u8       q1,  q1
+        vpaddl.u8       q2,  q2
+        vpaddl.u8       q3,  q3
+        vadd.i16        q0,  q0,  q1
+        vadd.i16        q2,  q2,  q3
+        vshl.i16        q0,  q0,  #1
+        vshl.i16        q1,  q2,  #1
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q1
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1:      // Copy and subsample input, padding 4
+        vld1.16         {d0}, [r1,  :64], r2
+        vld1.16         {d2}, [r12, :64], r2
+        vld1.16         {d1}, [r1,  :64], r2
+        vld1.16         {d3}, [r12, :64], r2
+        vpaddl.u8       q0,  q0
+        vpaddl.u8       q1,  q1
+        vadd.i16        q0,  q0,  q1
+        vshl.i16        q0,  q0,  #1
+        vdup.16         d3,  d1[3]
+        vmov            d2,  d1
+        vdup.16         d1,  d0[3]
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q1
+
+L(ipred_cfl_ac_420_w8_hpad):
+        beq             3f // This assumes that all callers already did "cmp r4, #0"
+2:      // Vertical padding (h_pad > 0)
+        subs            r4,  r4,  #4
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q10, q10, q0
+        vadd.i16        q11, q11, q1
+        bgt             2b
+3:
+
+        // Double the height and reuse the w4 summing/subtracting
+        lsl             r6,  r6,  #1
+        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+        adr             r7,  L(ipred_cfl_ac_420_w16_tbl)
+        ldr             r3,  [r7, r3, lsl #2]
+        add             r7,  r7,  r3
+        bx              r7
+
+        .align 2
+L(ipred_cfl_ac_420_w16_tbl):
+        .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w16_wpad0):
+1:      // Copy and subsample input, without padding
+        vld1.8          {q0, q1},   [r1,  :128], r2
+        vld1.8          {q2, q3},   [r12, :128], r2
+        vpaddl.u8       q0,  q0
+        vld1.8          {q12, q13}, [r1,  :128], r2
+        vpaddl.u8       q1,  q1
+        vpaddl.u8       q2,  q2
+        vpaddl.u8       q3,  q3
+        vadd.i16        q0,  q0,  q2
+        vadd.i16        q1,  q1,  q3
+        vld1.8          {q2, q3},   [r12, :128], r2
+        vpaddl.u8       q12, q12
+        vpaddl.u8       q13, q13
+        vpaddl.u8       q2,  q2
+        vpaddl.u8       q3,  q3
+        vadd.i16        q12, q12, q2
+        vadd.i16        q13, q13, q3
+        vshl.i16        q0,  q0,  #1
+        vshl.i16        q1,  q1,  #1
+        vshl.i16        q2,  q12, #1
+        vshl.i16        q3,  q13, #1
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q2
+        vmov            q1,  q3
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+1:      // Copy and subsample input, padding 4
+        vldr            d2,    [r1,  #16]
+        vld1.8          {q0},  [r1,  :128], r2
+        vldr            d6,    [r12, #16]
+        vld1.8          {q2},  [r12, :128], r2
+        vpaddl.u8       d2,  d2
+        vldr            d26,   [r1,  #16]
+        vpaddl.u8       q0,  q0
+        vld1.8          {q12}, [r1,  :128], r2
+        vpaddl.u8       d6,  d6
+        vldr            d30,   [r12, #16]
+        vpaddl.u8       q2,  q2
+        vld1.8          {q14}, [r12, :128], r2
+        vpaddl.u8       d26, d26
+        vpaddl.u8       q12, q12
+        vpaddl.u8       d30, d30
+        vpaddl.u8       q14, q14
+        vadd.i16        d2,  d2,  d6
+        vadd.i16        q0,  q0,  q2
+        vadd.i16        d26, d26, d30
+        vadd.i16        q12, q12, q14
+        vshl.i16        d2,  d2,  #1
+        vshl.i16        q0,  q0,  #1
+        vshl.i16        d6,  d26, #1
+        vshl.i16        q2,  q12, #1
+        vdup.16         d3,  d2[3]
+        vdup.16         d7,  d6[3]
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q2
+        vmov            q1,  q3
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+1:      // Copy and subsample input, padding 8
+        vld1.8          {q0}, [r1,  :128], r2
+        vld1.8          {q1}, [r12, :128], r2
+        vld1.8          {q2}, [r1,  :128], r2
+        vpaddl.u8       q0,  q0
+        vld1.8          {q3}, [r12, :128], r2
+        vpaddl.u8       q1,  q1
+        vpaddl.u8       q2,  q2
+        vpaddl.u8       q3,  q3
+        vadd.i16        q0,  q0,  q1
+        vadd.i16        q2,  q2,  q3
+        vshl.i16        q0,  q0,  #1
+        vshl.i16        q2,  q2,  #1
+        vdup.16         q1,  d1[3]
+        vdup.16         q3,  d5[3]
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q2
+        vmov            q1,  q3
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+1:      // Copy and subsample input, padding 12
+        vld1.8          {d0}, [r1,  :64], r2
+        vld1.8          {d1}, [r12, :64], r2
+        vld1.8          {d4}, [r1,  :64], r2
+        vpaddl.u8       q0,  q0
+        vld1.8          {d5}, [r12, :64], r2
+        vpaddl.u8       q2,  q2
+        vadd.i16        d0,  d0,  d1
+        vadd.i16        d4,  d4,  d5
+        vshl.i16        d0,  d0,  #1
+        vshl.i16        d4,  d4,  #1
+        vdup.16         q1,  d0[3]
+        vdup.16         q3,  d4[3]
+        vdup.16         d1,  d0[3]
+        vdup.16         d5,  d4[3]
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q2
+        vmov            q1,  q3
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_hpad):
+        beq             3f // This assumes that all callers already did "cmp r4, #0"
+2:      // Vertical padding (h_pad > 0)
+        subs            r4,  r4,  #2
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             2b
+3:
+
+        // Quadruple the height and reuse the w4 summing/subtracting
+        lsl             r6,  r6,  #2
+        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+endfunc
+
+// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                           const ptrdiff_t stride, const int w_pad,
+//                           const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_8bpc_neon, export=1
+        push            {r4-r8,lr}
+        ldr             r4,  [sp, #24]
+        ldr             r5,  [sp, #28]
+        ldr             r6,  [sp, #32]
+        clz             r8,  r5
+        lsl             r4,  r4,  #2
+        adr             r7,  L(ipred_cfl_ac_422_tbl)
+        sub             r8,  r8,  #27
+        ldr             r8,  [r7, r8, lsl #2]
+        vmov.i16        q8,  #0
+        vmov.i16        q9,  #0
+        vmov.i16        q10, #0
+        vmov.i16        q11, #0
+        add             r7,  r7,  r8
+        sub             r8,  r6,  r4  // height - h_pad
+        rbit            lr,  r5       // rbit(width)
+        rbit            r12, r6       // rbit(height)
+        clz             lr,  lr       // ctz(width)
+        clz             r12, r12      // ctz(height)
+        add             lr,  lr,  r12 // log2sz
+        add             r12, r1,  r2
+        vdup.32         d31, lr
+        lsl             r2,  r2,  #1
+        vneg.s32        d31, d31      // -log2sz
+        bx              r7
+
+        .align 2
+L(ipred_cfl_ac_422_tbl):
+        .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w4):
+1:      // Copy and subsample input
+        vld1.8          {d0}, [r1,  :64], r2
+        vld1.8          {d1}, [r12, :64], r2
+        vld1.8          {d2}, [r1,  :64], r2
+        vld1.8          {d3}, [r12, :64], r2
+        vpaddl.u8       q0,  q0
+        vpaddl.u8       q1,  q1
+        vshl.i16        q0,  q0,  #2
+        vshl.i16        q1,  q1,  #2
+        subs            r8,  r8,  #4
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        bgt             1b
+        cmp             r4,  #0
+        vmov            d0,  d3
+        vmov            d1,  d3
+        vmov            d2,  d3
+        b               L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+        cmp             r3,  #0
+        bne             L(ipred_cfl_ac_422_w8_wpad)
+1:      // Copy and subsample input, without padding
+        vld1.8          {q0}, [r1,  :128], r2
+        vld1.8          {q1}, [r12, :128], r2
+        vld1.8          {q2}, [r1,  :128], r2
+        vpaddl.u8       q0,  q0
+        vld1.8          {q3}, [r12, :128], r2
+        vpaddl.u8       q1,  q1
+        vpaddl.u8       q2,  q2
+        vpaddl.u8       q3,  q3
+        vshl.i16        q0,  q0,  #2
+        vshl.i16        q1,  q1,  #2
+        vshl.i16        q2,  q2,  #2
+        vshl.i16        q3,  q3,  #2
+        subs            r8,  r8,  #4
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q3
+        vmov            q1,  q3
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1:      // Copy and subsample input, padding 4
+        vld1.8          {d0}, [r1,  :64], r2
+        vld1.8          {d1}, [r12, :64], r2
+        vld1.8          {d2}, [r1,  :64], r2
+        vld1.8          {d3}, [r12, :64], r2
+        vpaddl.u8       q0,  q0
+        vpaddl.u8       q1,  q1
+        vshl.i16        q0,  q0,  #2
+        vshl.i16        q1,  q1,  #2
+        vdup.16         d7,  d3[3]
+        vmov            d6,  d3
+        vdup.16         d5,  d2[3]
+        vmov            d4,  d2
+        vdup.16         d3,  d1[3]
+        vmov            d2,  d1
+        vdup.16         d1,  d0[3]
+        subs            r8,  r8,  #4
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q3
+        vmov            q1,  q3
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+        adr             r7,  L(ipred_cfl_ac_422_w16_tbl)
+        ldr             r3,  [r7, r3, lsl #2]
+        add             r7,  r7,  r3
+        bx              r7
+
+        .align 2
+L(ipred_cfl_ac_422_w16_tbl):
+        .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w16_wpad0):
+1:      // Copy and subsample input, without padding
+        vld1.8          {q0, q1}, [r1,  :128], r2
+        vld1.8          {q2, q3}, [r12, :128], r2
+        vpaddl.u8       q0,  q0
+        vpaddl.u8       q1,  q1
+        vpaddl.u8       q2,  q2
+        vpaddl.u8       q3,  q3
+        vshl.i16        q0,  q0,  #2
+        vshl.i16        q1,  q1,  #2
+        vshl.i16        q2,  q2,  #2
+        vshl.i16        q3,  q3,  #2
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q2
+        vmov            q1,  q3
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+1:      // Copy and subsample input, padding 4
+        vldr            d2,   [r1,  #16]
+        vld1.8          {q0}, [r1,  :128], r2
+        vldr            d6,   [r12, #16]
+        vld1.8          {q2}, [r12, :128], r2
+        vpaddl.u8       d2,  d2
+        vpaddl.u8       q0,  q0
+        vpaddl.u8       d6,  d6
+        vpaddl.u8       q2,  q2
+        vshl.i16        d2,  d2,  #2
+        vshl.i16        q0,  q0,  #2
+        vshl.i16        d6,  d6,  #2
+        vshl.i16        q2,  q2,  #2
+        vdup.16         d3,  d2[3]
+        vdup.16         d7,  d6[3]
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q2
+        vmov            q1,  q3
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+1:      // Copy and subsample input, padding 8
+        vld1.8          {q0}, [r1,  :128], r2
+        vld1.8          {q2}, [r12, :128], r2
+        vpaddl.u8       q0,  q0
+        vpaddl.u8       q2,  q2
+        vshl.i16        q0,  q0,  #2
+        vshl.i16        q2,  q2,  #2
+        vdup.16         q1,  d1[3]
+        vdup.16         q3,  d5[3]
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q2
+        vmov            q1,  q3
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+1:      // Copy and subsample input, padding 12
+        vld1.8          {d0}, [r1,  :64], r2
+        vld1.8          {d1}, [r12, :64], r2
+        vpaddl.u8       q0,  q0
+        vshl.i16        q0,  q0,  #2
+        vdup.16         q3,  d1[3]
+        vdup.16         q1,  d0[3]
+        vdup.16         d5,  d1[3]
+        vmov            d4,  d1
+        vdup.16         d1,  d0[3]
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q2
+        vmov            q1,  q3
+        b               L(ipred_cfl_ac_420_w16_hpad)
+endfunc
+
+// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                           const ptrdiff_t stride, const int w_pad,
+//                           const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_8bpc_neon, export=1
+        push            {r4-r8,lr}
+        ldr             r4,  [sp, #24]
+        ldr             r5,  [sp, #28]
+        ldr             r6,  [sp, #32]
+        clz             r8,  r5
+        lsl             r4,  r4,  #2
+        adr             r7,  L(ipred_cfl_ac_444_tbl)
+        sub             r8,  r8,  #26
+        ldr             r8,  [r7, r8, lsl #2]
+        vmov.i16        q8,  #0
+        vmov.i16        q9,  #0
+        vmov.i16        q10, #0
+        vmov.i16        q11, #0
+        add             r7,  r7,  r8
+        sub             r8,  r6,  r4  // height - h_pad
+        rbit            lr,  r5       // rbit(width)
+        rbit            r12, r6       // rbit(height)
+        clz             lr,  lr       // ctz(width)
+        clz             r12, r12      // ctz(height)
+        add             lr,  lr,  r12 // log2sz
+        add             r12, r1,  r2
+        vdup.32         d31, lr
+        lsl             r2,  r2,  #1
+        vneg.s32        d31, d31      // -log2sz
+        bx              r7
+
+        .align 2
+L(ipred_cfl_ac_444_tbl):
+        .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_444_w8)  - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_444_w4)  - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w4):
+1:      // Copy and expand input
+        vld1.32         {d0[]},  [r1,  :32], r2
+        vld1.32         {d0[1]}, [r12, :32], r2
+        vld1.32         {d2[]},  [r1,  :32], r2
+        vld1.32         {d2[1]}, [r12, :32], r2
+        vshll.u8        q0,  d0,  #3
+        vshll.u8        q1,  d2,  #3
+        subs            r8,  r8,  #4
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        bgt             1b
+        cmp             r4,  #0
+        vmov            d0,  d3
+        vmov            d1,  d3
+        vmov            d2,  d3
+        b               L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+1:      // Copy and expand input
+        vld1.16         {d0}, [r1,  :64], r2
+        vld1.16         {d2}, [r12, :64], r2
+        vld1.16         {d4}, [r1,  :64], r2
+        vshll.u8        q0,  d0,  #3
+        vld1.16         {d6}, [r12, :64], r2
+        vshll.u8        q1,  d2,  #3
+        vshll.u8        q2,  d4,  #3
+        vshll.u8        q3,  d6,  #3
+        subs            r8,  r8,  #4
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q3
+        vmov            q1,  q3
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+        cmp             r3,  #0
+        bne             L(ipred_cfl_ac_444_w16_wpad)
+1:      // Copy and expand input, without padding
+        vld1.8          {q1}, [r1,  :128], r2
+        vld1.8          {q3}, [r12, :128], r2
+        vshll.u8        q0,  d2,  #3
+        vshll.u8        q1,  d3,  #3
+        vshll.u8        q2,  d6,  #3
+        vshll.u8        q3,  d7,  #3
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q2
+        vmov            q1,  q3
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1:      // Copy and expand input, padding 8
+        vld1.8          {d0}, [r1,  :64], r2
+        vld1.8          {d4}, [r12, :64], r2
+        vshll.u8        q0,  d0,  #3
+        vshll.u8        q2,  d4,  #3
+        vdup.16         q1,  d1[3]
+        vdup.16         q3,  d5[3]
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1}, [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vst1.16         {q2, q3}, [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        bgt             1b
+        cmp             r4,  #0
+        vmov            q0,  q2
+        vmov            q1,  q3
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+        adr             r7,  L(ipred_cfl_ac_444_w32_tbl)
+        ldr             r3,  [r7, r3, lsl #1] // (w3>>1) << 2
+        add             r7,  r7,  r3
+        bx              r7
+
+        .align 2
+L(ipred_cfl_ac_444_w32_tbl):
+        .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+        .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w32_wpad0):
+1:      // Copy and expand input, without padding
+        vld1.8          {q2, q3},   [r1,  :128], r2
+        vld1.8          {q13, q14}, [r12, :128], r2
+        vshll.u8        q0,  d4,  #3
+        vshll.u8        q1,  d5,  #3
+        vshll.u8        q2,  d6,  #3
+        vshll.u8        q3,  d7,  #3
+        vshll.u8        q12, d26, #3
+        vshll.u8        q13, d27, #3
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1},   [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vshll.u8        q0,  d28, #3
+        vshll.u8        q1,  d29, #3
+        vst1.16         {q2, q3},   [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        vst1.16         {q12, q13}, [r0, :128]!
+        vadd.i16        q8,  q8,  q12
+        vadd.i16        q9,  q9,  q13
+        vst1.16         {q0, q1},   [r0, :128]!
+        vadd.i16        q10, q10, q0
+        vadd.i16        q11, q11, q1
+        bgt             1b
+        cmp             r4,  #0
+        b               L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+1:      // Copy and expand input, padding 8
+        vldr            d4,    [r1,  #16]
+        vld1.8          {q1},  [r1,  :128], r2
+        vldr            d28,   [r12, #16]
+        vld1.8          {q13}, [r12, :128], r2
+        vshll.u8        q2,  d4,  #3
+        vshll.u8        q0,  d2,  #3
+        vshll.u8        q1,  d3,  #3
+        vshll.u8        q12, d26, #3
+        vshll.u8        q13, d27, #3
+        vdup.16         q3,  d5[3]
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1},   [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vshll.u8        q0,  d28, #3
+        vst1.16         {q2, q3},   [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        vdup.16         q1,  d1[3]
+        vst1.16         {q12, q13}, [r0, :128]!
+        vadd.i16        q8,  q8,  q12
+        vadd.i16        q9,  q9,  q13
+        vst1.16         {q0, q1},   [r0, :128]!
+        vadd.i16        q10, q10, q0
+        vadd.i16        q11, q11, q1
+        bgt             1b
+        cmp             r4,  #0
+        b               L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+1:      // Copy and expand input, padding 16
+        vld1.8          {q1},  [r1,  :128], r2
+        vld1.8          {q13}, [r12, :128], r2
+        vshll.u8        q0,  d2,  #3
+        vshll.u8        q1,  d3,  #3
+        vshll.u8        q12, d26, #3
+        vshll.u8        q13, d27, #3
+        vdup.16         q2,  d3[3]
+        vdup.16         q3,  d3[3]
+        subs            r8,  r8,  #2
+        vst1.16         {q0, q1},   [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vdup.16         q0,  d27[3]
+        vdup.16         q1,  d27[3]
+        vst1.16         {q2, q3},   [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        vst1.16         {q12, q13}, [r0, :128]!
+        vadd.i16        q8,  q8,  q12
+        vadd.i16        q9,  q9,  q13
+        vst1.16         {q0, q1},   [r0, :128]!
+        vadd.i16        q10, q10, q0
+        vadd.i16        q11, q11, q1
+        bgt             1b
+        cmp             r4,  #0
+        b               L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+1:      // Copy and expand input, padding 24
+        vld1.8          {d0},  [r1,  :64], r2
+        vld1.8          {d24}, [r12, :64], r2
+        vshll.u8        q0,  d0,  #3
+        vshll.u8        q12, d24, #3
+        subs            r8,  r8,  #2
+        vdup.16         q1,  d1[3]
+        vdup.16         q2,  d1[3]
+        vdup.16         q3,  d1[3]
+        vst1.16         {q0, q1},   [r0, :128]!
+        vadd.i16        q8,  q8,  q0
+        vadd.i16        q9,  q9,  q1
+        vdup.16         q13, d25[3]
+        vdup.16         q0,  d25[3]
+        vdup.16         q1,  d25[3]
+        vst1.16         {q2, q3},   [r0, :128]!
+        vadd.i16        q10, q10, q2
+        vadd.i16        q11, q11, q3
+        vst1.16         {q12, q13}, [r0, :128]!
+        vadd.i16        q8,  q8,  q12
+        vadd.i16        q9,  q9,  q13
+        vst1.16         {q0, q1},   [r0, :128]!
+        vadd.i16        q10, q10, q0
+        vadd.i16        q11, q11, q1
+        bgt             1b
+        cmp             r4,  #0
+
+L(ipred_cfl_ac_444_w32_hpad):
+        beq             3f // This assumes that all callers already did "cmp r4, #0"
+2:      // Vertical padding (h_pad > 0)
+        subs            r4,  r4,  #1
+        vst1.16         {q12, q13}, [r0, :128]!
+        vadd.i16        q8,  q8,  q12
+        vadd.i16        q9,  q9,  q13
+        vst1.16         {q0, q1},   [r0, :128]!
+        vadd.i16        q10, q10, q0
+        vadd.i16        q11, q11, q1
+        bgt             2b
+3:
+
+        //  Multiply the height by eight and reuse the w4 subtracting
+        lsl             r6,  r6,  #3
+        // Aggregate the sums, with wider intermediates earlier than in
+        // ipred_cfl_ac_420_w8_calc_subtract_dc.
+        vpaddl.u16      q0,  q8
+        vpaddl.u16      q1,  q9
+        vpaddl.u16      q2,  q10
+        vpaddl.u16      q3,  q11
+        vadd.i32        q0,  q0,  q1
+        vadd.i32        q2,  q2,  q3
+        vadd.i32        q0,  q0,  q2
+        vadd.i32        d0,  d0,  d1
+        vpadd.i32       d0,  d0,  d0  // sum
+        sub             r0,  r0,  r6, lsl #3
+        vrshl.u32       d16, d0,  d31 // (sum + (1 << (log2sz - 1))) >>= log2sz
+        vdup.16         q8,  d16[0]
+        b               L(ipred_cfl_ac_420_w4_subtract_dc)
+endfunc
--- /dev/null
+++ b/src/arm/32/mc16.S
@@ -1,0 +1,274 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define PREP_BIAS 8192
+
+.macro avg d0, d00, d01, d1, d10, d11
+        vld1.16         {q0, q1}, [r2, :128]!
+        vld1.16         {q2, q3}, [r3, :128]!
+        vqadd.s16       q0,  q0,  q2
+        vqadd.s16       q1,  q1,  q3
+        vmax.s16        q0,  q0,  q12 // -2*PREP_BIAS - 1 << intermediate_bits
+        vmax.s16        q1,  q1,  q12 // -2*PREP_BIAS - 1 << intermediate_bits
+        vqsub.s16       q0,  q0,  q12 // -2*PREP_BIAS - 1 << intermediate_bits
+        vqsub.s16       q1,  q1,  q12 // -2*PREP_BIAS - 1 << intermediate_bits
+        vshl.s16        \d0, q0,  q13 // -(intermediate_bits+1)
+        vshl.s16        \d1, q1,  q13 // -(intermediate_bits+1)
+.endm
+
+.macro w_avg d0, d00, d01, d1, d10, d11
+        vld1.16         {q0, q1}, [r2, :128]!
+        vld1.16         {q2, q3}, [r3, :128]!
+        // This difference requires a 17 bit range, and all bits are
+        // significant for the following multiplication.
+        vsubl.s16       \d0, d4,  d0
+        vsubl.s16       q0,  d5,  d1
+        vsubl.s16       \d1, d6,  d2
+        vsubl.s16       q1,  d7,  d3
+        vmul.s32        \d0, \d0, q4
+        vmul.s32        q0,  q0,  q4
+        vmul.s32        \d1, \d1, q4
+        vmul.s32        q1,  q1,  q4
+        vshr.s32        \d0, \d0, #4
+        vshr.s32        q0,  q0,  #4
+        vshr.s32        \d1, \d1, #4
+        vshr.s32        q1,  q1,  #4
+        vaddw.s16       \d0, \d0, d4
+        vaddw.s16       q0,  q0,  d5
+        vaddw.s16       \d1, \d1, d6
+        vaddw.s16       q1,  q1,  d7
+        vmovn.i32       \d00, \d0
+        vmovn.i32       \d01, q0
+        vmovn.i32       \d10, \d1
+        vmovn.i32       \d11, q1
+        vrshl.s16       \d0, \d0, q13 // -intermediate_bits
+        vrshl.s16       \d1, \d1, q13 // -intermediate_bits
+        vadd.s16        \d0, \d0, q12 // PREP_BIAS >> intermediate_bits
+        vadd.s16        \d1, \d1, q12 // PREP_BIAS >> intermediate_bits
+        vmin.s16        \d0, \d0, q15 // bitdepth_max
+        vmin.s16        \d1, \d1, q15 // bitdepth_max
+        vmax.s16        \d0, \d0, q14 // 0
+        vmax.s16        \d1, \d1, q14 // 0
+.endm
+
+.macro mask d0, d00, d01, d1, d10, d11
+        vld1.8          {q7},     [r6, :128]!
+        vld1.16         {q0, q1}, [r2, :128]!
+        vneg.s8         q7,  q7
+        vld1.16         {q2, q3}, [r3, :128]!
+        vmovl.s8        q6,  d14
+        vmovl.s8        q7,  d15
+        vmovl.s16       q4,  d12
+        vmovl.s16       q5,  d13
+        vmovl.s16       q6,  d14
+        vmovl.s16       q7,  d15
+        vsubl.s16       \d0, d4,  d0
+        vsubl.s16       q0,  d5,  d1
+        vsubl.s16       \d1, d6,  d2
+        vsubl.s16       q1,  d7,  d3
+        vmul.s32        \d0, \d0, q4
+        vmul.s32        q0,  q0,  q5
+        vmul.s32        \d1, \d1, q6
+        vmul.s32        q1,  q1,  q7
+        vshr.s32        \d0, \d0, #6
+        vshr.s32        q0,  q0,  #6
+        vshr.s32        \d1, \d1, #6
+        vshr.s32        q1,  q1,  #6
+        vaddw.s16       \d0, \d0, d4
+        vaddw.s16       q0,  q0,  d5
+        vaddw.s16       \d1, \d1, d6
+        vaddw.s16       q1,  q1,  d7
+        vmovn.i32       \d00, \d0
+        vmovn.i32       \d01, q0
+        vmovn.i32       \d10, \d1
+        vmovn.i32       \d11, q1
+        vrshl.s16       \d0, \d0, q13 // -intermediate_bits
+        vrshl.s16       \d1, \d1, q13 // -intermediate_bits
+        vadd.s16        \d0, \d0, q12 // PREP_BIAS >> intermediate_bits
+        vadd.s16        \d1, \d1, q12 // PREP_BIAS >> intermediate_bits
+        vmin.s16        \d0, \d0, q15 // bitdepth_max
+        vmin.s16        \d1, \d1, q15 // bitdepth_max
+        vmax.s16        \d0, \d0, q14 // 0
+        vmax.s16        \d1, \d1, q14 // 0
+.endm
+
+.macro bidir_fn type, bdmax
+function \type\()_16bpc_neon, export=1
+        push            {r4-r7,lr}
+        ldr             r4,  [sp, #20]
+        ldr             r5,  [sp, #24]
+        ldr             r6,  [sp, #28]
+        clz             r4,  r4
+.ifnc \type, avg
+        ldr             r7,  [sp, #32]
+        vmov.i16        q14, #0
+        vdup.16         q15, r7         // bitdepth_max
+.endif
+.ifc \type, w_avg
+        vpush           {q4}
+.endif
+.ifc \type, mask
+        vpush           {q4-q7}
+.endif
+        clz             r7,  \bdmax
+        sub             r7,  r7,  #18   // intermediate_bits = clz(bitdepth_max) - 18
+.ifc \type, avg
+        mov             lr,  #1
+        movw            r12, #2*PREP_BIAS
+        lsl             lr,  lr,  r7    // 1 << intermediate_bits
+        neg             r12, r12         // -2*PREP_BIAS
+        add             r7,  r7,  #1
+        sub             r12, r12, lr    // -2*PREP_BIAS - 1 << intermediate_bits
+        neg             r7,  r7         // -(intermediate_bits+1)
+        vdup.16         q12, r12         // -2*PREP_BIAS - 1 << intermediate_bits
+        vdup.16         q13, r7         // -(intermediate_bits+1)
+.else
+        mov             r12, #PREP_BIAS
+        lsr             r12, r12, r7    // PREP_BIAS >> intermediate_bits
+        neg             r7,  r7         // -intermediate_bits
+        vdup.16         q12, r12         // PREP_BIAS >> intermediate_bits
+        vdup.16         q13, r7         // -intermediate_bits
+.endif
+.ifc \type, w_avg
+        vdup.32         q4,  r6
+        vneg.s32        q4,  q4
+.endif
+        adr             r7,  L(\type\()_tbl)
+        sub             r4,  r4,  #24
+        \type           q8,  d16, d17, q9,  d18, d19
+        ldr             r4,  [r7, r4, lsl #2]
+        add             r7,  r7,  r4
+        bx              r7
+
+        .align 2
+L(\type\()_tbl):
+        .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_tbl) + CONFIG_THUMB
+
+40:
+        add             r7,  r0,  r1
+        lsl             r1,  r1,  #1
+4:
+        subs            r5,  r5,  #4
+        vst1.16         {d16},  [r0, :64], r1
+        vst1.16         {d17},  [r7, :64], r1
+        vst1.16         {d18},  [r0, :64], r1
+        vst1.16         {d19},  [r7, :64], r1
+        ble             0f
+        \type           q8,  d16, d17, q9,  d18, d19
+        b               4b
+80:
+        add             r7,  r0,  r1
+        lsl             r1,  r1,  #1
+8:
+        vst1.16         {q8},  [r0, :128], r1
+        subs            r5,  r5,  #2
+        vst1.16         {q9},  [r7, :128], r1
+        ble             0f
+        \type           q8,  d16, d17, q9,  d18, d19
+        b               8b
+160:
+16:
+        \type           q10, d20, d21, q11, d22, d23
+        vst1.16         {q8,  q9},  [r0, :128], r1
+        subs            r5,  r5,  #2
+        vst1.16         {q10, q11}, [r0, :128], r1
+        ble             0f
+        \type           q8,  d16, d17, q9,  d18, d19
+        b               16b
+320:
+        add             r7,  r0,  #32
+32:
+        \type           q10, d20, d21, q11, d22, d23
+        vst1.16         {q8,  q9},  [r0, :128], r1
+        subs            r5,  r5,  #1
+        vst1.16         {q10, q11}, [r7, :128], r1
+        ble             0f
+        \type           q8,  d16, d17, q9,  d18, d19
+        b               32b
+640:
+        add             r7,  r0,  #32
+        mov             r12, #64
+        sub             r1,  r1,  #64
+64:
+        \type           q10, d20, d21, q11, d22, d23
+        vst1.16         {q8,  q9},  [r0, :128], r12
+        \type           q8,  d16, d17, q9,  d18, d19
+        vst1.16         {q10, q11}, [r7, :128], r12
+        \type           q10, d20, d21, q11, d22, d23
+        vst1.16         {q8,  q9},  [r0, :128], r1
+        subs            r5,  r5,  #1
+        vst1.16         {q10, q11}, [r7, :128], r1
+        ble             0f
+        \type           q8,  d16, d17, q9,  d18, d19
+        b               64b
+1280:
+        add             r7,  r0,  #32
+        mov             r12, #64
+        sub             r1,  r1,  #192
+128:
+        \type           q10, d20, d21, q11, d22, d23
+        vst1.16         {q8,  q9},  [r0, :128], r12
+        \type           q8,  d16, d17, q9,  d18, d19
+        vst1.16         {q10, q11}, [r7, :128], r12
+        \type           q10, d20, d21, q11, d22, d23
+        vst1.16         {q8,  q9},  [r0, :128], r12
+        \type           q8,  d16, d17, q9,  d18, d19
+        vst1.16         {q10, q11}, [r7, :128], r12
+        \type           q10, d20, d21, q11, d22, d23
+        vst1.16         {q8,  q9},  [r0, :128], r12
+        \type           q8,  d16, d17, q9,  d18, d19
+        vst1.16         {q10, q11}, [r7, :128], r12
+        \type           q10, d20, d21, q11, d22, d23
+        vst1.16         {q8,  q9},  [r0, :128], r1
+        subs            r5,  r5,  #1
+        vst1.16         {q10, q11}, [r7, :128], r1
+        ble             0f
+        \type           q8,  d16, d17, q9,  d18, d19
+        b               128b
+0:
+.ifc \type, mask
+        vpop            {q4-q7}
+.endif
+.ifc \type, w_avg
+        vpop            {q4}
+.endif
+        pop             {r4-r7,pc}
+endfunc
+.endm
+
+bidir_fn avg, r6
+bidir_fn w_avg, r7
+bidir_fn mask, r7
--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -884,10 +884,10 @@
         lsl             x1,  x1,  #1
         br              x5
 40:
-        sub             x2,  x2,  #4
-        mov             x7,  #-4
         ld1r            {v6.2s}, [x8]             // top
         ld1r            {v7.2s}, [x10]            // weights_hor
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
         dup             v5.16b,  v6.b[3]          // right
         usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor
@@ -922,10 +922,10 @@
         b.gt            4b
         ret
 80:
-        sub             x2,  x2,  #4
-        mov             x7,  #-4
         ld1             {v6.8b}, [x8]             // top
         ld1             {v7.8b}, [x10]            // weights_hor
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
         dup             v5.16b,  v6.b[7]          // right
         usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor
@@ -1460,12 +1460,14 @@
         subs            w3,  w3,  #16
         sqrshrun        v6.8b,   v6.8h,   #4
 
-        ins             v0.h[2], v2.h[7]
         st4             {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
-        ins             v0.b[0], v6.b[7]
         st4             {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
-        ins             v0.b[2], v6.b[3]
-        b.gt            2b
+        b.le            8f
+        ins             v0.h[2], v2.h[7]
+        ins             v0.b[0], v6.b[7]
+        ins             v0.b[2], v6.b[3]
+        b               2b
+8:
         subs            w4,  w4,  #2
         b.le            9f
         sub             x8,  x6,  w9, uxtw
@@ -1815,7 +1817,7 @@
         dup             v16.8h, w8               // width + height
         adr             x7,  L(ipred_cfl_tbl)
         rbit            w8,  w8                  // rbit(width + height)
-        sub             w9,  w9,  #22            // 22 leading bits, minus table offset 4
+        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
         sub             w6,  w6,  #26
         clz             w8,  w8                  // ctz(width + height)
         ldrh            w9,  [x7, w9, uxtw #1]
@@ -2078,6 +2080,7 @@
         sub             x0,  x0,  w6, uxtw #4
         urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
         dup             v4.8h,   v4.h[0]
+L(ipred_cfl_ac_420_w8_subtract_dc):
 6:      // Subtract dc from ac
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
         subs            w6,  w6,  #4
@@ -2223,7 +2226,6 @@
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
-        b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_420_w16_hpad):
         cbz             w4,  3f
@@ -2244,7 +2246,6 @@
 
         // Double the height and reuse the w8 summing/subtracting
         lsl             w6,  w6,  #1
-        lsl             w9,  w9,  #1
         b               L(ipred_cfl_ac_420_w8_calc_subtract_dc)
 
 L(ipred_cfl_ac_420_tbl):
@@ -2473,4 +2474,291 @@
         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
+
+// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                           const ptrdiff_t stride, const int w_pad,
+//                           const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_8bpc_neon, export=1
+        clz             w8,  w5
+        lsl             w4,  w4,  #2
+        adr             x7,  L(ipred_cfl_ac_444_tbl)
+        sub             w8,  w8,  #26
+        ldrh            w8,  [x7, w8, uxtw #1]
+        movi            v16.8h,  #0
+        movi            v17.8h,  #0
+        movi            v18.8h,  #0
+        movi            v19.8h,  #0
+        sub             x7,  x7,  w8, uxtw
+        sub             w8,  w6,  w4         // height - h_pad
+        rbit            w9,  w5              // rbit(width)
+        rbit            w10, w6              // rbit(height)
+        clz             w9,  w9              // ctz(width)
+        clz             w10, w10             // ctz(height)
+        add             w9,  w9,  w10        // log2sz
+        add             x10, x1,  x2
+        dup             v31.4s,  w9
+        lsl             x2,  x2,  #1
+        neg             v31.4s,  v31.4s      // -log2sz
+        br              x7
+
+L(ipred_cfl_ac_444_w4):
+1:      // Copy and expand input
+        ld1             {v0.s}[0], [x1],  x2
+        ld1             {v0.s}[1], [x10], x2
+        ld1             {v1.s}[0], [x1],  x2
+        ld1             {v1.s}[1], [x10], x2
+        ushll           v0.8h,   v0.8b,   #3
+        ushll           v1.8h,   v1.8b,   #3
+        subs            w8,  w8,  #4
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        st1             {v0.8h, v1.8h}, [x0], #32
+        b.gt            1b
+        trn2            v0.2d,   v1.2d,   v1.2d
+        trn2            v1.2d,   v1.2d,   v1.2d
+        b               L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+1:      // Copy and expand input
+        ld1             {v0.8b}, [x1],  x2
+        ld1             {v1.8b}, [x10], x2
+        ld1             {v2.8b}, [x1],  x2
+        ushll           v0.8h,   v0.8b,   #3
+        ld1             {v3.8b}, [x10], x2
+        ushll           v1.8h,   v1.8b,   #3
+        ushll           v2.8h,   v2.8b,   #3
+        ushll           v3.8h,   v3.8b,   #3
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v3.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
+1:      // Copy and expand input, without padding
+        ld1             {v0.16b}, [x1],  x2
+        ld1             {v2.16b}, [x10], x2
+        ld1             {v4.16b}, [x1],  x2
+        ushll2          v1.8h,   v0.16b,  #3
+        ushll           v0.8h,   v0.8b,   #3
+        ld1             {v6.16b}, [x10], x2
+        ushll2          v3.8h,   v2.16b,  #3
+        ushll           v2.8h,   v2.8b,   #3
+        ushll2          v5.8h,   v4.16b,  #3
+        ushll           v4.8h,   v4.8b,   #3
+        ushll2          v7.8h,   v6.16b,  #3
+        ushll           v6.8h,   v6.8b,   #3
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            1b
+        mov             v0.16b,  v6.16b
+        mov             v1.16b,  v7.16b
+        mov             v2.16b,  v6.16b
+        mov             v3.16b,  v7.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1:      // Copy and expand input, padding 8
+        ld1             {v0.8b}, [x1],  x2
+        ld1             {v2.8b}, [x10], x2
+        ld1             {v4.8b}, [x1],  x2
+        ld1             {v6.8b}, [x10], x2
+        ushll           v0.8h,   v0.8b,   #3
+        ushll           v2.8h,   v2.8b,   #3
+        ushll           v4.8h,   v4.8b,   #3
+        ushll           v6.8h,   v6.8b,   #3
+        dup             v1.8h,   v0.h[7]
+        dup             v3.8h,   v2.h[7]
+        dup             v5.8h,   v4.h[7]
+        dup             v7.8h,   v6.h[7]
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            1b
+        mov             v0.16b,  v6.16b
+        mov             v1.16b,  v7.16b
+        mov             v2.16b,  v6.16b
+        mov             v3.16b,  v7.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
+        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
+        sub             x7,  x7,  w3, uxtw
+        br              x7
+
+L(ipred_cfl_ac_444_w32_wpad0):
+1:      // Copy and expand input, without padding
+        ld1             {v2.16b, v3.16b}, [x1],  x2
+        ld1             {v6.16b, v7.16b}, [x10], x2
+        ushll           v0.8h,   v2.8b,   #3
+        ushll2          v1.8h,   v2.16b,  #3
+        ushll           v2.8h,   v3.8b,   #3
+        ushll2          v3.8h,   v3.16b,  #3
+        ushll           v4.8h,   v6.8b,   #3
+        ushll2          v5.8h,   v6.16b,  #3
+        ushll           v6.8h,   v7.8b,   #3
+        ushll2          v7.8h,   v7.16b,  #3
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            1b
+        b               L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+1:      // Copy and expand input, padding 8
+        ldr             d2,  [x1,  #16]
+        ld1             {v1.16b}, [x1],  x2
+        ldr             d6,  [x10, #16]
+        ld1             {v5.16b}, [x10], x2
+        ushll           v2.8h,   v2.8b,   #3
+        ushll           v0.8h,   v1.8b,   #3
+        ushll2          v1.8h,   v1.16b,  #3
+        ushll           v6.8h,   v6.8b,   #3
+        ushll           v4.8h,   v5.8b,   #3
+        ushll2          v5.8h,   v5.16b,  #3
+        dup             v3.8h,   v2.h[7]
+        dup             v7.8h,   v6.h[7]
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            1b
+        b               L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+1:      // Copy and expand input, padding 16
+        ld1             {v1.16b}, [x1],  x2
+        ld1             {v5.16b}, [x10], x2
+        ushll           v0.8h,   v1.8b,   #3
+        ushll2          v1.8h,   v1.16b,  #3
+        ushll           v4.8h,   v5.8b,   #3
+        ushll2          v5.8h,   v5.16b,  #3
+        dup             v2.8h,   v1.h[7]
+        dup             v3.8h,   v1.h[7]
+        dup             v6.8h,   v5.h[7]
+        dup             v7.8h,   v5.h[7]
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            1b
+        b               L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+1:      // Copy and expand input, padding 24
+        ld1             {v0.8b}, [x1],  x2
+        ld1             {v4.8b}, [x10], x2
+        ushll           v0.8h,   v0.8b,   #3
+        ushll           v4.8h,   v4.8b,   #3
+        dup             v1.8h,   v0.h[7]
+        dup             v2.8h,   v0.h[7]
+        dup             v3.8h,   v0.h[7]
+        dup             v5.8h,   v4.h[7]
+        dup             v6.8h,   v4.h[7]
+        dup             v7.8h,   v4.h[7]
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            1b
+
+L(ipred_cfl_ac_444_w32_hpad):
+        cbz             w4,  3f
+2:      // Vertical padding (h_pad > 0)
+        subs            w4,  w4,  #2
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v4.8h
+        add             v17.8h,  v17.8h,  v5.8h
+        add             v18.8h,  v18.8h,  v6.8h
+        add             v19.8h,  v19.8h,  v7.8h
+        b.gt            2b
+3:
+
+        // Quadruple the height and reuse the w8 subtracting
+        lsl             w6,  w6,  #2
+        // Aggregate the sums, with wider intermediates earlier than in
+        // ipred_cfl_ac_420_w8_calc_subtract_dc.
+        uaddlp          v0.4s,   v16.8h
+        uaddlp          v1.4s,   v17.8h
+        uaddlp          v2.4s,   v18.8h
+        uaddlp          v3.4s,   v19.8h
+        add             v0.4s,   v0.4s,   v1.4s
+        add             v2.4s,   v2.4s,   v3.4s
+        add             v0.4s,   v0.4s,   v2.4s
+        addv            s0,  v0.4s                // sum
+        sub             x0,  x0,  w6, uxtw #4
+        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
+        dup             v4.8h,   v4.h[0]
+        b               L(ipred_cfl_ac_420_w8_subtract_dc)
+
+L(ipred_cfl_ac_444_tbl):
+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
+
+L(ipred_cfl_ac_444_w32_tbl):
+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
 endfunc
--- a/src/arm/64/ipred16.S
+++ b/src/arm/64/ipred16.S
@@ -920,10 +920,10 @@
         lsl             x1,  x1,  #1
         br              x5
 40:
-        sub             x2,  x2,  #8
-        mov             x7,  #-8
         ld1r            {v6.2d}, [x8]             // top
         ld1r            {v7.2s}, [x10]            // weights_hor
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
         dup             v5.8h,   v6.h[3]          // right
         sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor
@@ -963,10 +963,10 @@
         b.gt            4b
         ret
 80:
-        sub             x2,  x2,  #8
-        mov             x7,  #-8
         ld1             {v6.8h}, [x8]             // top
         ld1             {v7.8b}, [x10]            // weights_hor
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
         dup             v5.8h,   v6.h[7]          // right
         sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor
@@ -2125,7 +2125,7 @@
         dup             v16.4s, w8               // width + height
         adr             x7,  L(ipred_cfl_tbl)
         rbit            w8,  w8                  // rbit(width + height)
-        sub             w9,  w9,  #22            // 22 leading bits, minus table offset 4
+        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
         sub             w6,  w6,  #26
         clz             w8,  w8                  // ctz(width + height)
         ldrh            w9,  [x7, w9, uxtw #1]
@@ -2398,7 +2398,6 @@
 
         // Double the height and reuse the w4 summing/subtracting
         lsl             w6,  w6,  #1
-        lsl             w9,  w9,  #1
         b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
 
 L(ipred_cfl_ac_420_w16):
@@ -2547,7 +2546,6 @@
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
-        b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_420_w16_hpad):
         cbz             w4,  3f
@@ -2576,7 +2574,6 @@
 
         // Quadruple the height and reuse the w4 summing/subtracting
         lsl             w6,  w6,  #2
-        lsl             w9,  w9,  #2
         b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
 
 L(ipred_cfl_ac_420_tbl):
@@ -2831,4 +2828,249 @@
         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
+
+// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                            const ptrdiff_t stride, const int w_pad,
+//                            const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_16bpc_neon, export=1
+        clz             w8,  w5
+        lsl             w4,  w4,  #2
+        adr             x7,  L(ipred_cfl_ac_444_tbl)
+        sub             w8,  w8,  #26
+        ldrh            w8,  [x7, w8, uxtw #1]
+        movi            v24.4s,  #0
+        movi            v25.4s,  #0
+        movi            v26.4s,  #0
+        movi            v27.4s,  #0
+        sub             x7,  x7,  w8, uxtw
+        sub             w8,  w6,  w4         // height - h_pad
+        rbit            w9,  w5              // rbit(width)
+        rbit            w10, w6              // rbit(height)
+        clz             w9,  w9              // ctz(width)
+        clz             w10, w10             // ctz(height)
+        add             w9,  w9,  w10        // log2sz
+        add             x10, x1,  x2
+        dup             v31.4s,  w9
+        lsl             x2,  x2,  #1
+        neg             v31.4s,  v31.4s      // -log2sz
+        br              x7
+
+L(ipred_cfl_ac_444_w4):
+1:      // Copy and expand input
+        ld1             {v0.4h},   [x1],  x2
+        ld1             {v0.d}[1], [x10], x2
+        ld1             {v1.4h},   [x1],  x2
+        ld1             {v1.d}[1], [x10], x2
+        shl             v0.8h,   v0.8h,   #3
+        shl             v1.8h,   v1.8h,   #3
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h}, [x0], #32
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        b.gt            1b
+        trn2            v0.2d,   v1.2d,   v1.2d
+        trn2            v1.2d,   v1.2d,   v1.2d
+        b               L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+1:      // Copy and expand input
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v1.8h}, [x10], x2
+        ld1             {v2.8h}, [x1],  x2
+        shl             v0.8h,   v0.8h,   #3
+        ld1             {v3.8h}, [x10], x2
+        shl             v1.8h,   v1.8h,   #3
+        shl             v2.8h,   v2.8h,   #3
+        shl             v3.8h,   v3.8h,   #3
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v3.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
+1:      // Copy and expand input, without padding
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        ld1             {v2.8h, v3.8h}, [x10], x2
+        shl             v0.8h,   v0.8h,   #3
+        shl             v1.8h,   v1.8h,   #3
+        shl             v2.8h,   v2.8h,   #3
+        shl             v3.8h,   v3.8h,   #3
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1:      // Copy and expand input, padding 8
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v2.8h}, [x10], x2
+        shl             v0.8h,   v0.8h,   #3
+        shl             v2.8h,   v2.8h,   #3
+        dup             v1.8h,   v0.h[7]
+        dup             v3.8h,   v2.h[7]
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
+        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
+        lsr             x2,  x2,  #1 // Restore the stride to one line increments
+        sub             x7,  x7,  w3, uxtw
+        br              x7
+
+L(ipred_cfl_ac_444_w32_wpad0):
+1:      // Copy and expand input, without padding
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
+        shl             v0.8h,   v0.8h,   #3
+        shl             v1.8h,   v1.8h,   #3
+        shl             v2.8h,   v2.8h,   #3
+        shl             v3.8h,   v3.8h,   #3
+        subs            w8,  w8,  #1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        b               L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+1:      // Copy and expand input, padding 8
+        ld1             {v0.8h, v1.8h, v2.8h}, [x1],  x2
+        shl             v2.8h,   v2.8h,   #3
+        shl             v0.8h,   v0.8h,   #3
+        shl             v1.8h,   v1.8h,   #3
+        dup             v3.8h,   v2.h[7]
+        subs            w8,  w8,  #1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        b               L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+1:      // Copy and expand input, padding 16
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        shl             v1.8h,   v1.8h,   #3
+        shl             v0.8h,   v0.8h,   #3
+        dup             v2.8h,   v1.h[7]
+        dup             v3.8h,   v1.h[7]
+        subs            w8,  w8,  #1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        b               L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+1:      // Copy and expand input, padding 24
+        ld1             {v0.8h}, [x1],  x2
+        shl             v0.8h,   v0.8h,   #3
+        dup             v1.8h,   v0.h[7]
+        dup             v2.8h,   v0.h[7]
+        dup             v3.8h,   v0.h[7]
+        subs            w8,  w8,  #1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+
+L(ipred_cfl_ac_444_w32_hpad):
+        cbz             w4,  3f
+2:      // Vertical padding (h_pad > 0)
+        subs            w4,  w4,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            2b
+3:
+
+        //  Multiply the height by eight and reuse the w4 subtracting
+        lsl             w6,  w6,  #3
+        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_444_tbl):
+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
+        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
+
+L(ipred_cfl_ac_444_w32_tbl):
+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
+        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
 endfunc
--- a/src/arm/asm.S
+++ b/src/arm/asm.S
@@ -94,6 +94,8 @@
 #ifdef __ELF__
         .type   EXTERN\name, %function
         .hidden EXTERN\name
+#elif defined(__MACH__)
+        .private_extern EXTERN\name
 #endif
 #if HAVE_AS_FUNC
         .func   EXTERN\name
@@ -129,6 +131,8 @@
         .global EXTERN\name
 #ifdef __ELF__
         .hidden EXTERN\name
+#elif defined(__MACH__)
+        .private_extern EXTERN\name
 #endif
 EXTERN\name:
     .endif
--- a/src/arm/ipred_init_tmpl.c
+++ b/src/arm/ipred_init_tmpl.c
@@ -46,6 +46,7 @@
 
 decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon));
 decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon));
 
 decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
 
@@ -61,7 +62,6 @@
     c->intra_pred[LEFT_DC_PRED]  = BF(dav1d_ipred_dc_left, neon);
     c->intra_pred[HOR_PRED]      = BF(dav1d_ipred_h, neon);
     c->intra_pred[VERT_PRED]     = BF(dav1d_ipred_v, neon);
-#if ARCH_AARCH64
     c->intra_pred[PAETH_PRED]    = BF(dav1d_ipred_paeth, neon);
     c->intra_pred[SMOOTH_PRED]   = BF(dav1d_ipred_smooth, neon);
     c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon);
@@ -75,8 +75,8 @@
 
     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon);
     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);
 
     c->pal_pred                  = BF(dav1d_pal_pred, neon);
-#endif
 #endif
 }
--- a/src/arm/mc_init_tmpl.c
+++ b/src/arm/mc_init_tmpl.c
@@ -99,10 +99,12 @@
     init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
     init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
     init_mct_fn(FILTER_2D_BILINEAR,            bilin,               neon);
+#endif
 
     c->avg = BF(dav1d_avg, neon);
     c->w_avg = BF(dav1d_w_avg, neon);
     c->mask = BF(dav1d_mask, neon);
+#if BITDEPTH == 8 || ARCH_AARCH64
     c->blend = BF(dav1d_blend, neon);
     c->blend_h = BF(dav1d_blend_h, neon);
     c->blend_v = BF(dav1d_blend_v, neon);
--- a/src/decode.c
+++ b/src/decode.c
@@ -776,10 +776,10 @@
                                signabs(t->warpmv.matrix[3]),
                                signabs(t->warpmv.matrix[4]),
                                signabs(t->warpmv.matrix[5]),
-                               signabs(t->warpmv.alpha),
-                               signabs(t->warpmv.beta),
-                               signabs(t->warpmv.gamma),
-                               signabs(t->warpmv.delta),
+                               signabs(t->warpmv.u.p.alpha),
+                               signabs(t->warpmv.u.p.beta),
+                               signabs(t->warpmv.u.p.gamma),
+                               signabs(t->warpmv.u.p.delta),
                                b->mv2d.y, b->mv2d.x);
 #undef signabs
                 }
@@ -1849,10 +1849,10 @@
                                signabs(t->warpmv.matrix[3]),
                                signabs(t->warpmv.matrix[4]),
                                signabs(t->warpmv.matrix[5]),
-                               signabs(t->warpmv.alpha),
-                               signabs(t->warpmv.beta),
-                               signabs(t->warpmv.gamma),
-                               signabs(t->warpmv.delta),
+                               signabs(t->warpmv.u.p.alpha),
+                               signabs(t->warpmv.u.p.beta),
+                               signabs(t->warpmv.u.p.gamma),
+                               signabs(t->warpmv.u.p.delta),
                                b->mv[0].y, b->mv[0].x);
 #undef signabs
                     if (f->frame_thread.pass) {
--- a/src/ext/x86/x86inc.asm
+++ b/src/ext/x86/x86inc.asm
@@ -1,5 +1,5 @@
 ;*****************************************************************************
-;* x86inc.asm: x264asm abstraction layer
+;* x86inc.asm: x86 abstraction layer
 ;*****************************************************************************
 ;* Copyright (C) 2005-2020 x264 project
 ;*
@@ -21,23 +21,14 @@
 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 ;*****************************************************************************
 
-; This is a header file for the x264ASM assembly language, which uses
+; This is a header file for the x86inc.asm assembly language, which uses
 ; NASM/YASM syntax combined with a large number of macros to provide easy
 ; abstraction between different calling conventions (x86_32, win64, linux64).
 ; It also has various other useful features to simplify writing the kind of
-; DSP functions that are most often used in x264.
+; DSP functions that are most often used.
 
-; Unlike the rest of x264, this file is available under an ISC license, as it
-; has significant usefulness outside of x264 and we want it to be available
-; to the largest audience possible.  Of course, if you modify it for your own
-; purposes to add a new feature, we strongly encourage contributing a patch
-; as this feature might be useful for others as well.  Send patches or ideas
-; to x264-devel@videolan.org .
-
-%include "config.asm"
-
 %ifndef private_prefix
-    %define private_prefix dav1d
+    %error private_prefix not defined
 %endif
 
 %ifndef public_prefix
@@ -118,7 +109,7 @@
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
 ; Currently this works only for leaf functions which load all their arguments
 ; into registers at the start, and make no other use of the stack. Luckily that
-; covers most of x264's asm.
+; covers most use cases.
 
 ; PROLOGUE:
 ; %1 = number of arguments. loads them from stack if needed.
@@ -1522,12 +1513,11 @@
 AVX_INSTR pabsb, ssse3
 AVX_INSTR pabsd, ssse3
 AVX_INSTR pabsw, ssse3
-AVX_INSTR packsswb, mmx, 0, 0, 0
 AVX_INSTR packssdw, mmx, 0, 0, 0
-AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR packsswb, mmx, 0, 0, 0
 AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
 AVX_INSTR paddb, mmx, 0, 0, 1
-AVX_INSTR paddw, mmx, 0, 0, 1
 AVX_INSTR paddd, mmx, 0, 0, 1
 AVX_INSTR paddq, sse2, 0, 0, 1
 AVX_INSTR paddsb, mmx, 0, 0, 1
@@ -1534,6 +1524,7 @@
 AVX_INSTR paddsw, mmx, 0, 0, 1
 AVX_INSTR paddusb, mmx, 0, 0, 1
 AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
 AVX_INSTR palignr, ssse3, 0, 1, 0
 AVX_INSTR pand, mmx, 0, 0, 1
 AVX_INSTR pandn, mmx, 0, 0, 0
@@ -1541,71 +1532,71 @@
 AVX_INSTR pavgw, mmx2, 0, 0, 1
 AVX_INSTR pblendvb, sse4 ; can't be emulated
 AVX_INSTR pblendw, sse4, 0, 1, 0
-AVX_INSTR pclmulqdq, fnord, 0, 1, 0
 AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
 AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
 AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
 AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
-AVX_INSTR pcmpestri, sse42
-AVX_INSTR pcmpestrm, sse42
-AVX_INSTR pcmpistri, sse42
-AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pclmulqdq, fnord, 0, 1, 0
 AVX_INSTR pcmpeqb, mmx, 0, 0, 1
-AVX_INSTR pcmpeqw, mmx, 0, 0, 1
 AVX_INSTR pcmpeqd, mmx, 0, 0, 1
 AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
 AVX_INSTR pcmpgtb, mmx, 0, 0, 0
-AVX_INSTR pcmpgtw, mmx, 0, 0, 0
 AVX_INSTR pcmpgtd, mmx, 0, 0, 0
 AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
 AVX_INSTR pextrb, sse4
 AVX_INSTR pextrd, sse4
 AVX_INSTR pextrq, sse4
 AVX_INSTR pextrw, mmx2
-AVX_INSTR phaddw, ssse3, 0, 0, 0
 AVX_INSTR phaddd, ssse3, 0, 0, 0
 AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phaddw, ssse3, 0, 0, 0
 AVX_INSTR phminposuw, sse4
-AVX_INSTR phsubw, ssse3, 0, 0, 0
 AVX_INSTR phsubd, ssse3, 0, 0, 0
 AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR phsubw, ssse3, 0, 0, 0
 AVX_INSTR pinsrb, sse4, 0, 1, 0
 AVX_INSTR pinsrd, sse4, 0, 1, 0
 AVX_INSTR pinsrq, sse4, 0, 1, 0
 AVX_INSTR pinsrw, mmx2, 0, 1, 0
-AVX_INSTR pmaddwd, mmx, 0, 0, 1
 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
 AVX_INSTR pmaxsb, sse4, 0, 0, 1
-AVX_INSTR pmaxsw, mmx2, 0, 0, 1
 AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
 AVX_INSTR pmaxub, mmx2, 0, 0, 1
-AVX_INSTR pmaxuw, sse4, 0, 0, 1
 AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
 AVX_INSTR pminsb, sse4, 0, 0, 1
-AVX_INSTR pminsw, mmx2, 0, 0, 1
 AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
 AVX_INSTR pminub, mmx2, 0, 0, 1
-AVX_INSTR pminuw, sse4, 0, 0, 1
 AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
 AVX_INSTR pmovmskb, mmx2
-AVX_INSTR pmovsxbw, sse4
 AVX_INSTR pmovsxbd, sse4
 AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxdq, sse4
 AVX_INSTR pmovsxwd, sse4
 AVX_INSTR pmovsxwq, sse4
-AVX_INSTR pmovsxdq, sse4
-AVX_INSTR pmovzxbw, sse4
 AVX_INSTR pmovzxbd, sse4
 AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxdq, sse4
 AVX_INSTR pmovzxwd, sse4
 AVX_INSTR pmovzxwq, sse4
-AVX_INSTR pmovzxdq, sse4
 AVX_INSTR pmuldq, sse4, 0, 0, 1
 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
 AVX_INSTR pmulhuw, mmx2, 0, 0, 1
 AVX_INSTR pmulhw, mmx, 0, 0, 1
-AVX_INSTR pmullw, mmx, 0, 0, 1
 AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
 AVX_INSTR pmuludq, sse2, 0, 0, 1
 AVX_INSTR por, mmx, 0, 0, 1
 AVX_INSTR psadbw, mmx2, 0, 0, 1
@@ -1614,20 +1605,19 @@
 AVX_INSTR pshufhw, sse2
 AVX_INSTR pshuflw, sse2
 AVX_INSTR psignb, ssse3, 0, 0, 0
-AVX_INSTR psignw, ssse3, 0, 0, 0
 AVX_INSTR psignd, ssse3, 0, 0, 0
-AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
 AVX_INSTR pslld, mmx, 0, 0, 0
-AVX_INSTR psllq, mmx, 0, 0, 0
 AVX_INSTR pslldq, sse2, 0, 0, 0
-AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
 AVX_INSTR psrad, mmx, 0, 0, 0
-AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
 AVX_INSTR psrld, mmx, 0, 0, 0
-AVX_INSTR psrlq, mmx, 0, 0, 0
 AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
 AVX_INSTR psubb, mmx, 0, 0, 0
-AVX_INSTR psubw, mmx, 0, 0, 0
 AVX_INSTR psubd, mmx, 0, 0, 0
 AVX_INSTR psubq, sse2, 0, 0, 0
 AVX_INSTR psubsb, mmx, 0, 0, 0
@@ -1634,15 +1624,16 @@
 AVX_INSTR psubsw, mmx, 0, 0, 0
 AVX_INSTR psubusb, mmx, 0, 0, 0
 AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
 AVX_INSTR ptest, sse4
 AVX_INSTR punpckhbw, mmx, 0, 0, 0
-AVX_INSTR punpckhwd, mmx, 0, 0, 0
 AVX_INSTR punpckhdq, mmx, 0, 0, 0
 AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
 AVX_INSTR punpcklbw, mmx, 0, 0, 0
-AVX_INSTR punpcklwd, mmx, 0, 0, 0
 AVX_INSTR punpckldq, mmx, 0, 0, 0
 AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
 AVX_INSTR pxor, mmx, 0, 0, 1
 AVX_INSTR rcpps, sse, 1
 AVX_INSTR rcpss, sse, 1, 0, 0
@@ -1674,8 +1665,8 @@
 
 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
 AVX_INSTR pfadd, 3dnow, 1, 0, 1
-AVX_INSTR pfsub, 3dnow, 1, 0, 0
 AVX_INSTR pfmul, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
 
 ;%1 == instruction
 ;%2 == minimal instruction set
@@ -1740,9 +1731,9 @@
     %endmacro
 %endmacro
 
-FMA_INSTR  pmacsww,  pmullw, paddw
-FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
-FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmacsdd,  pmulld,  paddd ; sse4 emulation
+FMA_INSTR pmacsdql, pmuldq,  paddq ; sse4 emulation
+FMA_INSTR pmacsww,  pmullw,  paddw
 FMA_INSTR pmadcswd, pmaddwd, paddd
 
 ; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
--- a/src/meson.build
+++ b/src/meson.build
@@ -147,6 +147,7 @@
 
             if dav1d_bitdepths.contains('16')
                 libdav1d_sources += files(
+                    'arm/32/mc16.S',
                 )
             endif
         endif
--- a/src/obu.c
+++ b/src/obu.c
@@ -1201,7 +1201,6 @@
 
     const unsigned init_bit_pos = dav1d_get_bits_pos(&gb);
     const unsigned init_byte_pos = init_bit_pos >> 3;
-    const unsigned pkt_bytelen = init_byte_pos + len;
 
     // We must have read a whole number of bytes at this point (1 byte
     // for the header and whole bytes at a time when reading the
@@ -1345,6 +1344,7 @@
         // The current bit position is a multiple of 8 (because we
         // just aligned it) and less than 8*pkt_bytelen because
         // otherwise the overrun check would have fired.
+        const unsigned pkt_bytelen = init_byte_pos + len;
         const unsigned bit_pos = dav1d_get_bits_pos(&gb);
         assert((bit_pos & 7) == 0);
         assert(pkt_bytelen >= (bit_pos >> 3));
@@ -1371,17 +1371,12 @@
         const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
         const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3;
         if (gb.error) goto error;
-        Dav1dRef *ref;
-        Dav1dContentLightLevel *content_light;
-        Dav1dMasteringDisplay *mastering_display;
-        Dav1dITUTT35 *itut_t35_metadata;
 
         switch (meta_type) {
-        case OBU_META_HDR_CLL:
-            ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel));
+        case OBU_META_HDR_CLL: {
+            Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel));
             if (!ref) return DAV1D_ERR(ENOMEM);
-            content_light = ref->data;
-            memset(content_light, 0, sizeof(*content_light));
+            Dav1dContentLightLevel *const content_light = ref->data;
 
             content_light->max_content_light_level = dav1d_get_bits(&gb, 16);
             content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16);
@@ -1398,11 +1393,11 @@
             c->content_light = content_light;
             c->content_light_ref = ref;
             break;
+        }
         case OBU_META_HDR_MDCV: {
-            ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay));
+            Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay));
             if (!ref) return DAV1D_ERR(ENOMEM);
-            mastering_display = ref->data;
-            memset(mastering_display, 0, sizeof(*mastering_display));
+            Dav1dMasteringDisplay *const mastering_display = ref->data;
 
             for (int i = 0; i < 3; i++) {
                 mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16);
@@ -1450,9 +1445,9 @@
                 goto error;
             }
 
-            ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t));
+            Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t));
             if (!ref) return DAV1D_ERR(ENOMEM);
-            itut_t35_metadata = ref->data;
+            Dav1dITUTT35 *const itut_t35_metadata = ref->data;
 
             // We need our public headers to be C++ compatible, so payload can't be
             // a flexible array member
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -1084,11 +1084,11 @@
             const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
 
             const int dx = (int) (mvx >> 16) - 4;
-            const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 -
-                                                   wmp->beta  * 7) & ~0x3f;
+            const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -
+                                                   wmp->u.p.beta  * 7) & ~0x3f;
             const int dy = (int) (mvy >> 16) - 4;
-            const int my = (((int) mvy & 0xffff) - wmp->gamma * 4 -
-                                                   wmp->delta * 4) & ~0x3f;
+            const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -
+                                                   wmp->u.p.delta * 4) & ~0x3f;
 
             const pixel *ref_ptr;
             ptrdiff_t ref_stride = refp->p.stride[!!pl];
@@ -1110,10 +1110,10 @@
             }
             if (dst16 != NULL)
                 dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
-                                 wmp->abcd, mx, my HIGHBD_CALL_SUFFIX);
+                                 wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
             else
                 dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
-                                wmp->abcd, mx, my HIGHBD_CALL_SUFFIX);
+                                wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
         }
         if (dst8) dst8  += 8 * PXSTRIDE(dstride);
         else      dst16 += 8 * dstride;
--- a/src/tables.c
+++ b/src/tables.c
@@ -689,7 +689,7 @@
     { 0, -1,  2,   -4, -127,  3, -1, 0 }, { 0,  0,  1,   -2, -128,  1,  0, 0 },
 };
 
-const uint8_t dav1d_sm_weights[128] = {
+const uint8_t ALIGN(dav1d_sm_weights[128], 16) = {
     // Unused, because we always offset by bs, which is at least 2.
       0,   0,
     // bs = 2
--- a/src/warpmv.c
+++ b/src/warpmv.c
@@ -82,21 +82,21 @@
 
     if (mat[2] <= 0) return 1;
 
-    wm->alpha = iclip_wmp(mat[2] - 0x10000);
-    wm->beta = iclip_wmp(mat[3]);
+    wm->u.p.alpha = iclip_wmp(mat[2] - 0x10000);
+    wm->u.p.beta = iclip_wmp(mat[3]);
 
     int shift;
     const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]);
     const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y;
     const int rnd = (1 << shift) >> 1;
-    wm->gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
+    wm->u.p.gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
     const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y;
-    wm->delta = iclip_wmp(mat[5] -
+    wm->u.p.delta = iclip_wmp(mat[5] -
                           apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) -
                           0x10000);
 
-    return (4 * abs(wm->alpha) + 7 * abs(wm->beta) >= 0x10000) ||
-           (4 * abs(wm->gamma) + 4 * abs(wm->delta) >= 0x10000);
+    return (4 * abs(wm->u.p.alpha) + 7 * abs(wm->u.p.beta) >= 0x10000) ||
+           (4 * abs(wm->u.p.gamma) + 4 * abs(wm->u.p.delta) >= 0x10000);
 }
 
 static int resolve_divisor_64(const uint64_t d, int *const shift) {
--- a/src/x86/cdef_avx2.asm
+++ b/src/x86/cdef_avx2.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 %if ARCH_X86_64
--- a/src/x86/cdef_avx512.asm
+++ b/src/x86/cdef_avx512.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 %if HAVE_AVX512ICL && ARCH_X86_64
--- a/src/x86/cdef_sse.asm
+++ b/src/x86/cdef_sse.asm
@@ -24,32 +24,36 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 SECTION_RODATA 16
 
-%if ARCH_X86_32
-pb_0: times 16 db 0
-pb_0xFF: times 16 db 0xFF
-%endif
+%macro DUP8 1-*
+    %rep %0
+        times 8 db %1
+        %rotate 1
+    %endrep
+%endmacro
+
+div_table_sse4:  dd 840, 420, 280, 210, 168, 140, 120, 105
+                 dd 420, 210, 140, 105, 105, 105, 105, 105
+div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210
+                 dw 168, 168, 140, 140, 120, 120, 105, 105
+                 dw 420, 420, 210, 210, 140, 140, 105, 105
+                 dw 105, 105, 105, 105, 105, 105, 105, 105
+shufw_6543210x:  db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
+shufb_lohi:      db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
 pw_8: times 8 dw 8
 pw_128: times 8 dw 128
 pw_256: times 8 dw 256
 pw_2048: times 8 dw 2048
-%if ARCH_X86_32
 pw_0x7FFF: times 8 dw 0x7FFF
 pw_0x8000: times 8 dw 0x8000
-%endif
-div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105
-                dd 420, 210, 140, 105, 105, 105, 105, 105
-div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
-                 dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105
-shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
-shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
 tap_table: ; masks for 8-bit shift emulation
-           db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
+           DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80
            ; weights
-           db 4, 2, 3, 3, 2, 1
+           DUP8 4, 2, 3, 3, 2, 1
            ; taps indices
            db -1 * 16 + 1, -2 * 16 + 2
            db  0 * 16 + 1, -1 * 16 + 2
@@ -75,59 +79,19 @@
  %endif
 %endmacro
 
-%macro SAVE_ARG 2   ; varname, argnum
- %define %1_stkloc  [rsp+%2*gprsize]
- %define %1_argnum  %2
-    mov             r2, r%2m
-    mov      %1_stkloc, r2
-%endmacro
-
-%macro LOAD_ARG 1-2 0 ; varname, load_to_varname_register
- %if %2 == 0
-    mov r %+ %{1}_argnum, %1_stkloc
+%macro PMOVZXBW 2-3 0 ; %3 = half
+ %if cpuflag(sse4) && %3 == 0
+    pmovzxbw        %1, %2
  %else
-    mov            %1q, %1_stkloc
- %endif
-%endmacro
-
-%macro LOAD_ARG32 1-2 ; varname, load_to_varname_register
- %if ARCH_X86_32
-  %if %0 == 1
-    LOAD_ARG %1
+  %if %3 == 1
+    movd            %1, %2
   %else
-    LOAD_ARG %1, %2
+    movq            %1, %2
   %endif
+    punpcklbw       %1, m7
  %endif
 %endmacro
 
-%if ARCH_X86_32
- %define PIC_base_offset $$
- %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
-%else
- %define PIC_sym(sym) sym
-%endif
-
-%macro SAVE_PIC_REG 1
- %if ARCH_X86_32
-    mov       [esp+%1], PIC_reg
- %endif
-%endmacro
-
-%macro LOAD_PIC_REG 1
- %if ARCH_X86_32
-    mov        PIC_reg, [esp+%1]
- %endif
-%endmacro
-
-%macro PMOVZXBW 2-3 0 ; %3 = half
- %if %3 == 1
-    movd            %1, %2
- %else
-    movq            %1, %2
- %endif
-    punpcklbw       %1, m15
-%endmacro
-
 %macro PSHUFB_0 2
  %if cpuflag(ssse3)
     pshufb          %1, %2
@@ -138,34 +102,33 @@
  %endif
 %endmacro
 
-%macro LOAD_SEC_TAP 0
- %if ARCH_X86_64
-    movd            m3, [secq+kq]
-    PSHUFB_0        m3, m15
- %else
-    movd            m2, [secq+kq]             ; sec_taps
-    pxor            m3, m3
-    PSHUFB_0        m2, m3
- %endif
+%macro MOVDDUP 2
+%if cpuflag(ssse3)
+    movddup         %1, %2
+%else
+    movq            %1, %2
+    punpcklqdq      %1, %1
+%endif
 %endmacro
 
-%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, stride
+%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax
     ; load p0/p1
-    movsx         offq, byte [dirq+kq+%1]       ; off1
+    movsx         offq, byte [dirq+kq+%1+14*8]  ; off1
  %if %6 == 4
-    movq            m5, [stkq+offq*2+%7*0]      ; p0
-    movhps          m5, [stkq+offq*2+%7*1]
+    movq            m5, [stkq+offq*2+32*0]      ; p0
+    movhps          m5, [stkq+offq*2+32*1]
  %else
-    movu            m5, [stkq+offq*2+%7*0]      ; p0
+    movu            m5, [stkq+offq*2+32*0]      ; p0
  %endif
     neg           offq                          ; -off1
  %if %6 == 4
-    movq            m6, [stkq+offq*2+%7*0]      ; p1
-    movhps          m6, [stkq+offq*2+%7*1]
+    movq            m6, [stkq+offq*2+32*0]      ; p1
+    movhps          m6, [stkq+offq*2+32*1]
  %else
-    movu            m6, [stkq+offq*2+%7*0]      ; p1
+    movu            m6, [stkq+offq*2+32*0]      ; p1
  %endif
- %if cpuflag(sse4)
+ %if %7
+  %if cpuflag(sse4)
     ; out of bounds values are set to a value that is a both a large unsigned
     ; value and a negative signed value.
     ; use signed max and unsigned min to remove them
@@ -173,40 +136,26 @@
     pminuw          m8, m5
     pmaxsw          m7, m6
     pminuw          m8, m6
- %else
-  %if ARCH_X86_64
-    pcmpeqw         m9, m14, m5
-    pcmpeqw        m10, m14, m6
-    pandn           m9, m5
-    pandn          m10, m6
-    pmaxsw          m7, m9                      ; max after p0
-    pminsw          m8, m5                      ; min after p0
-    pmaxsw          m7, m10                     ; max after p1
-    pminsw          m8, m6                      ; min after p1
   %else
-    pcmpeqw         m9, m5, OUT_OF_BOUNDS_MEM
-    pandn           m9, m5
-    pmaxsw          m7, m9                      ; max after p0
-    pminsw          m8, m5                      ; min after p0
-    pcmpeqw         m9, m6, OUT_OF_BOUNDS_MEM
-    pandn           m9, m6
-    pmaxsw          m7, m9                      ; max after p1
-    pminsw          m8, m6                      ; min after p1
+    pcmpeqw         m3, m14, m5
+    pminsw          m8, m5     ; min after p0
+    pandn           m3, m5
+    pmaxsw          m7, m3     ; max after p0
+    pcmpeqw         m3, m14, m6
+    pminsw          m8, m6     ; min after p1
+    pandn           m3, m6
+    pmaxsw          m7, m3     ; max after p1
   %endif
  %endif
 
     ; accumulate sum[m13] over p0/p1
-    psubw           m5, m4          ; diff_p0(p0 - px)
-    psubw           m6, m4          ; diff_p1(p1 - px)
-    packsswb        m5, m6          ; convert pixel diff to 8-bit
+    psubw           m5, m4     ; diff_p0(p0 - px)
+    psubw           m6, m4     ; diff_p1(p1 - px)
+    packsswb        m5, m6     ; convert pixel diff to 8-bit
  %if cpuflag(ssse3)
-  %if ARCH_X86_64 && cpuflag(sse4)
-    pshufb          m5, m14         ; group diffs p0 and p1 into pairs
-  %else
-    pshufb          m5, [PIC_sym(shufb_lohi)]
-  %endif
+    pshufb          m5, m13    ; group diffs p0 and p1 into pairs
     pabsb           m6, m5
-    psignb          m9, %5, m5
+    psignb          m3, %5, m5
  %else
     movlhps         m6, m5
     punpckhbw       m6, m5
@@ -214,111 +163,113 @@
     pcmpgtb         m5, m6
     paddb           m6, m5
     pxor            m6, m5
-    paddb           m9, %5, m5
-    pxor            m9, m5
+    paddb           m3, %5, m5
+    pxor            m3, m5
  %endif
- %if ARCH_X86_64
-    psrlw          m10, m6, %2      ; emulate 8-bit shift
-    pand           m10, %3
-    psubusb         m5, %4, m10
- %else
-    psrlw           m5, m6, %2      ; emulate 8-bit shift
-    pand            m5, %3
-    paddusb         m5, %4
-    pxor            m5, [PIC_sym(pb_0xFF)]
- %endif
-    pminub          m5, m6          ; constrain(diff_p)
+    pand            m9, %3, m6 ; emulate 8-bit shift
+    psrlw           m9, %2
+    psubusb         m5, %4, m9
+    pminub          m5, m6     ; constrain(diff_p)
  %if cpuflag(ssse3)
-    pmaddubsw       m5, m9          ; constrain(diff_p) * taps
+    pmaddubsw       m5, m3     ; constrain(diff_p) * taps
  %else
-    psrlw           m2, m5, 8
-    psraw           m6, m9, 8
+    psrlw           m9, m5, 8
+    psraw           m6, m3, 8
     psllw           m5, 8
-    psllw           m9, 8
-    pmullw          m2, m6
-    pmulhw          m5, m9
-    paddw           m5, m2
+    psllw           m3, 8
+    pmullw          m9, m6
+    pmulhw          m5, m3
+    paddw           m5, m9
  %endif
-    paddw          m13, m5
+    paddw           m0, m5
 %endmacro
 
-%macro LOAD_BODY 4  ; dst, src, block_width, tmp_stride
+%macro LOAD_BODY 3 ; dst, src, block_width
  %if %3 == 4
     PMOVZXBW        m0, [%2+strideq*0]
     PMOVZXBW        m1, [%2+strideq*1]
     PMOVZXBW        m2, [%2+strideq*2]
     PMOVZXBW        m3, [%2+stride3q]
+    mova     [%1+32*0], m0
+    mova     [%1+32*1], m1
+    mova     [%1+32*2], m2
+    mova     [%1+32*3], m3
  %else
     movu            m0, [%2+strideq*0]
     movu            m1, [%2+strideq*1]
     movu            m2, [%2+strideq*2]
     movu            m3, [%2+stride3q]
-    punpckhbw       m4, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m5, m1, m15
-    punpcklbw       m1, m15
-    punpckhbw       m6, m2, m15
-    punpcklbw       m2, m15
-    punpckhbw       m7, m3, m15
-    punpcklbw       m3, m15
+    punpcklbw       m4, m0, m7
+    punpckhbw       m0, m7
+    mova  [%1+32*0+ 0], m4
+    mova  [%1+32*0+16], m0
+    punpcklbw       m4, m1, m7
+    punpckhbw       m1, m7
+    mova  [%1+32*1+ 0], m4
+    mova  [%1+32*1+16], m1
+    punpcklbw       m4, m2, m7
+    punpckhbw       m2, m7
+    mova  [%1+32*2+ 0], m4
+    mova  [%1+32*2+16], m2
+    punpcklbw       m4, m3, m7
+    punpckhbw       m3, m7
+    mova  [%1+32*3+ 0], m4
+    mova  [%1+32*3+16], m3
  %endif
-    mova     [%1+0*%4], m0
-    mova     [%1+1*%4], m1
-    mova     [%1+2*%4], m2
-    mova     [%1+3*%4], m3
- %if %3 == 8
-    mova [%1+0*%4+2*8], m4
-    mova [%1+1*%4+2*8], m5
-    mova [%1+2*%4+2*8], m6
-    mova [%1+3*%4+2*8], m7
- %endif
 %endmacro
 
-%macro CDEF_FILTER 3 ; w, h, stride
-
- %if cpuflag(sse4)
-  %define OUT_OF_BOUNDS 0x80008000
+%macro CDEF_FILTER_END 2 ; w, minmax
+    pxor            m6, m6
+    pcmpgtw         m6, m0
+    paddw           m0, m6
+ %if cpuflag(ssse3)
+    pmulhrsw        m0, m15
  %else
-  %define OUT_OF_BOUNDS 0x7FFF7FFF
+    paddw           m0, m15
+    psraw           m0, 4
  %endif
+    paddw           m4, m0
+ %if %2
+    pminsw          m4, m7
+    pmaxsw          m4, m8
+ %endif
+    packuswb        m4, m4
+ %if %1 == 4
+    movd [dstq+strideq*0], m4
+    psrlq           m4, 32
+    movd [dstq+strideq*1], m4
+    add           stkq, 32*2
+    lea           dstq, [dstq+strideq*2]
+ %else
+    movq        [dstq], m4
+    add           stkq, 32
+    add           dstq, strideq
+ %endif
+%endmacro
 
+%macro CDEF_FILTER 2 ; w, h
  %if ARCH_X86_64
-cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*%3, \
-                           dst, stride, left, top, pri, sec, stride3, dst4, edge
-    pcmpeqw        m14, m14
-  %if cpuflag(sse4)
-    psllw          m14, 15                  ; 0x8000
-  %else
-    psrlw          m14, 1                   ; 0x7FFF
-  %endif
-    pxor           m15, m15
-
-  %define px rsp+3*16+2*%3
+cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*32, \
+                           dst, stride, left, top, pri, sec, edge, stride3, dst4
+  %define px rsp+3*16+2*32
+  %define base 0
  %else
-cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
-                           dst, stride, left, top, stride3, dst4, edge
-    SAVE_ARG      left, 2
-    SAVE_ARG       top, 3
-    SAVE_ARG       pri, 4
-    SAVE_ARG       sec, 5
-    SAVE_ARG       dir, 6
-    SAVE_ARG   damping, 7
-
-  %define PIC_reg r2
-    LEA        PIC_reg, PIC_base_offset
-
-  %if cpuflag(sse4)
-   %define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x8000)]
-  %else
-   %define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x7FFF)]
-  %endif
-
-  %define m15 [PIC_sym(pb_0)]
-
-  %define px esp+7*16+2*%3
+cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
+                           dst, stride, left, edge, stride3
+    %define       topq  r2
+    %define      dst4q  r2
+    LEA             r5, tap_table
+  %define px esp+7*16+2*32
+  %define base r5-tap_table
  %endif
-
     mov          edged, r8m
+ %if cpuflag(sse4)
+   %define OUT_OF_BOUNDS_MEM [base+pw_0x8000]
+ %else
+   %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF]
+ %endif
+    mova            m6, OUT_OF_BOUNDS_MEM
+    pxor            m7, m7
 
     ; prepare pixel buffers - body/right
  %if %2 == 8
@@ -325,11 +276,11 @@
     lea          dst4q, [dstq+strideq*4]
  %endif
     lea       stride3q, [strideq*3]
-    test         edged, 2                   ; have_right
+    test         edgeb, 2 ; have_right
     jz .no_right
-    LOAD_BODY       px, dstq, %1, %3
+    LOAD_BODY       px, dstq, %1
  %if %2 == 8
-    LOAD_BODY  px+4*%3, dst4q, %1, %3
+    LOAD_BODY  px+4*32, dst4q, %1
  %endif
     jmp .body_done
 .no_right:
@@ -337,39 +288,37 @@
     PMOVZXBW        m1, [dstq+strideq*1], %1 == 4
     PMOVZXBW        m2, [dstq+strideq*2], %1 == 4
     PMOVZXBW        m3, [dstq+stride3q ], %1 == 4
+    mova     [px+32*0], m0
+    mova     [px+32*1], m1
+    mova     [px+32*2], m2
+    mova     [px+32*3], m3
+    movd [px+32*0+%1*2], m6
+    movd [px+32*1+%1*2], m6
+    movd [px+32*2+%1*2], m6
+    movd [px+32*3+%1*2], m6
  %if %2 == 8
-    PMOVZXBW        m4, [dst4q+strideq*0], %1 == 4
-    PMOVZXBW        m5, [dst4q+strideq*1], %1 == 4
-    PMOVZXBW        m6, [dst4q+strideq*2], %1 == 4
-    PMOVZXBW        m7, [dst4q+stride3q ], %1 == 4
+    PMOVZXBW        m0, [dst4q+strideq*0], %1 == 4
+    PMOVZXBW        m1, [dst4q+strideq*1], %1 == 4
+    PMOVZXBW        m2, [dst4q+strideq*2], %1 == 4
+    PMOVZXBW        m3, [dst4q+stride3q ], %1 == 4
+    mova     [px+32*4], m0
+    mova     [px+32*5], m1
+    mova     [px+32*6], m2
+    mova     [px+32*7], m3
+    movd [px+32*4+%1*2], m6
+    movd [px+32*5+%1*2], m6
+    movd [px+32*6+%1*2], m6
+    movd [px+32*7+%1*2], m6
  %endif
-    mova     [px+0*%3], m0
-    mova     [px+1*%3], m1
-    mova     [px+2*%3], m2
-    mova     [px+3*%3], m3
- %if %2 == 8
-    mova     [px+4*%3], m4
-    mova     [px+5*%3], m5
-    mova     [px+6*%3], m6
-    mova     [px+7*%3], m7
-    mov dword [px+4*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+5*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+6*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+7*%3+%1*2], OUT_OF_BOUNDS
- %endif
-    mov dword [px+0*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+1*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+2*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+3*%3+%1*2], OUT_OF_BOUNDS
 .body_done:
 
     ; top
-    LOAD_ARG32     top
-    test         edged, 4                    ; have_top
+    movifnidn     topq, r3mp
+    test         edgeb, 4 ; have_top
     jz .no_top
-    test         edged, 1                    ; have_left
+    test         edgeb, 1 ; have_left
     jz .top_no_left
-    test         edged, 2                    ; have_right
+    test         edgeb, 2 ; have_right
     jz .top_no_right
  %if %1 == 4
     PMOVZXBW        m0, [topq+strideq*0-2]
@@ -377,39 +326,39 @@
  %else
     movu            m0, [topq+strideq*0-4]
     movu            m1, [topq+strideq*1-4]
-    punpckhbw       m2, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m3, m1, m15
-    punpcklbw       m1, m15
-    movu  [px-2*%3+8], m2
-    movu  [px-1*%3+8], m3
+    punpckhbw       m2, m0, m7
+    punpcklbw       m0, m7
+    punpckhbw       m3, m1, m7
+    punpcklbw       m1, m7
+    movu  [px-32*2+8], m2
+    movu  [px-32*1+8], m3
  %endif
-    movu  [px-2*%3-%1], m0
-    movu  [px-1*%3-%1], m1
+    movu  [px-32*2-%1], m0
+    movu  [px-32*1-%1], m1
     jmp .top_done
 .top_no_right:
  %if %1 == 4
     PMOVZXBW        m0, [topq+strideq*0-%1]
     PMOVZXBW        m1, [topq+strideq*1-%1]
-    movu [px-2*%3-4*2], m0
-    movu [px-1*%3-4*2], m1
+    movu   [px-32*2-8], m0
+    movu   [px-32*1-8], m1
  %else
     movu            m0, [topq+strideq*0-%1]
     movu            m1, [topq+strideq*1-%2]
-    punpckhbw       m2, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m3, m1, m15
-    punpcklbw       m1, m15
-    mova [px-2*%3-8*2], m0
-    mova [px-2*%3-0*2], m2
-    mova [px-1*%3-8*2], m1
-    mova [px-1*%3-0*2], m3
+    punpckhbw       m2, m0, m7
+    punpcklbw       m0, m7
+    punpckhbw       m3, m1, m7
+    punpcklbw       m1, m7
+    mova  [px-32*2-16], m0
+    mova  [px-32*2+ 0], m2
+    mova  [px-32*1-16], m1
+    mova  [px-32*1+ 0], m3
  %endif
-    mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
+    movd [px-32*2+%1*2], m6
+    movd [px-32*1+%1*2], m6
     jmp .top_done
 .top_no_left:
-    test         edged, 2                   ; have_right
+    test         edgeb, 2 ; have_right
     jz .top_no_left_right
  %if %1 == 4
     PMOVZXBW        m0, [topq+strideq*0]
@@ -417,102 +366,92 @@
  %else
     movu            m0, [topq+strideq*0]
     movu            m1, [topq+strideq*1]
-    punpckhbw       m2, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m3, m1, m15
-    punpcklbw       m1, m15
-    movd [px-2*%3+8*2], m2
-    movd [px-1*%3+8*2], m3
+    punpckhbw       m2, m0, m7
+    punpcklbw       m0, m7
+    punpckhbw       m3, m1, m7
+    punpcklbw       m1, m7
+    movd  [px-32*2+16], m2
+    movd  [px-32*1+16], m3
  %endif
-    mova     [px-2*%3], m0
-    mova     [px-1*%3], m1
-    mov dword [px-2*%3-4], OUT_OF_BOUNDS
-    mov dword [px-1*%3-4], OUT_OF_BOUNDS
+    movd  [px-32*2- 4], m6
+    movd  [px-32*1- 4], m6
+    mova  [px-32*2+ 0], m0
+    mova  [px-32*1+ 0], m1
     jmp .top_done
 .top_no_left_right:
     PMOVZXBW        m0, [topq+strideq*0], %1 == 4
     PMOVZXBW        m1, [topq+strideq*1], %1 == 4
-    mova     [px-2*%3], m0
-    mova     [px-1*%3], m1
-    mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px-2*%3-4], OUT_OF_BOUNDS
-    mov dword [px-1*%3-4], OUT_OF_BOUNDS
+    movd   [px-32*2-4], m6
+    movd   [px-32*1-4], m6
+    mova   [px-32*2+0], m0
+    mova   [px-32*1+0], m1
+    movd [px-32*2+%1*2], m6
+    movd [px-32*1+%1*2], m6
     jmp .top_done
 .no_top:
- %if ARCH_X86_64
-    SWAP            m0, m14
- %else
-    mova            m0, OUT_OF_BOUNDS_MEM
- %endif
-    movu   [px-2*%3-4], m0
-    movu   [px-1*%3-4], m0
+    movu  [px-32*2- 4], m6
+    movu  [px-32*1- 4], m6
  %if %1 == 8
-    movq   [px-2*%3+12], m0
-    movq   [px-1*%3+12], m0
+    movq  [px-32*2+12], m6
+    movq  [px-32*1+12], m6
  %endif
- %if ARCH_X86_64
-    SWAP            m0, m14
- %endif
 .top_done:
 
     ; left
-    test         edged, 1                   ; have_left
+    test         edgeb, 1 ; have_left
     jz .no_left
-    SAVE_PIC_REG     0
-    LOAD_ARG32    left
+    movifnidn    leftq, leftmp
  %if %2 == 4
     movq            m0, [leftq]
  %else
     movu            m0, [leftq]
  %endif
-    LOAD_PIC_REG     0
  %if %2 == 4
-    punpcklbw       m0, m15
+    punpcklbw       m0, m7
  %else
-    punpckhbw       m1, m0, m15
-    punpcklbw       m0, m15
+    punpckhbw       m1, m0, m7
+    punpcklbw       m0, m7
     movhlps         m3, m1
-    movd   [px+4*%3-4], m1
-    movd   [px+6*%3-4], m3
+    movd   [px+32*4-4], m1
+    movd   [px+32*6-4], m3
     psrlq           m1, 32
     psrlq           m3, 32
-    movd   [px+5*%3-4], m1
-    movd   [px+7*%3-4], m3
+    movd   [px+32*5-4], m1
+    movd   [px+32*7-4], m3
  %endif
     movhlps         m2, m0
-    movd   [px+0*%3-4], m0
-    movd   [px+2*%3-4], m2
+    movd   [px+32*0-4], m0
+    movd   [px+32*2-4], m2
     psrlq           m0, 32
     psrlq           m2, 32
-    movd   [px+1*%3-4], m0
-    movd   [px+3*%3-4], m2
+    movd   [px+32*1-4], m0
+    movd   [px+32*3-4], m2
     jmp .left_done
 .no_left:
-    mov dword [px+0*%3-4], OUT_OF_BOUNDS
-    mov dword [px+1*%3-4], OUT_OF_BOUNDS
-    mov dword [px+2*%3-4], OUT_OF_BOUNDS
-    mov dword [px+3*%3-4], OUT_OF_BOUNDS
+    movd   [px+32*0-4], m6
+    movd   [px+32*1-4], m6
+    movd   [px+32*2-4], m6
+    movd   [px+32*3-4], m6
  %if %2 == 8
-    mov dword [px+4*%3-4], OUT_OF_BOUNDS
-    mov dword [px+5*%3-4], OUT_OF_BOUNDS
-    mov dword [px+6*%3-4], OUT_OF_BOUNDS
-    mov dword [px+7*%3-4], OUT_OF_BOUNDS
+    movd   [px+32*4-4], m6
+    movd   [px+32*5-4], m6
+    movd   [px+32*6-4], m6
+    movd   [px+32*7-4], m6
  %endif
 .left_done:
 
     ; bottom
  %if ARCH_X86_64
-    DEFINE_ARGS dst, stride, dummy1, dst8, pri, sec, stride3, dummy2, edge
+    DEFINE_ARGS dst, stride, dst8, dummy, pri, sec, edge, stride3
  %else
-    DEFINE_ARGS dst, stride, dummy1, dst8, stride3, dummy2, edge
+    DEFINE_ARGS dst, stride, dst8, edge, stride3
  %endif
-    test         edged, 8                   ; have_bottom
+    test         edgeb, 8 ; have_bottom
     jz .no_bottom
     lea          dst8q, [dstq+%2*strideq]
-    test         edged, 1                   ; have_left
+    test         edgeb, 1 ; have_left
     jz .bottom_no_left
-    test         edged, 2                   ; have_right
+    test         edgeb, 2 ; have_right
     jz .bottom_no_right
  %if %1 == 4
     PMOVZXBW        m0, [dst8q-(%1/2)]
@@ -520,40 +459,40 @@
  %else
     movu            m0, [dst8q-4]
     movu            m1, [dst8q+strideq-4]
-    punpckhbw       m2, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m3, m1, m15
-    punpcklbw       m1, m15
-    movu [px+(%2+0)*%3+8], m2
-    movu [px+(%2+1)*%3+8], m3
+    punpckhbw       m2, m0, m7
+    punpcklbw       m0, m7
+    punpckhbw       m3, m1, m7
+    punpcklbw       m1, m7
+    movu [px+32*(%2+0)+8], m2
+    movu [px+32*(%2+1)+8], m3
  %endif
-    movu [px+(%2+0)*%3-%1], m0
-    movu [px+(%2+1)*%3-%1], m1
+    movu [px+32*(%2+0)-%1], m0
+    movu [px+32*(%2+1)-%1], m1
     jmp .bottom_done
 .bottom_no_right:
  %if %1 == 4
     PMOVZXBW        m0, [dst8q-4]
     PMOVZXBW        m1, [dst8q+strideq-4]
-    movu [px+(%2+0)*%3-4*2], m0
-    movu [px+(%2+1)*%3-4*2], m1
+    movu [px+32*(%2+0)-8], m0
+    movu [px+32*(%2+1)-8], m1
  %else
     movu            m0, [dst8q-8]
     movu            m1, [dst8q+strideq-8]
-    punpckhbw       m2, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m3, m1, m15
-    punpcklbw       m1, m15
-    mova [px+(%2+0)*%3-8*2], m0
-    mova [px+(%2+0)*%3-0*2], m2
-    mova [px+(%2+1)*%3-8*2], m1
-    mova [px+(%2+1)*%3-0*2], m3
-    mov dword [px+(%2-1)*%3+8*2], OUT_OF_BOUNDS     ; overwritten by first mova
+    punpckhbw       m2, m0, m7
+    punpcklbw       m0, m7
+    punpckhbw       m3, m1, m7
+    punpcklbw       m1, m7
+    mova [px+32*(%2+0)-16], m0
+    mova [px+32*(%2+0)+ 0], m2
+    mova [px+32*(%2+1)-16], m1
+    mova [px+32*(%2+1)+ 0], m3
+    movd [px+32*(%2-1)+16], m6 ; overwritten by first mova
  %endif
-    mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
+    movd [px+32*(%2+0)+%1*2], m6
+    movd [px+32*(%2+1)+%1*2], m6
     jmp .bottom_done
 .bottom_no_left:
-    test          edged, 2                  ; have_right
+    test         edgeb, 2 ; have_right
     jz .bottom_no_left_right
  %if %1 == 4
     PMOVZXBW        m0, [dst8q]
@@ -561,233 +500,245 @@
  %else
     movu            m0, [dst8q]
     movu            m1, [dst8q+strideq]
-    punpckhbw       m2, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m3, m1, m15
-    punpcklbw       m1, m15
-    mova [px+(%2+0)*%3+8*2], m2
-    mova [px+(%2+1)*%3+8*2], m3
+    punpckhbw       m2, m0, m7
+    punpcklbw       m0, m7
+    punpckhbw       m3, m1, m7
+    punpcklbw       m1, m7
+    mova [px+32*(%2+0)+16], m2
+    mova [px+32*(%2+1)+16], m3
  %endif
-    mova [px+(%2+0)*%3], m0
-    mova [px+(%2+1)*%3], m1
-    mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
-    mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
+    mova [px+32*(%2+0)+ 0], m0
+    mova [px+32*(%2+1)+ 0], m1
+    movd [px+32*(%2+0)- 4], m6
+    movd [px+32*(%2+1)- 4], m6
     jmp .bottom_done
 .bottom_no_left_right:
     PMOVZXBW        m0, [dst8q+strideq*0], %1 == 4
     PMOVZXBW        m1, [dst8q+strideq*1], %1 == 4
-    mova [px+(%2+0)*%3], m0
-    mova [px+(%2+1)*%3], m1
-    mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
-    mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
+    mova [px+32*(%2+0)+ 0], m0
+    mova [px+32*(%2+1)+ 0], m1
+    movd [px+32*(%2+0)+%1*2], m6
+    movd [px+32*(%2+1)+%1*2], m6
+    movd [px+32*(%2+0)- 4], m6
+    movd [px+32*(%2+1)- 4], m6
     jmp .bottom_done
 .no_bottom:
- %if ARCH_X86_64
-    SWAP            m0, m14
- %else
-    mova            m0, OUT_OF_BOUNDS_MEM
- %endif
-    movu [px+(%2+0)*%3-4], m0
-    movu [px+(%2+1)*%3-4], m0
+    movu [px+32*(%2+0)- 4], m6
+    movu [px+32*(%2+1)- 4], m6
  %if %1 == 8
-    movq [px+(%2+0)*%3+12], m0
-    movq [px+(%2+1)*%3+12], m0
+    movq [px+32*(%2+0)+12], m6
+    movq [px+32*(%2+1)+12], m6
  %endif
- %if ARCH_X86_64
-    SWAP            m0, m14
- %endif
 .bottom_done:
 
     ; actual filter
-    DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, secdmp
  %if ARCH_X86_64
-    movifnidn     prid, prim
-    movifnidn     secd, secm
-    mov       dampingd, r7m
+    DEFINE_ARGS dst, stride, pridmp, damping, pri, sec
+    mova           m13, [shufb_lohi]
+ %if cpuflag(ssse3)
+    mova           m15, [pw_2048]
  %else
-    LOAD_ARG       pri
-    LOAD_ARG       sec
-    LOAD_ARG   damping, 1
+    mova           m15, [pw_8]
  %endif
-
-    SAVE_PIC_REG     8
-    mov        pridmpd, prid
-    mov        secdmpd, secd
-    or         pridmpd, 1
-    or         secdmpd, 1
-    bsr        pridmpd, pridmpd
-    bsr        secdmpd, secdmpd
+    mova           m14, m6
+ %else
+    DEFINE_ARGS dst, pridmp, sec, damping, pri, tap
+    %xdefine        m8  m1
+    %xdefine        m9  m2
+    %xdefine       m10  m0
+    %xdefine       m13  [base+shufb_lohi]
+    %xdefine       m14  OUT_OF_BOUNDS_MEM
+ %if cpuflag(ssse3)
+    %xdefine       m15  [base+pw_2048]
+ %else
+    %xdefine       m15  [base+pw_8]
+ %endif
+ %endif
+    movifnidn     prid, r4m
+    movifnidn     secd, r5m
+    mov       dampingd, r7m
+    movif32 [esp+0x3C], r1d
+    test          prid, prid
+    jz .sec_only
+    movd            m1, prim
+    bsr        pridmpd, prid
+    test          secd, secd
+    jz .pri_only
+    movd           m10, r5m
+    bsr           secd, secd
+    and           prid, 1
     sub        pridmpd, dampingd
-    sub        secdmpd, dampingd
+    sub           secd, dampingd
     xor       dampingd, dampingd
+    add           prid, prid
     neg        pridmpd
     cmovs      pridmpd, dampingd
-    neg        secdmpd
-    cmovs      secdmpd, dampingd
+    neg           secd
+    cmovs         secd, dampingd
+    PSHUFB_0        m1, m7
+    PSHUFB_0       m10, m7
  %if ARCH_X86_64
-    mov       [rsp+ 0], pridmpq                 ; pri_shift
-    mov       [rsp+16], secdmpq                 ; sec_shift
+    DEFINE_ARGS dst, stride, pridmp, tap, pri, sec
+    lea           tapq, [tap_table]
+    MOVDDUP        m11, [tapq+pridmpq*8] ; pri_shift_mask
+    MOVDDUP        m12, [tapq+secq*8]    ; sec_shift_mask
+    mov     [rsp+0x00], pridmpq          ; pri_shift
+    mov     [rsp+0x10], secq             ; sec_shift
+    DEFINE_ARGS dst, stride, dir, tap, pri, stk, k, off, h
  %else
+    MOVDDUP         m2, [tapq+pridmpq*8]
+    MOVDDUP         m3, [tapq+secq*8]
+    mov     [esp+0x04], dampingd         ; zero upper 32 bits of psrlw
+    mov     [esp+0x34], dampingd         ; source operand in ACCUMULATE_TAP
     mov     [esp+0x00], pridmpd
-    mov     [esp+0x30], secdmpd
-    mov dword [esp+0x04], 0                     ; zero upper 32 bits of psrlw
-    mov dword [esp+0x34], 0                     ; source operand in ACCUMULATE_TAP
-  %define PIC_reg r4
-    LOAD_PIC_REG     8
+    mov     [esp+0x30], secd
+    DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
+  %define         offq  dstq
+  %define           kd  strided
+  %define           kq  strideq
+    mova    [esp+0x10], m2
+    mova    [esp+0x40], m3
+    mova    [esp+0x20], m1
+    mova    [esp+0x50], m10
  %endif
-
-    DEFINE_ARGS dst, stride, pridmp, table, pri, sec, secdmp
-    lea         tableq, [PIC_sym(tap_table)]
- %if ARCH_X86_64
-    SWAP            m2, m11
-    SWAP            m3, m12
+    mov           dird, r6m
+    lea           stkq, [px]
+    lea           priq, [tapq+8*8+priq*8] ; pri_taps
+    mov             hd, %1*%2/8
+    lea           dirq, [tapq+dirq*2]
+.v_loop:
+    movif32 [esp+0x38], dstd
+    mov             kd, 1
+ %if %1 == 4
+    movq            m4, [stkq+32*0]
+    movhps          m4, [stkq+32*1]
+ %else
+    mova            m4, [stkq+32*0]       ; px
  %endif
-    movd            m2, [tableq+pridmpq]
-    movd            m3, [tableq+secdmpq]
-    PSHUFB_0        m2, m15                     ; pri_shift_mask
-    PSHUFB_0        m3, m15                     ; sec_shift_mask
+    pxor            m0, m0                ; sum
+    mova            m7, m4                ; max
+    mova            m8, m4                ; min
+.k_loop:
+    MOVDDUP         m2, [priq+kq*8]
  %if ARCH_X86_64
-    SWAP            m2, m11
-    SWAP            m3, m12
+    ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1
+    MOVDDUP         m2, [tapq+12*8+kq*8]
+    ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1
+    ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1
  %else
-  %define PIC_reg r6
-    mov        PIC_reg, r4
-    DEFINE_ARGS dst, stride, dir, table, pri, sec, secdmp
-    LOAD_ARG       pri
-    LOAD_ARG       dir, 1
-    mova    [esp+0x10], m2
-    mova    [esp+0x40], m3
+    ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1
+    MOVDDUP         m2, [tapq+12*8+kq*8]
+    ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
+    MOVDDUP         m2, [tapq+12*8+kq*8]
+    ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
  %endif
+    dec             kd
+    jge .k_loop
+    movif32       dstq, [esp+0x38]
+    movif32    strideq, [esp+0x3C]
+    CDEF_FILTER_END %1, 1
+    dec             hd
+    jg .v_loop
+    RET
 
-    ; pri/sec_taps[k] [4 total]
-    DEFINE_ARGS dst, stride, dummy, tap, pri, sec
-    movd            m0, prid
-    movd            m1, secd
- %if ARCH_X86_64
-    PSHUFB_0        m0, m15
-    PSHUFB_0        m1, m15
+.pri_only:
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, pridmp, damping, pri, tap, zero
+    lea           tapq, [tap_table]
  %else
-  %if cpuflag(ssse3)
-    pxor            m2, m2
-  %endif
-    mova            m3, [PIC_sym(pb_0xFF)]
-    PSHUFB_0        m0, m2
-    PSHUFB_0        m1, m2
-    pxor            m0, m3
-    pxor            m1, m3
-    mova    [esp+0x20], m0
-    mova    [esp+0x50], m1
+    DEFINE_ARGS dst, pridmp, zero, damping, pri, tap
  %endif
     and           prid, 1
-    lea           priq, [tapq+8+priq*2]         ; pri_taps
-    lea           secq, [tapq+12]               ; sec_taps
-
- %if ARCH_X86_64 && cpuflag(sse4)
-    mova           m14, [shufb_lohi]
- %endif
-
-    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
-    DEFINE_ARGS dst, stride, dir, tap, pri, sec
+    xor          zerod, zerod
+    sub       dampingd, pridmpd
+    cmovs     dampingd, zerod
+    add           prid, prid
+    PSHUFB_0        m1, m7
+    MOVDDUP         m7, [tapq+dampingq*8]
+    mov     [rsp+0x00], dampingq
  %if ARCH_X86_64
-    mov           dird, r6m
-    lea           dirq, [tapq+14+dirq*2]
-    DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
+    DEFINE_ARGS dst, stride, dir, stk, pri, tap, k, off, h
  %else
-    lea           dird, [tapd+14+dird*2]
-    DEFINE_ARGS dst, stride, dir, stk, pri, sec
-  %define hd    dword [esp+8]
-  %define offq  dstq
-  %define kq    strideq
+    mov     [rsp+0x04], zerod
+    DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
  %endif
-    mov             hd, %1*%2*2/mmsize
+    mov           dird, r6m
     lea           stkq, [px]
-    movif32 [esp+0x3C], strided
-.v_loop:
+    lea           priq, [tapq+8*8+priq*8]
+    mov             hd, %1*%2/8
+    lea           dirq, [tapq+dirq*2]
+.pri_v_loop:
     movif32 [esp+0x38], dstd
-    mov             kq, 1
+    mov             kd, 1
  %if %1 == 4
-    movq            m4, [stkq+%3*0]
-    movhps          m4, [stkq+%3*1]
+    movq            m4, [stkq+32*0]
+    movhps          m4, [stkq+32*1]
  %else
-    mova            m4, [stkq+%3*0]             ; px
+    mova            m4, [stkq+32*0]
  %endif
+    pxor            m0, m0
+.pri_k_loop:
+    MOVDDUP         m2, [priq+kq*8]
+    ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0
+    dec             kd
+    jge .pri_k_loop
+    movif32       dstq, [esp+0x38]
+    movif32    strideq, [esp+0x3C]
+    CDEF_FILTER_END %1, 0
+    dec             hd
+    jg .pri_v_loop
+    RET
 
- %if ARCH_X86_32
-  %xdefine m9   m3
-  %xdefine m13  m7
-  %xdefine  m7  m0
-  %xdefine  m8  m1
- %endif
-
-    pxor           m13, m13                     ; sum
-    mova            m7, m4                      ; max
-    mova            m8, m4                      ; min
-.k_loop:
-    movd            m2, [priq+kq]               ; pri_taps
+.sec_only:
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, dir, damping, tap, sec, zero
+%else
+    DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero
+%endif
+    movd            m1, r5m
+    bsr           secd, secd
+    mov           dird, r6m
+    xor          zerod, zerod
+    sub       dampingd, secd
+    cmovs     dampingd, zerod
+    PSHUFB_0        m1, m7
  %if ARCH_X86_64
-    PSHUFB_0        m2, m15
-  %if cpuflag(ssse3)
-    LOAD_SEC_TAP                                ; sec_taps
-  %endif
-    ACCUMULATE_TAP 0*2, [rsp+ 0], m11, m0, m2, %1, %3
-  %if notcpuflag(ssse3)
-    LOAD_SEC_TAP                                ; sec_taps
-  %endif
-    ACCUMULATE_TAP 2*2, [rsp+16], m12, m1, m3, %1, %3
-    ACCUMULATE_TAP 6*2, [rsp+16], m12, m1, m3, %1, %3
+    lea           tapq, [tap_table]
  %else
-  %if cpuflag(ssse3)
-    pxor            m3, m3
-  %endif
-    PSHUFB_0        m2, m3
-    ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, %3
-    LOAD_SEC_TAP                                ; sec_taps
-    ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
-  %if notcpuflag(ssse3)
-    LOAD_SEC_TAP                                ; sec_taps
-  %endif
-    ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
+    mov     [rsp+0x04], zerod
  %endif
-
-    dec             kq
-    jge .k_loop
-
-    pxor            m6, m6
-    pcmpgtw         m6, m13
-    paddw          m13, m6
- %if cpuflag(ssse3)
-    pmulhrsw       m13, [PIC_sym(pw_2048)]
+    mov     [rsp+0x00], dampingq
+    MOVDDUP         m7, [tapq+dampingq*8]
+    lea           dirq, [tapq+dirq*2]
+ %if ARCH_X86_64
+    DEFINE_ARGS dst, stride, dir, stk, tap, off, k, h
  %else
-    paddw          m13, [PIC_sym(pw_8)]
-    psraw          m13, 4
+    DEFINE_ARGS dst, stride, off, stk, dir, tap, h
  %endif
-    paddw           m4, m13
-    pminsw          m4, m7
-    pmaxsw          m4, m8
-    packuswb        m4, m4
-    movif32       dstd, [esp+0x38]
-    movif32    strided, [esp+0x3C]
+    lea           stkq, [px]
+    mov             hd, %1*%2/8
+.sec_v_loop:
+    mov             kd, 1
  %if %1 == 4
-    movd [dstq+strideq*0], m4
-    psrlq           m4, 32
-    movd [dstq+strideq*1], m4
+    movq            m4, [stkq+32*0]
+    movhps          m4, [stkq+32*1]
  %else
-    movq [dstq], m4
+    mova            m4, [stkq+32*0]
  %endif
-
- %if %1 == 4
- %define vloop_lines (mmsize/(%1*2))
-    lea           dstq, [dstq+strideq*vloop_lines]
-    add           stkq, %3*vloop_lines
- %else
-    lea           dstq, [dstq+strideq]
-    add           stkq, %3
+    pxor            m0, m0
+.sec_k_loop:
+    MOVDDUP         m2, [tapq+12*8+kq*8]
+    ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0
+ %if ARCH_X86_32
+    MOVDDUP         m2, [tapq+12*8+kq*8]
  %endif
+    ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0
+    dec             kd
+    jge .sec_k_loop
+    movif32    strideq, [esp+0x3C]
+    CDEF_FILTER_END %1, 0
     dec             hd
-    jg .v_loop
-
+    jg .sec_v_loop
     RET
 %endmacro
 
@@ -1079,18 +1030,16 @@
     shr            r1d, 10
     mov         [varq], r1d
  %else
-cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
-  %define PIC_reg r4
-    LEA        PIC_reg, PIC_base_offset
-
+cglobal cdef_dir, 2, 4, 8, 96, src, stride, var, stride3
+%define base r2-shufw_6543210x
+    LEA             r2, shufw_6543210x
     pxor            m0, m0
-    mova            m1, [PIC_sym(pw_128)]
-
     lea       stride3q, [strideq*3]
     movq            m5, [srcq+strideq*0]
     movhps          m5, [srcq+strideq*1]
     movq            m7, [srcq+strideq*2]
     movhps          m7, [srcq+stride3q]
+    mova            m1, [base+pw_128]
     psadbw          m2, m5, m0
     psadbw          m3, m7, m0
     packssdw        m2, m3
@@ -1143,7 +1092,7 @@
     pmaddwd         m0, m0
 
     phaddd          m2, m0
-    MULLD           m2, [PIC_sym(div_table%+SUFFIX)+48]
+    MULLD           m2, [base+div_table%+SUFFIX+48]
     mova    [esp+0x30], m2
 
     mova            m1, [esp+0x10]
@@ -1176,13 +1125,13 @@
     paddw           m0, m2                  ; partial_sum_diag[0][0-7]
     paddw           m1, m3                  ; partial_sum_diag[0][8-14,zero]
     mova            m3, [esp+0x50]
-    pshufb          m1, [PIC_sym(shufw_6543210x)]
+    pshufb          m1, [base+shufw_6543210x]
     punpckhwd       m2, m0, m1
     punpcklwd       m0, m1
     pmaddwd         m2, m2
     pmaddwd         m0, m0
-    MULLD           m2, [PIC_sym(div_table%+SUFFIX)+16]
-    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+0]
+    MULLD           m2, [base+div_table%+SUFFIX+16]
+    MULLD           m0, [base+div_table%+SUFFIX+ 0]
     paddd           m0, m2                  ; cost[0a-d]
     mova    [esp+0x40], m0
 
@@ -1217,13 +1166,13 @@
     paddw           m0, m2                  ; partial_sum_diag[1][0-7]
     paddw           m1, m3                  ; partial_sum_diag[1][8-14,zero]
     mova            m3, [esp+0x50]
-    pshufb          m1, [PIC_sym(shufw_6543210x)]
+    pshufb          m1, [base+shufw_6543210x]
     punpckhwd       m2, m0, m1
     punpcklwd       m0, m1
     pmaddwd         m2, m2
     pmaddwd         m0, m0
-    MULLD           m2, [PIC_sym(div_table%+SUFFIX)+16]
-    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+0]
+    MULLD           m2, [base+div_table%+SUFFIX+16]
+    MULLD           m0, [base+div_table%+SUFFIX+ 0]
     paddd           m0, m2                  ; cost[4a-d]
     phaddd          m1, [esp+0x40], m0      ; cost[0a/b,4a/b]
     phaddd          m1, [esp+0x30]          ; cost[0,4,2,6]
@@ -1259,8 +1208,8 @@
     punpcklwd       m0, m1
     pmaddwd         m2, m2
     pmaddwd         m0, m0
-    MULLD           m2, [PIC_sym(div_table%+SUFFIX)+48]
-    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+32]
+    MULLD           m2, [base+div_table%+SUFFIX+48]
+    MULLD           m0, [base+div_table%+SUFFIX+32]
     paddd           m0, m2                  ; cost[7a-d]
     mova    [esp+0x40], m0
 
@@ -1280,8 +1229,8 @@
     punpcklwd       m0, m2
     pmaddwd         m7, m7
     pmaddwd         m0, m0
-    MULLD           m7, [PIC_sym(div_table%+SUFFIX)+48]
-    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+32]
+    MULLD           m7, [base+div_table%+SUFFIX+48]
+    MULLD           m0, [base+div_table%+SUFFIX+32]
     paddd           m0, m7                  ; cost[5a-d]
     mova    [esp+0x50], m0
 
@@ -1303,8 +1252,8 @@
     punpcklwd       m0, m2
     pmaddwd         m7, m7
     pmaddwd         m0, m0
-    MULLD           m7, [PIC_sym(div_table%+SUFFIX)+48]
-    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+32]
+    MULLD           m7, [base+div_table%+SUFFIX+48]
+    MULLD           m0, [base+div_table%+SUFFIX+32]
     paddd           m0, m7                  ; cost[1a-d]
     SWAP            m0, m4
 
@@ -1330,8 +1279,8 @@
     punpcklwd       m4, m2
     pmaddwd         m0, m0
     pmaddwd         m4, m4
-    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+48]
-    MULLD           m4, [PIC_sym(div_table%+SUFFIX)+32]
+    MULLD           m0, [base+div_table%+SUFFIX+48]
+    MULLD           m4, [base+div_table%+SUFFIX+32]
     paddd           m4, m0                   ; cost[3a-d]
 
     mova            m1, [esp+0x00]
@@ -1367,6 +1316,7 @@
   %endif
 
     ; get direction and variance
+    mov           vard, varm
     punpckhdq       m3, m2, m1
     punpckldq       m2, m1
     psubd           m1, m0, m3
@@ -1388,18 +1338,18 @@
 %endmacro
 
 INIT_XMM sse4
-CDEF_FILTER 8, 8, 32
-CDEF_FILTER 4, 8, 32
-CDEF_FILTER 4, 4, 32
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
 CDEF_DIR
 
 INIT_XMM ssse3
-CDEF_FILTER 8, 8, 32
-CDEF_FILTER 4, 8, 32
-CDEF_FILTER 4, 4, 32
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
 CDEF_DIR
 
 INIT_XMM sse2
-CDEF_FILTER 8, 8, 32
-CDEF_FILTER 4, 8, 32
-CDEF_FILTER 4, 4, 32
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
--- a/src/x86/cpuid.asm
+++ b/src/x86/cpuid.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 SECTION .text
--- a/src/x86/film_grain.asm
+++ b/src/x86/film_grain.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 %if ARCH_X86_64
--- a/src/x86/film_grain_ssse3.asm
+++ b/src/x86/film_grain_ssse3.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 SECTION_RODATA
--- a/src/x86/ipred.asm
+++ b/src/x86/ipred.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 %if ARCH_X86_64
--- a/src/x86/ipred_ssse3.asm
+++ b/src/x86/ipred_ssse3.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 SECTION_RODATA 16
--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 %if ARCH_X86_64
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 
--- a/src/x86/loopfilter.asm
+++ b/src/x86/loopfilter.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 %if ARCH_X86_64
--- a/src/x86/looprestoration.asm
+++ b/src/x86/looprestoration.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 %if ARCH_X86_64
--- a/src/x86/looprestoration_ssse3.asm
+++ b/src/x86/looprestoration_ssse3.asm
@@ -24,6 +24,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 SECTION_RODATA 16
--- a/src/x86/mc_avx2.asm
+++ b/src/x86/mc_avx2.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 %if ARCH_X86_64
@@ -2766,9 +2767,9 @@
 %ifidn %1, put
  %assign isprep 0
  %if required_stack_alignment <= STACK_ALIGNMENT
-cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy
+cglobal put_8tap_scaled, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
  %else
-cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
+cglobal put_8tap_scaled, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
  %endif
  %xdefine base_reg r12
  %define rndshift 10
@@ -2775,11 +2776,11 @@
 %else
  %assign isprep 1
  %if required_stack_alignment <= STACK_ALIGNMENT
-cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+cglobal prep_8tap_scaled, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
   %xdefine tmp_stridem r14q
  %else
-cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
-  %define tmp_stridem qword [rsp+104]
+cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
+  %define tmp_stridem qword [rsp+120]
  %endif
  %xdefine base_reg r11
  %define rndshift 6
@@ -2808,7 +2809,7 @@
   %define hm r6m
  %endif
  %if required_stack_alignment > STACK_ALIGNMENT
-  %define dsm [rsp+96]
+  %define dsm [rsp+112]
   %define rX r1
   %define rXd r1d
  %else
@@ -2824,7 +2825,7 @@
   %define dxm r7m
  %else
   DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
-  %define hm [rsp+96]
+  %define hm [rsp+112]
  %endif
  MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
  %define rX r14
@@ -3104,181 +3105,9 @@
     lea                srcq, [srcq+ssq*2]
     jmp .w4_loop
 .w8:
-%ifidn %1, put
-    movifnidn           dsm, dsq
-%endif
-    shr                 t0d, 16
-    sub                srcq, 3
-    movd               xm15, t0d
-    pmaddwd              m8, [base+rescale_mul]
-    vpbroadcastq        m11, [base+pq_0x40000000]
-    vpbroadcastd        m15, xm15
-    paddd               m14, m8 ; mx+dx*[0-7]
-    pand                 m6, m14, m10
-    psrld                m6, 6
-    paddd               m15, m6
-    pcmpeqd              m6, m9
-    vextracti128        xm7, m15, 1
-    movd                r4d, xm15
-    pextrd              r6d, xm15, 2
-    pextrd              r7d, xm15, 1
-    pextrd              r9d, xm15, 3
-    movd               r10d, xm7
-    pextrd             r11d, xm7, 2
-    pextrd             r13d, xm7, 1
-    pextrd              rXd, xm7, 3
-    movq               xm15, [base+subpel_filters+r4*8]
-    movq               xm10, [base+subpel_filters+r6*8]
-    movhps             xm15, [base+subpel_filters+r7*8]
-    movhps             xm10, [base+subpel_filters+r9*8]
-    vinserti128         m15, [base+subpel_filters+r10*8], 1
-    vinserti128         m10, [base+subpel_filters+r11*8], 1
-    vpbroadcastq         m9, [base+subpel_filters+r13*8]
-    vpbroadcastq         m8, [base+subpel_filters+rX*8]
-    psrld               m14, 10
-    mova              [rsp], xm14
-    vextracti128        xm7, m14, 1
-    movd                r4d, xm14
-    pextrd              r6d, xm14, 2
-    pextrd              r7d, xm14, 1
-    pextrd              r9d, xm14, 3
-    movd               r10d, xm7
-    pextrd             r11d, xm7, 2
-    pextrd             r13d, xm7, 1
-    pextrd              rXd, xm7, 3
-    pshufd               m5, m6, q1100
-    pshufd               m6, m6, q3322
-    vpblendd            m15, m9, 0xc0
-    vpblendd            m10, m8, 0xc0
-    pblendvb            m15, m11, m5
-    pblendvb            m10, m11, m6
-    vbroadcasti128      m14, [base+subpel_s_shuf8]
-    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
-    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
-    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
-    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
-    mov                 myd, mym
-    mov                 dyd, dym
-    pshufb               m0, m14    ; 01a 01b
-    pshufb               m1, m14    ; 23a 23b
-    pshufb               m2, m14    ; 45a 45b
-    pshufb               m3, m14    ; 67a 67b
-    vbroadcasti128      m14, [base+wswap]
-.w8_loop:
-    and                 myd, 0x3ff
-    mov                 r6d, 64 << 24
-    mov                 r4d, myd
-    shr                 r4d, 6
-    lea                 r4d, [t1+r4]
-    cmovnz              r6q, [base+subpel_filters+r4*8]
-    movq               xm11, r6q
-    punpcklbw          xm11, xm11
-    psraw              xm11, 8
-    vinserti128         m11, xm11, 1
-    pshufd               m8, m11, q0000
-    pshufd               m9, m11, q1111
-    pmaddwd              m4, m0, m8
-    pmaddwd              m5, m1, m9
-    pshufd               m8, m11, q2222
-    pshufd              m11, m11, q3333
-    pmaddwd              m6, m2, m8
-    pmaddwd              m7, m3, m11
-    paddd                m4, m5
-    paddd                m6, m7
-    paddd                m4, m13
-    paddd                m4, m6
-    psrad                m4, rndshift
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-%ifidn %1, put
-    packuswb            xm4, xm4
-    movq             [dstq], xm4
-    add                dstq, dsm
-%else
-    mova             [tmpq], xm4
-    add                tmpq, 16
-%endif
-    dec                  hd
-    jz .ret
-    add                 myd, dyd
-    test                myd, ~0x3ff
-    jz .w8_loop
-    test                myd, 0x400
-    mov            [rsp+16], myd
-    mov                 r4d, [rsp+ 0]
-    mov                 r6d, [rsp+ 8]
-    mov                 r7d, [rsp+ 4]
-    mov                 r9d, [rsp+12]
-    jz .w8_skip_line
-    vpbroadcastq         m6, [srcq+r13]
-    vpbroadcastq         m7, [srcq+ rX]
-    movq                xm4, [srcq+ r4]
-    movq                xm5, [srcq+ r6]
-    movhps              xm4, [srcq+ r7]
-    movhps              xm5, [srcq+ r9]
-    vinserti128          m4, [srcq+r10], 1
-    vinserti128          m5, [srcq+r11], 1
-    add                srcq, ssq
-    mov                 myd, [rsp+16]
-    mov                 dyd, dym
-    pshufb               m0, m14
-    pshufb               m1, m14
-    pshufb               m2, m14
-    pshufb               m3, m14
-    vpblendd             m4, m6, 0xc0
-    vpblendd             m5, m7, 0xc0
-    pmaddubsw            m4, m15
-    pmaddubsw            m5, m10
-    phaddw               m4, m5
-    pslld                m5, m4, 16
-    paddw                m4, m5
-    pmulhrsw             m4, m12
-    pblendw              m0, m1, 0xaa
-    pblendw              m1, m2, 0xaa
-    pblendw              m2, m3, 0xaa
-    pblendw              m3, m4, 0xaa
-    jmp .w8_loop
-.w8_skip_line:
-    mova                 m0, m1
-    mova                 m1, m2
-    mova                 m2, m3
-    vpbroadcastq         m7, [srcq+r13]
-    vpbroadcastq         m8, [srcq+ rX]
-    movq                xm3, [srcq+ r4]
-    movq                xm4, [srcq+ r6]
-    movhps              xm3, [srcq+ r7]
-    movhps              xm4, [srcq+ r9]
-    vinserti128          m3, [srcq+r10], 1
-    vinserti128          m4, [srcq+r11], 1
-    add                srcq, ssq
-    movq                xm5, [srcq+ r4]
-    movq                xm6, [srcq+ r6]
-    movhps              xm5, [srcq+ r7]
-    movhps              xm6, [srcq+ r9]
-    vinserti128          m5, [srcq+r10], 1
-    vinserti128          m6, [srcq+r11], 1
-    vpbroadcastq         m9, [srcq+r13]
-    vpbroadcastq        m11, [srcq+ rX]
-    add                srcq, ssq
-    mov                 myd, [rsp+16]
-    mov                 dyd, dym
-    vpblendd             m3, m7, 0xc0
-    vpblendd             m4, m8, 0xc0
-    vpblendd             m5, m9, 0xc0
-    vpblendd             m6, m11, 0xc0
-    pmaddubsw            m3, m15
-    pmaddubsw            m4, m10
-    pmaddubsw            m5, m15
-    pmaddubsw            m6, m10
-    phaddw               m3, m4
-    phaddw               m5, m6
-    psrld                m4, m3, 16
-    pslld                m6, m5, 16
-    paddw                m3, m4
-    paddw                m5, m6
-    pblendw              m3, m5, 0xaa
-    pmulhrsw             m3, m12
-    jmp .w8_loop
+    mov      dword [rsp+48], 1
+    movifprep   tmp_stridem, 16
+    jmp .w_start
 .w16:
     mov      dword [rsp+48], 2
     movifprep   tmp_stridem, 32
@@ -3698,127 +3527,9 @@
     jg .dy1_w4_loop
     MC_8TAP_SCALED_RET
 .dy1_w8:
-%ifidn %1, put
-    movifnidn           dsm, dsq
-%endif
-    shr                 t0d, 16
-    sub                srcq, 3
-    movd               xm15, t0d
-    pmaddwd              m8, [base+rescale_mul]
-    vpbroadcastq        m11, [base+pq_0x40000000]
-    vpbroadcastd        m15, xm15
-    paddd               m14, m8 ; mx+dx*[0-7]
-    pand                 m6, m14, m10
-    psrld                m6, 6
-    paddd               m15, m6
-    pcmpeqd              m6, m9
-    vextracti128        xm7, m15, 1
-    movd                r4d, xm15
-    pextrd              r6d, xm15, 2
-    pextrd              r7d, xm15, 1
-    pextrd              r9d, xm15, 3
-    movd               r10d, xm7
-    pextrd             r11d, xm7, 2
-    pextrd             r13d, xm7, 1
-    pextrd              rXd, xm7, 3
-    movq               xm15, [base+subpel_filters+ r4*8]
-    movq               xm10, [base+subpel_filters+ r6*8]
-    movhps             xm15, [base+subpel_filters+ r7*8]
-    movhps             xm10, [base+subpel_filters+ r9*8]
-    vinserti128         m15, [base+subpel_filters+r10*8], 1
-    vinserti128         m10, [base+subpel_filters+r11*8], 1
-    vpbroadcastq         m9, [base+subpel_filters+r13*8]
-    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
-    psrld               m14, 10
-    vextracti128        xm7, m14, 1
-    movd                r4d, xm14
-    pextrd              r6d, xm14, 2
-    pextrd              r7d, xm14, 1
-    pextrd              r9d, xm14, 3
-    movd               r10d, xm7
-    pextrd             r11d, xm7, 2
-    pextrd             r13d, xm7, 1
-    pextrd              rXd, xm7, 3
-    mov            [rsp+32], r7d
-    pshufd               m5, m6, q1100
-    pshufd               m6, m6, q3322
-    vpblendd            m15, m9, 0xc0
-    vpblendd            m10, m8, 0xc0
-    pblendvb            m15, m11, m5
-    pblendvb            m10, m11, m6
-    vbroadcasti128      m14, [base+subpel_s_shuf8]
-    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
-    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
-    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
-    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
-    mov                 myd, mym
-    movu              [rsp], m10
-    pshufb               m0, m14    ; 01a 01b
-    pshufb               m1, m14    ; 23a 23b
-    pshufb               m2, m14    ; 45a 45b
-    pshufb               m3, m14    ; 67a 67b
-    shr                 myd, 6
-    lea                 myd, [t1+myq]
-    mov                 t1d, 64 << 24
-    cmovnz              t1q, [base+subpel_filters+myq*8]
-    vbroadcasti128      m14, [base+wswap]
-    movq               xm11, t1q
-    punpcklbw          xm11, xm11
-    psraw              xm11, 8
-    vinserti128         m11, xm11, 1
-    mov                 r7d, [rsp+32]
-    pshufd               m8, m11, q0000
-    pshufd               m9, m11, q1111
-    pshufd              m10, m11, q2222
-    pshufd              m11, m11, q3333
-.dy1_w8_loop:
-    pmaddwd              m4, m0, m8
-    pmaddwd              m5, m1, m9
-    pmaddwd              m6, m2, m10
-    pmaddwd              m7, m3, m11
-    paddd                m4, m5
-    paddd                m6, m7
-    paddd                m4, m13
-    paddd                m4, m6
-    psrad                m4, rndshift
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-%ifidn %1, put
-    packuswb            xm4, xm4
-    movq             [dstq], xm4
-    add                dstq, dsm
-%else
-    mova             [tmpq], xm4
-    add                tmpq, 16
-%endif
-    dec                  hd
-    jz .ret
-    movq                xm4, [srcq+ r4]
-    movq                xm5, [srcq+ r6]
-    movhps              xm4, [srcq+ r7]
-    movhps              xm5, [srcq+ r9]
-    vinserti128          m4, [srcq+r10], 1
-    vinserti128          m5, [srcq+r11], 1
-    vpbroadcastq         m6, [srcq+r13]
-    vpbroadcastq         m7, [srcq+ rX]
-    add                srcq, ssq
-    pshufb               m0, m14
-    pshufb               m1, m14
-    pshufb               m2, m14
-    pshufb               m3, m14
-    vpblendd             m4, m6, 0xc0
-    vpblendd             m5, m7, 0xc0
-    pmaddubsw            m4, m15
-    pmaddubsw            m5, [rsp]
-    phaddw               m4, m5
-    pslld                m5, m4, 16
-    paddw                m4, m5
-    pmulhrsw             m4, m12
-    pblendw              m0, m1, 0xaa
-    pblendw              m1, m2, 0xaa
-    pblendw              m2, m3, 0xaa
-    pblendw              m3, m4, 0xaa
-    jmp .dy1_w8_loop
+    mov      dword [rsp+72], 1
+    movifprep   tmp_stridem, 16
+    jmp .dy1_w_start
 .dy1_w16:
     mov      dword [rsp+72], 2
     movifprep   tmp_stridem, 32
@@ -3835,11 +3546,16 @@
     mov      dword [rsp+72], 16
     movifprep   tmp_stridem, 256
 .dy1_w_start:
+    mov                 myd, mym
 %ifidn %1, put
     movifnidn           dsm, dsq
 %endif
     shr                 t0d, 16
     sub                srcq, 3
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
     pmaddwd              m8, [base+rescale_mul]
     movd               xm15, t0d
     mov            [rsp+76], t0d
@@ -3851,6 +3567,10 @@
     shl           dword dxm, 3 ; dx*8
     vpbroadcastd        m15, xm15
     paddd               m14, m8 ; mx+dx*[0-7]
+    movq                xm0, r4q
+    punpcklbw           xm0, xm0
+    psraw               xm0, 8
+    mova           [rsp+96], xm0
     jmp .dy1_hloop
 .dy1_hloop_prep:
     dec      dword [rsp+72]
@@ -3910,27 +3630,16 @@
     MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
     MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
     MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
-    mov                 myd, mym
     movu              [rsp], m10
+    vpbroadcastd         m8, [rsp+0x60]
+    vpbroadcastd         m9, [rsp+0x64]
+    vpbroadcastd        m10, [rsp+0x68]
+    vpbroadcastd        m11, [rsp+0x6c]
     pshufb               m0, m14    ; 01a 01b
     pshufb               m1, m14    ; 23a 23b
     pshufb               m2, m14    ; 45a 45b
     pshufb               m3, m14    ; 67a 67b
-    shr                 myd, 6
-    mov                 r4d, 64 << 24
-    lea                 myd, [t1+myq]
-    cmovnz              r4q, [base+subpel_filters+myq*8]
     vbroadcasti128      m14, [base+wswap]
-    movq               xm11, r4q
-    punpcklbw          xm11, xm11
-    psraw              xm11, 8
-    vinserti128         m11, xm11, 1
-    mov                 r4d, [rsp+64]
-    mov                 r7d, [rsp+68]
-    pshufd               m8, m11, q0000
-    pshufd               m9, m11, q1111
-    pshufd              m10, m11, q2222
-    pshufd              m11, m11, q3333
 .dy1_vloop:
     pmaddwd              m4, m0, m8
     pmaddwd              m5, m1, m9
@@ -4182,137 +3891,9 @@
     jg .dy2_w4_loop
     MC_8TAP_SCALED_RET
 .dy2_w8:
-%ifidn %1, put
-    movifnidn           dsm, dsq
-%endif
-    shr                 t0d, 16
-    sub                srcq, 3
-    movd               xm15, t0d
-    pmaddwd              m8, [base+rescale_mul]
-    vpbroadcastq        m11, [base+pq_0x40000000]
-    vpbroadcastd        m15, xm15
-    paddd               m14, m8 ; mx+dx*[0-7]
-    pand                 m6, m14, m10
-    psrld                m6, 6
-    paddd               m15, m6
-    pcmpeqd              m6, m9
-    vextracti128        xm7, m15, 1
-    movd                r4d, xm15
-    pextrd              r6d, xm15, 2
-    pextrd              r7d, xm15, 1
-    pextrd              r9d, xm15, 3
-    movd               r10d, xm7
-    pextrd             r11d, xm7, 2
-    pextrd             r13d, xm7, 1
-    pextrd              rXd, xm7, 3
-    movq               xm15, [base+subpel_filters+ r4*8]
-    movq               xm10, [base+subpel_filters+ r6*8]
-    movhps             xm15, [base+subpel_filters+ r7*8]
-    movhps             xm10, [base+subpel_filters+ r9*8]
-    vinserti128         m15, [base+subpel_filters+r10*8], 1
-    vinserti128         m10, [base+subpel_filters+r11*8], 1
-    vpbroadcastq         m9, [base+subpel_filters+r13*8]
-    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
-    psrld               m14, 10
-    vextracti128        xm7, m14, 1
-    movd                r4d, xm14
-    pextrd              r6d, xm14, 2
-    pextrd              r7d, xm14, 1
-    pextrd              r9d, xm14, 3
-    movd               r10d, xm7
-    pextrd             r11d, xm7, 2
-    pextrd             r13d, xm7, 1
-    pextrd              rXd, xm7, 3
-    mov               [rsp], r7d
-    pshufd               m5, m6, q1100
-    pshufd               m6, m6, q3322
-    vpblendd            m15, m9, 0xc0
-    vpblendd            m10, m8, 0xc0
-    pblendvb            m15, m11, m5
-    pblendvb            m10, m11, m6
-    vbroadcasti128      m14, [base+subpel_s_shuf8]
-    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
-    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
-    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
-    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
-    mov                 myd, mym
-    pshufb               m0, m14    ; 01a 01b
-    pshufb               m1, m14    ; 23a 23b
-    pshufb               m2, m14    ; 45a 45b
-    pshufb               m3, m14    ; 67a 67b
-    shr                 myd, 6
-    lea                 myd, [t1+myq]
-    mov                 t1d, 64 << 24
-    cmovnz              t1q, [base+subpel_filters+myq*8]
-    movq               xm11, t1q
-    punpcklbw          xm11, xm11
-    psraw              xm11, 8
-    vinserti128         m11, xm11, 1
-    mov                 r7d, [rsp]
-    pshufd               m8, m11, q0000
-    pshufd               m9, m11, q1111
-    pshufd              m14, m11, q2222
-    pshufd              m11, m11, q3333
-.dy2_w8_loop:
-    pmaddwd              m4, m0, m8
-    pmaddwd              m5, m1, m9
-    pmaddwd              m6, m2, m14
-    pmaddwd              m7, m3, m11
-    paddd                m4, m5
-    paddd                m6, m7
-    paddd                m4, m13
-    paddd                m4, m6
-    psrad                m4, rndshift
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-%ifidn %1, put
-    packuswb            xm4, xm4
-    movq             [dstq], xm4
-    add                dstq, dsm
-%else
-    mova             [tmpq], xm4
-    add                tmpq, 16
-%endif
-    dec                  hd
-    jz .ret
-    mova                 m0, m1
-    mova                 m1, m2
-    mova                 m2, m3
-    movq                xm3, [srcq+ r4]
-    movq                xm4, [srcq+ r6]
-    movhps              xm3, [srcq+ r7]
-    movhps              xm4, [srcq+ r9]
-    vinserti128          m3, [srcq+r10], 1
-    vinserti128          m4, [srcq+r11], 1
-    vpbroadcastq         m5, [srcq+r13]
-    vpbroadcastq         m6, [srcq+ rX]
-    add                srcq, ssq
-    vpblendd             m3, m5, 0xc0
-    vpblendd             m4, m6, 0xc0
-    pmaddubsw            m3, m15
-    pmaddubsw            m4, m10
-    phaddw               m3, m4
-    movq                xm4, [srcq+ r4]
-    movq                xm5, [srcq+ r6]
-    movhps              xm4, [srcq+ r7]
-    movhps              xm5, [srcq+ r9]
-    vinserti128          m4, [srcq+r10], 1
-    vinserti128          m5, [srcq+r11], 1
-    vpbroadcastq         m6, [srcq+r13]
-    vpbroadcastq         m7, [srcq+ rX]
-    add                srcq, ssq
-    vpblendd             m4, m6, 0xc0
-    vpblendd             m5, m7, 0xc0
-    pmaddubsw            m4, m15
-    pmaddubsw            m5, m10
-    phaddw               m4, m5
-    psrld                m5, m3, 16
-    pslld                m6, m4, 16
-    paddw                m3, m5
-    paddw                m4, m6
-    pblendw              m3, m4, 0xaa
-    pmulhrsw             m3, m12
-    jmp .dy2_w8_loop
+    mov      dword [rsp+40], 1
+    movifprep   tmp_stridem, 16
+    jmp .dy2_w_start
 .dy2_w16:
     mov      dword [rsp+40], 2
     movifprep   tmp_stridem, 32
@@ -4329,11 +3910,16 @@
     mov      dword [rsp+40], 16
     movifprep   tmp_stridem, 256
 .dy2_w_start:
+    mov                 myd, mym
 %ifidn %1, put
     movifnidn           dsm, dsq
 %endif
     shr                 t0d, 16
     sub                srcq, 3
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
     pmaddwd              m8, [base+rescale_mul]
     movd               xm15, t0d
     mov            [rsp+64], t0d
@@ -4345,6 +3931,10 @@
     shl           dword dxm, 3 ; dx*8
     vpbroadcastd        m15, xm15
     paddd               m14, m8 ; mx+dx*[0-7]
+    movq                xm0, r4q
+    punpcklbw           xm0, xm0
+    psraw               xm0, 8
+    mova         [rsp+0x50], xm0
     jmp .dy2_hloop
 .dy2_hloop_prep:
     dec      dword [rsp+40]
@@ -4384,7 +3974,6 @@
     vpbroadcastq         m8, [base+subpel_filters+ rX*8]
     psrld               m14, 10
     vextracti128        xm7, m14, 1
-    movq           [rsp+32], xm14
     movd                r4d, xm14
     pextrd              r6d, xm14, 2
     pextrd              r7d, xm14, 1
@@ -4404,25 +3993,15 @@
     MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
     MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
     MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
-    mov                 myd, mym
+    vpbroadcastd         m8, [rsp+0x50]
+    vpbroadcastd         m9, [rsp+0x54]
+    vpbroadcastd        m11, [rsp+0x58]
+    vpbroadcastd         m4, [rsp+0x5c]
     pshufb               m0, m14    ; 01a 01b
     pshufb               m1, m14    ; 23a 23b
     pshufb               m2, m14    ; 45a 45b
     pshufb               m3, m14    ; 67a 67b
-    shr                 myd, 6
-    mov                 r4d, 64 << 24
-    lea                 myd, [t1+myq]
-    cmovnz              r4q, [base+subpel_filters+myq*8]
-    movq               xm14, r4q
-    punpcklbw          xm14, xm14
-    psraw              xm14, 8
-    vinserti128         m14, xm14, 1
-    mov                 r4d, [rsp+32]
-    mov                 r7d, [rsp+36]
-    pshufd               m8, m14, q0000
-    pshufd               m9, m14, q1111
-    pshufd              m11, m14, q2222
-    pshufd              m14, m14, q3333
+    SWAP                m14, m4
 .dy2_vloop:
     pmaddwd              m4, m0, m8
     pmaddwd              m5, m1, m9
--- a/src/x86/mc_avx512.asm
+++ b/src/x86/mc_avx512.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 %if HAVE_AVX512ICL && ARCH_X86_64
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -91,26 +91,46 @@
 decl_mct_fn(dav1d_prep_bilin_sse2);
 
 decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_ssse3);
 decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_ssse3);
 decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_ssse3);
 decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_ssse3);
 decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_ssse3);
 decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_ssse3);
 decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_ssse3);
 decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_ssse3);
 decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_ssse3);
 decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2);
+decl_mc_scaled_fn(dav1d_put_bilin_scaled_ssse3);
 
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_ssse3);
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_ssse3);
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_ssse3);
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_ssse3);
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_ssse3);
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_ssse3);
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_ssse3);
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_ssse3);
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_ssse3);
 decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2);
+decl_mct_scaled_fn(dav1d_prep_bilin_scaled_ssse3);
 
 decl_avg_fn(dav1d_avg_avx512icl);
 decl_avg_fn(dav1d_avg_avx2);
@@ -206,6 +226,30 @@
     init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  ssse3);
     init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   ssse3);
     init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          ssse3);
+
+#if ARCH_X86_64
+    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        ssse3);
+    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  ssse3);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         ssse3);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   ssse3);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  ssse3);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   ssse3);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          ssse3);
+    init_mc_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               ssse3);
+
+    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        ssse3);
+    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  ssse3);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         ssse3);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   ssse3);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  ssse3);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   ssse3);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          ssse3);
+    init_mct_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               ssse3);
+#endif
 
     c->avg = dav1d_avg_ssse3;
     c->w_avg = dav1d_w_avg_ssse3;
--- a/src/x86/mc_sse.asm
+++ b/src/x86/mc_sse.asm
@@ -24,6 +24,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 SECTION_RODATA 16
@@ -54,12 +55,19 @@
 subpel_h_shufA: db 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
 subpel_h_shufB: db 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
 subpel_h_shufC: db 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_s_shuf2: db 0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
+subpel_s_shuf8: db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
 bilin_h_shuf4:  db 1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
 bilin_h_shuf8:  db 1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+unpckw:         db 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
 
 pb_8x0_8x8: times 8 db 0
             times 8 db 8
-resize_mul: dd 0, 1, 2, 3
+bdct_lb_dw: times 4 db 0
+            times 4 db 4
+            times 4 db 8
+            times 4 db 12
+rescale_mul: dd 0, 1, 2, 3
 resize_shuf: times 5 db 0
              db 1, 2, 3, 4, 5, 6
              times 5+16 db 7
@@ -82,6 +90,9 @@
 pd_16384: times 4 dd 16484
 pd_32768: times 4 dd 32768
 pd_262144:times 4 dd 262144
+pd_0x3ff: times 4 dd 0x3ff
+pd_0x4000:times 4 dd 0x4000
+pq_0x40000000: times 2 dq 0x40000000
 
 pw_258:  times 2 dw 258
 
@@ -165,6 +176,35 @@
 HV_JMP_TABLE put,  bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
 HV_JMP_TABLE prep, bilin, ssse3, 7,    4, 8, 16, 32, 64, 128
 
+%macro SCALED_JMP_TABLE 1-*
+    %xdefine %1_table (%%table - %2)
+    %xdefine %%base mangle(private_prefix %+ _%1)
+%%table:
+    %rep %0 - 1
+        dw %%base %+ .w%2 - %%base
+        %rotate 1
+    %endrep
+    %rotate 1
+%%dy_1024:
+    %xdefine %1_dy1_table (%%dy_1024 - %2)
+    %rep %0 - 1
+        dw %%base %+ .dy1_w%2 - %%base
+        %rotate 1
+    %endrep
+    %rotate 1
+%%dy_2048:
+    %xdefine %1_dy2_table (%%dy_2048 - %2)
+    %rep %0 - 1
+        dw %%base %+ .dy2_w%2 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
+%if ARCH_X86_64
+SCALED_JMP_TABLE put_8tap_scaled_ssse3, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled_ssse3,   4, 8, 16, 32, 64, 128
+%endif
+
 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
 
 cextern mc_warp_filter
@@ -1464,8 +1504,8 @@
 %assign FILTER_SMOOTH  (1*15 << 16) | 4*15
 %assign FILTER_SHARP   (2*15 << 16) | 3*15
 
-%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v
-cglobal %1_8tap_%2
+%macro FN 4 ; prefix, type, type_h, type_v
+cglobal %1_%2
     mov                 t0d, FILTER_%3
 %ifidn %3, %4
     mov                 t1d, t0d
@@ -1473,7 +1513,7 @@
     mov                 t1d, FILTER_%4
 %endif
 %ifnidn %2, regular ; skip the jump in the last filter
-    jmp mangle(private_prefix %+ _%1_8tap %+ SUFFIX)
+    jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
 %endif
 %endmacro
 
@@ -1485,15 +1525,15 @@
 DECLARE_REG_TMP 7, 8
 %endif
 
-MC_8TAP_FN put, sharp,          SHARP,   SHARP
-MC_8TAP_FN put, sharp_smooth,   SHARP,   SMOOTH
-MC_8TAP_FN put, smooth_sharp,   SMOOTH,  SHARP
-MC_8TAP_FN put, smooth,         SMOOTH,  SMOOTH
-MC_8TAP_FN put, sharp_regular,  SHARP,   REGULAR
-MC_8TAP_FN put, regular_sharp,  REGULAR, SHARP
-MC_8TAP_FN put, smooth_regular, SMOOTH,  REGULAR
-MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH
-MC_8TAP_FN put, regular,        REGULAR, REGULAR
+FN put_8tap, sharp,          SHARP,   SHARP
+FN put_8tap, sharp_smooth,   SHARP,   SMOOTH
+FN put_8tap, smooth_sharp,   SMOOTH,  SHARP
+FN put_8tap, smooth,         SMOOTH,  SMOOTH
+FN put_8tap, sharp_regular,  SHARP,   REGULAR
+FN put_8tap, regular_sharp,  REGULAR, SHARP
+FN put_8tap, smooth_regular, SMOOTH,  REGULAR
+FN put_8tap, regular_smooth, REGULAR, SMOOTH
+FN put_8tap, regular,        REGULAR, REGULAR
 
 %if ARCH_X86_32
  %define base_reg r1
@@ -2773,15 +2813,15 @@
  DECLARE_REG_TMP 6, 7
 %endif
 
-MC_8TAP_FN prep, sharp,          SHARP,   SHARP
-MC_8TAP_FN prep, sharp_smooth,   SHARP,   SMOOTH
-MC_8TAP_FN prep, smooth_sharp,   SMOOTH,  SHARP
-MC_8TAP_FN prep, smooth,         SMOOTH,  SMOOTH
-MC_8TAP_FN prep, sharp_regular,  SHARP,   REGULAR
-MC_8TAP_FN prep, regular_sharp,  REGULAR, SHARP
-MC_8TAP_FN prep, smooth_regular, SMOOTH,  REGULAR
-MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH
-MC_8TAP_FN prep, regular,        REGULAR, REGULAR
+FN prep_8tap, sharp,          SHARP,   SHARP
+FN prep_8tap, sharp_smooth,   SHARP,   SMOOTH
+FN prep_8tap, smooth_sharp,   SMOOTH,  SHARP
+FN prep_8tap, smooth,         SMOOTH,  SMOOTH
+FN prep_8tap, sharp_regular,  SHARP,   REGULAR
+FN prep_8tap, regular_sharp,  REGULAR, SHARP
+FN prep_8tap, smooth_regular, SMOOTH,  REGULAR
+FN prep_8tap, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap, regular,        REGULAR, REGULAR
 
 %if ARCH_X86_32
  %define base_reg r2
@@ -3912,6 +3952,1738 @@
     RET
 %endmacro
 
+%macro movifprep 2
+ %if isprep
+    mov %1, %2
+ %endif
+%endmacro
+
+%macro REMAP_REG 2
+ %xdefine r%1  r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+  %xdefine r14_save r14
+  %assign %%i 14
+  %rep 14
+   %assign %%j %%i-1
+   REMAP_REG %%i, %%j
+   %assign %%i %%i-1
+  %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+  %assign %%i 1
+  %rep 13
+   %assign %%j %%i+1
+   REMAP_REG %%i, %%j
+   %assign %%i %%i+1
+  %endrep
+  %xdefine r14 r14_save
+  %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+    RET
+ %if %1
+    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3]
+    SWAP                m%2, m%5
+    movq                m%1, [srcq+ r4]
+    movq                m%2, [srcq+ r6]
+    movhps              m%1, [srcq+ r7]
+    movhps              m%2, [srcq+ r9]
+    movq                m%3, [srcq+r10]
+    movq                m%4, [srcq+r11]
+    movhps              m%3, [srcq+r13]
+    movhps              m%4, [srcq+ rX]
+    add                srcq, ssq
+    movq                m%5, [srcq+ r4]
+    movq                m%6, [srcq+ r6]
+    movhps              m%5, [srcq+ r7]
+    movhps              m%6, [srcq+ r9]
+    movq                m%7, [srcq+r10]
+    movq                m%8, [srcq+r11]
+    movhps              m%7, [srcq+r13]
+    movhps              m%8, [srcq+ rX]
+    add                srcq, ssq
+    pmaddubsw           m%1, m%9
+    pmaddubsw           m%5, m%9
+    pmaddubsw           m%2, m%10
+    pmaddubsw           m%6, m%10
+    pmaddubsw           m%3, m%11
+    pmaddubsw           m%7, m%11
+    pmaddubsw           m%4, m%12
+    pmaddubsw           m%8, m%12
+    phaddw              m%1, m%2
+    phaddw              m%5, m%6
+    phaddw              m%3, m%4
+    phaddw              m%7, m%8
+    phaddw              m%1, m%3
+    phaddw              m%5, m%7
+    pmulhrsw            m%1, m12
+    pmulhrsw            m%5, m12
+    SWAP                m%2, m%5
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled, 4, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal put_8tap_scaled, 4, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %xdefine base_reg r12
+ %define rndshift 10
+%else
+ %assign isprep 1
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled, 4, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+  %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+  %define tmp_stridem qword [rsp+0x138]
+ %endif
+ %xdefine base_reg r11
+ %define rndshift 6
+%endif
+    LEA            base_reg, %1_8tap_scaled_ssse3
+%define base base_reg-%1_8tap_scaled_ssse3
+    tzcnt                wd, wm
+    movd                 m8, dxm
+    movd                m14, mxm
+    pshufd               m8, m8, q0000
+    pshufd              m14, m14, q0000
+%if isprep && UNIX64
+    mov                 r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%endif
+    mov                 dyd, dym
+%ifidn %1, put
+ %if WIN64
+    mov                 r8d, hm
+  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+  %define hm r5m
+  %define dxm r8m
+ %else
+  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+  %define hm r6m
+ %endif
+ %if required_stack_alignment > STACK_ALIGNMENT
+  %define dsm [rsp+0x138]
+  %define rX r1
+  %define rXd r1d
+ %else
+  %define dsm dsq
+  %define rX r14
+  %define rXd r14d
+ %endif
+%else ; prep
+ %if WIN64
+    mov                 r7d, hm
+  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+  %define hm r4m
+  %define dxm r7m
+ %else
+  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+  %define hm [rsp+0x94]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define rX r14
+ %define rXd r14d
+%endif
+    mova                m10, [base+pd_0x3ff]
+    mova                m12, [base+pw_8192]
+%ifidn %1, put
+    mova                m13, [base+pd_512]
+%else
+    mova                m13, [base+pd_32]
+%endif
+    pxor                 m9, m9
+    lea                ss3q, [ssq*3]
+    movzx               r7d, t1b
+    shr                 t1d, 16
+    cmp                  hd, 6
+    cmovs               t1d, r7d
+    sub                srcq, ss3q
+    cmp                 dyd, 1024
+    je .dy1
+    cmp                 dyd, 2048
+    je .dy2
+    movzx                wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
+    add                  wq, base_reg
+    jmp                  wq
+%ifidn %1, put
+.w2:
+    mov                 myd, mym
+    movzx               t0d, t0b
+    dec                srcq
+    movd                m15, t0d
+    punpckldq            m9, m8
+    SWAP                 m8, m9
+    paddd               m14, m8 ; mx+dx*[0-1]
+    mova                m11, [base+pd_0x4000]
+    pshufd              m15, m15, q0000
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd               m15, m8
+    movd                r4d, m15
+    psrldq              m15, 4
+    movd                r6d, m15
+    mova                 m5, [base+bdct_lb_dw]
+    mova                 m6, [base+subpel_s_shuf2]
+    movd                m15, [base+subpel_filters+r4*8+2]
+    movd                 m7, [base+subpel_filters+r6*8+2]
+    pxor                 m9, m9
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    movq                 m0, [srcq+ssq*0]
+    movq                 m2, [srcq+ssq*2]
+    movhps               m0, [srcq+ssq*1]
+    movhps               m2, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    pshufb              m14, m5
+    paddb               m14, m6
+    movq                 m1, [srcq+ssq*0]
+    movq                 m3, [srcq+ssq*2]
+    movhps               m1, [srcq+ssq*1]
+    movhps               m3, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    punpckldq           m15, m7
+    punpcklqdq          m15, m15
+    pand                m11, m8
+    pandn                m8, m15
+    SWAP                m15, m8
+    por                 m15, m11
+    pshufb               m0, m14
+    pshufb               m2, m14
+    pshufb               m1, m14
+    pshufb               m3, m14
+    pmaddubsw            m0, m15
+    pmaddubsw            m2, m15
+    pmaddubsw            m1, m15
+    pmaddubsw            m3, m15
+    phaddw               m0, m2
+    phaddw               m1, m3
+    pmulhrsw             m0, m12       ; 0 1 2 3
+    pmulhrsw             m1, m12       ; 4 5 6 7
+    palignr              m2, m1, m0, 4 ; 1 2 3 4
+    punpcklwd            m3, m0, m2    ; 01 12
+    punpckhwd            m0, m2        ; 23 34
+    pshufd               m5, m1, q0321 ; 5 6 7 _
+    punpcklwd            m2, m1, m5    ; 45 56
+    punpckhwd            m4, m1, m5    ; 67 __
+.w2_loop:
+    and                 myd, 0x3ff
+    mov                 r6d, 64 << 24
+    mov                 r4d, myd
+    shr                 r4d, 6
+    lea                 r4d, [t1+r4]
+    cmovnz              r6q, [base+subpel_filters+r4*8]
+    movq                m11, r6q
+    punpcklbw           m11, m11
+    psraw               m11, 8
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pshufd              m10, m11, q2222
+    pshufd              m11, m11, q3333
+    pmaddwd              m5, m3, m8
+    pmaddwd              m6, m0, m9
+    pmaddwd              m7, m2, m10
+    pmaddwd              m8, m4, m11
+    paddd                m5, m6
+    paddd                m7, m8
+    paddd                m5, m13
+    paddd                m5, m7
+    psrad                m5, 10
+    packssdw             m5, m5
+    packuswb             m5, m5
+    pextrw              r6d, m5, 0
+    mov              [dstq], r6w
+    add                dstq, dsq
+    dec                  hd
+    jz .ret
+    add                 myd, dyd
+    test                myd, ~0x3ff
+    jz .w2_loop
+    movq                 m5, [srcq]
+    test                myd, 0x400
+    jz .w2_skip_line
+    add                srcq, ssq
+    shufps               m3, m0, q1032      ; 01 12
+    shufps               m0, m2, q1032      ; 23 34
+    shufps               m2, m4, q1032      ; 45 56
+    pshufb               m5, m14
+    pmaddubsw            m5, m15
+    phaddw               m5, m5
+    pmulhrsw             m5, m12
+    palignr              m4, m5, m1, 12
+    punpcklqdq           m1, m4, m4         ; 6 7 6 7
+    punpcklwd            m4, m1, m5         ; 67 __
+    jmp .w2_loop
+.w2_skip_line:
+    movhps               m5, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mova                 m3, m0             ; 01 12
+    mova                 m0, m2             ; 23 34
+    pshufb               m5, m14
+    pmaddubsw            m5, m15
+    phaddw               m5, m5
+    pmulhrsw             m5, m12            ; 6 7 6 7
+    palignr              m4, m5, m1, 8      ; 4 5 6 7
+    pshufd               m5, m4, q0321      ; 5 6 7 _
+    mova                 m1, m4
+    punpcklwd            m2, m4, m5         ; 45 56
+    punpckhwd            m4, m5             ; 67 __
+    jmp .w2_loop
+    SWAP                m15, m8, m9
+%endif
+.w4:
+    mov                 myd, mym
+    mova                 m7, [base+rescale_mul]
+    movzx               t0d, t0b
+    dec                srcq
+    movd                m15, t0d
+    pmaddwd              m8, m7
+    mova                m11, [base+pd_0x4000]
+    pshufd              m15, m15, q0000
+    paddd               m14, m8 ; mx+dx*[0-3]
+    pand                 m0, m14, m10
+    psrld                m0, 6
+    paddd               m15, m0
+    psrldq               m7, m15, 8
+    movd                r4d, m15
+    movd               r11d, m7
+    psrldq              m15, 4
+    psrldq               m7, 4
+    movd                r6d, m15
+    movd               r13d, m7
+    movd                m15, [base+subpel_filters+ r4*8+2]
+    movd                 m2, [base+subpel_filters+r11*8+2]
+    movd                 m3, [base+subpel_filters+ r6*8+2]
+    movd                 m4, [base+subpel_filters+r13*8+2]
+    mova                 m5, [base+bdct_lb_dw]
+    movq                 m6, [base+subpel_s_shuf2]
+    pcmpeqd              m0, m9
+    psrld               m14, 10
+    movu                 m7, [srcq+ssq*0]
+    movu                 m9, [srcq+ssq*1]
+    movu                 m8, [srcq+ssq*2]
+    movu                m10, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    punpckldq           m15, m3
+    punpckldq            m2, m4
+    punpcklqdq           m6, m6
+    punpcklqdq          m15, m2
+    pshufb              m14, m5
+    paddb               m14, m6
+    movu                 m2, [srcq+ssq*0]
+    movu                 m4, [srcq+ssq*1]
+    movu                 m3, [srcq+ssq*2]
+    movu                 m5, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    pand                m11, m0
+    pandn                m0, m15
+    SWAP                m15, m0
+    por                 m15, m11
+    pshufb               m7, m14
+    pshufb               m9, m14
+    pshufb               m8, m14
+    pshufb              m10, m14
+    pshufb               m2, m14
+    pshufb               m4, m14
+    pshufb               m3, m14
+    pshufb               m5, m14
+    pmaddubsw            m7, m15
+    pmaddubsw            m9, m15
+    pmaddubsw            m8, m15
+    pmaddubsw           m10, m15
+    pmaddubsw            m2, m15
+    pmaddubsw            m4, m15
+    pmaddubsw            m3, m15
+    pmaddubsw            m5, m15
+    phaddw               m7, m9
+    phaddw               m8, m10
+    phaddw               m9, m2, m4
+    phaddw               m3, m5
+    pmulhrsw             m7, m12            ; 0 1
+    pmulhrsw             m8, m12            ; 2 3
+    pmulhrsw             m9, m12            ; 4 5
+    pmulhrsw             m3, m12            ; 6 7
+    shufps               m4, m7, m8, q1032  ; 1 2
+    shufps               m5, m8, m9, q1032  ; 3 4
+    shufps               m6, m9, m3, q1032  ; 5 6
+    psrldq              m11, m3, 8          ; 7 _
+    punpcklwd            m0, m7, m4 ; 01
+    punpckhwd            m7, m4     ; 12
+    punpcklwd            m1, m8, m5 ; 23
+    punpckhwd            m8, m5     ; 34
+    punpcklwd            m2, m9, m6 ; 45
+    punpckhwd            m9, m6     ; 56
+    punpcklwd            m3, m11    ; 67
+    mova         [rsp+0x00], m7
+    mova         [rsp+0x10], m8
+    mova         [rsp+0x20], m9
+.w4_loop:
+    and                 myd, 0x3ff
+    mov                 r6d, 64 << 24
+    mov                 r4d, myd
+    shr                 r4d, 6
+    lea                 r4d, [t1+r4]
+    cmovnz              r6q, [base+subpel_filters+r4*8]
+    movq                m10, r6q
+    punpcklbw           m10, m10
+    psraw               m10, 8
+    pshufd               m7, m10, q0000
+    pshufd               m8, m10, q1111
+    pshufd               m9, m10, q2222
+    pshufd              m10, m10, q3333
+    pmaddwd              m4, m0, m7
+    pmaddwd              m5, m1, m8
+    pmaddwd              m6, m2, m9
+    pmaddwd              m7, m3, m10
+    paddd                m4, m5
+    paddd                m6, m7
+    paddd                m4, m13
+    paddd                m4, m6
+    psrad                m4, rndshift
+    packssdw             m4, m4
+%ifidn %1, put
+    packuswb             m4, m4
+    movd             [dstq], m4
+    add                dstq, dsq
+%else
+    movq             [tmpq], m4
+    add                tmpq, 8
+%endif
+    dec                  hd
+    jz .ret
+    add                 myd, dyd
+    test                myd, ~0x3ff
+    jz .w4_loop
+    movu                 m4, [srcq]
+    test                myd, 0x400
+    jz .w4_skip_line
+    mova                 m0, [rsp+0x00]
+    mova         [rsp+0x00], m1
+    mova                 m1, [rsp+0x10]
+    mova         [rsp+0x10], m2
+    mova                 m2, [rsp+0x20]
+    mova         [rsp+0x20], m3
+    pshufb               m4, m14
+    pmaddubsw            m4, m15
+    phaddw               m4, m4
+    pmulhrsw             m4, m12
+    punpcklwd            m3, m11, m4
+    mova                m11, m4
+    add                srcq, ssq
+    jmp .w4_loop
+.w4_skip_line:
+    movu                 m5, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mova                 m6, [rsp+0x10]
+    mova                 m7, [rsp+0x20]
+    pshufb               m4, m14
+    pshufb               m5, m14
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, m15
+    phaddw               m4, m5
+    pmulhrsw             m4, m12
+    punpcklwd            m9, m11, m4
+    mova         [rsp+0x00], m6
+    mova         [rsp+0x10], m7
+    mova         [rsp+0x20], m9
+    psrldq              m11, m4, 8
+    mova                 m0, m1
+    mova                 m1, m2
+    mova                 m2, m3
+    punpcklwd            m3, m4, m11
+    jmp .w4_loop
+    SWAP                 m0, m15
+.w8:
+    mov    dword [rsp+0x90], 1
+    movifprep   tmp_stridem, 16
+    jmp .w_start
+.w16:
+    mov    dword [rsp+0x90], 2
+    movifprep   tmp_stridem, 32
+    jmp .w_start
+.w32:
+    mov    dword [rsp+0x90], 4
+    movifprep   tmp_stridem, 64
+    jmp .w_start
+.w64:
+    mov    dword [rsp+0x90], 8
+    movifprep   tmp_stridem, 128
+    jmp .w_start
+.w128:
+    mov    dword [rsp+0x90], 16
+    movifprep   tmp_stridem, 256
+.w_start:
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    movd                m15, t0d
+    pslld                m7, m8, 2 ; dx*4
+    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
+    pshufd              m15, m15, q0000
+    paddd               m14, m8 ; mx+dx*[0-3]
+    mova        [rsp+0x100], m7
+    mova        [rsp+0x120], m15
+    mov         [rsp+0x098], srcq
+    mov         [rsp+0x130], r0q ; dstq / tmpq
+%if UNIX64
+    mov                  hm, hd
+%endif
+    jmp .hloop
+.hloop_prep:
+    dec   dword [rsp+0x090]
+    jz .ret
+    add   qword [rsp+0x130], 8*(isprep+1)
+    mov                  hd, hm
+    mova                 m7, [rsp+0x100]
+    mova                m14, [rsp+0x110]
+    mova                m10, [base+pd_0x3ff]
+    mova                m15, [rsp+0x120]
+    pxor                 m9, m9
+    mov                srcq, [rsp+0x098]
+    mov                 r0q, [rsp+0x130] ; dstq / tmpq
+    paddd               m14, m7
+.hloop:
+    mova                m11, [base+pq_0x40000000]
+    psrld                m4, m14, 10
+    mova              [rsp], m4
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd                m5, m15, m6
+    pcmpeqd              m6, m9
+    psrldq               m4, m5, 8
+    movd                r4d, m5
+    movd                r6d, m4
+    psrldq               m5, 4
+    psrldq               m4, 4
+    movd                r7d, m5
+    movd                r9d, m4
+    movq                 m0, [base+subpel_filters+r4*8]
+    movq                 m1, [base+subpel_filters+r6*8]
+    movhps               m0, [base+subpel_filters+r7*8]
+    movhps               m1, [base+subpel_filters+r9*8]
+    paddd               m14, m7 ; mx+dx*[4-7]
+    pand                 m5, m14, m10
+    psrld                m5, 6
+    paddd               m15, m5
+    pcmpeqd              m5, m9
+    mova        [rsp+0x110], m14
+    psrldq               m4, m15, 8
+    movd               r10d, m15
+    movd               r11d, m4
+    psrldq              m15, 4
+    psrldq               m4, 4
+    movd               r13d, m15
+    movd                rXd, m4
+    movq                 m2, [base+subpel_filters+r10*8]
+    movq                 m3, [base+subpel_filters+r11*8]
+    movhps               m2, [base+subpel_filters+r13*8]
+    movhps               m3, [base+subpel_filters+ rX*8]
+    psrld               m14, 10
+    psrldq               m4, m14, 8
+    movd               r10d, m14
+    movd               r11d, m4
+    psrldq              m14, 4
+    psrldq               m4, 4
+    movd               r13d, m14
+    movd                rXd, m4
+    mov                 r4d, [rsp+ 0]
+    mov                 r6d, [rsp+ 8]
+    mov                 r7d, [rsp+ 4]
+    mov                 r9d, [rsp+12]
+    pshufd               m4, m6, q1100
+    pshufd               m6, m6, q3322
+    pshufd              m14, m5, q1100
+    pshufd               m5, m5, q3322
+    pand                 m7, m11, m4
+    pand                 m8, m11, m6
+    pand                m15, m11, m14
+    pand                m11, m11, m5
+    pandn                m4, m0
+    pandn                m6, m1
+    pandn               m14, m2
+    pandn                m5, m3
+    por                  m7, m4
+    por                  m8, m6
+    por                 m15, m14
+    por                 m11, m5
+    mova         [rsp+0x10], m7
+    mova         [rsp+0x20], m8
+    mova         [rsp+0x30], m15
+    mova         [rsp+0x40], m11
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1
+    mova         [rsp+0x50], m1
+    mova         [rsp+0x60], m2
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3
+    mova         [rsp+0x70], m3
+    mova         [rsp+0x80], m4
+    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5
+    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7
+    SWAP                 m7, m0
+    SWAP                 m8, m14
+    mova                 m1, [rsp+0x50]
+    mova                 m2, [rsp+0x60]
+    mova                 m3, [rsp+0x70]
+    mova                 m9, [rsp+0x80]
+    mov                 myd, mym
+    mov                 dyd, dym
+    punpcklwd            m4, m5, m6 ; 45a
+    punpckhwd            m5, m6     ; 45b
+    punpcklwd            m6, m7, m8 ; 67a
+    punpckhwd            m7, m8     ; 67b
+    punpcklwd            m0, m1, m2 ; 01a
+    punpckhwd            m1, m2     ; 01b
+    punpcklwd            m2, m3, m9 ; 23a
+    punpckhwd            m3, m9     ; 23b
+    mova         [rsp+0x50], m4
+    mova         [rsp+0x60], m5
+    mova         [rsp+0x70], m6
+    mova         [rsp+0x80], m7
+    SWAP                m14, m8
+.vloop:
+    and                 myd, 0x3ff
+    mov                 r6d, 64 << 24
+    mov                 r4d, myd
+    shr                 r4d, 6
+    lea                 r4d, [t1+r4]
+    cmovnz              r6q, [base+subpel_filters+r4*8]
+    movq                m11, r6q
+    punpcklbw           m11, m11
+    psraw               m11, 8
+    pshufd               m5, m11, q0000
+    pshufd               m7, m11, q1111
+    pshufd              m10, m11, q2222
+    pshufd              m11, m11, q3333
+    pmaddwd              m4, m5, m0
+    pmaddwd              m5, m5, m1
+    pmaddwd              m6, m7, m2
+    pmaddwd              m7, m7, m3
+    paddd                m4, m13
+    paddd                m5, m13
+    paddd                m4, m6
+    paddd                m5, m7
+    pmaddwd              m6, [rsp+0x50], m10
+    pmaddwd              m7, [rsp+0x60], m10
+    pmaddwd              m8, [rsp+0x70], m11
+    pmaddwd              m9, [rsp+0x80], m11
+    paddd                m4, m6
+    paddd                m5, m7
+    paddd                m4, m8
+    paddd                m5, m9
+    psrad                m4, rndshift
+    psrad                m5, rndshift
+    packssdw             m4, m5
+%ifidn %1, put
+    packuswb             m4, m4
+    movq             [dstq], m4
+    add                dstq, dsm
+%else
+    mova             [tmpq], m4
+    add                tmpq, tmp_stridem
+%endif
+    dec                  hd
+    jz .hloop_prep
+    add                 myd, dyd
+    test                myd, ~0x3ff
+    jz .vloop
+    test                myd, 0x400
+    mov         [rsp+0x140], myd
+    mov                 r4d, [rsp+ 0]
+    mov                 r6d, [rsp+ 8]
+    mov                 r7d, [rsp+ 4]
+    mov                 r9d, [rsp+12]
+    jz .skip_line
+    mova                m14, [base+unpckw]
+    movq                 m6, [srcq+r10]
+    movq                 m7, [srcq+r11]
+    movhps               m6, [srcq+r13]
+    movhps               m7, [srcq+ rX]
+    movq                 m4, [srcq+ r4]
+    movq                 m5, [srcq+ r6]
+    movhps               m4, [srcq+ r7]
+    movhps               m5, [srcq+ r9]
+    add                srcq, ssq
+    mov                 myd, [rsp+0x140]
+    mov                 dyd, dym
+    pshufd               m9, m14, q1032
+    pshufb               m0, m14                ; 0a 1a
+    pshufb               m1, m14                ; 0b 1b
+    pshufb               m2, m9                 ; 3a 2a
+    pshufb               m3, m9                 ; 3b 2b
+    pmaddubsw            m6, [rsp+0x30]
+    pmaddubsw            m7, [rsp+0x40]
+    pmaddubsw            m4, [rsp+0x10]
+    pmaddubsw            m5, [rsp+0x20]
+    phaddw               m6, m7
+    phaddw               m4, m5
+    phaddw               m4, m6
+    pmulhrsw             m4, m12
+    pshufb               m5, [rsp+0x50], m14    ; 4a 5a
+    pshufb               m6, [rsp+0x60], m14    ; 4b 5b
+    pshufb               m7, [rsp+0x70], m9     ; 7a 6a
+    pshufb               m8, [rsp+0x80], m9     ; 7b 6b
+    punpckhwd            m0, m2 ; 12a
+    punpckhwd            m1, m3 ; 12b
+    punpcklwd            m2, m5 ; 34a
+    punpcklwd            m3, m6 ; 34b
+    punpckhwd            m5, m7 ; 56a
+    punpckhwd            m6, m8 ; 56b
+    punpcklwd            m7, m4 ; 78a
+    punpckhqdq           m4, m4
+    punpcklwd            m8, m4 ; 78b
+    mova         [rsp+0x50], m5
+    mova         [rsp+0x60], m6
+    mova         [rsp+0x70], m7
+    mova         [rsp+0x80], m8
+    jmp .vloop
+.skip_line:
+    mova                 m0, [rsp+0x10]
+    mova                 m1, [rsp+0x20]
+    mova                m14, [rsp+0x30]
+    mova                m15, [rsp+0x40]
+    MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15
+    mov                 myd, [rsp+0x140]
+    mov                 dyd, dym
+    mova                 m0, m2         ; 01a
+    mova                 m1, m3         ; 01b
+    mova                 m2, [rsp+0x50] ; 23a
+    mova                 m3, [rsp+0x60] ; 23b
+    mova                 m5, [rsp+0x70] ; 45a
+    mova                 m6, [rsp+0x80] ; 45b
+    punpcklwd            m7, m4, m8     ; 67a
+    punpckhwd            m4, m8         ; 67b
+    mova         [rsp+0x50], m5
+    mova         [rsp+0x60], m6
+    mova         [rsp+0x70], m7
+    mova         [rsp+0x80], m4
+    jmp .vloop
+.dy1:
+    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
+    add                  wq, base_reg
+    jmp                  wq
+%ifidn %1, put
+.dy1_w2:
+    mov                 myd, mym
+    movzx               t0d, t0b
+    dec                srcq
+    movd                m15, t0d
+    punpckldq            m9, m8
+    SWAP                 m8, m9
+    paddd               m14, m8 ; mx+dx*[0-1]
+    mova                m11, [base+pd_0x4000]
+    pshufd              m15, m15, q0000
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd               m15, m8
+    movd                r4d, m15
+    psrldq              m15, 4
+    movd                r6d, m15
+    mova                 m5, [base+bdct_lb_dw]
+    mova                 m6, [base+subpel_s_shuf2]
+    movd                m15, [base+subpel_filters+r4*8+2]
+    movd                 m7, [base+subpel_filters+r6*8+2]
+    pxor                 m9, m9
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    movq                 m0, [srcq+ssq*0]
+    movq                 m2, [srcq+ssq*2]
+    movhps               m0, [srcq+ssq*1]
+    movhps               m2, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    pshufb              m14, m5
+    paddb               m14, m6
+    movq                 m1, [srcq+ssq*0]
+    movq                 m3, [srcq+ssq*2]
+    movhps               m1, [srcq+ssq*1]
+    add                srcq, ss3q
+    movq               xm10, r4q
+    punpcklbw          xm10, xm10
+    psraw              xm10, 8
+    punpckldq           m15, m7
+    punpcklqdq          m15, m15
+    pand                m11, m8
+    pandn                m8, m15
+    SWAP                m15, m8
+    por                 m15, m11
+    pshufd               m8, m10, q0000
+    pshufd               m9, m10, q1111
+    pshufd              m11, m10, q3333
+    pshufd              m10, m10, q2222
+    pshufb               m0, m14
+    pshufb               m2, m14
+    pshufb               m1, m14
+    pshufb               m3, m14
+    pmaddubsw            m0, m15
+    pmaddubsw            m2, m15
+    pmaddubsw            m1, m15
+    pmaddubsw            m3, m15
+    phaddw               m0, m2
+    phaddw               m1, m3
+    pmulhrsw             m0, m12
+    pmulhrsw             m1, m12
+    palignr              m2, m1, m0, 4
+    pshufd               m4, m1, q2121
+    punpcklwd            m3, m0, m2     ; 01 12
+    punpckhwd            m0, m2         ; 23 34
+    punpcklwd            m2, m1, m4     ; 45 56
+.dy1_w2_loop:
+    movq                 m1, [srcq+ssq*0]
+    movhps               m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pmaddwd              m5, m3, m8
+    pmaddwd              m6, m0, m9
+    pmaddwd              m7, m2, m10
+    mova                 m3, m0
+    mova                 m0, m2
+    paddd                m5, m13
+    paddd                m6, m7
+    pshufb               m1, m14
+    pmaddubsw            m1, m15
+    phaddw               m1, m1
+    pmulhrsw             m1, m12
+    palignr              m7, m1, m4, 12
+    punpcklwd            m2, m7, m1     ; 67 78
+    pmaddwd              m7, m2, m11
+    mova                 m4, m1
+    paddd                m5, m6
+    paddd                m5, m7
+    psrad                m5, rndshift
+    packssdw             m5, m5
+    packuswb             m5, m5
+    pextrw              r4d, m5, 0
+    pextrw              r6d, m5, 1
+    mov        [dstq+dsq*0], r4w
+    mov        [dstq+dsq*1], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .dy1_w2_loop
+    RET
+    SWAP                m15, m8, m9
+%endif
+.dy1_w4:
+    mov                 myd, mym
+    mova                 m7, [base+rescale_mul]
+    movzx               t0d, t0b
+    dec                srcq
+    movd                m15, t0d
+    pmaddwd              m8, m7
+    mova                m11, [base+pd_0x4000]
+    pshufd              m15, m15, q0000
+    paddd               m14, m8 ; mx+dx*[0-3]
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd               m15, m8
+    psrldq               m7, m15, 8
+    movd                r4d, m15
+    movd               r11d, m7
+    psrldq              m15, 4
+    psrldq               m7, 4
+    movd                r6d, m15
+    movd               r13d, m7
+    movd                m15, [base+subpel_filters+ r4*8+2]
+    movd                 m4, [base+subpel_filters+r11*8+2]
+    movd                 m5, [base+subpel_filters+ r6*8+2]
+    movd                 m7, [base+subpel_filters+r13*8+2]
+    movq                 m6, [base+subpel_s_shuf2]
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
+    movu                 m2, [srcq+ssq*2]
+    movu                 m3, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    punpckldq           m15, m5
+    punpckldq            m4, m7
+    punpcklqdq           m6, m6
+    punpcklqdq          m15, m4
+    pshufb              m14, [base+bdct_lb_dw]
+    movu                 m4, [srcq+ssq*0]
+    movu                 m5, [srcq+ssq*1]
+    movu                 m7, [srcq+ssq*2]
+    add                srcq, ss3q
+    pand                m11, m8
+    pandn                m8, m15
+    SWAP                m15, m8
+    por                 m15, m11
+    paddb               m14, m6
+    movq                m10, r4q
+    punpcklbw           m10, m10
+    psraw               m10, 8
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pshufb               m2, m14
+    pshufb               m3, m14
+    pshufb               m4, m14
+    pshufb               m5, m14
+    pshufb               m7, m14
+    pmaddubsw            m0, m15
+    pmaddubsw            m1, m15
+    pmaddubsw            m2, m15
+    pmaddubsw            m3, m15
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, m15
+    pmaddubsw            m7, m15
+    phaddw               m0, m1
+    phaddw               m2, m3
+    phaddw               m4, m5
+    phaddw               m6, m7, m7
+    pmulhrsw             m0, m12    ; 0 1
+    pmulhrsw             m2, m12    ; 2 3
+    pmulhrsw             m4, m12    ; 4 5
+    pmulhrsw             m6, m12    ; 6 _
+    shufps               m1, m0, m2, q1032  ; 1 2
+    shufps               m3, m2, m4, q1032  ; 3 4
+    shufps               m5, m4, m6, q1032  ; 5 6
+    punpcklwd            m7, m0, m1 ; 01
+    punpckhwd            m0, m1     ; 12
+    punpcklwd            m8, m2, m3 ; 23
+    punpckhwd            m2, m3     ; 34
+    punpcklwd            m9, m4, m5 ; 45
+    punpckhwd            m4, m5     ; 56
+    pshufd               m1, m10, q0000
+    pshufd               m3, m10, q1111
+    pshufd               m5, m10, q2222
+    pshufd              m10, m10, q3333
+    mova         [rsp+0x00], m8
+    mova         [rsp+0x10], m2
+    mova         [rsp+0x20], m9
+    mova         [rsp+0x30], m4
+.dy1_w4_loop:
+    movu                m11, [srcq+ssq*0]
+    pmaddwd              m7, m1
+    pmaddwd              m8, m3
+    pmaddwd              m0, m1
+    pmaddwd              m2, m3
+    pmaddwd              m9, m5
+    pmaddwd              m4, m5
+    paddd                m7, m8
+    paddd                m0, m2
+    movu                 m8, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb              m11, m14
+    pmaddubsw           m11, m15
+    paddd                m7, m13
+    paddd                m0, m13
+    paddd                m7, m9
+    paddd                m0, m4
+    pshufb               m8, m14
+    pmaddubsw            m8, m15
+    phaddw              m11, m8
+    mova                 m8, [rsp+0x20]
+    pmulhrsw            m11, m12
+    punpcklwd            m9, m6, m11    ; 67
+    psrldq               m6, m11, 8
+    punpcklwd            m4, m11, m6    ; 78
+    pmaddwd              m2, m9, m10
+    pmaddwd             m11, m4, m10
+    paddd                m7, m2
+    mova                 m2, [rsp+0x30]
+    paddd                m0, m11
+    psrad                m7, rndshift
+    psrad                m0, rndshift
+    packssdw             m7, m0
+    mova                 m0, [rsp+0x10]
+%ifidn %1, put
+    packuswb             m7, m7
+    psrldq              m11, m7, 4
+    movd       [dstq+dsq*0], m7
+    movd       [dstq+dsq*1], m11
+    lea                dstq, [dstq+dsq*2]
+%else
+    mova             [tmpq], m7
+    add                tmpq, 16
+%endif
+    sub                  hd, 2
+    jz .ret
+    mova                 m7, [rsp+0x00]
+    mova         [rsp+0x00], m8
+    mova         [rsp+0x10], m2
+    mova         [rsp+0x20], m9
+    mova         [rsp+0x30], m4
+    jmp .dy1_w4_loop
+    SWAP                 m8, m15
+.dy1_w8:
+    mov    dword [rsp+0x90], 1
+    movifprep   tmp_stridem, 16
+    jmp .dy1_w_start
+.dy1_w16:
+    mov    dword [rsp+0x90], 2
+    movifprep   tmp_stridem, 32
+    jmp .dy1_w_start
+.dy1_w32:
+    mov    dword [rsp+0x90], 4
+    movifprep   tmp_stridem, 64
+    jmp .dy1_w_start
+.dy1_w64:
+    mov    dword [rsp+0x90], 8
+    movifprep   tmp_stridem, 128
+    jmp .dy1_w_start
+.dy1_w128:
+    mov    dword [rsp+0x90], 16
+    movifprep   tmp_stridem, 256
+.dy1_w_start:
+    mov                 myd, mym
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    movd                m15, t0d
+    pslld                m7, m8, 2 ; dx*4
+    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
+    pshufd              m15, m15, q0000
+    paddd               m14, m8 ; mx+dx*[0-3]
+    movq                 m3, r4q
+    punpcklbw            m3, m3
+    psraw                m3, 8
+    mova        [rsp+0x100], m7
+    mova        [rsp+0x120], m15
+    mov         [rsp+0x098], srcq
+    mov         [rsp+0x130], r0q ; dstq / tmpq
+    pshufd               m0, m3, q0000
+    pshufd               m1, m3, q1111
+    pshufd               m2, m3, q2222
+    pshufd               m3, m3, q3333
+    mova        [rsp+0x140], m0
+    mova        [rsp+0x150], m1
+    mova        [rsp+0x160], m2
+    mova        [rsp+0x170], m3
+%if UNIX64
+    mov                  hm, hd
+%endif
+    jmp .dy1_hloop
+.dy1_hloop_prep:
+    dec   dword [rsp+0x090]
+    jz .ret
+    add   qword [rsp+0x130], 8*(isprep+1)
+    mov                  hd, hm
+    mova                 m7, [rsp+0x100]
+    mova                m14, [rsp+0x110]
+    mova                m10, [base+pd_0x3ff]
+    mova                m15, [rsp+0x120]
+    pxor                 m9, m9
+    mov                srcq, [rsp+0x098]
+    mov                 r0q, [rsp+0x130] ; dstq / tmpq
+    paddd               m14, m7
+.dy1_hloop:
+    mova                m11, [base+pq_0x40000000]
+    psrld                m4, m14, 10
+    mova              [rsp], m4
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd                m5, m15, m6
+    pcmpeqd              m6, m9
+    psrldq               m4, m5, 8
+    movd                r4d, m5
+    movd                r6d, m4
+    psrldq               m5, 4
+    psrldq               m4, 4
+    movd                r7d, m5
+    movd                r9d, m4
+    movq                 m0, [base+subpel_filters+r4*8]
+    movq                 m1, [base+subpel_filters+r6*8]
+    movhps               m0, [base+subpel_filters+r7*8]
+    movhps               m1, [base+subpel_filters+r9*8]
+    paddd               m14, m7 ; mx+dx*[4-7]
+    pand                 m5, m14, m10
+    psrld                m5, 6
+    paddd               m15, m5
+    pcmpeqd              m5, m9
+    mova        [rsp+0x110], m14
+    psrldq               m4, m15, 8
+    movd               r10d, m15
+    movd               r11d, m4
+    psrldq              m15, 4
+    psrldq               m4, 4
+    movd               r13d, m15
+    movd                rXd, m4
+    movq                 m2, [base+subpel_filters+r10*8]
+    movq                 m3, [base+subpel_filters+r11*8]
+    movhps               m2, [base+subpel_filters+r13*8]
+    movhps               m3, [base+subpel_filters+ rX*8]
+    psrld               m14, 10
+    psrldq               m4, m14, 8
+    movd               r10d, m14
+    movd               r11d, m4
+    psrldq              m14, 4
+    psrldq               m4, 4
+    movd               r13d, m14
+    movd                rXd, m4
+    punpcklbw           m14, m14
+    psraw               m14, 8
+    mov                 r4d, [rsp+ 0]
+    mov                 r6d, [rsp+ 8]
+    mov                 r7d, [rsp+ 4]
+    mov                 r9d, [rsp+12]
+    pshufd               m4, m6, q1100
+    pshufd               m6, m6, q3322
+    pshufd               m7, m5, q1100
+    pshufd               m5, m5, q3322
+    pand                 m8, m11, m4
+    pand                 m9, m11, m6
+    pand                m15, m11, m7
+    pand                m11, m11, m5
+    pandn                m4, m0
+    pandn                m6, m1
+    pandn                m7, m2
+    pandn                m5, m3
+    por                  m8, m4
+    por                  m9, m6
+    por                 m15, m7
+    por                 m11, m5
+    mova         [rsp+0x10], m8
+    mova         [rsp+0x20], m9
+    mova         [rsp+0x30], m15
+    mova         [rsp+0x40], m11
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
+    mova         [rsp+0x50], m1
+    mova         [rsp+0x60], m2
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
+    mova         [rsp+0x70], m3
+    mova         [rsp+0x80], m4
+    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
+    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
+    SWAP                 m7, m0
+    SWAP                 m8, m14
+    mova                 m1, [rsp+0x50]
+    mova                 m2, [rsp+0x60]
+    mova                 m3, [rsp+0x70]
+    mova                m15, [rsp+0x80]
+    punpcklwd            m4, m5, m6 ; 45a
+    punpckhwd            m5, m6     ; 45b
+    punpcklwd            m6, m7, m8 ; 67a
+    punpckhwd            m7, m8     ; 67b
+    SWAP                m14, m8
+    mova                 m8, [rsp+0x140]
+    mova                 m9, [rsp+0x150]
+    mova                m10, [rsp+0x160]
+    mova                m11, [rsp+0x170]
+    punpcklwd            m0, m1, m2 ; 01a
+    punpckhwd            m1, m2     ; 01b
+    punpcklwd            m2, m3, m15; 23a
+    punpckhwd            m3, m15    ; 23b
+    mova         [rsp+0x50], m4
+    mova         [rsp+0x60], m5
+    mova         [rsp+0x70], m6
+    mova         [rsp+0x80], m7
+    mova                m14, [base+unpckw]
+.dy1_vloop:
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m8
+    pmaddwd              m6, m2, m9
+    pmaddwd              m7, m3, m9
+    paddd                m4, m13
+    paddd                m5, m13
+    paddd                m4, m6
+    paddd                m5, m7
+    pmaddwd              m6, [rsp+0x50], m10
+    pmaddwd              m7, [rsp+0x60], m10
+    pmaddwd             m15, [rsp+0x70], m11
+    paddd                m4, m6
+    pmaddwd              m6, [rsp+0x80], m11
+    paddd                m5, m7
+    paddd                m4, m15
+    paddd                m5, m6
+    psrad                m4, rndshift
+    psrad                m5, rndshift
+    packssdw             m4, m5
+%ifidn %1, put
+    packuswb             m4, m4
+    movq             [dstq], m4
+    add                dstq, dsm
+%else
+    mova             [tmpq], m4
+    add                tmpq, tmp_stridem
+%endif
+    dec                  hd
+    jz .dy1_hloop_prep
+    movq                 m4, [srcq+ r4]
+    movq                 m5, [srcq+ r6]
+    movhps               m4, [srcq+ r7]
+    movhps               m5, [srcq+ r9]
+    movq                 m6, [srcq+r10]
+    movq                 m7, [srcq+r11]
+    movhps               m6, [srcq+r13]
+    movhps               m7, [srcq+ rX]
+    add                srcq, ssq
+    pshufd              m15, m14, q1032
+    pshufb               m0, m14                ; 0a 1a
+    pshufb               m1, m14                ; 0b 1b
+    pshufb               m2, m15                ; 3a 2a
+    pshufb               m3, m15                ; 3b 2b
+    pmaddubsw            m4, [rsp+0x10]
+    pmaddubsw            m5, [rsp+0x20]
+    pmaddubsw            m6, [rsp+0x30]
+    pmaddubsw            m7, [rsp+0x40]
+    phaddw               m4, m5
+    phaddw               m6, m7
+    phaddw               m4, m6
+    pmulhrsw             m4, m12
+    pshufb               m5, [rsp+0x70], m15    ; 7a 6a
+    pshufb               m7, [rsp+0x80], m15    ; 7b 6b
+    pshufb               m6, [rsp+0x50], m14    ; 4a 5a
+    pshufb              m15, [rsp+0x60], m14    ; 4b 5b
+    punpckhwd            m0, m2  ; 12a
+    punpckhwd            m1, m3  ; 12b
+    punpcklwd            m2, m6  ; 34a
+    punpcklwd            m3, m15 ; 34b
+    punpckhwd            m6, m5  ; 56a
+    punpckhwd           m15, m7  ; 56b
+    punpcklwd            m5, m4  ; 78a
+    psrldq               m4, 8
+    punpcklwd            m7, m4  ; 78b
+    mova         [rsp+0x50], m6
+    mova         [rsp+0x60], m15
+    mova         [rsp+0x70], m5
+    mova         [rsp+0x80], m7
+    jmp .dy1_vloop
+.dy2:
+    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
+    add                  wq, base_reg
+    jmp                  wq
+%ifidn %1, put
+.dy2_w2:
+    mov                 myd, mym
+    movzx               t0d, t0b
+    dec                srcq
+    movd                m15, t0d
+    punpckldq            m9, m8
+    SWAP                 m8, m9
+    paddd               m14, m8 ; mx+dx*[0-1]
+    mova                m11, [base+pd_0x4000]
+    pshufd              m15, m15, q0000
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd               m15, m8
+    movd                r4d, m15
+    psrldq              m15, 4
+    movd                r6d, m15
+    mova                 m5, [base+bdct_lb_dw]
+    mova                 m6, [base+subpel_s_shuf2]
+    movd                m15, [base+subpel_filters+r4*8+2]
+    movd                 m7, [base+subpel_filters+r6*8+2]
+    pxor                 m9, m9
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    movq                 m0, [srcq+ssq*0]
+    movq                 m1, [srcq+ssq*1]
+    movhps               m0, [srcq+ssq*2]
+    movhps               m1, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    pshufb              m14, m5
+    paddb               m14, m6
+    punpckldq           m15, m7
+    punpcklqdq          m15, m15
+    pand                m11, m8
+    pandn                m8, m15
+    SWAP                m15, m8
+    por                 m15, m11
+    movq                 m3, [srcq+ssq*0]
+    movhps               m3, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pshufb               m3, m14
+    pmaddubsw            m0, m15
+    pmaddubsw            m1, m15
+    pmaddubsw            m3, m15
+    movq                m11, r4q
+    punpcklbw           m11,  m11
+    psraw               m11, 8
+    pslldq               m2, m3, 8
+    phaddw               m0, m2
+    phaddw               m1, m3
+    pmulhrsw             m0, m12            ; 0 2 _ 4
+    pmulhrsw             m1, m12            ; 1 3 _ 5
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pshufd              m10, m11, q2222
+    pshufd              m11, m11, q3333
+    pshufd               m2, m0, q3110      ; 0 2 2 4
+    pshufd               m1, m1, q3110      ; 1 3 3 5
+    punpcklwd            m3, m2, m1         ; 01 23
+    punpckhwd            m2, m1             ; 23 45
+.dy2_w2_loop:
+    movq                 m6, [srcq+ssq*0]
+    movq                 m7, [srcq+ssq*1]
+    movhps               m6, [srcq+ssq*2]
+    movhps               m7, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    pmaddwd              m4, m3, m8
+    pmaddwd              m5, m2, m9
+    pshufb               m6, m14
+    pshufb               m7, m14
+    pmaddubsw            m6, m15
+    pmaddubsw            m7, m15
+    phaddw               m6, m7
+    pmulhrsw             m6, m12
+    psrldq               m7, m6, 8
+    palignr              m6, m0, 8
+    palignr              m7, m1, 8
+    mova                 m0, m6
+    mova                 m1, m7
+    pshufd               m6, m6, q3221
+    pshufd               m7, m7, q3221
+    punpcklwd            m3, m6, m7       ; 45 67
+    punpckhwd            m2, m6, m7       ; 67 89
+    pmaddwd              m6, m3, m10
+    pmaddwd              m7, m2, m11
+    paddd                m4, m5
+    paddd                m4, m13
+    paddd                m6, m7
+    paddd                m4, m6
+    psrad                m4, rndshift
+    packssdw             m4, m4
+    packuswb             m4, m4
+    movd                r4d, m4
+    mov        [dstq+dsq*0], r4w
+    shr                 r4d, 16
+    mov        [dstq+dsq*1], r4w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .dy2_w2_loop
+    RET
+    SWAP                m15, m8, m9
+%endif
+.dy2_w4:
+    mov                 myd, mym
+    mova                 m7, [base+rescale_mul]
+    movzx               t0d, t0b
+    dec                srcq
+    movd                m15, t0d
+    pmaddwd              m8, m7
+    mova                m11, [base+pd_0x4000]
+    pshufd              m15, m15, q0000
+    paddd               m14, m8 ; mx+dx*[0-3]
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd               m15, m8
+    psrldq               m7, m15, 8
+    movd                r4d, m15
+    movd               r11d, m7
+    psrldq              m15, 4
+    psrldq               m7, 4
+    movd                r6d, m15
+    movd               r13d, m7
+    movd                m15, [base+subpel_filters+ r4*8+2]
+    movd                 m4, [base+subpel_filters+r11*8+2]
+    movd                 m5, [base+subpel_filters+ r6*8+2]
+    movd                 m7, [base+subpel_filters+r13*8+2]
+    movq                 m6, [base+subpel_s_shuf2]
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    movu                 m0, [srcq+ssq*0]
+    movu                 m2, [srcq+ssq*2]
+    movu                 m1, [srcq+ssq*1]
+    movu                 m3, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    punpckldq           m15, m5
+    punpckldq            m4, m7
+    punpcklqdq           m6, m6
+    punpcklqdq          m15, m4
+    pshufb              m14, [base+bdct_lb_dw]
+    movu                 m4, [srcq+ssq*0]
+    movu                 m5, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pand                m11, m8
+    pandn                m8, m15
+    SWAP                m15, m8
+    por                 m15, m11
+    paddb               m14, m6
+    movq                m11, r4q
+    punpcklbw           m11, m11
+    psraw               m11, 8
+    pshufb               m0, m14
+    pshufb               m2, m14
+    pshufb               m1, m14
+    pshufb               m3, m14
+    pshufb               m4, m14
+    pshufb               m5, m14
+    pmaddubsw            m0, m15
+    pmaddubsw            m2, m15
+    pmaddubsw            m1, m15
+    pmaddubsw            m3, m15
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, m15
+    phaddw               m0, m2
+    phaddw               m1, m3
+    phaddw               m4, m5
+    pmulhrsw             m0, m12    ; 0 2
+    pmulhrsw             m1, m12    ; 1 3
+    pmulhrsw             m4, m12    ; 4 5
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pshufd              m10, m11, q2222
+    pshufd              m11, m11, q3333
+    psrldq               m5, m4, 8  ; 5 _
+    punpckhwd            m2, m0, m1 ; 23
+    punpcklwd            m0, m1     ; 01
+    punpcklwd            m4, m5     ; 45
+.dy2_w4_loop:
+    pmaddwd              m0, m8         ; a0
+    pmaddwd              m5, m2, m8     ; b0
+    pmaddwd              m2, m9         ; a1
+    pmaddwd              m7, m4, m9     ; b1
+    pmaddwd              m3, m4, m10    ; a2
+    paddd                m0, m13
+    paddd                m5, m13
+    paddd                m0, m2
+    paddd                m5, m7
+    paddd                m0, m3
+    movu                 m6, [srcq+ssq*0]
+    movu                 m7, [srcq+ssq*1]
+    movu                 m3, [srcq+ssq*2]
+    movu                 m1, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    pshufb               m6, m14
+    pshufb               m7, m14
+    pshufb               m3, m14
+    pshufb               m1, m14
+    pmaddubsw            m6, m15
+    pmaddubsw            m7, m15
+    pmaddubsw            m3, m15
+    pmaddubsw            m1, m15
+    phaddw               m6, m7
+    phaddw               m3, m1
+    pmulhrsw             m6, m12    ; 6 7
+    pmulhrsw             m3, m12    ; 8 9
+    psrldq               m7, m6, 8
+    psrldq               m1, m3, 8
+    punpcklwd            m6, m7     ; 67
+    punpcklwd            m3, m1     ; 89
+    mova                 m2, m6
+    pmaddwd              m1, m6, m10    ; b2
+    pmaddwd              m6, m11        ; a3
+    pmaddwd              m7, m3, m11    ; b3
+    paddd                m5, m1
+    paddd                m0, m6
+    paddd                m5, m7
+    psrad                m0, rndshift
+    psrad                m5, rndshift
+    packssdw             m0, m5
+%ifidn %1, put
+    packuswb             m0, m0
+    psrldq               m1, m0, 4
+    movd       [dstq+dsq*0], m0
+    movd       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+%else
+    mova             [tmpq], m0
+    add                tmpq, 16
+%endif
+    mova                 m0, m4
+    mova                 m4, m3
+    sub                  hd, 2
+    jg .dy2_w4_loop
+    MC_8TAP_SCALED_RET
+    SWAP                 m8, m15
+.dy2_w8:
+    mov    dword [rsp+0x90], 1
+    movifprep   tmp_stridem, 16
+    jmp .dy2_w_start
+.dy2_w16:
+    mov    dword [rsp+0x90], 2
+    movifprep   tmp_stridem, 32
+    jmp .dy2_w_start
+.dy2_w32:
+    mov    dword [rsp+0x90], 4
+    movifprep   tmp_stridem, 64
+    jmp .dy2_w_start
+.dy2_w64:
+    mov    dword [rsp+0x90], 8
+    movifprep   tmp_stridem, 128
+    jmp .dy2_w_start
+.dy2_w128:
+    mov    dword [rsp+0x90], 16
+    movifprep   tmp_stridem, 256
+.dy2_w_start:
+    mov                 myd, mym
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    movd                m15, t0d
+    pslld                m7, m8, 2 ; dx*4
+    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
+    pshufd              m15, m15, q0000
+    paddd               m14, m8 ; mx+dx*[0-3]
+    movq                 m3, r4q
+    punpcklbw            m3, m3
+    psraw                m3, 8
+    mova        [rsp+0x100], m7
+    mova        [rsp+0x120], m15
+    mov         [rsp+0x098], srcq
+    mov         [rsp+0x130], r0q ; dstq / tmpq
+    pshufd               m0, m3, q0000
+    pshufd               m1, m3, q1111
+    pshufd               m2, m3, q2222
+    pshufd               m3, m3, q3333
+    mova        [rsp+0x140], m0
+    mova        [rsp+0x150], m1
+    mova        [rsp+0x160], m2
+    mova        [rsp+0x170], m3
+%if UNIX64
+    mov                  hm, hd
+%endif
+    jmp .dy2_hloop
+.dy2_hloop_prep:
+    dec   dword [rsp+0x090]
+    jz .ret
+    add   qword [rsp+0x130], 8*(isprep+1)
+    mov                  hd, hm
+    mova                 m7, [rsp+0x100]
+    mova                m14, [rsp+0x110]
+    mova                m10, [base+pd_0x3ff]
+    mova                m15, [rsp+0x120]
+    pxor                 m9, m9
+    mov                srcq, [rsp+0x098]
+    mov                 r0q, [rsp+0x130] ; dstq / tmpq
+    paddd               m14, m7
+.dy2_hloop:
+    mova                m11, [base+pq_0x40000000]
+    psrld                m4, m14, 10
+    mova              [rsp], m4
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd                m5, m15, m6
+    pcmpeqd              m6, m9
+    psrldq               m4, m5, 8
+    movd                r4d, m5
+    movd                r6d, m4
+    psrldq               m5, 4
+    psrldq               m4, 4
+    movd                r7d, m5
+    movd                r9d, m4
+    movq                 m0, [base+subpel_filters+r4*8]
+    movq                 m1, [base+subpel_filters+r6*8]
+    movhps               m0, [base+subpel_filters+r7*8]
+    movhps               m1, [base+subpel_filters+r9*8]
+    paddd               m14, m7 ; mx+dx*[4-7]
+    pand                 m5, m14, m10
+    psrld                m5, 6
+    paddd               m15, m5
+    pcmpeqd              m5, m9
+    mova        [rsp+0x110], m14
+    psrldq               m4, m15, 8
+    movd               r10d, m15
+    movd               r11d, m4
+    psrldq              m15, 4
+    psrldq               m4, 4
+    movd               r13d, m15
+    movd                rXd, m4
+    movq                 m2, [base+subpel_filters+r10*8]
+    movq                 m3, [base+subpel_filters+r11*8]
+    movhps               m2, [base+subpel_filters+r13*8]
+    movhps               m3, [base+subpel_filters+ rX*8]
+    psrld               m14, 10
+    psrldq               m4, m14, 8
+    movd               r10d, m14
+    movd               r11d, m4
+    psrldq              m14, 4
+    psrldq               m4, 4
+    movd               r13d, m14
+    movd                rXd, m4
+    mov                 r4d, [rsp+ 0]
+    mov                 r6d, [rsp+ 8]
+    mov                 r7d, [rsp+ 4]
+    mov                 r9d, [rsp+12]
+    pshufd               m4, m6, q1100
+    pshufd               m6, m6, q3322
+    pshufd               m7, m5, q1100
+    pshufd               m5, m5, q3322
+    pand                 m8, m11, m4
+    pand                 m9, m11, m6
+    pand                m15, m11, m7
+    pand                m11, m11, m5
+    pandn                m4, m0
+    pandn                m6, m1
+    pandn                m7, m2
+    pandn                m5, m3
+    por                  m8, m4
+    por                  m9, m6
+    por                 m15, m7
+    por                 m11, m5
+    mova         [rsp+0x10], m8
+    mova         [rsp+0x20], m9
+    mova         [rsp+0x30], m15
+    mova         [rsp+0x40], m11
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
+    mova         [rsp+0x50], m1
+    mova         [rsp+0x60], m2
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
+    mova         [rsp+0x70], m3
+    mova         [rsp+0x80], m4
+    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
+    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
+    SWAP                 m7, m0
+    SWAP                 m8, m14
+    mova                 m1, [rsp+0x50]
+    mova                 m2, [rsp+0x60]
+    mova                 m3, [rsp+0x70]
+    mova                m15, [rsp+0x80]
+    punpcklwd            m4, m5, m6 ; 45a
+    punpckhwd            m5, m6     ; 45b
+    punpcklwd            m6, m7, m8 ; 67a
+    punpckhwd            m7, m8     ; 67b
+    SWAP                m14, m8
+    mova                 m8, [rsp+0x140]
+    mova                 m9, [rsp+0x150]
+    mova                m10, [rsp+0x160]
+    mova                m11, [rsp+0x170]
+    punpcklwd            m0, m1, m2 ; 01a
+    punpckhwd            m1, m2     ; 01b
+    punpcklwd            m2, m3, m15; 23a
+    punpckhwd            m3, m15    ; 23b
+    mova         [rsp+0x50], m4
+    mova         [rsp+0x60], m5
+    mova         [rsp+0x70], m6
+    mova         [rsp+0x80], m7
+.dy2_vloop:
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m8
+    pmaddwd              m6, m2, m9
+    pmaddwd              m7, m3, m9
+    paddd                m4, m13
+    paddd                m5, m13
+    paddd                m4, m6
+    paddd                m5, m7
+    pmaddwd              m6, [rsp+0x50], m10
+    pmaddwd              m7, [rsp+0x60], m10
+    pmaddwd             m15, [rsp+0x70], m11
+    paddd                m4, m6
+    pmaddwd              m6, [rsp+0x80], m11
+    paddd                m5, m7
+    paddd                m4, m15
+    paddd                m5, m6
+    psrad                m4, rndshift
+    psrad                m5, rndshift
+    packssdw             m4, m5
+%ifidn %1, put
+    packuswb             m4, m4
+    movq             [dstq], m4
+    add                dstq, dsm
+%else
+    mova             [tmpq], m4
+    add                tmpq, tmp_stridem
+%endif
+    dec                  hd
+    jz .dy2_hloop_prep
+    mova                 m8, [rsp+0x10]
+    mova                 m9, [rsp+0x20]
+    mova                m10, [rsp+0x30]
+    mova                m11, [rsp+0x40]
+    mova                 m0, m2             ; 01a
+    mova                 m1, m3             ; 01b
+    MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11
+    mova                 m3, [rsp+0x50] ; 23a
+    mova                 m4, [rsp+0x60] ; 23b
+    mova                 m5, [rsp+0x70] ; 45a
+    mova                 m7, [rsp+0x80] ; 45b
+    mova                 m8, [rsp+0x140]
+    mova                 m9, [rsp+0x150]
+    mova                m10, [rsp+0x160]
+    mova                m11, [rsp+0x170]
+    punpcklwd           m14, m2, m6     ; 67a
+    punpckhwd            m2, m6         ; 67b
+    mova         [rsp+0x50], m5
+    mova         [rsp+0x60], m7
+    mova         [rsp+0x70], m14
+    mova         [rsp+0x80], m2
+    mova                 m2, m3
+    mova                 m3, m4
+    jmp .dy2_vloop
+.ret:
+    MC_8TAP_SCALED_RET 0
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled
+    mov                 t0d, (5*15 << 16) | 5*15
+    mov                 t1d, (5*15 << 16) | 5*15
+    jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
+%endmacro
+
+%if ARCH_X86_64
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+BILIN_SCALED_FN put
+FN put_8tap_scaled, sharp,          SHARP,   SHARP
+FN put_8tap_scaled, sharp_smooth,   SHARP,   SMOOTH
+FN put_8tap_scaled, smooth_sharp,   SMOOTH,  SHARP
+FN put_8tap_scaled, smooth,         SMOOTH,  SMOOTH
+FN put_8tap_scaled, sharp_regular,  SHARP,   REGULAR
+FN put_8tap_scaled, regular_sharp,  REGULAR, SHARP
+FN put_8tap_scaled, smooth_regular, SMOOTH,  REGULAR
+FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN put_8tap_scaled, regular,        REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+BILIN_SCALED_FN prep
+FN prep_8tap_scaled, sharp,          SHARP,   SHARP
+FN prep_8tap_scaled, sharp_smooth,   SHARP,   SMOOTH
+FN prep_8tap_scaled, smooth_sharp,   SMOOTH,  SHARP
+FN prep_8tap_scaled, smooth,         SMOOTH,  SMOOTH
+FN prep_8tap_scaled, sharp_regular,  SHARP,   REGULAR
+FN prep_8tap_scaled, regular_sharp,  REGULAR, SHARP
+FN prep_8tap_scaled, smooth_regular, SMOOTH,  REGULAR
+FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap_scaled, regular,        REGULAR, REGULAR
+MC_8TAP_SCALED prep
+%endif
+
 %if ARCH_X86_32
  %macro SAVE_ALPHA_BETA 0
     mov              alpham, alphad
@@ -5715,7 +7487,7 @@
 %define m11 [base+pd_63]
 %define m10 [base+pb_8x0_8x8]
 %endif
-    pmaddwd              m4, m7, [base+resize_mul]  ; dx*[0,1,2,3]
+    pmaddwd              m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
     pslld                m7, 2                      ; dx*4
     pslld                m5, 14
     paddd                m6, m4                     ; mx+[0..3]*dx
--- a/src/x86/msac.asm
+++ b/src/x86/msac.asm
@@ -23,6 +23,7 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 SECTION_RODATA 64 ; avoids cacheline splits
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -518,9 +518,7 @@
 }
 
 int main(int argc, char *argv[]) {
-    (void)func_new, (void)func_ref;
     state.seed = get_seed();
-    int ret = 0;
 
     while (argc > 1) {
         if (!strncmp(argv[1], "--help", 6)) {
@@ -568,6 +566,24 @@
 
     dav1d_init_cpu();
 
+#ifdef readtime
+    if (state.bench_pattern) {
+        static int testing = 0;
+        checkasm_save_context();
+        if (!testing) {
+            checkasm_set_signal_handler_state(1);
+            testing = 1;
+            readtime();
+            checkasm_set_signal_handler_state(0);
+        } else {
+            fprintf(stderr, "checkasm: unable to access cycle counter\n");
+            return 1;
+        }
+    }
+#endif
+
+    int ret = 0;
+
     if (!state.function_listing) {
         fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
 #if ARCH_X86_64
@@ -672,7 +688,9 @@
 /* Indicate that the current test has failed, return whether verbose printing
  * is requested. */
 int checkasm_fail_func(const char *const msg, ...) {
-    if (state.current_func_ver->cpu && state.current_func_ver->ok) {
+    if (state.current_func_ver && state.current_func_ver->cpu &&
+        state.current_func_ver->ok)
+    {
         va_list arg;
 
         print_cpu_name();
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -86,8 +86,6 @@
 int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps,
                                  unsigned max_ulp, int len);
 
-static void *func_ref, *func_new;
-
 #define BENCH_RUNS (1 << 12) /* Trade-off between accuracy and speed */
 
 /* Decide whether or not the specified function needs to be tested */
@@ -99,6 +97,7 @@
  * is optional. */
 #define declare_func(ret, ...)\
     declare_new(ret, __VA_ARGS__)\
+    void *func_ref, *func_new;\
     typedef ret func_type(__VA_ARGS__);\
     checkasm_save_context()
 
@@ -127,6 +126,9 @@
 }
 #define readtime readtime
 #endif
+#elif (ARCH_AARCH64 || ARCH_ARM) && defined(__APPLE__)
+#include <mach/mach_time.h>
+#define readtime() mach_absolute_time()
 #elif ARCH_AARCH64
 #ifdef _MSC_VER
 #include <windows.h>
--- a/tests/checkasm/msac.c
+++ b/tests/checkasm/msac.c
@@ -140,11 +140,11 @@
     report("decode_symbol");
 }
 
-static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
+static void check_decode_bool_adapt(MsacDSPContext *const c, uint8_t *const buf) {
     MsacContext s_c, s_a;
 
+    declare_func(unsigned, MsacContext *s, uint16_t *cdf);
     if (check_func(c->bool_adapt, "msac_decode_bool_adapt")) {
-        declare_func(unsigned, MsacContext *s, uint16_t *cdf);
         uint16_t cdf[2][2];
         for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
             dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
@@ -165,9 +165,13 @@
                 bench_new(&s_a, cdf[1]);
         }
     }
+}
 
+static void check_decode_bool_equi(MsacDSPContext *const c, uint8_t *const buf) {
+    MsacContext s_c, s_a;
+
+    declare_func(unsigned, MsacContext *s);
     if (check_func(c->bool_equi, "msac_decode_bool_equi")) {
-        declare_func(unsigned, MsacContext *s);
         dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
         s_a = s_c;
         for (int i = 0; i < 64; i++) {
@@ -180,9 +184,13 @@
         }
         bench_new(&s_a);
     }
+}
 
+static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
+    MsacContext s_c, s_a;
+
+    declare_func(unsigned, MsacContext *s, unsigned f);
     if (check_func(c->bool, "msac_decode_bool")) {
-        declare_func(unsigned, MsacContext *s, unsigned f);
         dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
         s_a = s_c;
         for (int i = 0; i < 64; i++) {
@@ -197,6 +205,12 @@
         bench_new(&s_a, 16384);
     }
 
+}
+
+static void check_decode_bool_funcs(MsacDSPContext *const c, uint8_t *const buf) {
+    check_decode_bool_adapt(c, buf);
+    check_decode_bool_equi(c, buf);
+    check_decode_bool(c, buf);
     report("decode_bool");
 }
 
@@ -204,8 +218,8 @@
     ALIGN_STK_16(uint16_t, cdf, 2, [16]);
     MsacContext s_c, s_a;
 
+    declare_func(unsigned, MsacContext *s, uint16_t *cdf);
     if (check_func(c->hi_tok, "msac_decode_hi_tok")) {
-        declare_func(unsigned, MsacContext *s, uint16_t *cdf);
         for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
             dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
             s_a = s_c;
@@ -272,6 +286,6 @@
         buf[i] = rnd();
 
     check_decode_symbol(&c, buf);
-    check_decode_bool(&c, buf);
+    check_decode_bool_funcs(&c, buf);
     check_decode_hi_tok(&c, buf);
 }
--- a/tests/checkasm/x86/checkasm.asm
+++ b/tests/checkasm/x86/checkasm.asm
@@ -23,8 +23,9 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-%define private_prefix checkasm
 %include "config.asm"
+%undef private_prefix
+%define private_prefix checkasm
 %include "ext/x86/x86inc.asm"
 
 SECTION_RODATA 16
--- /dev/null
+++ b/tests/header_test.c.in
@@ -1,0 +1,33 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <dav1d/INPUT>
+
+int main()
+{
+    return 0;
+}
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -101,6 +101,36 @@
     test('checkasm', checkasm, is_parallel: false)
 endif
 
+c99_extension_flag = cc.first_supported_argument(
+    '-Werror=c11-extensions',
+    '-Werror=c99-c11-compat',
+    '-Wc11-extensions',
+    '-Wc99-c11-compat',
+)
+
+# dav1d_api_headers
+foreach header : dav1d_api_headers
+    header_file = '@0@'.format(header).split('/')[-1]
+    target = header_file + '_test'
+
+    header_test_source = custom_target(target,
+        output : target + '.c',
+        input : 'header_test.c.in',
+        capture : true,
+        command : ['sed', '-e', 's/INPUT/' + header_file + '/', '@INPUT@']
+    )
+
+    header_test_exe = executable(target,
+        header_test_source,
+        include_directories: dav1d_inc_dirs,
+        c_args: [c99_extension_flag],
+        build_by_default: true
+    )
+
+    test(target, header_test_exe)
+endforeach
+
+
 # fuzzing binaries
 if meson.version().version_compare('>=0.49')
     subdir('libfuzzer')
--- a/tools/dav1d.c
+++ b/tools/dav1d.c
@@ -124,11 +124,15 @@
     else
         b += snprintf(b, end - b, "Decoded %u/%u frames (%.1lf%%)",
                       n, num, 100.0 * n / num);
-    if (i_fps && b < end) {
+    if (b < end) {
         const double d_fps = 1e9 * n / elapsed;
-        const double speed = d_fps / i_fps;
-        b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
-                      d_fps, i_fps, speed);
+        if (i_fps) {
+            const double speed = d_fps / i_fps;
+            b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
+                          d_fps, i_fps, speed);
+        } else {
+            b += snprintf(b, end - b, " - %.2lf fps", d_fps);
+        }
     }
     if (!istty)
         strcpy(b > end - 2 ? end - 2 : b, "\n");
--- a/tools/output/y4m2.c
+++ b/tools/output/y4m2.c
@@ -28,6 +28,7 @@
 #include "config.h"
 
 #include <errno.h>
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -77,8 +78,17 @@
         chr_names_8bpc_i420[p->seq_hdr->chr > 2 ? DAV1D_CHR_UNKNOWN : p->seq_hdr->chr] :
         ss_names[p->p.layout][p->seq_hdr->hbd];
 
-    fprintf(c->f, "YUV4MPEG2 W%d H%d F%d:%d Ip C%s\n",
-            p->p.w, p->p.h, c->fps[0], c->fps[1], ss_name);
+    const unsigned fw = p->p.w;
+    const unsigned fh = p->p.h;
+    uint64_t aw = (uint64_t)fh * p->frame_hdr->render_width;
+    uint64_t ah = (uint64_t)fw * p->frame_hdr->render_height;
+    uint64_t gcd = ah;
+    for (uint64_t a = aw, b; (b = a % gcd); a = gcd, gcd = b);
+    aw /= gcd;
+    ah /= gcd;
+
+    fprintf(c->f, "YUV4MPEG2 W%u H%u F%u:%u Ip A%"PRIu64":%"PRIu64" C%s\n",
+            fw, fh, c->fps[0], c->fps[1], aw, ah, ss_name);
 
     return 0;
 }