shithub: dav1d

Download patch

ref: 75558f8b2fadb34bda1481dd13c76066b2f979cb
parent: 664c6a5fa208998435c6fc1634924a453cc6e17b
author: Henrik Gramner <gramner@twoorioles.com>
date: Thu May 16 15:42:53 EDT 2019

x86: Enable msac asm on x86-32

--- /dev/null
+++ b/src/arm/msac.h
@@ -1,0 +1,50 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_MSAC_H
+#define DAV1D_SRC_ARM_MSAC_H
+
+unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
+                                               size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
+unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
+
+#if ARCH_AARCH64
+#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_neon
+#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_neon
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
+#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_neon
+#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_neon
+#define dav1d_msac_decode_bool           dav1d_msac_decode_bool_neon
+#endif
+
+#endif /* DAV1D_SRC_ARM_MSAC_H */
--- a/src/msac.h
+++ b/src/msac.h
@@ -43,6 +43,14 @@
     int allow_update_cdf;
 } MsacContext;
 
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/msac.h"
+#elif ARCH_X86
+#include "src/x86/msac.h"
+#endif
+#endif
+
 void dav1d_msac_init(MsacContext *s, const uint8_t *data, size_t sz,
                      int disable_cdf_update_flag);
 unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
@@ -53,44 +61,22 @@
 int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
 
 /* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */
-#if ARCH_AARCH64 && HAVE_ASM
-unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
-                                              size_t n_symbols);
-unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf,
-                                              size_t n_symbols);
-unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
-                                               size_t n_symbols);
-unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
-unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
-unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
-#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_neon
-#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_neon
-#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
-#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_neon
-#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_neon
-#define dav1d_msac_decode_bool           dav1d_msac_decode_bool_neon
-#elif ARCH_X86_64 && HAVE_ASM
-unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
-                                              size_t n_symbols);
-unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
-                                              size_t n_symbols);
-unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
-                                               size_t n_symbols);
-unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
-unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
-unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
-#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
-#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
-#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
-#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
-#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
-#define dav1d_msac_decode_bool           dav1d_msac_decode_bool_sse2
-#else
+#ifndef dav1d_msac_decode_symbol_adapt4
 #define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_symbol_adapt8
 #define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_symbol_adapt16
 #define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_bool_adapt
 #define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_c
+#endif
+#ifndef dav1d_msac_decode_bool_equi
 #define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_c
+#endif
+#ifndef dav1d_msac_decode_bool
 #define dav1d_msac_decode_bool           dav1d_msac_decode_bool_c
 #endif
 
--- a/src/x86/msac.asm
+++ b/src/x86/msac.asm
@@ -26,8 +26,6 @@
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
 
-%if ARCH_X86_64
-
 SECTION_RODATA 64 ; avoids cacheline splits
 
 dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
@@ -34,10 +32,34 @@
 pw_0xff00: times 8 dw 0xff00
 pw_32:     times 8 dw 32
 
+%if ARCH_X86_64
+%define resp   resq
+%define movp   movq
+%define c_shuf q3333
+%define DECODE_SYMBOL_ADAPT_INIT
+%else
+%define resp   resd
+%define movp   movd
+%define c_shuf q1111
+%macro DECODE_SYMBOL_ADAPT_INIT 0
+    mov            t0, r0m
+    mov            t1, r1m
+    mov            t2, r2m
+%if STACK_ALIGNMENT >= 16
+    sub           esp, 40
+%else
+    mov           eax, esp
+    and           esp, ~15
+    sub           esp, 40
+    mov         [esp], eax
+%endif
+%endmacro
+%endif
+
 struc msac
-    .buf:        resq 1
-    .end:        resq 1
-    .dif:        resq 1
+    .buf:        resp 1
+    .end:        resp 1
+    .dif:        resp 1
     .rng:        resd 1
     .cnt:        resd 1
     .update_cdf: resd 1
@@ -48,22 +70,26 @@
 SECTION .text
 
 %if WIN64
-DECLARE_REG_TMP 3
-%define buf rsp+8 ; shadow space
-%else
-DECLARE_REG_TMP 0
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3
+%define buf rsp+8  ; shadow space
+%elif UNIX64
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0
 %define buf rsp-40 ; red zone
+%else
+DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2
+%define buf esp+8
 %endif
 
 INIT_XMM sse2
-cglobal msac_decode_symbol_adapt4, 3, 7, 6, s, cdf, ns
-    movd           m2, [sq+msac.rng]
-    movq           m1, [cdfq]
-    lea           rax, [pw_0xff00]
-    movq           m3, [sq+msac.dif]
-    mov           r3d, [sq+msac.update_cdf]
-    mov           r4d, nsd
-    neg           nsq
+cglobal msac_decode_symbol_adapt4, 0, 6, 6
+    DECODE_SYMBOL_ADAPT_INIT
+    LEA           rax, pw_0xff00
+    movd           m2, [t0+msac.rng]
+    movq           m1, [t1]
+    movp           m3, [t0+msac.dif]
+    mov           t3d, [t0+msac.update_cdf]
+    mov           t4d, t2d
+    neg            t2
     pshuflw        m2, m2, q0000
     movd     [buf+12], m2
     pand           m2, [rax]
@@ -71,8 +97,8 @@
     psrlw          m1, 6
     psllw          m1, 7
     pmulhuw        m1, m2
-    movq           m2, [rax+nsq*2]
-    pshuflw        m3, m3, q3333
+    movq           m2, [rax+t2*2]
+    pshuflw        m3, m3, c_shuf
     paddw          m1, m2
     mova     [buf+16], m1
     psubusw        m1, m3
@@ -79,104 +105,121 @@
     pxor           m2, m2
     pcmpeqw        m1, m2 ; c >= v
     pmovmskb      eax, m1
-    test          r3d, r3d
+    test          t3d, t3d
     jz .renorm ; !allow_update_cdf
 
 ; update_cdf:
-    movzx         r3d, word [cdfq+r4*2] ; count
+    movzx         t3d, word [t1+t4*2] ; count
     pcmpeqw        m2, m2
-    mov           r2d, r3d
-    shr           r3d, 4
-    cmp           r4d, 4
-    sbb           r3d, -5 ; (count >> 4) + (n_symbols > 3) + 4
-    cmp           r2d, 32
-    adc           r2d, 0  ; count + (count < 32)
-    movd           m3, r3d
+    mov           t2d, t3d
+    shr           t3d, 4
+    cmp           t4d, 4
+    sbb           t3d, -5 ; (count >> 4) + (n_symbols > 3) + 4
+    cmp           t2d, 32
+    adc           t2d, 0  ; count + (count < 32)
+    movd           m3, t3d
     pavgw          m2, m1 ; i >= val ? -1 : 32768
     psubw          m2, m0 ; for (i = 0; i < val; i++)
     psubw          m0, m1 ;     cdf[i] += (32768 - cdf[i]) >> rate;
     psraw          m2, m3 ; for (; i < n_symbols - 1; i++)
     paddw          m0, m2 ;     cdf[i] += ((  -1 - cdf[i]) >> rate) + 1;
-    movq       [cdfq], m0
-    mov   [cdfq+r4*2], r2w
+    movq         [t1], m0
+    mov     [t1+t4*2], t2w
 
 .renorm:
     tzcnt         eax, eax
-    mov            r4, [sq+msac.dif]
-    movzx         r1d, word [buf+rax+16] ; v
-    movzx         r2d, word [buf+rax+14] ; u
+    mov            t4, [t0+msac.dif]
+    movzx         t1d, word [buf+rax+16] ; v
+    movzx         t2d, word [buf+rax+14] ; u
     shr           eax, 1
 .renorm2:
-    not            r4
-    sub           r2d, r1d ; rng
-    shl            r1, 48
-    add            r4, r1  ; ~dif
+%if ARCH_X86_64 == 0
+%if STACK_ALIGNMENT >= 16
+    add           esp, 40
+%else
+    mov           esp, [esp]
+%endif
+%endif
+    not            t4
+    sub           t2d, t1d ; rng
+    shl            t1, gprsize*8-16
+    add            t4, t1  ; ~dif
 .renorm3:
-    mov           r1d, [sq+msac.cnt]
-    movifnidn      t0, sq
+    mov           t1d, [t0+msac.cnt]
+    movifnidn      t7, t0
 .renorm4:
-    bsr           ecx, r2d
+    bsr           ecx, t2d
     xor           ecx, 15  ; d
-    shl           r2d, cl
-    shl            r4, cl
-    mov [t0+msac.rng], r2d
-    not            r4
-    sub           r1d, ecx
+    shl           t2d, cl
+    shl            t4, cl
+    mov [t7+msac.rng], t2d
+    not            t4
+    sub           t1d, ecx
     jge .end ; no refill required
 
 ; refill:
-    mov            r2, [t0+msac.buf]
-    mov           rcx, [t0+msac.end]
-    lea            r5, [r2+8]
-    cmp            r5, rcx
+    mov            t2, [t7+msac.buf]
+    mov           rcx, [t7+msac.end]
+%if ARCH_X86_64 == 0
+    push           t5
+%endif
+    lea            t5, [t2+gprsize]
+    cmp            t5, rcx
     jg .refill_eob
-    mov            r2, [r2]
-    lea           ecx, [r1+23]
-    add           r1d, 16
+    mov            t2, [t2]
+    lea           ecx, [t1+23]
+    add           t1d, 16
     shr           ecx, 3   ; shift_bytes
-    bswap          r2
-    sub            r5, rcx
+    bswap          t2
+    sub            t5, rcx
     shl           ecx, 3   ; shift_bits
-    shr            r2, cl
-    sub           ecx, r1d ; shift_bits - 16 - cnt
-    mov           r1d, 48
-    shl            r2, cl
-    mov [t0+msac.buf], r5
-    sub           r1d, ecx ; cnt + 64 - shift_bits
-    xor            r4, r2
+    shr            t2, cl
+    sub           ecx, t1d ; shift_bits - 16 - cnt
+    mov           t1d, gprsize*8-16
+    shl            t2, cl
+    mov [t7+msac.buf], t5
+    sub           t1d, ecx ; cnt + gprsize*8 - shift_bits
+    xor            t4, t2
+%if ARCH_X86_64 == 0
+    pop            t5
+%endif
 .end:
-    mov [t0+msac.cnt], r1d
-    mov [t0+msac.dif], r4
+    mov [t7+msac.cnt], t1d
+    mov [t7+msac.dif], t4
     RET
 .refill_eob: ; avoid overreading the input buffer
-    mov            r5, rcx
-    mov           ecx, 40
-    sub           ecx, r1d ; c
+    mov            t5, rcx
+    mov           ecx, gprsize*8-24
+    sub           ecx, t1d ; c
 .refill_eob_loop:
-    cmp            r2, r5
+    cmp            t2, t5
     jge .refill_eob_end    ; eob reached
-    movzx         r1d, byte [r2]
-    inc            r2
-    shl            r1, cl
-    xor            r4, r1
+    movzx         t1d, byte [t2]
+    inc            t2
+    shl            t1, cl
+    xor            t4, t1
     sub           ecx, 8
     jge .refill_eob_loop
 .refill_eob_end:
-    mov           r1d, 40
-    sub           r1d, ecx
-    mov [t0+msac.buf], r2
-    mov [t0+msac.dif], r4
-    mov [t0+msac.cnt], r1d
+    mov           t1d, gprsize*8-24
+%if ARCH_X86_64 == 0
+    pop            t5
+%endif
+    sub           t1d, ecx
+    mov [t7+msac.buf], t2
+    mov [t7+msac.dif], t4
+    mov [t7+msac.cnt], t1d
     RET
 
-cglobal msac_decode_symbol_adapt8, 3, 7, 6, s, cdf, ns
-    movd           m2, [sq+msac.rng]
-    movu           m1, [cdfq]
-    lea           rax, [pw_0xff00]
-    movq           m3, [sq+msac.dif]
-    mov           r3d, [sq+msac.update_cdf]
-    mov           r4d, nsd
-    neg           nsq
+cglobal msac_decode_symbol_adapt8, 0, 6, 6
+    DECODE_SYMBOL_ADAPT_INIT
+    LEA           rax, pw_0xff00
+    movd           m2, [t0+msac.rng]
+    movu           m1, [t1]
+    movp           m3, [t0+msac.dif]
+    mov           t3d, [t0+msac.update_cdf]
+    mov           t4d, t2d
+    neg            t2
     pshuflw        m2, m2, q0000
     movd     [buf+12], m2
     punpcklqdq     m2, m2
@@ -185,8 +228,8 @@
     pand           m2, [rax]
     psllw          m1, 7
     pmulhuw        m1, m2
-    movu           m2, [rax+nsq*2]
-    pshuflw        m3, m3, q3333
+    movu           m2, [rax+t2*2]
+    pshuflw        m3, m3, c_shuf
     paddw          m1, m2
     punpcklqdq     m3, m3
     mova     [buf+16], m1
@@ -194,35 +237,36 @@
     pxor           m2, m2
     pcmpeqw        m1, m2
     pmovmskb      eax, m1
-    test          r3d, r3d
+    test          t3d, t3d
     jz m(msac_decode_symbol_adapt4).renorm
-    movzx         r3d, word [cdfq+r4*2]
+    movzx         t3d, word [t1+t4*2]
     pcmpeqw        m2, m2
-    mov           r2d, r3d
-    shr           r3d, 4
-    cmp           r4d, 4 ; may be called with n_symbols < 4
-    sbb           r3d, -5
-    cmp           r2d, 32
-    adc           r2d, 0
-    movd           m3, r3d
+    mov           t2d, t3d
+    shr           t3d, 4
+    cmp           t4d, 4 ; may be called with n_symbols < 4
+    sbb           t3d, -5
+    cmp           t2d, 32
+    adc           t2d, 0
+    movd           m3, t3d
     pavgw          m2, m1
     psubw          m2, m0
     psubw          m0, m1
     psraw          m2, m3
     paddw          m0, m2
-    movu       [cdfq], m0
-    mov   [cdfq+r4*2], r2w
+    movu         [t1], m0
+    mov     [t1+t4*2], t2w
     jmp m(msac_decode_symbol_adapt4).renorm
 
-cglobal msac_decode_symbol_adapt16, 3, 7, 6, s, cdf, ns
-    movd           m4, [sq+msac.rng]
-    movu           m2, [cdfq]
-    lea           rax, [pw_0xff00]
-    movu           m3, [cdfq+16]
-    movq           m5, [sq+msac.dif]
-    mov           r3d, [sq+msac.update_cdf]
-    mov           r4d, nsd
-    neg           nsq
+cglobal msac_decode_symbol_adapt16, 0, 6, 6
+    DECODE_SYMBOL_ADAPT_INIT
+    LEA           rax, pw_0xff00
+    movd           m4, [t0+msac.rng]
+    movu           m2, [t1]
+    movu           m3, [t1+16]
+    movp           m5, [t0+msac.dif]
+    mov           t3d, [t0+msac.update_cdf]
+    mov           t4d, t2d
+    neg            t2
 %if WIN64
     sub           rsp, 48 ; need 36 bytes, shadow space is only 32
 %endif
@@ -238,8 +282,8 @@
     psllw          m3, 7
     pmulhuw        m2, m4
     pmulhuw        m3, m4
-    movu           m4, [rax+nsq*2]
-    pshuflw        m5, m5, q3333
+    movu           m4, [rax+t2*2]
+    pshuflw        m5, m5, c_shuf
     paddw          m2, m4
     psubw          m4, [rax-pw_0xff00+pw_32]
     punpcklqdq     m5, m5
@@ -253,20 +297,20 @@
     pcmpeqw        m3, m4
     packsswb       m5, m2, m3
     pmovmskb      eax, m5
-    test          r3d, r3d
+    test          t3d, t3d
     jz .renorm
-    movzx         r3d, word [cdfq+r4*2]
+    movzx         t3d, word [t1+t4*2]
     pcmpeqw        m4, m4
     mova           m5, m4
-    lea           r2d, [r3+80] ; only support n_symbols >= 4
-    shr           r2d, 4
-    cmp           r3d, 32
-    adc           r3d, 0
+    lea           t2d, [t3+80] ; only support n_symbols >= 4
+    shr           t2d, 4
+    cmp           t3d, 32
+    adc           t3d, 0
     pavgw          m4, m2
     pavgw          m5, m3
     psubw          m4, m0
     psubw          m0, m2
-    movd           m2, r2d
+    movd           m2, t2d
     psubw          m5, m1
     psubw          m1, m3
     psraw          m4, m2
@@ -273,105 +317,127 @@
     psraw          m5, m2
     paddw          m0, m4
     paddw          m1, m5
-    movu       [cdfq], m0
-    movu    [cdfq+16], m1
-    mov   [cdfq+r4*2], r3w
+    movu         [t1], m0
+    movu      [t1+16], m1
+    mov     [t1+t4*2], t3w
 .renorm:
     tzcnt         eax, eax
-    mov            r4, [sq+msac.dif]
-    movzx         r1d, word [buf+rax*2]
-    movzx         r2d, word [buf+rax*2-2]
+    mov            t4, [t0+msac.dif]
+    movzx         t1d, word [buf+rax*2]
+    movzx         t2d, word [buf+rax*2-2]
 %if WIN64
     add           rsp, 48
 %endif
     jmp m(msac_decode_symbol_adapt4).renorm2
 
-cglobal msac_decode_bool_adapt, 2, 7, 0, s, cdf
-    movzx         eax, word [cdfq]
-    movzx         r3d, byte [sq+msac.rng+1]
-    mov            r4, [sq+msac.dif]
-    mov           r2d, [sq+msac.rng]
-    mov           r5d, eax
+cglobal msac_decode_bool_adapt, 0, 6, 0
+    movifnidn      t1, r1mp
+    movifnidn      t0, r0mp
+    movzx         eax, word [t1]
+    movzx         t3d, byte [t0+msac.rng+1]
+    mov            t4, [t0+msac.dif]
+    mov           t2d, [t0+msac.rng]
+%if ARCH_X86_64
+    mov           t5d, eax
+%endif
     and           eax, ~63
-    imul          eax, r3d
+    imul          eax, t3d
 %if UNIX64
-    mov            r7, r4
+    mov            t6, t4
 %endif
     shr           eax, 7
-    add           eax, 4   ; v
-    mov           r3d, eax
-    shl           rax, 48  ; vw
-    sub           r2d, r3d ; r - v
-    sub            r4, rax ; dif - vw
-    cmovb         r2d, r3d
-    mov           r3d, [sq+msac.update_cdf]
+    add           eax, 4            ; v
+    mov           t3d, eax
+    shl           rax, gprsize*8-16 ; vw
+    sub           t2d, t3d          ; r - v
+    sub            t4, rax          ; dif - vw
+    setb           al
+    cmovb         t2d, t3d
+    mov           t3d, [t0+msac.update_cdf]
 %if UNIX64
-    cmovb          r4, r7
+    cmovb          t4, t6
 %else
-    cmovb          r4, [sq+msac.dif]
+    cmovb          t4, [t0+msac.dif]
 %endif
-    setb           al
-    not            r4
-    test          r3d, r3d
+%if ARCH_X86_64 == 0
+    movzx         eax, al
+%endif
+    not            t4
+    test          t3d, t3d
     jz m(msac_decode_symbol_adapt4).renorm3
-%if WIN64
-    push           r7
+%if UNIX64 == 0
+    push           t6
 %endif
-    movzx         r7d, word [cdfq+2]
-    movifnidn      t0, sq
-    lea           ecx, [r7+64]
-    cmp           r7d, 32
-    adc           r7d, 0
-    mov      [cdfq+2], r7w
-    imul          r7d, eax, -32769
+    movzx         t6d, word [t1+2]
+%if ARCH_X86_64 == 0
+    push           t5
+    movzx         t5d, word [t1]
+%endif
+    movifnidn      t7, t0
+    lea           ecx, [t6+64]
+    cmp           t6d, 32
+    adc           t6d, 0
+    mov        [t1+2], t6w
+    imul          t6d, eax, -32769
     shr           ecx, 4   ; rate
-    add           r7d, r5d ; if (bit)
-    sub           r5d, eax ;     cdf[0] -= ((cdf[0] - 32769) >> rate) + 1;
-    sar           r7d, cl  ; else
-    sub           r5d, r7d ;     cdf[0] -= cdf[0] >> rate;
-    mov        [cdfq], r5w
+    add           t6d, t5d ; if (bit)
+    sub           t5d, eax ;     cdf[0] -= ((cdf[0] - 32769) >> rate) + 1;
+    sar           t6d, cl  ; else
+    sub           t5d, t6d ;     cdf[0] -= cdf[0] >> rate;
+    mov          [t1], t5w
 %if WIN64
-    mov           r1d, [t0+msac.cnt]
-    pop            r7
+    mov           t1d, [t7+msac.cnt]
+    pop            t6
     jmp m(msac_decode_symbol_adapt4).renorm4
 %else
+%if ARCH_X86_64 == 0
+    pop            t5
+    pop            t6
+%endif
     jmp m(msac_decode_symbol_adapt4).renorm3
 %endif
 
-cglobal msac_decode_bool_equi, 1, 7, 0, s
-    mov           r1d, [sq+msac.rng]
-    mov            r4, [sq+msac.dif]
-    mov           r2d, r1d
-    mov           r1b, 8
-    mov            r3, r4
-    mov           eax, r1d
-    shr           r1d, 1   ; v
-    shl           rax, 47  ; vw
-    sub           r2d, r1d ; r - v
-    sub            r4, rax ; dif - vw
-    cmovb         r2d, r1d
-    cmovb          r4, r3
+cglobal msac_decode_bool_equi, 0, 6, 0
+    movifnidn      t0, r0mp
+    mov           t1d, [t0+msac.rng]
+    mov            t4, [t0+msac.dif]
+    mov           t2d, t1d
+    mov           t1b, 8
+    mov            t3, t4
+    mov           eax, t1d
+    shr           t1d, 1            ; v
+    shl           rax, gprsize*8-17 ; vw
+    sub           t2d, t1d          ; r - v
+    sub            t4, rax          ; dif - vw
+    cmovb         t2d, t1d
+    cmovb          t4, t3
     setb           al ; the upper 32 bits contains garbage but that's OK
-    not            r4
+    not            t4
+%if ARCH_X86_64 == 0
+    movzx         eax, al
+%endif
     jmp m(msac_decode_symbol_adapt4).renorm3
 
-cglobal msac_decode_bool, 2, 7, 0, s, f
-    movzx         eax, byte [sq+msac.rng+1] ; r >> 8
-    mov            r4, [sq+msac.dif]
-    mov           r2d, [sq+msac.rng]
-    and           r1d, ~63
-    imul          eax, r1d
-    mov            r3, r4
+cglobal msac_decode_bool, 0, 6, 0
+    movifnidn      t0, r0mp
+    movifnidn     t1d, r1m
+    movzx         eax, byte [t0+msac.rng+1] ; r >> 8
+    mov            t4, [t0+msac.dif]
+    mov           t2d, [t0+msac.rng]
+    and           t1d, ~63
+    imul          eax, t1d
+    mov            t3, t4
     shr           eax, 7
-    add           eax, 4   ; v
-    mov           r1d, eax
-    shl           rax, 48  ; vw
-    sub           r2d, r1d ; r - v
-    sub            r4, rax ; dif - vw
-    cmovb         r2d, r1d
-    cmovb          r4, r3
+    add           eax, 4            ; v
+    mov           t1d, eax
+    shl           rax, gprsize*8-16 ; vw
+    sub           t2d, t1d          ; r - v
+    sub            t4, rax          ; dif - vw
+    cmovb         t2d, t1d
+    cmovb          t4, t3
     setb           al
-    not            r4
-    jmp m(msac_decode_symbol_adapt4).renorm3
-
+    not            t4
+%if ARCH_X86_64 == 0
+    movzx         eax, al
 %endif
+    jmp m(msac_decode_symbol_adapt4).renorm3
--- /dev/null
+++ b/src/x86/msac.h
@@ -1,0 +1,51 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_X86_MSAC_H
+#define DAV1D_SRC_X86_MSAC_H
+
+unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
+                                               size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
+unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
+
+#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
+#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
+#endif
+
+#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
+#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
+#define dav1d_msac_decode_bool           dav1d_msac_decode_bool_sse2
+
+#endif /* DAV1D_SRC_X86_MSAC_H */
--- a/tests/checkasm/msac.c
+++ b/tests/checkasm/msac.c
@@ -175,7 +175,7 @@
         c.bool_equi      = dav1d_msac_decode_bool_equi_neon;
         c.bool           = dav1d_msac_decode_bool_neon;
     }
-#elif ARCH_X86_64 && HAVE_ASM
+#elif ARCH_X86 && HAVE_ASM
     if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
         c.symbol_adapt4  = dav1d_msac_decode_symbol_adapt4_sse2;
         c.symbol_adapt8  = dav1d_msac_decode_symbol_adapt8_sse2;