ref: 75558f8b2fadb34bda1481dd13c76066b2f979cb
parent: 664c6a5fa208998435c6fc1634924a453cc6e17b
author: Henrik Gramner <gramner@twoorioles.com>
date: Thu May 16 15:42:53 EDT 2019
x86: Enable msac asm on x86-32
--- /dev/null
+++ b/src/arm/msac.h
@@ -1,0 +1,50 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_MSAC_H
+#define DAV1D_SRC_ARM_MSAC_H
+
+unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
+unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
+
+#if ARCH_AARCH64
+#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon
+#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
+#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_neon
+#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_neon
+#define dav1d_msac_decode_bool dav1d_msac_decode_bool_neon
+#endif
+
+#endif /* DAV1D_SRC_ARM_MSAC_H */
--- a/src/msac.h
+++ b/src/msac.h
@@ -43,6 +43,14 @@
int allow_update_cdf;
} MsacContext;
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/msac.h"
+#elif ARCH_X86
+#include "src/x86/msac.h"
+#endif
+#endif
+
void dav1d_msac_init(MsacContext *s, const uint8_t *data, size_t sz,
int disable_cdf_update_flag);
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
@@ -53,44 +61,22 @@
int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
/* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */
-#if ARCH_AARCH64 && HAVE_ASM
-unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
- size_t n_symbols);
-unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf,
- size_t n_symbols);
-unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
- size_t n_symbols);
-unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
-unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
-unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
-#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon
-#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon
-#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
-#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_neon
-#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_neon
-#define dav1d_msac_decode_bool dav1d_msac_decode_bool_neon
-#elif ARCH_X86_64 && HAVE_ASM
-unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
- size_t n_symbols);
-unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
- size_t n_symbols);
-unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
- size_t n_symbols);
-unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
-unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
-unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
-#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
-#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
-#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
-#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_sse2
-#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_sse2
-#define dav1d_msac_decode_bool dav1d_msac_decode_bool_sse2
-#else
+#ifndef dav1d_msac_decode_symbol_adapt4
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_symbol_adapt8
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_symbol_adapt16
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_bool_adapt
#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_c
+#endif
+#ifndef dav1d_msac_decode_bool_equi
#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_c
+#endif
+#ifndef dav1d_msac_decode_bool
#define dav1d_msac_decode_bool dav1d_msac_decode_bool_c
#endif
--- a/src/x86/msac.asm
+++ b/src/x86/msac.asm
@@ -26,8 +26,6 @@
%include "config.asm"
%include "ext/x86/x86inc.asm"
-%if ARCH_X86_64
-
SECTION_RODATA 64 ; avoids cacheline splits
dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
@@ -34,10 +32,34 @@
pw_0xff00: times 8 dw 0xff00
pw_32: times 8 dw 32
+%if ARCH_X86_64
+%define resp resq
+%define movp movq
+%define c_shuf q3333
+%define DECODE_SYMBOL_ADAPT_INIT
+%else
+%define resp resd
+%define movp movd
+%define c_shuf q1111
+%macro DECODE_SYMBOL_ADAPT_INIT 0
+ mov t0, r0m
+ mov t1, r1m
+ mov t2, r2m
+%if STACK_ALIGNMENT >= 16
+ sub esp, 40
+%else
+ mov eax, esp
+ and esp, ~15
+ sub esp, 40
+ mov [esp], eax
+%endif
+%endmacro
+%endif
+
struc msac
- .buf: resq 1
- .end: resq 1
- .dif: resq 1
+ .buf: resp 1
+ .end: resp 1
+ .dif: resp 1
.rng: resd 1
.cnt: resd 1
.update_cdf: resd 1
@@ -48,22 +70,26 @@
SECTION .text
%if WIN64
-DECLARE_REG_TMP 3
-%define buf rsp+8 ; shadow space
-%else
-DECLARE_REG_TMP 0
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3
+%define buf rsp+8 ; shadow space
+%elif UNIX64
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0
%define buf rsp-40 ; red zone
+%else
+DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2
+%define buf esp+8
%endif
INIT_XMM sse2
-cglobal msac_decode_symbol_adapt4, 3, 7, 6, s, cdf, ns
- movd m2, [sq+msac.rng]
- movq m1, [cdfq]
- lea rax, [pw_0xff00]
- movq m3, [sq+msac.dif]
- mov r3d, [sq+msac.update_cdf]
- mov r4d, nsd
- neg nsq
+cglobal msac_decode_symbol_adapt4, 0, 6, 6
+ DECODE_SYMBOL_ADAPT_INIT
+ LEA rax, pw_0xff00
+ movd m2, [t0+msac.rng]
+ movq m1, [t1]
+ movp m3, [t0+msac.dif]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ neg t2
pshuflw m2, m2, q0000
movd [buf+12], m2
pand m2, [rax]
@@ -71,8 +97,8 @@
psrlw m1, 6
psllw m1, 7
pmulhuw m1, m2
- movq m2, [rax+nsq*2]
- pshuflw m3, m3, q3333
+ movq m2, [rax+t2*2]
+ pshuflw m3, m3, c_shuf
paddw m1, m2
mova [buf+16], m1
psubusw m1, m3
@@ -79,104 +105,121 @@
pxor m2, m2
pcmpeqw m1, m2 ; c >= v
pmovmskb eax, m1
- test r3d, r3d
+ test t3d, t3d
jz .renorm ; !allow_update_cdf
; update_cdf:
- movzx r3d, word [cdfq+r4*2] ; count
+ movzx t3d, word [t1+t4*2] ; count
pcmpeqw m2, m2
- mov r2d, r3d
- shr r3d, 4
- cmp r4d, 4
- sbb r3d, -5 ; (count >> 4) + (n_symbols > 3) + 4
- cmp r2d, 32
- adc r2d, 0 ; count + (count < 32)
- movd m3, r3d
+ mov t2d, t3d
+ shr t3d, 4
+ cmp t4d, 4
+ sbb t3d, -5 ; (count >> 4) + (n_symbols > 3) + 4
+ cmp t2d, 32
+ adc t2d, 0 ; count + (count < 32)
+ movd m3, t3d
pavgw m2, m1 ; i >= val ? -1 : 32768
psubw m2, m0 ; for (i = 0; i < val; i++)
psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate;
psraw m2, m3 ; for (; i < n_symbols - 1; i++)
paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1;
- movq [cdfq], m0
- mov [cdfq+r4*2], r2w
+ movq [t1], m0
+ mov [t1+t4*2], t2w
.renorm:
tzcnt eax, eax
- mov r4, [sq+msac.dif]
- movzx r1d, word [buf+rax+16] ; v
- movzx r2d, word [buf+rax+14] ; u
+ mov t4, [t0+msac.dif]
+ movzx t1d, word [buf+rax+16] ; v
+ movzx t2d, word [buf+rax+14] ; u
shr eax, 1
.renorm2:
- not r4
- sub r2d, r1d ; rng
- shl r1, 48
- add r4, r1 ; ~dif
+%if ARCH_X86_64 == 0
+%if STACK_ALIGNMENT >= 16
+ add esp, 40
+%else
+ mov esp, [esp]
+%endif
+%endif
+ not t4
+ sub t2d, t1d ; rng
+ shl t1, gprsize*8-16
+ add t4, t1 ; ~dif
.renorm3:
- mov r1d, [sq+msac.cnt]
- movifnidn t0, sq
+ mov t1d, [t0+msac.cnt]
+ movifnidn t7, t0
.renorm4:
- bsr ecx, r2d
+ bsr ecx, t2d
xor ecx, 15 ; d
- shl r2d, cl
- shl r4, cl
- mov [t0+msac.rng], r2d
- not r4
- sub r1d, ecx
+ shl t2d, cl
+ shl t4, cl
+ mov [t7+msac.rng], t2d
+ not t4
+ sub t1d, ecx
jge .end ; no refill required
; refill:
- mov r2, [t0+msac.buf]
- mov rcx, [t0+msac.end]
- lea r5, [r2+8]
- cmp r5, rcx
+ mov t2, [t7+msac.buf]
+ mov rcx, [t7+msac.end]
+%if ARCH_X86_64 == 0
+ push t5
+%endif
+ lea t5, [t2+gprsize]
+ cmp t5, rcx
jg .refill_eob
- mov r2, [r2]
- lea ecx, [r1+23]
- add r1d, 16
+ mov t2, [t2]
+ lea ecx, [t1+23]
+ add t1d, 16
shr ecx, 3 ; shift_bytes
- bswap r2
- sub r5, rcx
+ bswap t2
+ sub t5, rcx
shl ecx, 3 ; shift_bits
- shr r2, cl
- sub ecx, r1d ; shift_bits - 16 - cnt
- mov r1d, 48
- shl r2, cl
- mov [t0+msac.buf], r5
- sub r1d, ecx ; cnt + 64 - shift_bits
- xor r4, r2
+ shr t2, cl
+ sub ecx, t1d ; shift_bits - 16 - cnt
+ mov t1d, gprsize*8-16
+ shl t2, cl
+ mov [t7+msac.buf], t5
+ sub t1d, ecx ; cnt + gprsize*8 - shift_bits
+ xor t4, t2
+%if ARCH_X86_64 == 0
+ pop t5
+%endif
.end:
- mov [t0+msac.cnt], r1d
- mov [t0+msac.dif], r4
+ mov [t7+msac.cnt], t1d
+ mov [t7+msac.dif], t4
RET
.refill_eob: ; avoid overreading the input buffer
- mov r5, rcx
- mov ecx, 40
- sub ecx, r1d ; c
+ mov t5, rcx
+ mov ecx, gprsize*8-24
+ sub ecx, t1d ; c
.refill_eob_loop:
- cmp r2, r5
+ cmp t2, t5
jge .refill_eob_end ; eob reached
- movzx r1d, byte [r2]
- inc r2
- shl r1, cl
- xor r4, r1
+ movzx t1d, byte [t2]
+ inc t2
+ shl t1, cl
+ xor t4, t1
sub ecx, 8
jge .refill_eob_loop
.refill_eob_end:
- mov r1d, 40
- sub r1d, ecx
- mov [t0+msac.buf], r2
- mov [t0+msac.dif], r4
- mov [t0+msac.cnt], r1d
+ mov t1d, gprsize*8-24
+%if ARCH_X86_64 == 0
+ pop t5
+%endif
+ sub t1d, ecx
+ mov [t7+msac.buf], t2
+ mov [t7+msac.dif], t4
+ mov [t7+msac.cnt], t1d
RET
-cglobal msac_decode_symbol_adapt8, 3, 7, 6, s, cdf, ns
- movd m2, [sq+msac.rng]
- movu m1, [cdfq]
- lea rax, [pw_0xff00]
- movq m3, [sq+msac.dif]
- mov r3d, [sq+msac.update_cdf]
- mov r4d, nsd
- neg nsq
+cglobal msac_decode_symbol_adapt8, 0, 6, 6
+ DECODE_SYMBOL_ADAPT_INIT
+ LEA rax, pw_0xff00
+ movd m2, [t0+msac.rng]
+ movu m1, [t1]
+ movp m3, [t0+msac.dif]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ neg t2
pshuflw m2, m2, q0000
movd [buf+12], m2
punpcklqdq m2, m2
@@ -185,8 +228,8 @@
pand m2, [rax]
psllw m1, 7
pmulhuw m1, m2
- movu m2, [rax+nsq*2]
- pshuflw m3, m3, q3333
+ movu m2, [rax+t2*2]
+ pshuflw m3, m3, c_shuf
paddw m1, m2
punpcklqdq m3, m3
mova [buf+16], m1
@@ -194,35 +237,36 @@
pxor m2, m2
pcmpeqw m1, m2
pmovmskb eax, m1
- test r3d, r3d
+ test t3d, t3d
jz m(msac_decode_symbol_adapt4).renorm
- movzx r3d, word [cdfq+r4*2]
+ movzx t3d, word [t1+t4*2]
pcmpeqw m2, m2
- mov r2d, r3d
- shr r3d, 4
- cmp r4d, 4 ; may be called with n_symbols < 4
- sbb r3d, -5
- cmp r2d, 32
- adc r2d, 0
- movd m3, r3d
+ mov t2d, t3d
+ shr t3d, 4
+ cmp t4d, 4 ; may be called with n_symbols < 4
+ sbb t3d, -5
+ cmp t2d, 32
+ adc t2d, 0
+ movd m3, t3d
pavgw m2, m1
psubw m2, m0
psubw m0, m1
psraw m2, m3
paddw m0, m2
- movu [cdfq], m0
- mov [cdfq+r4*2], r2w
+ movu [t1], m0
+ mov [t1+t4*2], t2w
jmp m(msac_decode_symbol_adapt4).renorm
-cglobal msac_decode_symbol_adapt16, 3, 7, 6, s, cdf, ns
- movd m4, [sq+msac.rng]
- movu m2, [cdfq]
- lea rax, [pw_0xff00]
- movu m3, [cdfq+16]
- movq m5, [sq+msac.dif]
- mov r3d, [sq+msac.update_cdf]
- mov r4d, nsd
- neg nsq
+cglobal msac_decode_symbol_adapt16, 0, 6, 6
+ DECODE_SYMBOL_ADAPT_INIT
+ LEA rax, pw_0xff00
+ movd m4, [t0+msac.rng]
+ movu m2, [t1]
+ movu m3, [t1+16]
+ movp m5, [t0+msac.dif]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ neg t2
%if WIN64
sub rsp, 48 ; need 36 bytes, shadow space is only 32
%endif
@@ -238,8 +282,8 @@
psllw m3, 7
pmulhuw m2, m4
pmulhuw m3, m4
- movu m4, [rax+nsq*2]
- pshuflw m5, m5, q3333
+ movu m4, [rax+t2*2]
+ pshuflw m5, m5, c_shuf
paddw m2, m4
psubw m4, [rax-pw_0xff00+pw_32]
punpcklqdq m5, m5
@@ -253,20 +297,20 @@
pcmpeqw m3, m4
packsswb m5, m2, m3
pmovmskb eax, m5
- test r3d, r3d
+ test t3d, t3d
jz .renorm
- movzx r3d, word [cdfq+r4*2]
+ movzx t3d, word [t1+t4*2]
pcmpeqw m4, m4
mova m5, m4
- lea r2d, [r3+80] ; only support n_symbols >= 4
- shr r2d, 4
- cmp r3d, 32
- adc r3d, 0
+ lea t2d, [t3+80] ; only support n_symbols >= 4
+ shr t2d, 4
+ cmp t3d, 32
+ adc t3d, 0
pavgw m4, m2
pavgw m5, m3
psubw m4, m0
psubw m0, m2
- movd m2, r2d
+ movd m2, t2d
psubw m5, m1
psubw m1, m3
psraw m4, m2
@@ -273,105 +317,127 @@
psraw m5, m2
paddw m0, m4
paddw m1, m5
- movu [cdfq], m0
- movu [cdfq+16], m1
- mov [cdfq+r4*2], r3w
+ movu [t1], m0
+ movu [t1+16], m1
+ mov [t1+t4*2], t3w
.renorm:
tzcnt eax, eax
- mov r4, [sq+msac.dif]
- movzx r1d, word [buf+rax*2]
- movzx r2d, word [buf+rax*2-2]
+ mov t4, [t0+msac.dif]
+ movzx t1d, word [buf+rax*2]
+ movzx t2d, word [buf+rax*2-2]
%if WIN64
add rsp, 48
%endif
jmp m(msac_decode_symbol_adapt4).renorm2
-cglobal msac_decode_bool_adapt, 2, 7, 0, s, cdf
- movzx eax, word [cdfq]
- movzx r3d, byte [sq+msac.rng+1]
- mov r4, [sq+msac.dif]
- mov r2d, [sq+msac.rng]
- mov r5d, eax
+cglobal msac_decode_bool_adapt, 0, 6, 0
+ movifnidn t1, r1mp
+ movifnidn t0, r0mp
+ movzx eax, word [t1]
+ movzx t3d, byte [t0+msac.rng+1]
+ mov t4, [t0+msac.dif]
+ mov t2d, [t0+msac.rng]
+%if ARCH_X86_64
+ mov t5d, eax
+%endif
and eax, ~63
- imul eax, r3d
+ imul eax, t3d
%if UNIX64
- mov r7, r4
+ mov t6, t4
%endif
shr eax, 7
- add eax, 4 ; v
- mov r3d, eax
- shl rax, 48 ; vw
- sub r2d, r3d ; r - v
- sub r4, rax ; dif - vw
- cmovb r2d, r3d
- mov r3d, [sq+msac.update_cdf]
+ add eax, 4 ; v
+ mov t3d, eax
+ shl rax, gprsize*8-16 ; vw
+ sub t2d, t3d ; r - v
+ sub t4, rax ; dif - vw
+ setb al
+ cmovb t2d, t3d
+ mov t3d, [t0+msac.update_cdf]
%if UNIX64
- cmovb r4, r7
+ cmovb t4, t6
%else
- cmovb r4, [sq+msac.dif]
+ cmovb t4, [t0+msac.dif]
%endif
- setb al
- not r4
- test r3d, r3d
+%if ARCH_X86_64 == 0
+ movzx eax, al
+%endif
+ not t4
+ test t3d, t3d
jz m(msac_decode_symbol_adapt4).renorm3
-%if WIN64
- push r7
+%if UNIX64 == 0
+ push t6
%endif
- movzx r7d, word [cdfq+2]
- movifnidn t0, sq
- lea ecx, [r7+64]
- cmp r7d, 32
- adc r7d, 0
- mov [cdfq+2], r7w
- imul r7d, eax, -32769
+ movzx t6d, word [t1+2]
+%if ARCH_X86_64 == 0
+ push t5
+ movzx t5d, word [t1]
+%endif
+ movifnidn t7, t0
+ lea ecx, [t6+64]
+ cmp t6d, 32
+ adc t6d, 0
+ mov [t1+2], t6w
+ imul t6d, eax, -32769
shr ecx, 4 ; rate
- add r7d, r5d ; if (bit)
- sub r5d, eax ; cdf[0] -= ((cdf[0] - 32769) >> rate) + 1;
- sar r7d, cl ; else
- sub r5d, r7d ; cdf[0] -= cdf[0] >> rate;
- mov [cdfq], r5w
+ add t6d, t5d ; if (bit)
+ sub t5d, eax ; cdf[0] -= ((cdf[0] - 32769) >> rate) + 1;
+ sar t6d, cl ; else
+ sub t5d, t6d ; cdf[0] -= cdf[0] >> rate;
+ mov [t1], t5w
%if WIN64
- mov r1d, [t0+msac.cnt]
- pop r7
+ mov t1d, [t7+msac.cnt]
+ pop t6
jmp m(msac_decode_symbol_adapt4).renorm4
%else
+%if ARCH_X86_64 == 0
+ pop t5
+ pop t6
+%endif
jmp m(msac_decode_symbol_adapt4).renorm3
%endif
-cglobal msac_decode_bool_equi, 1, 7, 0, s
- mov r1d, [sq+msac.rng]
- mov r4, [sq+msac.dif]
- mov r2d, r1d
- mov r1b, 8
- mov r3, r4
- mov eax, r1d
- shr r1d, 1 ; v
- shl rax, 47 ; vw
- sub r2d, r1d ; r - v
- sub r4, rax ; dif - vw
- cmovb r2d, r1d
- cmovb r4, r3
+cglobal msac_decode_bool_equi, 0, 6, 0
+ movifnidn t0, r0mp
+ mov t1d, [t0+msac.rng]
+ mov t4, [t0+msac.dif]
+ mov t2d, t1d
+ mov t1b, 8
+ mov t3, t4
+ mov eax, t1d
+ shr t1d, 1 ; v
+ shl rax, gprsize*8-17 ; vw
+ sub t2d, t1d ; r - v
+ sub t4, rax ; dif - vw
+ cmovb t2d, t1d
+ cmovb t4, t3
setb al ; the upper 32 bits contains garbage but that's OK
- not r4
+ not t4
+%if ARCH_X86_64 == 0
+ movzx eax, al
+%endif
jmp m(msac_decode_symbol_adapt4).renorm3
-cglobal msac_decode_bool, 2, 7, 0, s, f
- movzx eax, byte [sq+msac.rng+1] ; r >> 8
- mov r4, [sq+msac.dif]
- mov r2d, [sq+msac.rng]
- and r1d, ~63
- imul eax, r1d
- mov r3, r4
+cglobal msac_decode_bool, 0, 6, 0
+ movifnidn t0, r0mp
+ movifnidn t1d, r1m
+ movzx eax, byte [t0+msac.rng+1] ; r >> 8
+ mov t4, [t0+msac.dif]
+ mov t2d, [t0+msac.rng]
+ and t1d, ~63
+ imul eax, t1d
+ mov t3, t4
shr eax, 7
- add eax, 4 ; v
- mov r1d, eax
- shl rax, 48 ; vw
- sub r2d, r1d ; r - v
- sub r4, rax ; dif - vw
- cmovb r2d, r1d
- cmovb r4, r3
+ add eax, 4 ; v
+ mov t1d, eax
+ shl rax, gprsize*8-16 ; vw
+ sub t2d, t1d ; r - v
+ sub t4, rax ; dif - vw
+ cmovb t2d, t1d
+ cmovb t4, t3
setb al
- not r4
- jmp m(msac_decode_symbol_adapt4).renorm3
-
+ not t4
+%if ARCH_X86_64 == 0
+ movzx eax, al
%endif
+ jmp m(msac_decode_symbol_adapt4).renorm3
--- /dev/null
+++ b/src/x86/msac.h
@@ -1,0 +1,51 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_X86_MSAC_H
+#define DAV1D_SRC_X86_MSAC_H
+
+unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
+unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
+
+#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
+#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
+#endif
+
+#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_sse2
+#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_sse2
+#define dav1d_msac_decode_bool dav1d_msac_decode_bool_sse2
+
+#endif /* DAV1D_SRC_X86_MSAC_H */
--- a/tests/checkasm/msac.c
+++ b/tests/checkasm/msac.c
@@ -175,7 +175,7 @@
c.bool_equi = dav1d_msac_decode_bool_equi_neon;
c.bool = dav1d_msac_decode_bool_neon;
}
-#elif ARCH_X86_64 && HAVE_ASM
+#elif ARCH_X86 && HAVE_ASM
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_sse2;
c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_sse2;