ref: b8399319939f8721640e3835e9b9e0f5f9bb913d
parent: d21dc801529a4aeeaad0d7da4bd1f8e675cba269
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Feb 3 18:56:12 EST 2020
x86: Avoid cmov instructions that depends on multiple flags On many AMD CPU:s cmov instructions that depends on multiple flags require an additional µop, so prefer using cmov variants that only depends on a single flag where possible.
--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -374,9 +374,9 @@
sub dampingd, 31
xor zerod, zerod
add pridmpd, dampingd
- cmovl pridmpd, zerod
+ cmovs pridmpd, zerod
add secdmpd, dampingd
- cmovl secdmpd, zerod
+ cmovs secdmpd, zerod
mov [rsp+0], pridmpq ; pri_shift
mov [rsp+8], secdmpq ; sec_shift
--- a/src/x86/cdef_sse.asm
+++ b/src/x86/cdef_sse.asm
@@ -623,9 +623,9 @@
sub secdmpd, dampingd
xor dampingd, dampingd
neg pridmpd
- cmovl pridmpd, dampingd
+ cmovs pridmpd, dampingd
neg secdmpd
- cmovl secdmpd, dampingd
+ cmovs secdmpd, dampingd
%if ARCH_X86_64
mov [rsp+ 0], pridmpq ; pri_shift
mov [rsp+16], secdmpq ; sec_shift
--- a/src/x86/film_grain.asm
+++ b/src/x86/film_grain.asm
@@ -171,9 +171,9 @@
movsx val0d, byte [bufq+xq]
add val3d, val0d
cmp val3d, maxd
- cmovg val3d, maxd
+ cmovns val3d, maxd
cmp val3d, mind
- cmovl val3d, mind
+ cmovs val3d, mind
mov byte [bufq+xq], val3b
; keep val3d in-place as left for next x iteration
inc xq
@@ -585,9 +585,9 @@
movsx val0d, byte [bufq+xq]
add val3d, val0d
cmp val3d, maxd
- cmovg val3d, maxd
+ cmovns val3d, maxd
cmp val3d, mind
- cmovl val3d, mind
+ cmovs val3d, mind
mov byte [bufq+xq], val3b
; keep val3d in-place as left for next x iteration
inc xq
--- a/src/x86/film_grain_ssse3.asm
+++ b/src/x86/film_grain_ssse3.asm
@@ -222,9 +222,9 @@
movsx val0d, byte [bufq+xq]
add val3d, val0d
cmp val3d, maxd
- cmovg val3d, maxd
+ cmovns val3d, maxd
cmp val3d, mind
- cmovl val3d, mind
+ cmovs val3d, mind
mov byte [bufq+xq], val3b
; keep val3d in-place as left for next x iteration
inc xq
@@ -778,9 +778,9 @@
movsx val0d, byte [bufq+xq]
add val3d, val0d
cmp val3d, maxd
- cmovg val3d, maxd
+ cmovns val3d, maxd
cmp val3d, mind
- cmovl val3d, mind
+ cmovs val3d, mind
mov byte [bufq+xq], val3b
; keep val3d in-place as left for next x iteration
inc xq
--- a/src/x86/ipred.asm
+++ b/src/x86/ipred.asm
@@ -308,7 +308,7 @@
mov r6d, 0x5556
mov r2d, 0x3334
cmp hd, 32
- cmovz r6d, r2d
+ cmove r6d, r2d
movd xm1, r6d
pmulhuw xm0, xm1
.w8_end:
@@ -1441,7 +1441,7 @@
mov r3d, 9
mov tlq, rsp
cmp hd, 4
- cmova maxbased, r3d
+ cmovne maxbased, r3d
vextracti128 xm1, m0, 1
packuswb xm0, xm1
mova [tlq], xm0
@@ -1628,8 +1628,8 @@
sar r5d, 1
mov tlq, rsp
add r5d, 17 ; w*2 + (filter_strength == 3)
- cmp hd, 8
- cmova maxbased, r5d
+ cmp hd, 16
+ cmovns maxbased, r5d
mov [tlq+r5], r3b
vextracti128 xm0, m1, 1
packuswb xm0, xm1
@@ -1745,8 +1745,8 @@
sar r5d, 1
mov tlq, rsp
add r5d, 33
- cmp hd, 16
- cmova maxbased, r5d
+ cmp hd, 32
+ cmovns maxbased, r5d
mov [tlq+r5], r3b
packuswb m0, m1
vpermq m0, m0, q3120
@@ -1812,7 +1812,7 @@
lea r3d, [hq+31]
mov maxbased, 63
cmp hd, 32
- cmovb maxbased, r3d
+ cmovs maxbased, r3d
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w32_main
vbroadcasti128 m0, [pb_0to15]
@@ -1889,8 +1889,8 @@
mov tlq, rsp
mov [tlq+65], r3b
mov r3d, 65
- cmp hd, 32
- cmova maxbased, r3d
+ cmp hd, 64
+ cmove maxbased, r3d
packuswb m0, m2
packuswb m1, m6
mova [tlq+ 0], m0
@@ -2294,7 +2294,7 @@
cmp hd, 16
movu xm2, [rsp+49]
vinserti128 m2, [rsp+43], 1
- cmovl r5d, hd
+ cmovs r5d, hd
xor r5d, 15 ; h == 16 ? 5 : 15 - h
movd xm0, r5d
vbroadcasti128 m1, [base+z_filter_s+12]
@@ -2501,7 +2501,7 @@
.w8_filter_left_h16:
mov r5d, 10
cmp hd, 16
- cmovl r5d, hd
+ cmovs r5d, hd
xor r5d, 15 ; h == 16 ? 5 : 15 - h
movd xm0, r5d
vpbroadcastb m0, xm0
@@ -2742,7 +2742,7 @@
.w16_filter_left_h16:
mov r5d, 10
cmp hd, 16
- cmovl r5d, hd
+ cmovs r5d, hd
xor r5d, 15 ; h == 16 ? 5 : 15 - h
movd xm0, r5d
vpbroadcastb m0, xm0
@@ -3115,7 +3115,7 @@
mov r4d, 9
lea tlq, [rsp+15]
cmp wd, 4
- cmova maxbased, r4d
+ cmovne maxbased, r4d
vextracti128 xm1, m0, 1
packuswb xm0, xm1
mova [rsp], xm0
@@ -3321,8 +3321,8 @@
sar r5d, 1
lea tlq, [rsp+31]
add r5d, 17
- cmp wd, 8
- cmova maxbased, r5d
+ cmp wd, 16
+ cmovns maxbased, r5d
neg r5
mov [tlq+r5], r4b
vextracti128 xm1, m0, 1
@@ -3385,7 +3385,7 @@
sub org_wd, 8
lea r2, [strideq*3]
lea r6, [dstq+org_wq]
- cmovg dstq, r6
+ cmovns dstq, r6
punpcklwd xm1, xm2, xm0
punpckhwd xm2, xm0
lea r6, [dstq+strideq*4]
@@ -3493,8 +3493,8 @@
sar r5d, 1
lea tlq, [rsp+63]
add r5d, 33
- cmp wd, 16
- cmova maxbased, r5d
+ cmp wd, 32
+ cmovns maxbased, r5d
neg r5
mov [tlq+r5], r4b
packuswb m0, m1
@@ -3563,7 +3563,7 @@
sub org_wd, 8
lea r2, [strideq*3]
lea r6, [dstq+org_wq]
- cmovg dstq, r6
+ cmovns dstq, r6
punpcklbw m1, m2, m0
punpckhbw m2, m0
lea r3, [strideq*5]
@@ -3652,7 +3652,7 @@
movu xm11, [tlq-66] ; 56-63
vinserti128 m11, [tlq-52], 1 ; 40-47
sub r4d, wd ; 21-w
- cmovg r5d, r4d
+ cmovns r5d, r4d
movu xm12, [tlq-58] ; 48-55
vinserti128 m12, [tlq-44], 1 ; 32-39
sub r4d, 8 ; 13-w
@@ -3721,8 +3721,8 @@
lea tlq, [rsp+95]
mov [tlq-65], r4b
mov r4d, 65
- cmp wd, 32
- cmova maxbased, r4d
+ cmp wd, 64
+ cmove maxbased, r4d
packuswb m0, m2
packuswb m1, m6
mova [tlq-63], m0
@@ -4553,7 +4553,7 @@
mov r6d, 0x5556
mov r2d, 0x3334
cmp hd, 32
- cmovz r6d, r2d
+ cmove r6d, r2d
movd xm1, r6d
pmulhuw xm0, xm1
.w8_end:
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -4587,8 +4587,8 @@
cglobal inv_txfm_add_identity_identity_8x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
mov r5d, 4
mov tx2d, 2
- cmp eobd, 106
- cmovg tx2d, r5d
+ cmp eobd, 107
+ cmovns tx2d, r5d
mov r3d, tx2d
%if ARCH_X86_32
LEA r5, $$
@@ -4617,8 +4617,8 @@
cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
mov r5d, 4
mov tx2d, 2
- cmp eobd, 106
- cmovg tx2d, r5d
+ cmp eobd, 107
+ cmovns tx2d, r5d
mov r3d, tx2d
%if ARCH_X86_32
LEA r5, $$
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -2072,8 +2072,8 @@
WIN64_SPILL_XMM 16
movzx mxd, myb
shr myd, 16
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
tzcnt r6d, wd
movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
vpbroadcastd m7, [pw_512]
@@ -2293,8 +2293,8 @@
vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2]
movzx mxd, myb
shr myd, 16
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
lea ss3q, [ssq*3]
sub srcq, ss3q
@@ -2434,8 +2434,8 @@
vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4]
movzx mxd, myb
shr myd, 16
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
lea ss3q, [ssq*3]
sub srcq, ss3q
@@ -5136,9 +5136,9 @@
xor r12d, r12d
lea r10, [ihq-1]
cmp yq, ihq
- cmovl r10, yq
+ cmovs r10, yq
test yq, yq
- cmovl r10, r12
+ cmovs r10, r12
imul r10, sstrideq
add srcq, r10
@@ -5145,9 +5145,9 @@
; ref += iclip(x, 0, iw - 1)
lea r10, [iwq-1]
cmp xq, iwq
- cmovl r10, xq
+ cmovs r10, xq
test xq, xq
- cmovl r10, r12
+ cmovs r10, r12
add srcq, r10
; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
@@ -5154,7 +5154,7 @@
lea bottomextq, [yq+bhq]
sub bottomextq, ihq
lea r3, [bhq-1]
- cmovl bottomextq, r12
+ cmovs bottomextq, r12
DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
bottomext, rightext
@@ -5161,9 +5161,9 @@
; top_ext = iclip(-y, 0, bh - 1)
neg topextq
- cmovl topextq, r12
+ cmovs topextq, r12
cmp bottomextq, bhq
- cmovge bottomextq, r3
+ cmovns bottomextq, r3
cmp topextq, bhq
cmovg topextq, r3
@@ -5171,7 +5171,7 @@
lea rightextq, [xq+bwq]
sub rightextq, iwq
lea r2, [bwq-1]
- cmovl rightextq, r12
+ cmovs rightextq, r12
DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
bottomext, rightext
@@ -5178,11 +5178,11 @@
; left_ext = iclip(-x, 0, bw - 1)
neg leftextq
- cmovl leftextq, r12
+ cmovs leftextq, r12
cmp rightextq, bwq
- cmovge rightextq, r2
+ cmovns rightextq, r2
cmp leftextq, bwq
- cmovge leftextq, r2
+ cmovns leftextq, r2
DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
dst, dstride, src, sstride, bottomext, rightext
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -1564,8 +1564,8 @@
%if ARCH_X86_32
movzx mxd, ssb
shr ssd, 16
- cmp hd, 4
- cmovle ssd, mxd
+ cmp hd, 6
+ cmovs ssd, mxd
lea ssq, [base_reg+ssq*8+subpel_filters-put_ssse3]
%else
%assign stack_offset org_stack_offset
@@ -1572,8 +1572,8 @@
WIN64_SPILL_XMM 16
movzx mxd, myb
shr myd, 16
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
lea myq, [base_reg+myq*8+subpel_filters-put_ssse3]
%endif
tzcnt r6d, wd
@@ -1860,8 +1860,8 @@
%if ARCH_X86_32
movzx mxd, ssb
shr ssd, 16
- cmp hd, 4
- cmovle ssd, mxd
+ cmp hd, 6
+ cmovs ssd, mxd
movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
W32_RESTORE_SSQ
lea r6, [ssq*3]
@@ -1890,8 +1890,8 @@
%else
movzx mxd, myb
shr myd, 16
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
ALLOC_STACK mmsize*14, 14
lea ss3q, [ssq*3]
@@ -2206,8 +2206,8 @@
movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
movzx mxd, ssb
shr ssd, 16
- cmp hd, 4
- cmovle ssd, mxd
+ cmp hd, 6
+ cmovs ssd, mxd
movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
mov ssq, ssmp
ALLOC_STACK -mmsize*13
@@ -2247,8 +2247,8 @@
movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
movzx mxd, myb
shr myd, 16
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
movq m1, [base_reg+myq*8+subpel_filters-put_ssse3]
pshufd subpelh0, m0, q0000
pshufd subpelh1, m0, q1111
@@ -2646,8 +2646,8 @@
movzx mxd, myb
%endif
shr myd, 16
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
lea myq, [base_reg+myq*8+subpel_filters-prep_ssse3]
mova m2, [base+pw_512]
psrlw m2, m2, 1 ; 0x0100
@@ -2859,8 +2859,8 @@
mov mxd, myd
shr myd, 16
and mxd, 0x7f
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
mov r5, r2; use as new base
%define base_reg r5
@@ -2889,8 +2889,8 @@
%else
movzx mxd, myb
shr myd, 16
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
ALLOC_STACK mmsize*14, 14
lea stride3q, [strideq*3]
@@ -3108,8 +3108,8 @@
mov mxd, myd
shr myd, 16
and mxd, 0x7f
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
ALLOC_STACK -mmsize*13
%if STACK_ALIGNMENT < mmsize
@@ -3154,8 +3154,8 @@
movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
movzx mxd, myb
shr myd, 16
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
pshufd subpelh0, m0, q0000
pshufd subpelh1, m0, q1111
@@ -4747,9 +4747,9 @@
xor reg_zero, reg_zero
lea reg_tmp, [ihq-1]
cmp yq, ihq
- cmovl reg_tmp, yq
+ cmovs reg_tmp, yq
test yq, yq
- cmovl reg_tmp, reg_zero
+ cmovs reg_tmp, reg_zero
%if ARCH_X86_64
imul reg_tmp, sstrideq
add srcq, reg_tmp
@@ -4762,9 +4762,9 @@
; ref += iclip(x, 0, iw - 1)
lea reg_tmp, [iwq-1]
cmp xq, iwq
- cmovl reg_tmp, xq
+ cmovs reg_tmp, xq
test xq, xq
- cmovl reg_tmp, reg_zero
+ cmovs reg_tmp, reg_zero
add reg_src, reg_tmp
%if ARCH_X86_32
mov srcm, reg_src
@@ -4777,7 +4777,7 @@
lea reg_bottomext, [yq+bhq]
sub reg_bottomext, ihq
lea r3, [bhq-1]
- cmovl reg_bottomext, reg_zero
+ cmovs reg_bottomext, reg_zero
;
DEFINE_ARGS bw, bh, iw, ih, x, \
@@ -4786,9 +4786,9 @@
; top_ext = iclip(-y, 0, bh - 1)
neg topextq
- cmovl topextq, reg_zero
+ cmovs topextq, reg_zero
cmp reg_bottomext, bhq
- cmovge reg_bottomext, r3
+ cmovns reg_bottomext, r3
cmp topextq, bhq
cmovg topextq, r3
%if ARCH_X86_32
@@ -4800,7 +4800,7 @@
lea reg_rightext, [xq+bwq]
sub reg_rightext, iwq
lea r2, [bwq-1]
- cmovl reg_rightext, reg_zero
+ cmovs reg_rightext, reg_zero
DEFINE_ARGS bw, bh, iw, ih, leftext, \
topext, dst, dstride, src, sstride, \
@@ -4808,14 +4808,14 @@
; left_ext = iclip(-x, 0, bw - 1)
neg leftextq
- cmovl leftextq, reg_zero
+ cmovs leftextq, reg_zero
cmp reg_rightext, bwq
- cmovge reg_rightext, r2
+ cmovns reg_rightext, r2
%if ARCH_X86_32
mov r3m, r1
%endif
cmp leftextq, bwq
- cmovge leftextq, r2
+ cmovns leftextq, r2
%undef reg_zero
%undef reg_tmp