shithub: dav1d

Download patch

ref: b8399319939f8721640e3835e9b9e0f5f9bb913d
parent: d21dc801529a4aeeaad0d7da4bd1f8e675cba269
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Feb 3 18:56:12 EST 2020

x86: Avoid cmov instructions that depends on multiple flags

On many AMD CPU:s cmov instructions that depends on multiple flags
require an additional µop, so prefer using cmov variants that only
depends on a single flag where possible.

--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -374,9 +374,9 @@
     sub       dampingd, 31
     xor          zerod, zerod
     add        pridmpd, dampingd
-    cmovl      pridmpd, zerod
+    cmovs      pridmpd, zerod
     add        secdmpd, dampingd
-    cmovl      secdmpd, zerod
+    cmovs      secdmpd, zerod
     mov        [rsp+0], pridmpq                 ; pri_shift
     mov        [rsp+8], secdmpq                 ; sec_shift
 
--- a/src/x86/cdef_sse.asm
+++ b/src/x86/cdef_sse.asm
@@ -623,9 +623,9 @@
     sub        secdmpd, dampingd
     xor       dampingd, dampingd
     neg        pridmpd
-    cmovl      pridmpd, dampingd
+    cmovs      pridmpd, dampingd
     neg        secdmpd
-    cmovl      secdmpd, dampingd
+    cmovs      secdmpd, dampingd
  %if ARCH_X86_64
     mov       [rsp+ 0], pridmpq                 ; pri_shift
     mov       [rsp+16], secdmpq                 ; sec_shift
--- a/src/x86/film_grain.asm
+++ b/src/x86/film_grain.asm
@@ -171,9 +171,9 @@
     movsx         val0d, byte [bufq+xq]
     add           val3d, val0d
     cmp           val3d, maxd
-    cmovg         val3d, maxd
+    cmovns        val3d, maxd
     cmp           val3d, mind
-    cmovl         val3d, mind
+    cmovs         val3d, mind
     mov  byte [bufq+xq], val3b
     ; keep val3d in-place as left for next x iteration
     inc              xq
@@ -585,9 +585,9 @@
     movsx         val0d, byte [bufq+xq]
     add           val3d, val0d
     cmp           val3d, maxd
-    cmovg         val3d, maxd
+    cmovns        val3d, maxd
     cmp           val3d, mind
-    cmovl         val3d, mind
+    cmovs         val3d, mind
     mov  byte [bufq+xq], val3b
     ; keep val3d in-place as left for next x iteration
     inc              xq
--- a/src/x86/film_grain_ssse3.asm
+++ b/src/x86/film_grain_ssse3.asm
@@ -222,9 +222,9 @@
     movsx         val0d, byte [bufq+xq]
     add           val3d, val0d
     cmp           val3d, maxd
-    cmovg         val3d, maxd
+    cmovns        val3d, maxd
     cmp           val3d, mind
-    cmovl         val3d, mind
+    cmovs         val3d, mind
     mov  byte [bufq+xq], val3b
     ; keep val3d in-place as left for next x iteration
     inc              xq
@@ -778,9 +778,9 @@
     movsx         val0d, byte [bufq+xq]
     add           val3d, val0d
     cmp           val3d, maxd
-    cmovg         val3d, maxd
+    cmovns        val3d, maxd
     cmp           val3d, mind
-    cmovl         val3d, mind
+    cmovs         val3d, mind
     mov  byte [bufq+xq], val3b
     ; keep val3d in-place as left for next x iteration
     inc              xq
--- a/src/x86/ipred.asm
+++ b/src/x86/ipred.asm
@@ -308,7 +308,7 @@
     mov                 r6d, 0x5556
     mov                 r2d, 0x3334
     cmp                  hd, 32
-    cmovz               r6d, r2d
+    cmove               r6d, r2d
     movd                xm1, r6d
     pmulhuw             xm0, xm1
 .w8_end:
@@ -1441,7 +1441,7 @@
     mov                 r3d, 9
     mov                 tlq, rsp
     cmp                  hd, 4
-    cmova          maxbased, r3d
+    cmovne         maxbased, r3d
     vextracti128        xm1, m0, 1
     packuswb            xm0, xm1
     mova              [tlq], xm0
@@ -1628,8 +1628,8 @@
     sar                 r5d, 1
     mov                 tlq, rsp
     add                 r5d, 17 ; w*2 + (filter_strength == 3)
-    cmp                  hd, 8
-    cmova          maxbased, r5d
+    cmp                  hd, 16
+    cmovns         maxbased, r5d
     mov            [tlq+r5], r3b
     vextracti128        xm0, m1, 1
     packuswb            xm0, xm1
@@ -1745,8 +1745,8 @@
     sar                 r5d, 1
     mov                 tlq, rsp
     add                 r5d, 33
-    cmp                  hd, 16
-    cmova          maxbased, r5d
+    cmp                  hd, 32
+    cmovns         maxbased, r5d
     mov            [tlq+r5], r3b
     packuswb             m0, m1
     vpermq               m0, m0, q3120
@@ -1812,7 +1812,7 @@
     lea                 r3d, [hq+31]
     mov            maxbased, 63
     cmp                  hd, 32
-    cmovb          maxbased, r3d
+    cmovs          maxbased, r3d
     test             angled, 0x400 ; !enable_intra_edge_filter
     jnz .w32_main
     vbroadcasti128       m0, [pb_0to15]
@@ -1889,8 +1889,8 @@
     mov                 tlq, rsp
     mov            [tlq+65], r3b
     mov                 r3d, 65
-    cmp                  hd, 32
-    cmova          maxbased, r3d
+    cmp                  hd, 64
+    cmove          maxbased, r3d
     packuswb             m0, m2
     packuswb             m1, m6
     mova           [tlq+ 0], m0
@@ -2294,7 +2294,7 @@
     cmp                  hd, 16
     movu                xm2, [rsp+49]
     vinserti128          m2, [rsp+43], 1
-    cmovl               r5d, hd
+    cmovs               r5d, hd
     xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
     movd                xm0, r5d
     vbroadcasti128       m1, [base+z_filter_s+12]
@@ -2501,7 +2501,7 @@
 .w8_filter_left_h16:
     mov                 r5d, 10
     cmp                  hd, 16
-    cmovl               r5d, hd
+    cmovs               r5d, hd
     xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
     movd                xm0, r5d
     vpbroadcastb         m0, xm0
@@ -2742,7 +2742,7 @@
 .w16_filter_left_h16:
     mov                 r5d, 10
     cmp                  hd, 16
-    cmovl               r5d, hd
+    cmovs               r5d, hd
     xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
     movd                xm0, r5d
     vpbroadcastb         m0, xm0
@@ -3115,7 +3115,7 @@
     mov                 r4d, 9
     lea                 tlq, [rsp+15]
     cmp                  wd, 4
-    cmova          maxbased, r4d
+    cmovne         maxbased, r4d
     vextracti128        xm1, m0, 1
     packuswb            xm0, xm1
     mova              [rsp], xm0
@@ -3321,8 +3321,8 @@
     sar                 r5d, 1
     lea                 tlq, [rsp+31]
     add                 r5d, 17
-    cmp                  wd, 8
-    cmova          maxbased, r5d
+    cmp                  wd, 16
+    cmovns         maxbased, r5d
     neg                  r5
     mov            [tlq+r5], r4b
     vextracti128        xm1, m0, 1
@@ -3385,7 +3385,7 @@
     sub              org_wd, 8
     lea                  r2, [strideq*3]
     lea                  r6, [dstq+org_wq]
-    cmovg              dstq, r6
+    cmovns             dstq, r6
     punpcklwd           xm1, xm2, xm0
     punpckhwd           xm2, xm0
     lea                  r6, [dstq+strideq*4]
@@ -3493,8 +3493,8 @@
     sar                 r5d, 1
     lea                 tlq, [rsp+63]
     add                 r5d, 33
-    cmp                  wd, 16
-    cmova          maxbased, r5d
+    cmp                  wd, 32
+    cmovns         maxbased, r5d
     neg                  r5
     mov            [tlq+r5], r4b
     packuswb             m0, m1
@@ -3563,7 +3563,7 @@
     sub              org_wd, 8
     lea                  r2, [strideq*3]
     lea                  r6, [dstq+org_wq]
-    cmovg              dstq, r6
+    cmovns             dstq, r6
     punpcklbw            m1, m2, m0
     punpckhbw            m2, m0
     lea                  r3, [strideq*5]
@@ -3652,7 +3652,7 @@
     movu               xm11, [tlq-66]    ; 56-63
     vinserti128         m11, [tlq-52], 1 ; 40-47
     sub                 r4d, wd ; 21-w
-    cmovg               r5d, r4d
+    cmovns              r5d, r4d
     movu               xm12, [tlq-58]    ; 48-55
     vinserti128         m12, [tlq-44], 1 ; 32-39
     sub                 r4d, 8 ; 13-w
@@ -3721,8 +3721,8 @@
     lea                 tlq, [rsp+95]
     mov            [tlq-65], r4b
     mov                 r4d, 65
-    cmp                  wd, 32
-    cmova          maxbased, r4d
+    cmp                  wd, 64
+    cmove          maxbased, r4d
     packuswb             m0, m2
     packuswb             m1, m6
     mova           [tlq-63], m0
@@ -4553,7 +4553,7 @@
     mov                 r6d, 0x5556
     mov                 r2d, 0x3334
     cmp                  hd, 32
-    cmovz               r6d, r2d
+    cmove               r6d, r2d
     movd                xm1, r6d
     pmulhuw             xm0, xm1
 .w8_end:
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -4587,8 +4587,8 @@
 cglobal inv_txfm_add_identity_identity_8x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
     mov                    r5d, 4
     mov                   tx2d, 2
-    cmp                   eobd, 106
-    cmovg                 tx2d, r5d
+    cmp                   eobd, 107
+    cmovns                tx2d, r5d
     mov                    r3d, tx2d
 %if ARCH_X86_32
     LEA                     r5, $$
@@ -4617,8 +4617,8 @@
 cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
     mov                    r5d, 4
     mov                   tx2d, 2
-    cmp                   eobd, 106
-    cmovg                 tx2d, r5d
+    cmp                   eobd, 107
+    cmovns                tx2d, r5d
     mov                    r3d, tx2d
 %if ARCH_X86_32
     LEA                     r5, $$
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -2072,8 +2072,8 @@
     WIN64_SPILL_XMM      16
     movzx               mxd, myb
     shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
     tzcnt               r6d, wd
     movzx               r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
     vpbroadcastd         m7, [pw_512]
@@ -2293,8 +2293,8 @@
     vpbroadcastd         m7, [r8+mxq*8+subpel_filters-put_avx2+2]
     movzx               mxd, myb
     shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
     vpbroadcastq         m0, [r8+myq*8+subpel_filters-put_avx2]
     lea                ss3q, [ssq*3]
     sub                srcq, ss3q
@@ -2434,8 +2434,8 @@
     vpbroadcastd        m11, [r8+mxq*8+subpel_filters-put_avx2+4]
     movzx               mxd, myb
     shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
     vpbroadcastq         m0, [r8+myq*8+subpel_filters-put_avx2]
     lea                ss3q, [ssq*3]
     sub                srcq, ss3q
@@ -5136,9 +5136,9 @@
     xor                r12d, r12d
     lea                 r10, [ihq-1]
     cmp                  yq, ihq
-    cmovl               r10, yq
+    cmovs               r10, yq
     test                 yq, yq
-    cmovl               r10, r12
+    cmovs               r10, r12
     imul                r10, sstrideq
     add                srcq, r10
 
@@ -5145,9 +5145,9 @@
     ; ref += iclip(x, 0, iw - 1)
     lea                 r10, [iwq-1]
     cmp                  xq, iwq
-    cmovl               r10, xq
+    cmovs               r10, xq
     test                 xq, xq
-    cmovl               r10, r12
+    cmovs               r10, r12
     add                srcq, r10
 
     ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
@@ -5154,7 +5154,7 @@
     lea          bottomextq, [yq+bhq]
     sub          bottomextq, ihq
     lea                  r3, [bhq-1]
-    cmovl        bottomextq, r12
+    cmovs        bottomextq, r12
 
     DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
                 bottomext, rightext
@@ -5161,9 +5161,9 @@
 
     ; top_ext = iclip(-y, 0, bh - 1)
     neg             topextq
-    cmovl           topextq, r12
+    cmovs           topextq, r12
     cmp          bottomextq, bhq
-    cmovge       bottomextq, r3
+    cmovns       bottomextq, r3
     cmp             topextq, bhq
     cmovg           topextq, r3
 
@@ -5171,7 +5171,7 @@
     lea           rightextq, [xq+bwq]
     sub           rightextq, iwq
     lea                  r2, [bwq-1]
-    cmovl         rightextq, r12
+    cmovs         rightextq, r12
 
     DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
                 bottomext, rightext
@@ -5178,11 +5178,11 @@
 
     ; left_ext = iclip(-x, 0, bw - 1)
     neg            leftextq
-    cmovl          leftextq, r12
+    cmovs          leftextq, r12
     cmp           rightextq, bwq
-    cmovge        rightextq, r2
+    cmovns        rightextq, r2
     cmp            leftextq, bwq
-    cmovge         leftextq, r2
+    cmovns         leftextq, r2
 
     DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
                 dst, dstride, src, sstride, bottomext, rightext
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -1564,8 +1564,8 @@
 %if ARCH_X86_32
     movzx               mxd, ssb
     shr                 ssd, 16
-    cmp                  hd, 4
-    cmovle              ssd, mxd
+    cmp                  hd, 6
+    cmovs               ssd, mxd
     lea                 ssq, [base_reg+ssq*8+subpel_filters-put_ssse3]
 %else
  %assign stack_offset org_stack_offset
@@ -1572,8 +1572,8 @@
     WIN64_SPILL_XMM      16
     movzx               mxd, myb
     shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
     lea                 myq, [base_reg+myq*8+subpel_filters-put_ssse3]
 %endif
     tzcnt               r6d, wd
@@ -1860,8 +1860,8 @@
 %if ARCH_X86_32
     movzx               mxd, ssb
     shr                 ssd, 16
-    cmp                  hd, 4
-    cmovle              ssd, mxd
+    cmp                  hd, 6
+    cmovs               ssd, mxd
     movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
     W32_RESTORE_SSQ
     lea                  r6, [ssq*3]
@@ -1890,8 +1890,8 @@
 %else
     movzx               mxd, myb
     shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
     movq                 m0, [base_reg+myq*8+subpel_filters-put_ssse3]
     ALLOC_STACK   mmsize*14, 14
     lea                ss3q, [ssq*3]
@@ -2206,8 +2206,8 @@
     movq                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
     movzx               mxd, ssb
     shr                 ssd, 16
-    cmp                  hd, 4
-    cmovle              ssd, mxd
+    cmp                  hd, 6
+    cmovs               ssd, mxd
     movq                 m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
     mov                 ssq, ssmp
     ALLOC_STACK  -mmsize*13
@@ -2247,8 +2247,8 @@
     movq                 m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
     movzx               mxd, myb
     shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
     movq                 m1, [base_reg+myq*8+subpel_filters-put_ssse3]
     pshufd         subpelh0, m0, q0000
     pshufd         subpelh1, m0, q1111
@@ -2646,8 +2646,8 @@
     movzx               mxd, myb
 %endif
     shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
     lea                 myq, [base_reg+myq*8+subpel_filters-prep_ssse3]
     mova                 m2, [base+pw_512]
     psrlw                m2, m2, 1 ; 0x0100
@@ -2859,8 +2859,8 @@
     mov                 mxd, myd
     shr                 myd, 16
     and                 mxd, 0x7f
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
     movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
     mov                  r5, r2; use as new base
  %define           base_reg  r5
@@ -2889,8 +2889,8 @@
 %else
     movzx               mxd, myb
     shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
     movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
     ALLOC_STACK   mmsize*14, 14
     lea            stride3q, [strideq*3]
@@ -3108,8 +3108,8 @@
     mov                 mxd, myd
     shr                 myd, 16
     and                 mxd, 0x7f
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
     movq                 m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
     ALLOC_STACK  -mmsize*13
 %if STACK_ALIGNMENT < mmsize
@@ -3154,8 +3154,8 @@
     movq                 m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
     movzx               mxd, myb
     shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
     movq                 m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
     pshufd         subpelh0, m0, q0000
     pshufd         subpelh1, m0, q1111
@@ -4747,9 +4747,9 @@
     xor            reg_zero, reg_zero
     lea             reg_tmp, [ihq-1]
     cmp                  yq, ihq
-    cmovl           reg_tmp, yq
+    cmovs           reg_tmp, yq
     test                 yq, yq
-    cmovl           reg_tmp, reg_zero
+    cmovs           reg_tmp, reg_zero
 %if ARCH_X86_64
     imul            reg_tmp, sstrideq
     add                srcq, reg_tmp
@@ -4762,9 +4762,9 @@
     ; ref += iclip(x, 0, iw - 1)
     lea             reg_tmp, [iwq-1]
     cmp                  xq, iwq
-    cmovl           reg_tmp, xq
+    cmovs           reg_tmp, xq
     test                 xq, xq
-    cmovl           reg_tmp, reg_zero
+    cmovs           reg_tmp, reg_zero
     add             reg_src, reg_tmp
 %if ARCH_X86_32
     mov                srcm, reg_src
@@ -4777,7 +4777,7 @@
     lea       reg_bottomext, [yq+bhq]
     sub       reg_bottomext, ihq
     lea                  r3, [bhq-1]
-    cmovl     reg_bottomext, reg_zero
+    cmovs     reg_bottomext, reg_zero
     ;
 
     DEFINE_ARGS bw, bh, iw, ih, x, \
@@ -4786,9 +4786,9 @@
 
     ; top_ext = iclip(-y, 0, bh - 1)
     neg             topextq
-    cmovl           topextq, reg_zero
+    cmovs           topextq, reg_zero
     cmp       reg_bottomext, bhq
-    cmovge    reg_bottomext, r3
+    cmovns    reg_bottomext, r3
     cmp             topextq, bhq
     cmovg           topextq, r3
  %if ARCH_X86_32
@@ -4800,7 +4800,7 @@
     lea        reg_rightext, [xq+bwq]
     sub        reg_rightext, iwq
     lea                  r2, [bwq-1]
-    cmovl      reg_rightext, reg_zero
+    cmovs      reg_rightext, reg_zero
 
     DEFINE_ARGS bw, bh, iw, ih, leftext, \
                 topext, dst, dstride, src, sstride, \
@@ -4808,14 +4808,14 @@
 
     ; left_ext = iclip(-x, 0, bw - 1)
     neg            leftextq
-    cmovl          leftextq, reg_zero
+    cmovs          leftextq, reg_zero
     cmp        reg_rightext, bwq
-    cmovge     reg_rightext, r2
+    cmovns     reg_rightext, r2
  %if ARCH_X86_32
     mov                 r3m, r1
  %endif
     cmp            leftextq, bwq
-    cmovge         leftextq, r2
+    cmovns         leftextq, r2
 
 %undef reg_zero
 %undef reg_tmp