shithub: dav1d

Download patch

ref: 5895809ebbe1db20f753342eabaa9af0bb07dd47
parent: be1fe18e2f2e6802bf696bc5cbddfd24fb2bc748
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Jun 29 11:48:46 EDT 2020

x86: Add minor mc 8-tap optimizations

--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -1985,8 +1985,12 @@
 %macro FN 4 ; fn, type, type_h, type_v
 cglobal %1_%2
     mov                 t0d, FILTER_%3
+%ifidn %3, %4
+    mov                 t1d, t0d
+%else
     mov                 t1d, FILTER_%4
-%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
     jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
 %endif
 %endmacro
@@ -1999,15 +2003,15 @@
 
 %define PUT_8TAP_FN FN put_8tap,
 
-PUT_8TAP_FN regular,        REGULAR, REGULAR
-PUT_8TAP_FN regular_sharp,  REGULAR, SHARP
-PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR
-PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH
-PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
-PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR
 PUT_8TAP_FN sharp,          SHARP,   SHARP
 PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
+PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
+PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH
+PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR
+PUT_8TAP_FN regular_sharp,  REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular,        REGULAR, REGULAR
 
 cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
     imul                mxd, mxm, 0x010101
@@ -2730,30 +2734,24 @@
     mova             [tmpq], ym3
 %endmacro
 
-%macro PREP_8TAP_FN 3 ; type, type_h, type_v
-cglobal prep_8tap_%1
-    mov                 t0d, FILTER_%2
-    mov                 t1d, FILTER_%3
-%ifnidn %1, sharp_smooth ; skip the jump in the last filter
-    jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
-%endif
-%endmacro
-
 %macro PREP_8TAP 0
  %if WIN64
   DECLARE_REG_TMP 6, 4
  %else
   DECLARE_REG_TMP 6, 7
- %endif
-PREP_8TAP_FN regular,        REGULAR, REGULAR
-PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR
-PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH
-PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
-PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+
 PREP_8TAP_FN sharp,          SHARP,   SHARP
 PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
+PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
+PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH
+PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR
+PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular,        REGULAR, REGULAR
 
 cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
     imul                mxd, mxm, 0x010101
@@ -5722,11 +5720,9 @@
 %macro BILIN_SCALED_FN 1
 cglobal %1_bilin_scaled
     mov                 t0d, (5*15 << 16) | 5*15
-    mov                 t1d, (5*15 << 16) | 5*15
+    mov                 t1d, t0d
     jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
 %endmacro
-%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
-%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
 
 %if WIN64
 DECLARE_REG_TMP 6, 5
@@ -5733,16 +5729,20 @@
 %else
 DECLARE_REG_TMP 6, 8
 %endif
+
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+
 BILIN_SCALED_FN put
-PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
-PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
-PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
-PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
-PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
-PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
-PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
 PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP
 PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
+PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
+PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
+PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
 MC_8TAP_SCALED put
 
 %if WIN64
@@ -5750,16 +5750,17 @@
 %else
 DECLARE_REG_TMP 6, 7
 %endif
+
 BILIN_SCALED_FN prep
-PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
-PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
-PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
-PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
-PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
-PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
-PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
 PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP
 PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
+PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
+PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
+PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
 MC_8TAP_SCALED prep
 
 %macro WARP_V 5 ; dst, 02, 46, 13, 57
--- a/src/x86/mc_sse.asm
+++ b/src/x86/mc_sse.asm
@@ -1464,6 +1464,19 @@
 %assign FILTER_SMOOTH  (1*15 << 16) | 4*15
 %assign FILTER_SHARP   (2*15 << 16) | 3*15
 
+%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v
+cglobal %1_8tap_%2
+    mov                 t0d, FILTER_%3
+%ifidn %3, %4
+    mov                 t1d, t0d
+%else
+    mov                 t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+    jmp mangle(private_prefix %+ _%1_8tap %+ SUFFIX)
+%endif
+%endmacro
+
 %if ARCH_X86_32
 DECLARE_REG_TMP 1, 2
 %elif WIN64
@@ -1472,25 +1485,16 @@
 DECLARE_REG_TMP 7, 8
 %endif
 
-%macro PUT_8TAP_FN 3 ; type, type_h, type_v
-cglobal put_8tap_%1
-    mov                 t0d, FILTER_%2
-    mov                 t1d, FILTER_%3
-%ifnidn %1, sharp_smooth ; skip the jump in the last filter
-    jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX)
-%endif
-%endmacro
+MC_8TAP_FN put, sharp,          SHARP,   SHARP
+MC_8TAP_FN put, sharp_smooth,   SHARP,   SMOOTH
+MC_8TAP_FN put, smooth_sharp,   SMOOTH,  SHARP
+MC_8TAP_FN put, smooth,         SMOOTH,  SMOOTH
+MC_8TAP_FN put, sharp_regular,  SHARP,   REGULAR
+MC_8TAP_FN put, regular_sharp,  REGULAR, SHARP
+MC_8TAP_FN put, smooth_regular, SMOOTH,  REGULAR
+MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH
+MC_8TAP_FN put, regular,        REGULAR, REGULAR
 
-PUT_8TAP_FN regular,        REGULAR, REGULAR
-PUT_8TAP_FN regular_sharp,  REGULAR, SHARP
-PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR
-PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH
-PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
-PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR
-PUT_8TAP_FN sharp,          SHARP,   SHARP
-PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
-
 %if ARCH_X86_32
  %define base_reg r1
  %define base base_reg-put_ssse3
@@ -2764,15 +2768,6 @@
     PHADDW               %1, m1, %3, 1
 %endmacro
 
-%macro PREP_8TAP_FN 3 ; type, type_h, type_v
-cglobal prep_8tap_%1
-    mov                 t0d, FILTER_%2
-    mov                 t1d, FILTER_%3
-%ifnidn %1, sharp_smooth ; skip the jump in the last filter
-    jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
-%endif
-%endmacro
-
 %macro PREP_8TAP 0
 %if ARCH_X86_32
  DECLARE_REG_TMP 1, 2
@@ -2781,15 +2776,16 @@
 %else
  DECLARE_REG_TMP 6, 7
 %endif
-PREP_8TAP_FN regular,        REGULAR, REGULAR
-PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR
-PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH
-PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
-PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR
-PREP_8TAP_FN sharp,          SHARP,   SHARP
-PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
+
+MC_8TAP_FN prep, sharp,          SHARP,   SHARP
+MC_8TAP_FN prep, sharp_smooth,   SHARP,   SMOOTH
+MC_8TAP_FN prep, smooth_sharp,   SMOOTH,  SHARP
+MC_8TAP_FN prep, smooth,         SMOOTH,  SMOOTH
+MC_8TAP_FN prep, sharp_regular,  SHARP,   REGULAR
+MC_8TAP_FN prep, regular_sharp,  REGULAR, SHARP
+MC_8TAP_FN prep, smooth_regular, SMOOTH,  REGULAR
+MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH
+MC_8TAP_FN prep, regular,        REGULAR, REGULAR
 
 %if ARCH_X86_32
  %define base_reg r2