ref: 5895809ebbe1db20f753342eabaa9af0bb07dd47
parent: be1fe18e2f2e6802bf696bc5cbddfd24fb2bc748
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Jun 29 11:48:46 EDT 2020
x86: Add minor mc 8-tap optimizations
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -1985,8 +1985,12 @@
%macro FN 4 ; fn, type, type_h, type_v
cglobal %1_%2
mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
mov t1d, FILTER_%4
-%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
%endif
%endmacro
@@ -1999,15 +2003,15 @@
%define PUT_8TAP_FN FN put_8tap,
-PUT_8TAP_FN regular, REGULAR, REGULAR
-PUT_8TAP_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_FN sharp_regular, SHARP, REGULAR
PUT_8TAP_FN sharp, SHARP, SHARP
PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
imul mxd, mxm, 0x010101
@@ -2730,30 +2734,24 @@
mova [tmpq], ym3
%endmacro
-%macro PREP_8TAP_FN 3 ; type, type_h, type_v
-cglobal prep_8tap_%1
- mov t0d, FILTER_%2
- mov t1d, FILTER_%3
-%ifnidn %1, sharp_smooth ; skip the jump in the last filter
- jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
-%endif
-%endmacro
-
%macro PREP_8TAP 0
%if WIN64
DECLARE_REG_TMP 6, 4
%else
DECLARE_REG_TMP 6, 7
- %endif
-PREP_8TAP_FN regular, REGULAR, REGULAR
-PREP_8TAP_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+
PREP_8TAP_FN sharp, SHARP, SHARP
PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
imul mxd, mxm, 0x010101
@@ -5722,11 +5720,9 @@
%macro BILIN_SCALED_FN 1
cglobal %1_bilin_scaled
mov t0d, (5*15 << 16) | 5*15
- mov t1d, (5*15 << 16) | 5*15
+ mov t1d, t0d
jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
%endmacro
-%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
-%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
%if WIN64
DECLARE_REG_TMP 6, 5
@@ -5733,16 +5729,20 @@
%else
DECLARE_REG_TMP 6, 8
%endif
+
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+
BILIN_SCALED_FN put
-PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
-PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
-PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
MC_8TAP_SCALED put
%if WIN64
@@ -5750,16 +5750,17 @@
%else
DECLARE_REG_TMP 6, 7
%endif
+
BILIN_SCALED_FN prep
-PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
-PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
-PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
MC_8TAP_SCALED prep
%macro WARP_V 5 ; dst, 02, 46, 13, 57
--- a/src/x86/mc_sse.asm
+++ b/src/x86/mc_sse.asm
@@ -1464,6 +1464,19 @@
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
+%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v
+cglobal %1_8tap_%2
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8tap %+ SUFFIX)
+%endif
+%endmacro
+
%if ARCH_X86_32
DECLARE_REG_TMP 1, 2
%elif WIN64
@@ -1472,25 +1485,16 @@
DECLARE_REG_TMP 7, 8
%endif
-%macro PUT_8TAP_FN 3 ; type, type_h, type_v
-cglobal put_8tap_%1
- mov t0d, FILTER_%2
- mov t1d, FILTER_%3
-%ifnidn %1, sharp_smooth ; skip the jump in the last filter
- jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX)
-%endif
-%endmacro
+MC_8TAP_FN put, sharp, SHARP, SHARP
+MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH
+MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP
+MC_8TAP_FN put, smooth, SMOOTH, SMOOTH
+MC_8TAP_FN put, sharp_regular, SHARP, REGULAR
+MC_8TAP_FN put, regular_sharp, REGULAR, SHARP
+MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR
+MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH
+MC_8TAP_FN put, regular, REGULAR, REGULAR
-PUT_8TAP_FN regular, REGULAR, REGULAR
-PUT_8TAP_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_FN sharp_regular, SHARP, REGULAR
-PUT_8TAP_FN sharp, SHARP, SHARP
-PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
-
%if ARCH_X86_32
%define base_reg r1
%define base base_reg-put_ssse3
@@ -2764,15 +2768,6 @@
PHADDW %1, m1, %3, 1
%endmacro
-%macro PREP_8TAP_FN 3 ; type, type_h, type_v
-cglobal prep_8tap_%1
- mov t0d, FILTER_%2
- mov t1d, FILTER_%3
-%ifnidn %1, sharp_smooth ; skip the jump in the last filter
- jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
-%endif
-%endmacro
-
%macro PREP_8TAP 0
%if ARCH_X86_32
DECLARE_REG_TMP 1, 2
@@ -2781,15 +2776,16 @@
%else
DECLARE_REG_TMP 6, 7
%endif
-PREP_8TAP_FN regular, REGULAR, REGULAR
-PREP_8TAP_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_FN sharp_regular, SHARP, REGULAR
-PREP_8TAP_FN sharp, SHARP, SHARP
-PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+
+MC_8TAP_FN prep, sharp, SHARP, SHARP
+MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH
+MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP
+MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH
+MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR
+MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP
+MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR
+MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH
+MC_8TAP_FN prep, regular, REGULAR, REGULAR
%if ARCH_X86_32
%define base_reg r2