ref: a63e13eecdf495b6b4880a4dc4b03715e6640a55
parent: 4dd64f06beec5301c02efce225e9d8ffe66c47ff
author: Sindre Aamås <saamas@cisco.com>
date: Tue Mar 7 09:15:23 EST 2017
[Common/x86] Simplify mc_luma X86_32_PICASM handling Utilize program counter-relative offsets to simplify X86_32_PICASM code. In order for this to work with nasm, data constants are placed in the text segment.
--- a/codec/common/x86/mc_luma.asm
+++ b/codec/common/x86/mc_luma.asm
@@ -44,7 +44,11 @@
;*******************************************************************************
; Local Data (Read Only)
;*******************************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=32
+%else
SECTION .rodata align=32
+%endif
;*******************************************************************************
; Various memory constants (trigonometric values or rounding values)
@@ -120,12 +124,6 @@
psllw %1, 4
%endmacro
-%macro MOVEIMM_DW32 1
- pcmpeqw %1, %1
- psrlw %1, 15
- psllw %1, 5
-%endmacro
-
%endif
;*******************************************************************************
@@ -197,12 +195,7 @@
%macro FILTER_HV_W8 9
paddw %1, %6
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 %8
- paddw %1, %8
-%else
- paddw %1, [h264_w0x10_1]
-%endif
+ paddw %1, [pic(h264_w0x10_1)]
movdqa %8, %3
movdqa %7, %2
paddw %8, %4
@@ -221,12 +214,7 @@
%macro FILTER_HV_W4 9
paddw %1, %6
-%ifdef X86_32_PICASM
-MOVEIMM_DW16 %8
-paddw %1, %8
-%else
-paddw %1, [h264_w0x10_1]
-%endif
+paddw %1, [pic(h264_w0x10_1)]
movdqa %8, %3
movdqa %7, %2
paddw %8, %4
@@ -457,6 +445,7 @@
;*******************************************************************************
WELS_EXTERN McHorVer02WidthEq8_sse2
%assign push_num 0
+ INIT_X86_32_PIC r5
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -530,6 +519,7 @@
.xx_exit:
POP_XMM
LOAD_5_PARA_POP
+ DEINIT_X86_32_PIC
ret
;***********************************************************************
@@ -550,6 +540,7 @@
;***********************************************************************
WELS_EXTERN McHorVer02Height9Or17_sse2
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -671,6 +662,7 @@
%endif
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
@@ -684,6 +676,7 @@
;***********************************************************************
WELS_EXTERN McHorVer02Height5_sse2
%assign push_num 0
+INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -805,6 +798,7 @@
%endif
POP_XMM
LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
ret
@@ -819,6 +813,7 @@
;***********************************************************************
WELS_EXTERN McHorVer20Width9Or17_sse2
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -855,12 +850,7 @@
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 xmm6
- paddw xmm0, xmm6
-%else
- paddw xmm0, [h264_w0x10_1]
-%endif
+ paddw xmm0, [pic(h264_w0x10_1)]
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2], xmm0
@@ -877,11 +867,7 @@
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
-%ifdef X86_32_PICASM
- paddw xmm2, xmm6
-%else
- paddw xmm2, [h264_w0x10_1]
-%endif
+ paddw xmm2, [pic(h264_w0x10_1)]
psraw xmm2, 5
packuswb xmm2, xmm2
movq [r2+1], xmm2
@@ -892,6 +878,7 @@
jnz .yloop_width_9
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
@@ -918,12 +905,7 @@
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 xmm6
- paddw xmm0, xmm6
-%else
- paddw xmm0, [h264_w0x10_1]
-%endif
+ paddw xmm0, [pic(h264_w0x10_1)]
psraw xmm0, 5
packuswb xmm0, xmm0
movq [r2], xmm0
@@ -951,12 +933,7 @@
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 xmm6
- paddw xmm0, xmm6
-%else
- paddw xmm0, [h264_w0x10_1]
-%endif
+ paddw xmm0, [pic(h264_w0x10_1)]
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2+8], xmm0
@@ -974,11 +951,7 @@
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
-%ifdef X86_32_PICASM
- paddw xmm2, xmm6
-%else
- paddw xmm2, [h264_w0x10_1]
-%endif
+ paddw xmm2, [pic(h264_w0x10_1)]
psraw xmm2, 5
packuswb xmm2, xmm2
movq [r2+9], xmm2
@@ -988,6 +961,7 @@
jnz .yloop_width_17
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
@@ -1002,6 +976,7 @@
;***********************************************************************
WELS_EXTERN McHorVer20Width5_sse2
%assign push_num 0
+INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -1035,12 +1010,7 @@
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
-%ifdef X86_32_PICASM
-MOVEIMM_DW16 xmm6
-paddw xmm0, xmm6
-%else
-paddw xmm0, [h264_w0x10_1]
-%endif
+paddw xmm0, [pic(h264_w0x10_1)]
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2], xmm0
@@ -1057,11 +1027,7 @@
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
-%ifdef X86_32_PICASM
-paddw xmm2, xmm6
-%else
-paddw xmm2, [h264_w0x10_1]
-%endif
+paddw xmm2, [pic(h264_w0x10_1)]
psraw xmm2, 5
packuswb xmm2, xmm2
movd [r2+1], xmm2
@@ -1072,6 +1038,7 @@
jnz .yloop_width_5
POP_XMM
LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
ret
@@ -1238,12 +1205,7 @@
psubw %1, %7
psraw %1, 2
paddw %8, %1
-%ifdef X86_32_PICASM
- MOVEIMM_DW32 %7
- paddw %8, %7
-%else
- paddw %8, [h264_mc_hc_32]
-%endif
+ paddw %8, [pic(h264_mc_hc_32)]
psraw %8, 6
packuswb %8, %8
movq %9, %8
@@ -1260,6 +1222,7 @@
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -1377,6 +1340,7 @@
%endif
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
;***********************************************************************
@@ -1391,6 +1355,7 @@
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -1507,6 +1472,7 @@
%endif
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
@@ -1595,12 +1561,7 @@
psubw %1, %7
psraw %1, 2
paddw %8, %1
-%ifdef X86_32_PICASM
-MOVEIMM_DW32 %7
-paddw %8, %7
-%else
-paddw %8, [h264_mc_hc_32]
-%endif
+paddw %8, [pic(h264_mc_hc_32)]
psraw %8, 6
packuswb %8, %8
movd %9, %8
@@ -1619,6 +1580,7 @@
WELS_EXTERN McHorVer22Width4VerLastAlign_sse2
%assign push_num 0
+INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -1736,6 +1698,7 @@
%endif
POP_XMM
LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
ret
@@ -1751,6 +1714,7 @@
WELS_EXTERN McHorVer22Width4VerLastUnAlign_sse2
%assign push_num 0
+INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -1867,6 +1831,7 @@
%endif
POP_XMM
LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
ret
@@ -1879,12 +1844,7 @@
movdqa %7, %3
pmaddubsw %7, %6
paddw %1, %7
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 %7
- paddw %1, %7
-%else
- paddw %1, [h264_w0x10_1]
-%endif
+ paddw %1, [pic(h264_w0x10_1)]
psraw %1, 5
%endmacro
@@ -1901,12 +1861,7 @@
movdqa %7, %4
pmaddubsw %7, %6
paddw %1, %7
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 %7
- paddw %1, %7
-%else
- paddw %1, [h264_w0x10_1]
-%endif
+ paddw %1, [pic(h264_w0x10_1)]
psraw %1, 5
%endmacro
@@ -1916,20 +1871,7 @@
pshufb %1, %2
pshufb %5, %3
pshufd %6, %1, 10110001b
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x14141414 ;db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
- pmaddubsw %1, [esp]
- mov esp, r0
- pop r0
-%else
- pmaddubsw %1, [db20_128]
-%endif
+ pmaddubsw %1, [pic(db20_128)]
pmaddubsw %5, %4
pmaddubsw %6, %4
paddw %1, %5
@@ -1939,12 +1881,7 @@
; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
%macro SSSE3_FilterHorizontal_8px 6
SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 %5
- paddw %1, %5
-%else
- paddw %1, [h264_w0x10_1]
-%endif
+ paddw %1, [pic(h264_w0x10_1)]
psraw %1, 5
%endmacro
@@ -1959,20 +1896,7 @@
pshufb %7, %4
punpcklqdq %6, %7
pshufd %7, %1, 10110001b
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x14141414 ;db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
- pmaddubsw %1, [esp]
- mov esp, r0
- pop r0
-%else
- pmaddubsw %1, [db20_128]
-%endif
+ pmaddubsw %1, [pic(db20_128)]
pmaddubsw %6, %5
pmaddubsw %7, %5
paddw %1, %6
@@ -1982,31 +1906,13 @@
; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
%macro SSSE3_FilterHorizontal_2x4px 7
SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 %6
- paddw %1, %6
-%else
- paddw %1, [h264_w0x10_1]
-%endif
+ paddw %1, [pic(h264_w0x10_1)]
psraw %1, 5
%endmacro
; pixels=%1 -32768>>scale=%2 tmp=%3
%macro SSSE3_FilterHorizontalbw_2px 3
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- pmaddubsw %1, [esp]
- mov esp, r1
- pop r1
-%else
- pmaddubsw %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_128]
-%endif
+ pmaddubsw %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_128)]
pmaddwd %1, %2
pshufd %3, %1, 10110001b
paddd %1, %3
@@ -2014,33 +1920,8 @@
; pixels=%1 tmp=%2
%macro SSSE3_FilterHorizontal_2px 2
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- pmaddubsw %1, [esp]
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- pmaddwd %1, [esp]
- pshufd %2, %1, 10110001b
- paddd %1, %2
- push 0x00008000
- push 0x00008000
- push 0x00008000
- push 0x00008000
- paddd %1, [esp]
- mov esp, r1
- pop r1
-%else
- SSSE3_FilterHorizontalbw_2px %1, [dwm1024_128], %2
- paddd %1, [dd32768_128]
-%endif
+ SSSE3_FilterHorizontalbw_2px %1, [pic(dwm1024_128)], %2
+ paddd %1, [pic(dd32768_128)]
%endmacro
; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
@@ -2055,14 +1936,8 @@
paddw %7, %4
paddw %1, %7
psraw %1, 2
-%ifdef X86_32_PICASM
+ paddw %7, [pic(h264_mc_hc_32)]
paddw %1, %7
- MOVEIMM_DW32 %7
- paddw %1, %7
-%else
- paddw %7, [h264_mc_hc_32]
- paddw %1, %7
-%endif
psraw %1, 6
%endmacro
@@ -2080,7 +1955,11 @@
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
+%ifdef X86_32_PICASM
+%define i_width dword arg5
+%else
%define i_width r4
+%endif
%define i_height r5
%define i_srcstride3 r6
%assign push_num 0
@@ -2094,28 +1973,14 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+ INIT_X86_32_PIC_NOPRESERVE r4
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
+ %assign push_num_begin push_num
cmp i_width, 4
jg .width8or16
-%ifdef X86_32_PICASM
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- movdqu xmm6, [esp]
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- movdqu xmm7, [esp]
- push 0x14141414 ;db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
-%endif
movd xmm0, [p_src]
movd xmm4, [p_src + i_srcstride]
punpcklbw xmm0, xmm4
@@ -2134,14 +1999,8 @@
movd xmm3, [p_src]
punpcklbw xmm4, xmm3
punpcklqdq xmm2, xmm4
-%ifdef X86_32_PICASM
- movdqu xmm5, [esp]
- SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, xmm6, xmm5, xmm7, xmm4
- add esp, 48
-%else
- movdqa xmm5, [db20_128]
- SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+ movdqa xmm5, [pic(db20_128)]
+ SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm0, xmm0
movd [p_dst], xmm0
psrlq xmm0, 32
@@ -2152,11 +2011,7 @@
movd xmm0, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm0
punpcklqdq xmm3, xmm4
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, xmm6, xmm5, xmm7, xmm4
-%else
- SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+ SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm1, xmm1
movd [p_dst], xmm1
psrlq xmm1, 32
@@ -2167,14 +2022,11 @@
movd xmm4, [p_src + i_srcstride3]
punpcklbw xmm0, xmm4
jg .width4_height_ge8
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, xmm6, xmm5, xmm7, xmm4
-%else
- SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+ SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm2, xmm2
movd [p_dst], xmm2
.width4_height_le5_done:
+ DEINIT_X86_32_PIC_KEEPDEF
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -2186,11 +2038,7 @@
movd xmm1, [p_src]
punpcklbw xmm4, xmm1
punpcklqdq xmm0, xmm4
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, xmm6, xmm5, xmm7, xmm4
-%else
- SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+ SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm2, xmm2
movd [p_dst], xmm2
psrlq xmm2, 32
@@ -2201,11 +2049,7 @@
movd xmm2, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm2
punpcklqdq xmm1, xmm4
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, xmm6, xmm5, xmm7, xmm4
-%else
- SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+ SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm3, xmm3
movd [p_dst], xmm3
psrlq xmm3, 32
@@ -2215,14 +2059,11 @@
lea p_dst, [p_dst + 2 * i_dststride]
movd xmm4, [p_src + i_srcstride3]
punpcklbw xmm2, xmm4
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, xmm6, xmm5, xmm7, xmm4
-%else
- SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+ SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm0, xmm0
movd [p_dst], xmm0
.width4_height_ge8_done:
+ DEINIT_X86_32_PIC_KEEPDEF
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -2231,38 +2072,16 @@
ret
.width8or16:
+ %assign push_num push_num_begin
sub i_height, 1
push i_height
+ %assign push_num push_num + 1
%xdefine i_ycnt i_height
%define i_height [r7]
.xloop:
push p_src
push p_dst
-%ifdef X86_32_PICASM
- push i_width
- mov i_width, esp
- and esp, 0xfffffff0
- push 0xfb01fb01 ;[esp+64]maddubsw_p1m5_128
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0x14141414 ;[esp+48]db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x01fb01fb ;[esp+32]maddubsw_m5p1_128
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x14fb14fb ;[esp+16]maddubsw_m5p20_128
- push 0x14fb14fb
- push 0x14fb14fb
- push 0x14fb14fb
- push 0xfb14fb14 ;[esp] maddubsw_p20m5_128
- push 0xfb14fb14
- push 0xfb14fb14
- push 0xfb14fb14
-%endif
+ %assign push_num push_num + 2
test i_ycnt, 1
jnz .yloop_begin_even
movq xmm0, [p_src]
@@ -2276,11 +2095,7 @@
movq xmm5, [p_src + i_srcstride]
lea p_src, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm5
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [esp+64], [esp+48], [esp+32], xmm7
-%else
- SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm7
-%endif
+ SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm7
packuswb xmm0, xmm0
movlps [p_dst], xmm0
add p_dst, i_dststride
@@ -2297,36 +2112,20 @@
punpcklbw xmm4, xmm5
.yloop:
movq xmm6, [p_src]
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [esp+16], [esp], xmm0, xmm7
-%else
- SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm0, xmm7
-%endif
+ SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm0, xmm7
movq xmm7, [p_src + i_srcstride]
punpcklbw xmm6, xmm7
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [esp+64], [esp+48], [esp+32], xmm0
-%else
- SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm0
-%endif
+ SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm0
packuswb xmm1, xmm2
movlps [p_dst], xmm1
movhps [p_dst + i_dststride], xmm1
lea p_dst, [p_dst + 2 * i_dststride]
movq xmm0, [p_src + 2 * i_srcstride]
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [esp+16], [esp], xmm2, xmm1
-%else
- SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm2, xmm1
-%endif
+ SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm2, xmm1
movq xmm1, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
punpcklbw xmm0, xmm1
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [esp+64], [esp+48], [esp+32], xmm2
-%else
- SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm2
-%endif
+ SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm2
packuswb xmm3, xmm4
movlps [p_dst], xmm3
movhps [p_dst + i_dststride], xmm3
@@ -2334,36 +2133,20 @@
jle .yloop_exit
lea p_dst, [p_dst + 2 * i_dststride]
movq xmm2, [p_src]
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [esp+16], [esp], xmm4, xmm3
-%else
- SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm4, xmm3
-%endif
+ SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm4, xmm3
movq xmm3, [p_src + i_srcstride]
punpcklbw xmm2, xmm3
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [esp+64], [esp+48], [esp+32], xmm4
-%else
- SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm4
-%endif
+ SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm5, xmm6
movlps [p_dst], xmm5
movhps [p_dst + i_dststride], xmm5
lea p_dst, [p_dst + 2 * i_dststride]
movq xmm4, [p_src + 2 * i_srcstride]
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [esp+16], [esp], xmm6, xmm5
-%else
- SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm6, xmm5
-%endif
+ SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm6, xmm5
movq xmm5, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
punpcklbw xmm4, xmm5
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [esp+64], [esp+48], [esp+32], xmm6
-%else
- SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm6
-%endif
+ SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm6
packuswb xmm7, xmm0
movlps [p_dst], xmm7
movhps [p_dst + i_dststride], xmm7
@@ -2371,12 +2154,9 @@
sub i_ycnt, 8
jg .yloop
.yloop_exit:
-%ifdef X86_32_PICASM
- mov esp, i_width
- pop i_width
-%endif
pop p_dst
pop p_src
+ %assign push_num push_num - 2
sub i_width, 8
jle .width8or16_done
add p_src, 8
@@ -2385,6 +2165,8 @@
jmp .xloop
.width8or16_done:
pop i_ycnt
+ %assign push_num push_num - 1
+ DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -2418,6 +2200,7 @@
%define i_width r4
%define i_height r5
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
@@ -2424,28 +2207,9 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
-%ifdef X86_32_PICASM
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- movdqu xmm4, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- movdqu xmm5, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- movdqu xmm6, [esp]
- add esp, 48
-%else
- movdqa xmm4, [shufb_32435465768798A9]
- movdqa xmm5, [shufb_011267784556ABBC]
- movdqa xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ movdqa xmm4, [pic(shufb_32435465768798A9)]
+ movdqa xmm5, [pic(shufb_011267784556ABBC)]
+ movdqa xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 8
je .width8_yloop
jg .width16_yloop
@@ -2463,6 +2227,7 @@
jg .width4_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width8_yloop:
movdqu xmm0, [p_src - 2]
@@ -2478,6 +2243,7 @@
jg .width8_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width16_yloop:
movdqu xmm0, [p_src - 2]
@@ -2492,6 +2258,7 @@
jg .width16_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -2518,6 +2285,7 @@
%define i_width r4
%define i_height r5
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -2524,28 +2292,9 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
-%ifdef X86_32_PICASM
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- movdqu xmm5, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- movdqu xmm6, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- movdqu xmm7, [esp]
- add esp, 48
-%else
- movdqa xmm5, [shufb_32435465768798A9]
- movdqa xmm6, [shufb_011267784556ABBC]
- movdqa xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ movdqa xmm5, [pic(shufb_32435465768798A9)]
+ movdqa xmm6, [pic(shufb_011267784556ABBC)]
+ movdqa xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 9
je .width9_yloop
jg .width17_yloop
@@ -2563,6 +2312,7 @@
jg .width5_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width9_yloop:
movdqu xmm0, [p_src - 2]
@@ -2586,6 +2336,7 @@
jg .width9_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width17_yloop:
movdqu xmm0, [p_src - 2]
@@ -2615,6 +2366,7 @@
jg .width17_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -2637,6 +2389,7 @@
%define p_dst r2
%define i_height r3
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
@@ -2643,28 +2396,9 @@
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
-%ifdef X86_32_PICASM
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- movdqu xmm4, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- movdqu xmm5, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- movdqu xmm6, [esp]
- add esp, 48
-%else
- movdqa xmm4, [shufb_32435465768798A9]
- movdqa xmm5, [shufb_011267784556ABBC]
- movdqa xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ movdqa xmm4, [pic(shufb_32435465768798A9)]
+ movdqa xmm5, [pic(shufb_011267784556ABBC)]
+ movdqa xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 1
.yloop:
movdqu xmm0, [p_src - 2]
@@ -2681,6 +2415,7 @@
movlps [p_dst], xmm0
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -2702,6 +2437,7 @@
%define i_height r3
%define i_srcstride 8
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -2746,6 +2482,7 @@
.done:
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef p_dst
@@ -2769,6 +2506,7 @@
%define i_dststride r3
%define i_height r4
%assign push_num 0
+ INIT_X86_32_PIC r5
LOAD_5_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
@@ -2776,28 +2514,9 @@
SIGN_EXTENSION r4, r4d
sub p_src, i_srcstride
sub p_src, i_srcstride
-%ifdef X86_32_PICASM
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- movdqu xmm4, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- movdqu xmm5, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- movdqu xmm6, [esp]
- add esp, 48
-%else
- movdqa xmm4, [shufb_32435465768798A9]
- movdqa xmm5, [shufb_011267784556ABBC]
- movdqa xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ movdqa xmm4, [pic(shufb_32435465768798A9)]
+ movdqa xmm5, [pic(shufb_011267784556ABBC)]
+ movdqa xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 1
.yloop:
movdqu xmm0, [p_src - 2]
@@ -2818,6 +2537,7 @@
.done:
POP_XMM
LOAD_5_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -2846,6 +2566,7 @@
push r5
%assign push_num 1
%endif
+ INIT_X86_32_PIC r6
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -2936,6 +2657,7 @@
.done:
POP_XMM
LOAD_5_PARA_POP
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r5
%endif
@@ -2965,6 +2687,7 @@
%define i_width r4
%define i_height r5
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -2975,28 +2698,9 @@
sub p_src, i_srcstride
pcmpeqw xmm4, xmm4
psllw xmm4, 15 ; dw -32768
-%ifdef X86_32_PICASM
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- movdqu xmm5, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- movdqu xmm6, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- movdqu xmm7, [esp]
- add esp, 48
-%else
- movdqa xmm5, [shufb_32435465768798A9]
- movdqa xmm6, [shufb_011267784556ABBC]
- movdqa xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ movdqa xmm5, [pic(shufb_32435465768798A9)]
+ movdqa xmm6, [pic(shufb_011267784556ABBC)]
+ movdqa xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 9
jne .width17_yloop
@@ -3019,6 +2723,7 @@
jg .width9_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width17_yloop:
@@ -3047,6 +2752,7 @@
jg .width17_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -3070,7 +2776,11 @@
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
+%ifdef X86_32_PICASM
+%define i_width dword arg5
+%else
%define i_width r4
+%endif
%define i_height r5
%define i_srcstride3 r6
%assign push_num 0
@@ -3084,14 +2794,23 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+ INIT_X86_32_PIC_NOPRESERVE r4
sub i_height, 1
push i_height
+ %assign push_num push_num + 1
lea i_srcstride3, [3 * i_srcstride]
test i_width, 1
jz .width_loop
push p_src
push p_dst
+ %assign push_num push_num + 2
+%ifdef X86_32_PICASM
+ add p_src, i_width
+ add p_src, i_width
+ sub p_src, 2
+%else
lea p_src, [p_src + 2 * i_width - 2]
+%endif
add p_dst, i_width
movd xmm0, [p_src]
punpcklwd xmm0, [p_src + i_srcstride]
@@ -3186,11 +2905,13 @@
.unalign_done:
pop p_dst
pop p_src
+ %assign push_num push_num - 2
mov i_height, [r7]
sub i_width, 1
.width_loop:
push p_src
push p_dst
+ %assign push_num push_num + 2
movdqa xmm0, [p_src]
movdqa xmm1, [p_src + i_srcstride]
movdqa xmm2, [p_src + 2 * i_srcstride]
@@ -3245,6 +2966,7 @@
.x_loop_dec:
pop p_dst
pop p_src
+ %assign push_num push_num - 2
sub i_width, 8
jle .done
mov i_height, [r7]
@@ -3258,6 +2980,8 @@
pop p_src
.done:
pop i_height
+ %assign push_num push_num - 1
+ DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -3280,24 +3004,7 @@
vpshufb %5, %1, %3
vpshufb %1, %1, %2
vpshufd %6, %1, 10110001b
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- vpmaddubsw %1, %1, [esp]
- mov esp, r0
- pop r0
-%else
- vpmaddubsw %1, %1, [db20_256]
-%endif
+ vpmaddubsw %1, %1, [pic(db20_256)]
vpmaddubsw %5, %5, %4
vpmaddubsw %6, %6, %4
vpaddw %1, %1, %5
@@ -3307,14 +3014,7 @@
; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 db20=%4 tmp=%5,%6
%macro AVX2_FilterHorizontal_16px 6
AVX2_FilterHorizontalbw_16px %1, %2, %3, %4, %5, %6
-%ifdef X86_32_PICASM
- vpcmpeqw %6, %6, %6
- vpsrlw %6, %6, 15
- vpsllw %6, %6, 4
- vpaddw %1, %1, %6
-%else
- vpaddw %1, %1, [h264_w0x10_256]
-%endif
+ vpaddw %1, %1, [pic(h264_w0x10_256)]
vpsraw %1, %1, 5
%endmacro
@@ -3327,24 +3027,7 @@
vpunpcklqdq %1, %1, %2
vpunpcklqdq %6, %6, %7
vpshufd %7, %1, 10110001b
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- vpmaddubsw %1, %1, [esp]
- mov esp, r0
- pop r0
-%else
- vpmaddubsw %1, %1, [db20_256]
-%endif
+ vpmaddubsw %1, %1, [pic(db20_256)]
vpmaddubsw %6, %6, %5
vpmaddubsw %7, %7, %5
vpaddw %1, %1, %6
@@ -3354,20 +3037,13 @@
; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 db20=%5 tmp=%6,%7
%macro AVX2_FilterHorizontal_4x4px 7
AVX2_FilterHorizontalbw_4x4px %1, %2, %3, %4, %5, %6, %7
-%ifdef X86_32_PICASM
- vpcmpeqw %7, %7, %7
- vpsrlw %7, %7, 15
- vpsllw %7, %7, 4
- vpaddw %1, %1, %7
-%else
- vpaddw %1, %1, [h264_w0x10_256]
-%endif
+ vpaddw %1, %1, [pic(h264_w0x10_256)]
vpsraw %1, %1, 5
%endmacro
; pixels=%1 -32768>>scale=%2 tmp=%3
%macro AVX2_FilterHorizontalbw_4px 3
- vpmaddubsw %1, %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_256]
+ vpmaddubsw %1, %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_256)]
vpmaddwd %1, %1, %2
vpshufd %3, %1, 10110001b
vpaddd %1, %1, %3
@@ -3375,45 +3051,8 @@
; pixels=%1 tmp=%2
%macro AVX2_FilterHorizontal_4px 2
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x0000fe0a ;maddubsw_m2p10_m40m40_p10m2_p0p0_256
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- push 0xfc00fc00 ;dwm1024_256
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- push 0x00008000 ;dd32768_256
- push 0x00008000
- push 0x00008000
- push 0x00008000
- push 0x00008000
- push 0x00008000
- push 0x00008000
- push 0x00008000
- vpmaddubsw %1, %1, [esp+64]
- vpmaddwd %1, %1, [esp+32]
- vpshufd %2, %1, 10110001b
- vpaddd %1, %1, %2
- vpaddd %1, %1, [esp]
- mov esp, r0
- pop r0
-%else
- AVX2_FilterHorizontalbw_4px %1, [dwm1024_256], %2
- vpaddd %1, %1, [dd32768_256]
-%endif
+ AVX2_FilterHorizontalbw_4px %1, [pic(dwm1024_256)], %2
+ vpaddd %1, %1, [pic(dd32768_256)]
%endmacro
; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
@@ -3423,14 +3062,7 @@
vpaddw %1, %1, %7
vpmaddubsw %7, %3, %6
vpaddw %1, %1, %7
-%ifdef X86_32_PICASM
- vpcmpeqw %7, %7, %7
- vpsrlw %7, %7, 15
- vpsllw %7, %7, 4
- vpaddw %1, %1, %7
-%else
- vpaddw %1, %1, [h264_w0x10_256]
-%endif
+ vpaddw %1, %1, [pic(h264_w0x10_256)]
vpsraw %1, %1, 5
%endmacro
@@ -3444,14 +3076,7 @@
vpaddw %1, %1, %7
vpmaddubsw %7, %4, %6
vpaddw %1, %1, %7
-%ifdef X86_32_PICASM
- vpcmpeqw %7, %7, %7
- vpsrlw %7, %7, 15
- vpsllw %7, %7, 4
- vpaddw %1, %1, %7
-%else
- vpaddw %1, %1, [h264_w0x10_256]
-%endif
+ vpaddw %1, %1, [pic(h264_w0x10_256)]
vpsraw %1, %1, 5
%endmacro
@@ -3465,24 +3090,7 @@
vpaddw %7, %3, %4
vpaddw %1, %1, %7
vpsraw %1, %1, 2
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x00200020
- push 0x00200020
- push 0x00200020
- push 0x00200020
- push 0x00200020
- push 0x00200020
- push 0x00200020
- push 0x00200020
- vpaddw %7, %7, [esp]
- mov esp, r0
- pop r0
-%else
- vpaddw %7, %7, [dw32_256]
-%endif
+ vpaddw %7, %7, [pic(dw32_256)]
vpaddw %1, %1, %7
vpsraw %1, %1, 6
%endmacro
@@ -3501,7 +3109,11 @@
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
+%ifdef X86_32_PICASM
+%define i_width dword arg5
+%else
%define i_width r4
+%endif
%define i_height r5
%define i_srcstride3 r6
%assign push_num 0
@@ -3515,6 +3127,7 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+ INIT_X86_32_PIC_NOPRESERVE r4
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
@@ -3522,32 +3135,6 @@
je .width8
jg .width16
; .width4:
-%ifdef X86_32_PICASM
- push i_width
- mov i_width, esp
- and esp, 0xffffffe0
- sub esp, 16
- push 0x14141414 ;db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0xfb01fb01 ;maddubsw_p1m5_256
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0x01fb01fb ;maddubsw_m5p1_256
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
-%endif
vmovd xmm0, [p_src]
vpbroadcastd xmm5, [p_src + i_srcstride]
vpunpcklbw xmm0, xmm0, xmm5
@@ -3574,13 +3161,8 @@
vpunpcklbw ymm5, ymm5, ymm4
vpblendd ymm3, ymm3, ymm5, 11001100b
vpblendd ymm2, ymm2, ymm3, 11110000b
-%ifdef X86_32_PICASM
- vbroadcasti128 ymm6, [esp+64]
- AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [esp+32], ymm6, [esp], ymm5
-%else
- vbroadcasti128 ymm6, [db20_128]
- AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
-%endif
+ vbroadcasti128 ymm6, [pic(db20_128)]
+ AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
vpackuswb ymm0, ymm0, ymm0
vmovd [p_dst], xmm0
vpsrlq xmm5, xmm0, 32
@@ -3596,11 +3178,7 @@
vpbroadcastd ymm5, [p_src + i_srcstride3]
vpunpcklbw ymm4, ymm4, ymm5
jg .width4_height_ge8
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [esp+32], xmm6, [esp], xmm5
-%else
- AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
-%endif
+ AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
vpackuswb xmm2, xmm2, xmm2
vmovd [p_dst], xmm2
jmp .width4_done
@@ -3616,11 +3194,7 @@
vpunpcklbw ymm5, ymm5, ymm0
vpblendd ymm1, ymm1, ymm5, 11001100b
vpblendd ymm4, ymm4, ymm1, 11110000b
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [esp+32], ymm6, [esp], ymm5
-%else
- AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
-%endif
+ AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
vpackuswb ymm2, ymm2, ymm2
vmovd [p_dst], xmm2
vpsrlq xmm5, xmm2, 32
@@ -3635,19 +3209,12 @@
lea p_dst, [p_dst + 2 * i_dststride]
vmovd xmm5, [p_src + i_srcstride3]
vpunpcklbw xmm0, xmm0, xmm5
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [esp+32], xmm6, [esp], xmm5
-%else
- AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
-%endif
+ AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
vpackuswb xmm4, xmm4, xmm4
vmovd [p_dst], xmm4
.width4_done:
-%ifdef X86_32_PICASM
- mov esp, i_width
- pop i_width
-%endif
vzeroupper
+ DEINIT_X86_32_PIC_KEEPDEF
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -3656,32 +3223,6 @@
ret
.width8:
-%ifdef X86_32_PICASM
- push i_width
- mov i_width, esp
- and esp, 0xffffffe0
- sub esp, 16
- push 0x14141414 ;db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0xfb01fb01 ;maddubsw_p1m5_256
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0x01fb01fb ;maddubsw_m5p1_256
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
-%endif
sub i_height, 1
vmovq xmm0, [p_src]
vmovq xmm4, [p_src + i_srcstride]
@@ -3701,13 +3242,8 @@
vmovq xmm3, [p_src + 2 * i_srcstride]
vpunpcklbw xmm4, xmm4, xmm3
vinserti128 ymm2, ymm2, xmm4, 1
-%ifdef X86_32_PICASM
- vbroadcasti128 ymm5, [esp+64]
- AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [esp+32], ymm5, [esp], ymm4
-%else
- vbroadcasti128 ymm5, [db20_128]
- AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+ vbroadcasti128 ymm5, [pic(db20_128)]
+ AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
vmovq xmm4, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpunpcklbw xmm3, xmm3, xmm4
@@ -3714,11 +3250,7 @@
vmovq xmm6, [p_src]
vpunpcklbw xmm4, xmm4, xmm6
vinserti128 ymm3, ymm3, xmm4, 1
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [esp+32], ymm5, [esp], ymm4
-%else
- AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+ AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
vpackuswb ymm0, ymm0, ymm1
vmovlps [p_dst], xmm0
vextracti128 xmm1, ymm0, 1
@@ -3732,11 +3264,7 @@
vmovq xmm4, [p_src + i_srcstride]
vpunpcklbw xmm0, xmm6, xmm4
jg .width8_height_ge8
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [esp+32], xmm5, [esp], xmm4
-%else
- AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
-%endif
+ AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
vpackuswb xmm2, xmm2, xmm2
vmovlps [p_dst], xmm2
jmp .width8_done
@@ -3744,11 +3272,7 @@
vmovq xmm1, [p_src + 2 * i_srcstride]
vpunpcklbw xmm4, xmm4, xmm1
vinserti128 ymm0, ymm0, xmm4, 1
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [esp+32], ymm5, [esp], ymm4
-%else
- AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+ AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
vmovq xmm4, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpunpcklbw xmm1, xmm1, xmm4
@@ -3755,11 +3279,7 @@
vmovq xmm6, [p_src]
vpunpcklbw xmm4, xmm4, xmm6
vinserti128 ymm1, ymm1, xmm4, 1
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [esp+32], ymm5, [esp], ymm4
-%else
- AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+ AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
vpackuswb ymm2, ymm2, ymm3
vmovlps [p_dst], xmm2
vextracti128 xmm3, ymm2, 1
@@ -3773,19 +3293,12 @@
jl .width8_done
vmovq xmm4, [p_src + i_srcstride]
vpunpcklbw xmm2, xmm6, xmm4
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [esp+32], xmm5, [esp], xmm4
-%else
- AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
-%endif
+ AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
vpackuswb xmm0, xmm0, xmm0
vmovlps [p_dst], xmm0
.width8_done:
-%ifdef X86_32_PICASM
- mov esp, i_width
- pop i_width
-%endif
vzeroupper
+ DEINIT_X86_32_PIC_KEEPDEF
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -3794,51 +3307,6 @@
ret
.width16:
-%ifdef X86_32_PICASM
- push i_width
- mov i_width, esp
- and esp, 0xffffffe0
- push 0x14141414 ;db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0xfb01fb01 ;maddubsw_p1m5_256
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0x01fb01fb ;maddubsw_m5p1_256
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x14fb14fb ;maddubsw_m5p20_256
- push 0x14fb14fb
- push 0x14fb14fb
- push 0x14fb14fb
- push 0x14fb14fb
- push 0x14fb14fb
- push 0x14fb14fb
- push 0x14fb14fb
- push 0xfb14fb14 ;maddubsw_p20m5_256
- push 0xfb14fb14
- push 0xfb14fb14
- push 0xfb14fb14
- push 0xfb14fb14
- push 0xfb14fb14
- push 0xfb14fb14
- push 0xfb14fb14
-%endif
sub i_height, 1
test i_height, 1
jnz .width16_yloop_begin_even
@@ -3865,11 +3333,7 @@
lea p_src, [p_src + 2 * i_srcstride]
vpblendd ymm5, ymm5, ymm6, 11110000b
vpunpcklbw ymm4, ymm4, ymm5
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [esp+96], [esp+128], [esp+64], ymm7
-%else
- AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm7
-%endif
+ AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm7
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 1000b
vmovdqa [p_dst], xmm0
@@ -3899,20 +3363,12 @@
vmovq xmm6, [p_src]
vpbroadcastq ymm7, [p_src + 8]
vpblendd ymm6, ymm6, ymm7, 11110000b
-%ifdef X86_32_PICASM
- AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [esp+32], [esp], ymm0, ymm7
-%else
- AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm0, ymm7
-%endif
+ AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm0, ymm7
vmovq xmm7, [p_src + i_srcstride]
vpbroadcastq ymm0, [p_src + i_srcstride + 8]
vpblendd ymm7, ymm7, ymm0, 11110000b
vpunpcklbw ymm6, ymm6, ymm7
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [esp+96], [esp+128], [esp+64], ymm0
-%else
- AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm0
-%endif
+ AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm0
vpackuswb ymm1, ymm1, ymm2
vpermq ymm1, ymm1, 11011000b
vmovdqa [p_dst], xmm1
@@ -3921,21 +3377,13 @@
vmovq xmm0, [p_src + 2 * i_srcstride]
vpbroadcastq ymm1, [p_src + 2 * i_srcstride + 8]
vpblendd ymm0, ymm0, ymm1, 11110000b
-%ifdef X86_32_PICASM
- AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [esp+32], [esp], ymm2, ymm1
-%else
- AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm2, ymm1
-%endif
+ AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm2, ymm1
vmovq xmm1, [p_src + i_srcstride3]
vpbroadcastq ymm2, [p_src + i_srcstride3 + 8]
lea p_src, [p_src + 4 * i_srcstride]
vpblendd ymm1, ymm1, ymm2, 11110000b
vpunpcklbw ymm0, ymm0, ymm1
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [esp+96], [esp+128], [esp+64], ymm2
-%else
- AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm2
-%endif
+ AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm2
vpackuswb ymm3, ymm3, ymm4
vpermq ymm3, ymm3, 11011000b
vmovdqa [p_dst], xmm3
@@ -3944,20 +3392,12 @@
vmovq xmm2, [p_src]
vpbroadcastq ymm3, [p_src + 8]
vpblendd ymm2, ymm2, ymm3, 11110000b
-%ifdef X86_32_PICASM
- AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [esp+32], [esp], ymm4, ymm3
-%else
- AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm4, ymm3
-%endif
+ AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm4, ymm3
vmovq xmm3, [p_src + i_srcstride]
vpbroadcastq ymm4, [p_src + i_srcstride + 8]
vpblendd ymm3, ymm3, ymm4, 11110000b
vpunpcklbw ymm2, ymm2, ymm3
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [esp+96], [esp+128], [esp+64], ymm4
-%else
- AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm4
-%endif
+ AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm4
vpackuswb ymm5, ymm5, ymm6
vpermq ymm5, ymm5, 11011000b
vmovdqa [p_dst], xmm5
@@ -3966,21 +3406,13 @@
vmovq xmm4, [p_src + 2 * i_srcstride]
vpbroadcastq ymm5, [p_src + 2 * i_srcstride + 8]
vpblendd ymm4, ymm4, ymm5, 11110000b
-%ifdef X86_32_PICASM
- AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [esp+32], [esp], ymm6, ymm5
-%else
- AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm6, ymm5
-%endif
+ AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm6, ymm5
vmovq xmm5, [p_src + i_srcstride3]
vpbroadcastq ymm6, [p_src + i_srcstride3 + 8]
lea p_src, [p_src + 4 * i_srcstride]
vpblendd ymm5, ymm5, ymm6, 11110000b
vpunpcklbw ymm4, ymm4, ymm5
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [esp+96], [esp+128], [esp+64], ymm6
-%else
- AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm6
-%endif
+ AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm6
vpackuswb ymm7, ymm7, ymm0
vpermq ymm7, ymm7, 11011000b
vmovdqa [p_dst], xmm7
@@ -3988,11 +3420,8 @@
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 8
jg .width16_yloop
-%ifdef X86_32_PICASM
- mov esp, i_width
- pop i_width
-%endif
vzeroupper
+ DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -4026,6 +3455,7 @@
%define i_width r4
%define i_height r5
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
@@ -4032,32 +3462,9 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- vbroadcasti128 ymm4, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- vbroadcasti128 ymm5, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- vbroadcasti128 ymm6, [esp]
- mov esp, r1
- pop r1
-%else
- vbroadcasti128 ymm4, [shufb_32435465768798A9]
- vbroadcasti128 ymm5, [shufb_011267784556ABBC]
- vbroadcasti128 ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ vbroadcasti128 ymm4, [pic(shufb_32435465768798A9)]
+ vbroadcasti128 ymm5, [pic(shufb_011267784556ABBC)]
+ vbroadcasti128 ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 8
je .width8
jg .width16_yloop
@@ -4086,6 +3493,7 @@
vzeroupper
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width8:
lea i_srcstride3, [3 * i_srcstride]
@@ -4110,6 +3518,7 @@
vzeroupper
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
%undef i_srcstride3
.width16_yloop:
@@ -4129,6 +3538,7 @@
vzeroupper
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -4155,6 +3565,7 @@
%define i_width r4
%define i_height r5
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -4161,32 +3572,9 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- vbroadcasti128 ymm5, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- vbroadcasti128 ymm6, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- vbroadcasti128 ymm7, [esp]
- mov esp, r1
- pop r1
-%else
- vbroadcasti128 ymm5, [shufb_32435465768798A9]
- vbroadcasti128 ymm6, [shufb_011267784556ABBC]
- vbroadcasti128 ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ vbroadcasti128 ymm5, [pic(shufb_32435465768798A9)]
+ vbroadcasti128 ymm6, [pic(shufb_011267784556ABBC)]
+ vbroadcasti128 ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 9
je .width9
jg .width17
@@ -4210,6 +3598,7 @@
vzeroupper
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width9:
%xdefine i_srcstride3 i_width
@@ -4248,6 +3637,7 @@
vzeroupper
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width17:
lea i_srcstride3, [3 * i_srcstride]
@@ -4291,6 +3681,7 @@
vzeroupper
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef i_srcstride3
%undef p_src
@@ -4320,6 +3711,7 @@
push r4
%assign push_num 1
%endif
+ INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
@@ -4327,32 +3719,9 @@
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- vbroadcasti128 ymm4, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- vbroadcasti128 ymm5, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- vbroadcasti128 ymm6, [esp]
- mov esp, r1
- pop r1
-%else
- vbroadcasti128 ymm4, [shufb_32435465768798A9]
- vbroadcasti128 ymm5, [shufb_011267784556ABBC]
- vbroadcasti128 ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ vbroadcasti128 ymm4, [pic(shufb_32435465768798A9)]
+ vbroadcasti128 ymm5, [pic(shufb_011267784556ABBC)]
+ vbroadcasti128 ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 3
.yloop:
vmovdqu xmm0, [p_src - 2]
@@ -4372,6 +3741,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r4
%endif
@@ -4403,6 +3773,7 @@
push r4
%assign push_num 1
%endif
+ INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -4443,6 +3814,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r4
%endif
@@ -4469,6 +3841,7 @@
%define i_height r3
%define i_dststride 16
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 6
SIGN_EXTENSION r1, r1d
@@ -4475,32 +3848,9 @@
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- vbroadcasti128 ymm3, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- vbroadcasti128 ymm4, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- vbroadcasti128 ymm5, [esp]
- mov esp, r1
- pop r1
-%else
- vbroadcasti128 ymm3, [shufb_32435465768798A9]
- vbroadcasti128 ymm4, [shufb_011267784556ABBC]
- vbroadcasti128 ymm5, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ vbroadcasti128 ymm3, [pic(shufb_32435465768798A9)]
+ vbroadcasti128 ymm4, [pic(shufb_011267784556ABBC)]
+ vbroadcasti128 ymm5, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 1
.yloop:
vmovdqu xmm0, [p_src - 2]
@@ -4519,6 +3869,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -4541,6 +3892,7 @@
%define i_height r3
%define i_srcstride 16
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -4614,6 +3966,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef p_dst
@@ -4641,6 +3994,7 @@
push r4
%assign push_num 1
%endif
+ INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -4687,6 +4041,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r4
%endif
@@ -4713,6 +4068,7 @@
%define i_height r3
%define i_dststride 32
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
@@ -4719,32 +4075,9 @@
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- vbroadcasti128 ymm4, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- vbroadcasti128 ymm5, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- vbroadcasti128 ymm6, [esp]
- mov esp, r1
- pop r1
-%else
- vbroadcasti128 ymm4, [shufb_32435465768798A9]
- vbroadcasti128 ymm5, [shufb_011267784556ABBC]
- vbroadcasti128 ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ vbroadcasti128 ymm4, [pic(shufb_32435465768798A9)]
+ vbroadcasti128 ymm5, [pic(shufb_011267784556ABBC)]
+ vbroadcasti128 ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 1
.yloop:
vmovdqu xmm0, [p_src - 2]
@@ -4768,6 +4101,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -4790,6 +4124,7 @@
%define i_height r3
%define i_srcstride 32
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -4869,6 +4204,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -4896,6 +4232,7 @@
push r4
%assign push_num 1
%endif
+ INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -4903,47 +4240,9 @@
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- vbroadcasti128 ymm5, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- vbroadcasti128 ymm6, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- vbroadcasti128 ymm7, [esp]
- sub esp, 16
- push 0x0000fe0a ;maddubsw_m2p10_m40m40_p10m2_p0p0_256
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x80008000 ;dwm32768_256
- push 0x80008000
- push 0x80008000
- push 0x80008000
- push 0x80008000
- push 0x80008000
- push 0x80008000
- push 0x80008000
-%else
- vbroadcasti128 ymm5, [shufb_32435465768798A9]
- vbroadcasti128 ymm6, [shufb_011267784556ABBC]
- vbroadcasti128 ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ vbroadcasti128 ymm5, [pic(shufb_32435465768798A9)]
+ vbroadcasti128 ymm6, [pic(shufb_011267784556ABBC)]
+ vbroadcasti128 ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 3
.yloop:
vmovdqu xmm0, [p_src - 2]
@@ -4961,14 +4260,7 @@
vinserti128 ymm0, ymm0, [p_src + i_srcstride3 + 6], 1
lea p_src, [p_src + 4 * i_srcstride]
vpunpckhqdq ymm4, ymm4, ymm0
-%ifdef X86_32_PICASM
- vpmaddubsw ymm4, ymm4, [esp+32]
- vpmaddwd ymm4, ymm4, [esp]
- vpshufd ymm2, ymm4, 10110001b
- vpaddd ymm4, ymm4, ymm2
-%else
- AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
-%endif
+ AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
vmovlps [p_dst + 26], xmm4
vmovdqa [p_dst + 16], xmm3
vextracti128 xmm2, ymm4, 1
@@ -4991,16 +4283,7 @@
vmovdqu xmm3, [p_src + i_srcstride - 2]
vinserti128 ymm3, ymm3, [p_src + i_srcstride + 6], 1
vpunpckhqdq ymm4, ymm0, ymm3
-%ifdef X86_32_PICASM
- vpmaddubsw ymm4, ymm4, [esp+32]
- vpmaddwd ymm4, ymm4, [esp]
- vpshufd ymm2, ymm4, 10110001b
- vpaddd ymm4, ymm4, ymm2
- mov esp, r5
- pop r5
-%else
- AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
-%endif
+ AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
vextracti128 xmm4, ymm4, 1
@@ -5011,6 +4294,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r4
%endif
@@ -5037,7 +4321,11 @@
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
+%ifdef X86_32_PICASM
+%define i_width dword arg5
+%else
%define i_width r4
+%endif
%define i_height r5
%define i_srcstride3 r6
%assign push_num 0
@@ -5051,6 +4339,7 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+ INIT_X86_32_PIC_NOPRESERVE r4
sub i_height, 1
lea i_srcstride3, [3 * i_srcstride]
test i_width, 1
@@ -5058,7 +4347,14 @@
push i_height
push p_src
push p_dst
+ %assign push_num push_num + 3
+%ifdef X86_32_PICASM
+ add p_src, i_width
+ add p_src, i_width
+ sub p_src, 2
+%else
lea p_src, [p_src + 2 * i_width - 2]
+%endif
add p_dst, i_width
vmovd xmm0, [p_src]
vpunpcklwd xmm0, xmm0, [p_src + i_srcstride]
@@ -5119,6 +4415,7 @@
pop p_dst
pop p_src
pop i_height
+ %assign push_num push_num - 3
.align_begin:
vmovdqa ymm0, [p_src]
vmovdqa ymm1, [p_src + i_srcstride]
@@ -5175,6 +4472,7 @@
vmovdqa [p_dst], xmm0
.done:
vzeroupper
+ DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32