shithub: openh264

Download patch

ref: a63e13eecdf495b6b4880a4dc4b03715e6640a55
parent: 4dd64f06beec5301c02efce225e9d8ffe66c47ff
author: Sindre Aamås <saamas@cisco.com>
date: Tue Mar 7 09:15:23 EST 2017

[Common/x86] Simplify mc_luma X86_32_PICASM handling

Utilize program counter-relative offsets to simplify X86_32_PICASM
code.

In order for this to work with nasm, data constants are placed in
the text segment.

--- a/codec/common/x86/mc_luma.asm
+++ b/codec/common/x86/mc_luma.asm
@@ -44,7 +44,11 @@
 ;*******************************************************************************
 ; Local Data (Read Only)
 ;*******************************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=32
+%else
 SECTION .rodata align=32
+%endif
 
 ;*******************************************************************************
 ; Various memory constants (trigonometric values or rounding values)
@@ -120,12 +124,6 @@
     psllw        %1,  4
 %endmacro
 
-%macro MOVEIMM_DW32 1
-    pcmpeqw      %1,  %1
-    psrlw        %1,  15
-    psllw        %1,  5
-%endmacro
-
 %endif
 
 ;*******************************************************************************
@@ -197,12 +195,7 @@
 
 %macro FILTER_HV_W8 9
     paddw   %1, %6
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16 %8
-    paddw   %1, %8
-%else
-    paddw   %1, [h264_w0x10_1]
-%endif
+    paddw   %1, [pic(h264_w0x10_1)]
     movdqa  %8, %3
     movdqa  %7, %2
     paddw   %8, %4
@@ -221,12 +214,7 @@
 
 %macro FILTER_HV_W4 9
 paddw   %1, %6
-%ifdef X86_32_PICASM
-MOVEIMM_DW16 %8
-paddw   %1, %8
-%else
-paddw   %1, [h264_w0x10_1]
-%endif
+paddw   %1, [pic(h264_w0x10_1)]
 movdqa  %8, %3
 movdqa  %7, %2
 paddw   %8, %4
@@ -457,6 +445,7 @@
 ;*******************************************************************************
 WELS_EXTERN McHorVer02WidthEq8_sse2
     %assign  push_num 0
+    INIT_X86_32_PIC r5
     LOAD_5_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -530,6 +519,7 @@
 .xx_exit:
     POP_XMM
     LOAD_5_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 ;***********************************************************************
@@ -550,6 +540,7 @@
 ;***********************************************************************
 WELS_EXTERN McHorVer02Height9Or17_sse2
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -671,6 +662,7 @@
 %endif
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 
@@ -684,6 +676,7 @@
 ;***********************************************************************
 WELS_EXTERN McHorVer02Height5_sse2
 %assign  push_num 0
+INIT_X86_32_PIC r6
 LOAD_6_PARA
 PUSH_XMM 8
 SIGN_EXTENSION  r1, r1d
@@ -805,6 +798,7 @@
 %endif
 POP_XMM
 LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
 ret
 
 
@@ -819,6 +813,7 @@
 ;***********************************************************************
 WELS_EXTERN McHorVer20Width9Or17_sse2
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -855,12 +850,7 @@
     paddw xmm0, xmm6
     psllw xmm6, 2
     paddw xmm0, xmm6
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16 xmm6
-    paddw xmm0, xmm6
-%else
-    paddw xmm0, [h264_w0x10_1]
-%endif
+    paddw xmm0, [pic(h264_w0x10_1)]
     psraw  xmm0, 5
     packuswb xmm0, xmm0
     movd [r2], xmm0
@@ -877,11 +867,7 @@
     paddw xmm2, xmm5
     psllw xmm5, 2
     paddw xmm2, xmm5
-%ifdef X86_32_PICASM
-    paddw xmm2, xmm6
-%else
-    paddw xmm2, [h264_w0x10_1]
-%endif
+    paddw xmm2, [pic(h264_w0x10_1)]
     psraw  xmm2, 5
     packuswb xmm2, xmm2
     movq [r2+1], xmm2
@@ -892,6 +878,7 @@
     jnz .yloop_width_9
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 
 
@@ -918,12 +905,7 @@
     paddw xmm0, xmm4
     psllw xmm4, 2
     paddw xmm0, xmm4
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16 xmm6
-    paddw xmm0, xmm6
-%else
-    paddw xmm0, [h264_w0x10_1]
-%endif
+    paddw xmm0, [pic(h264_w0x10_1)]
     psraw  xmm0, 5
     packuswb xmm0, xmm0
     movq [r2], xmm0
@@ -951,12 +933,7 @@
     paddw xmm0, xmm6
     psllw xmm6, 2
     paddw xmm0, xmm6
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16 xmm6
-    paddw xmm0, xmm6
-%else
-    paddw xmm0, [h264_w0x10_1]
-%endif
+    paddw xmm0, [pic(h264_w0x10_1)]
     psraw  xmm0, 5
     packuswb xmm0, xmm0
     movd [r2+8], xmm0
@@ -974,11 +951,7 @@
     paddw xmm2, xmm5
     psllw xmm5, 2
     paddw xmm2, xmm5
-%ifdef X86_32_PICASM
-    paddw xmm2, xmm6
-%else
-    paddw xmm2, [h264_w0x10_1]
-%endif
+    paddw xmm2, [pic(h264_w0x10_1)]
     psraw  xmm2, 5
     packuswb xmm2, xmm2
     movq [r2+9], xmm2
@@ -988,6 +961,7 @@
     jnz .yloop_width_17
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 
@@ -1002,6 +976,7 @@
 ;***********************************************************************
 WELS_EXTERN McHorVer20Width5_sse2
 %assign  push_num 0
+INIT_X86_32_PIC r6
 LOAD_6_PARA
 PUSH_XMM 8
 SIGN_EXTENSION  r1, r1d
@@ -1035,12 +1010,7 @@
 paddw xmm0, xmm6
 psllw xmm6, 2
 paddw xmm0, xmm6
-%ifdef X86_32_PICASM
-MOVEIMM_DW16 xmm6
-paddw xmm0, xmm6
-%else
-paddw xmm0, [h264_w0x10_1]
-%endif
+paddw xmm0, [pic(h264_w0x10_1)]
 psraw  xmm0, 5
 packuswb xmm0, xmm0
 movd [r2], xmm0
@@ -1057,11 +1027,7 @@
 paddw xmm2, xmm5
 psllw xmm5, 2
 paddw xmm2, xmm5
-%ifdef X86_32_PICASM
-paddw xmm2, xmm6
-%else
-paddw xmm2, [h264_w0x10_1]
-%endif
+paddw xmm2, [pic(h264_w0x10_1)]
 psraw  xmm2, 5
 packuswb xmm2, xmm2
 movd [r2+1], xmm2
@@ -1072,6 +1038,7 @@
 jnz .yloop_width_5
 POP_XMM
 LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
 ret
 
 
@@ -1238,12 +1205,7 @@
     psubw  %1, %7
     psraw   %1, 2
     paddw  %8, %1
-%ifdef X86_32_PICASM
-    MOVEIMM_DW32 %7
-    paddw  %8, %7
-%else
-    paddw  %8, [h264_mc_hc_32]
-%endif
+    paddw  %8, [pic(h264_mc_hc_32)]
     psraw   %8, 6
     packuswb %8, %8
     movq %9, %8
@@ -1260,6 +1222,7 @@
 
 WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -1377,6 +1340,7 @@
 %endif
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 ;***********************************************************************
@@ -1391,6 +1355,7 @@
 
 WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -1507,6 +1472,7 @@
 %endif
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 
@@ -1595,12 +1561,7 @@
 psubw  %1, %7
 psraw   %1, 2
 paddw  %8, %1
-%ifdef X86_32_PICASM
-MOVEIMM_DW32 %7
-paddw  %8, %7
-%else
-paddw  %8, [h264_mc_hc_32]
-%endif
+paddw  %8, [pic(h264_mc_hc_32)]
 psraw   %8, 6
 packuswb %8, %8
 movd %9, %8
@@ -1619,6 +1580,7 @@
 
 WELS_EXTERN McHorVer22Width4VerLastAlign_sse2
 %assign  push_num 0
+INIT_X86_32_PIC r6
 LOAD_6_PARA
 PUSH_XMM 8
 SIGN_EXTENSION  r1, r1d
@@ -1736,6 +1698,7 @@
 %endif
 POP_XMM
 LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
 ret
 
 
@@ -1751,6 +1714,7 @@
 
 WELS_EXTERN McHorVer22Width4VerLastUnAlign_sse2
 %assign  push_num 0
+INIT_X86_32_PIC r6
 LOAD_6_PARA
 PUSH_XMM 8
 SIGN_EXTENSION  r1, r1d
@@ -1867,6 +1831,7 @@
 %endif
 POP_XMM
 LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
 ret
 
 
@@ -1879,12 +1844,7 @@
     movdqa          %7, %3
     pmaddubsw       %7, %6
     paddw           %1, %7
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16    %7
-    paddw            %1, %7
-%else
-    paddw           %1, [h264_w0x10_1]
-%endif
+    paddw           %1, [pic(h264_w0x10_1)]
     psraw           %1, 5
 %endmacro
 
@@ -1901,12 +1861,7 @@
     movdqa          %7, %4
     pmaddubsw       %7, %6
     paddw           %1, %7
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16    %7
-    paddw            %1, %7
-%else
-    paddw           %1, [h264_w0x10_1]
-%endif
+    paddw           %1, [pic(h264_w0x10_1)]
     psraw           %1, 5
 %endmacro
 
@@ -1916,20 +1871,7 @@
     pshufb          %1, %2
     pshufb          %5, %3
     pshufd          %6, %1, 10110001b
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xfffffff0
-    push            0x14141414    ;db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    pmaddubsw       %1, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    pmaddubsw       %1, [db20_128]
-%endif
+    pmaddubsw       %1, [pic(db20_128)]
     pmaddubsw       %5, %4
     pmaddubsw       %6, %4
     paddw           %1, %5
@@ -1939,12 +1881,7 @@
 ; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
 %macro SSSE3_FilterHorizontal_8px 6
     SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16    %5
-    paddw           %1, %5
-%else
-    paddw           %1, [h264_w0x10_1]
-%endif
+    paddw           %1, [pic(h264_w0x10_1)]
     psraw           %1, 5
 %endmacro
 
@@ -1959,20 +1896,7 @@
     pshufb          %7, %4
     punpcklqdq      %6, %7
     pshufd          %7, %1, 10110001b
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xfffffff0
-    push            0x14141414    ;db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    pmaddubsw       %1, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    pmaddubsw       %1, [db20_128]
-%endif
+    pmaddubsw       %1, [pic(db20_128)]
     pmaddubsw       %6, %5
     pmaddubsw       %7, %5
     paddw           %1, %6
@@ -1982,31 +1906,13 @@
 ; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
 %macro SSSE3_FilterHorizontal_2x4px 7
     SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16    %6
-    paddw           %1, %6
-%else
-    paddw           %1, [h264_w0x10_1]
-%endif
+    paddw           %1, [pic(h264_w0x10_1)]
     psraw           %1, 5
 %endmacro
 
 ; pixels=%1 -32768>>scale=%2 tmp=%3
 %macro SSSE3_FilterHorizontalbw_2px 3
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    pmaddubsw       %1, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    pmaddubsw       %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_128]
-%endif
+    pmaddubsw       %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_128)]
     pmaddwd         %1, %2
     pshufd          %3, %1, 10110001b
     paddd           %1, %3
@@ -2014,33 +1920,8 @@
 
 ; pixels=%1 tmp=%2
 %macro SSSE3_FilterHorizontal_2px 2
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    pmaddubsw       %1, [esp]
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    pmaddwd         %1, [esp]
-    pshufd          %2, %1, 10110001b
-    paddd           %1, %2
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    paddd           %1, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    SSSE3_FilterHorizontalbw_2px %1, [dwm1024_128], %2
-    paddd           %1, [dd32768_128]
-%endif
+    SSSE3_FilterHorizontalbw_2px %1, [pic(dwm1024_128)], %2
+    paddd           %1, [pic(dd32768_128)]
 %endmacro
 
 ; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
@@ -2055,14 +1936,8 @@
     paddw           %7, %4
     paddw           %1, %7
     psraw           %1, 2
-%ifdef X86_32_PICASM
+    paddw           %7, [pic(h264_mc_hc_32)]
     paddw           %1, %7
-    MOVEIMM_DW32    %7
-    paddw           %1, %7
-%else
-    paddw           %7, [h264_mc_hc_32]
-    paddw           %1, %7
-%endif
     psraw           %1, 6
 %endmacro
 
@@ -2080,7 +1955,11 @@
 %define i_srcstride   r1
 %define p_dst         r2
 %define i_dststride   r3
+%ifdef X86_32_PICASM
+%define i_width       dword arg5
+%else
 %define i_width       r4
+%endif
 %define i_height      r5
 %define i_srcstride3  r6
     %assign push_num 0
@@ -2094,28 +1973,14 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
+    INIT_X86_32_PIC_NOPRESERVE r4
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
     lea             i_srcstride3, [3 * i_srcstride]
+    %assign push_num_begin push_num
     cmp             i_width, 4
     jg              .width8or16
 
-%ifdef X86_32_PICASM
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    movdqu          xmm6, [esp]
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    movdqu          xmm7, [esp]
-    push            0x14141414    ;db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-%endif
     movd            xmm0, [p_src]
     movd            xmm4, [p_src + i_srcstride]
     punpcklbw       xmm0, xmm4
@@ -2134,14 +1999,8 @@
     movd            xmm3, [p_src]
     punpcklbw       xmm4, xmm3
     punpcklqdq      xmm2, xmm4
-%ifdef X86_32_PICASM
-    movdqu          xmm5, [esp]
-    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, xmm6, xmm5, xmm7, xmm4
-    add             esp, 48
-%else
-    movdqa          xmm5, [db20_128]
-    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+    movdqa          xmm5, [pic(db20_128)]
+    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm0, xmm0
     movd            [p_dst], xmm0
     psrlq           xmm0, 32
@@ -2152,11 +2011,7 @@
     movd            xmm0, [p_src + 2 * i_srcstride]
     punpcklbw       xmm4, xmm0
     punpcklqdq      xmm3, xmm4
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, xmm6, xmm5, xmm7, xmm4
-%else
-    SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+    SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm1, xmm1
     movd            [p_dst], xmm1
     psrlq           xmm1, 32
@@ -2167,14 +2022,11 @@
     movd            xmm4, [p_src + i_srcstride3]
     punpcklbw       xmm0, xmm4
     jg              .width4_height_ge8
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, xmm6, xmm5, xmm7, xmm4
-%else
-    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm2, xmm2
     movd            [p_dst], xmm2
 .width4_height_le5_done:
+    DEINIT_X86_32_PIC_KEEPDEF
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -2186,11 +2038,7 @@
     movd            xmm1, [p_src]
     punpcklbw       xmm4, xmm1
     punpcklqdq      xmm0, xmm4
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, xmm6, xmm5, xmm7, xmm4
-%else
-    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm2, xmm2
     movd            [p_dst], xmm2
     psrlq           xmm2, 32
@@ -2201,11 +2049,7 @@
     movd            xmm2, [p_src + 2 * i_srcstride]
     punpcklbw       xmm4, xmm2
     punpcklqdq      xmm1, xmm4
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, xmm6, xmm5, xmm7, xmm4
-%else
-    SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+    SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm3, xmm3
     movd            [p_dst], xmm3
     psrlq           xmm3, 32
@@ -2215,14 +2059,11 @@
     lea             p_dst, [p_dst + 2 * i_dststride]
     movd            xmm4, [p_src + i_srcstride3]
     punpcklbw       xmm2, xmm4
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, xmm6, xmm5, xmm7, xmm4
-%else
-    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm0, xmm0
     movd            [p_dst], xmm0
 .width4_height_ge8_done:
+    DEINIT_X86_32_PIC_KEEPDEF
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -2231,38 +2072,16 @@
     ret
 
 .width8or16:
+    %assign push_num push_num_begin
     sub             i_height, 1
     push            i_height
+    %assign push_num push_num + 1
 %xdefine i_ycnt i_height
 %define i_height [r7]
 .xloop:
     push            p_src
     push            p_dst
-%ifdef X86_32_PICASM
-    push            i_width
-    mov             i_width, esp
-    and             esp, 0xfffffff0
-    push            0xfb01fb01    ;[esp+64]maddubsw_p1m5_128
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0x14141414    ;[esp+48]db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x01fb01fb    ;[esp+32]maddubsw_m5p1_128
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x14fb14fb    ;[esp+16]maddubsw_m5p20_128
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0xfb14fb14    ;[esp] maddubsw_p20m5_128
-    push            0xfb14fb14
-    push            0xfb14fb14
-    push            0xfb14fb14
-%endif
+    %assign push_num push_num + 2
     test            i_ycnt, 1
     jnz             .yloop_begin_even
     movq            xmm0, [p_src]
@@ -2276,11 +2095,7 @@
     movq            xmm5, [p_src + i_srcstride]
     lea             p_src, [p_src + 2 * i_srcstride]
     punpcklbw       xmm4, xmm5
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [esp+64], [esp+48], [esp+32], xmm7
-%else
-    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm7
-%endif
+    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm7
     packuswb        xmm0, xmm0
     movlps          [p_dst], xmm0
     add             p_dst, i_dststride
@@ -2297,36 +2112,20 @@
     punpcklbw       xmm4, xmm5
 .yloop:
     movq            xmm6, [p_src]
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [esp+16], [esp], xmm0, xmm7
-%else
-    SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm0, xmm7
-%endif
+    SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm0, xmm7
     movq            xmm7, [p_src + i_srcstride]
     punpcklbw       xmm6, xmm7
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [esp+64], [esp+48], [esp+32], xmm0
-%else
-    SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm0
-%endif
+    SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm0
     packuswb        xmm1, xmm2
     movlps          [p_dst], xmm1
     movhps          [p_dst + i_dststride], xmm1
     lea             p_dst, [p_dst + 2 * i_dststride]
     movq            xmm0, [p_src + 2 * i_srcstride]
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [esp+16], [esp], xmm2, xmm1
-%else
-    SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm2, xmm1
-%endif
+    SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm2, xmm1
     movq            xmm1, [p_src + i_srcstride3]
     lea             p_src, [p_src + 4 * i_srcstride]
     punpcklbw       xmm0, xmm1
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [esp+64], [esp+48], [esp+32], xmm2
-%else
-    SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm2
-%endif
+    SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm2
     packuswb        xmm3, xmm4
     movlps          [p_dst], xmm3
     movhps          [p_dst + i_dststride], xmm3
@@ -2334,36 +2133,20 @@
     jle             .yloop_exit
     lea             p_dst, [p_dst + 2 * i_dststride]
     movq            xmm2, [p_src]
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [esp+16], [esp], xmm4, xmm3
-%else
-    SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm4, xmm3
-%endif
+    SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm4, xmm3
     movq            xmm3, [p_src + i_srcstride]
     punpcklbw       xmm2, xmm3
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [esp+64], [esp+48], [esp+32], xmm4
-%else
-    SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm4
-%endif
+    SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm5, xmm6
     movlps          [p_dst], xmm5
     movhps          [p_dst + i_dststride], xmm5
     lea             p_dst, [p_dst + 2 * i_dststride]
     movq            xmm4, [p_src + 2 * i_srcstride]
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [esp+16], [esp], xmm6, xmm5
-%else
-    SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm6, xmm5
-%endif
+    SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm6, xmm5
     movq            xmm5, [p_src + i_srcstride3]
     lea             p_src, [p_src + 4 * i_srcstride]
     punpcklbw       xmm4, xmm5
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [esp+64], [esp+48], [esp+32], xmm6
-%else
-    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm6
-%endif
+    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm6
     packuswb        xmm7, xmm0
     movlps          [p_dst], xmm7
     movhps          [p_dst + i_dststride], xmm7
@@ -2371,12 +2154,9 @@
     sub             i_ycnt, 8
     jg              .yloop
 .yloop_exit:
-%ifdef X86_32_PICASM
-    mov             esp, i_width
-    pop             i_width
-%endif
     pop             p_dst
     pop             p_src
+    %assign push_num push_num - 2
     sub             i_width, 8
     jle             .width8or16_done
     add             p_src, 8
@@ -2385,6 +2165,8 @@
     jmp             .xloop
 .width8or16_done:
     pop             i_ycnt
+    %assign push_num push_num - 1
+    DEINIT_X86_32_PIC
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -2418,6 +2200,7 @@
 %define i_width      r4
 %define i_height     r5
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 7
     SIGN_EXTENSION  r1, r1d
@@ -2424,28 +2207,9 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
-%ifdef X86_32_PICASM
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    movdqu          xmm4, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    movdqu          xmm5, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    movdqu          xmm6, [esp]
-    add             esp, 48
-%else
-    movdqa          xmm4, [shufb_32435465768798A9]
-    movdqa          xmm5, [shufb_011267784556ABBC]
-    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    movdqa          xmm4, [pic(shufb_32435465768798A9)]
+    movdqa          xmm5, [pic(shufb_011267784556ABBC)]
+    movdqa          xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     cmp             i_width, 8
     je              .width8_yloop
     jg              .width16_yloop
@@ -2463,6 +2227,7 @@
     jg              .width4_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width8_yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2478,6 +2243,7 @@
     jg              .width8_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width16_yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2492,6 +2258,7 @@
     jg              .width16_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -2518,6 +2285,7 @@
 %define i_width      r4
 %define i_height     r5
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -2524,28 +2292,9 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
-%ifdef X86_32_PICASM
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    movdqu          xmm5, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    movdqu          xmm6, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    movdqu          xmm7, [esp]
-    add             esp, 48
-%else
-    movdqa          xmm5, [shufb_32435465768798A9]
-    movdqa          xmm6, [shufb_011267784556ABBC]
-    movdqa          xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    movdqa          xmm5, [pic(shufb_32435465768798A9)]
+    movdqa          xmm6, [pic(shufb_011267784556ABBC)]
+    movdqa          xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     cmp             i_width, 9
     je              .width9_yloop
     jg              .width17_yloop
@@ -2563,6 +2312,7 @@
     jg              .width5_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width9_yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2586,6 +2336,7 @@
     jg              .width9_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width17_yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2615,6 +2366,7 @@
     jg              .width17_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -2637,6 +2389,7 @@
 %define p_dst        r2
 %define i_height     r3
     %assign  push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 7
     SIGN_EXTENSION  r1, r1d
@@ -2643,28 +2396,9 @@
     SIGN_EXTENSION  r3, r3d
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
-%ifdef X86_32_PICASM
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    movdqu          xmm4, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    movdqu          xmm5, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    movdqu          xmm6, [esp]
-    add             esp, 48
-%else
-    movdqa          xmm4, [shufb_32435465768798A9]
-    movdqa          xmm5, [shufb_011267784556ABBC]
-    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    movdqa          xmm4, [pic(shufb_32435465768798A9)]
+    movdqa          xmm5, [pic(shufb_011267784556ABBC)]
+    movdqa          xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     sub             i_height, 1
 .yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2681,6 +2415,7 @@
     movlps          [p_dst], xmm0
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -2702,6 +2437,7 @@
 %define i_height     r3
 %define i_srcstride  8
     %assign  push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r2, r2d
@@ -2746,6 +2482,7 @@
 .done:
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef p_dst
@@ -2769,6 +2506,7 @@
 %define i_dststride  r3
 %define i_height     r4
     %assign  push_num 0
+    INIT_X86_32_PIC r5
     LOAD_5_PARA
     PUSH_XMM 7
     SIGN_EXTENSION  r1, r1d
@@ -2776,28 +2514,9 @@
     SIGN_EXTENSION  r4, r4d
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
-%ifdef X86_32_PICASM
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    movdqu          xmm4, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    movdqu          xmm5, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    movdqu          xmm6, [esp]
-    add             esp, 48
-%else
-    movdqa          xmm4, [shufb_32435465768798A9]
-    movdqa          xmm5, [shufb_011267784556ABBC]
-    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    movdqa          xmm4, [pic(shufb_32435465768798A9)]
+    movdqa          xmm5, [pic(shufb_011267784556ABBC)]
+    movdqa          xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     sub             i_height, 1
 .yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2818,6 +2537,7 @@
 .done:
     POP_XMM
     LOAD_5_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -2846,6 +2566,7 @@
     push            r5
     %assign  push_num 1
 %endif
+    INIT_X86_32_PIC r6
     LOAD_5_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -2936,6 +2657,7 @@
 .done:
     POP_XMM
     LOAD_5_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop             r5
 %endif
@@ -2965,6 +2687,7 @@
 %define i_width     r4
 %define i_height    r5
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -2975,28 +2698,9 @@
     sub             p_src, i_srcstride
     pcmpeqw         xmm4, xmm4
     psllw           xmm4, 15                                ; dw -32768
-%ifdef X86_32_PICASM
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    movdqu          xmm5, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    movdqu          xmm6, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    movdqu          xmm7, [esp]
-    add             esp, 48
-%else
-    movdqa          xmm5, [shufb_32435465768798A9]
-    movdqa          xmm6, [shufb_011267784556ABBC]
-    movdqa          xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    movdqa          xmm5, [pic(shufb_32435465768798A9)]
+    movdqa          xmm6, [pic(shufb_011267784556ABBC)]
+    movdqa          xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     cmp             i_width, 9
     jne             .width17_yloop
 
@@ -3019,6 +2723,7 @@
     jg              .width9_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 
 .width17_yloop:
@@ -3047,6 +2752,7 @@
     jg              .width17_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -3070,7 +2776,11 @@
 %define i_srcstride  r1
 %define p_dst        r2
 %define i_dststride  r3
+%ifdef X86_32_PICASM
+%define i_width      dword arg5
+%else
 %define i_width      r4
+%endif
 %define i_height     r5
 %define i_srcstride3 r6
     %assign  push_num 0
@@ -3084,14 +2794,23 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
+    INIT_X86_32_PIC_NOPRESERVE r4
     sub             i_height, 1
     push            i_height
+    %assign push_num push_num + 1
     lea             i_srcstride3, [3 * i_srcstride]
     test            i_width, 1
     jz              .width_loop
     push            p_src
     push            p_dst
+    %assign push_num push_num + 2
+%ifdef X86_32_PICASM
+    add             p_src, i_width
+    add             p_src, i_width
+    sub             p_src, 2
+%else
     lea             p_src, [p_src + 2 * i_width - 2]
+%endif
     add             p_dst, i_width
     movd            xmm0, [p_src]
     punpcklwd       xmm0, [p_src + i_srcstride]
@@ -3186,11 +2905,13 @@
 .unalign_done:
     pop             p_dst
     pop             p_src
+    %assign push_num push_num - 2
     mov             i_height, [r7]
     sub             i_width, 1
 .width_loop:
     push            p_src
     push            p_dst
+    %assign push_num push_num + 2
     movdqa          xmm0, [p_src]
     movdqa          xmm1, [p_src + i_srcstride]
     movdqa          xmm2, [p_src + 2 * i_srcstride]
@@ -3245,6 +2966,7 @@
 .x_loop_dec:
     pop             p_dst
     pop             p_src
+    %assign push_num push_num - 2
     sub             i_width, 8
     jle             .done
     mov             i_height, [r7]
@@ -3258,6 +2980,8 @@
     pop             p_src
 .done:
     pop             i_height
+    %assign push_num push_num - 1
+    DEINIT_X86_32_PIC
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -3280,24 +3004,7 @@
     vpshufb         %5, %1, %3
     vpshufb         %1, %1, %2
     vpshufd         %6, %1, 10110001b
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xffffffe0
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    vpmaddubsw      %1, %1, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    vpmaddubsw      %1, %1, [db20_256]
-%endif
+    vpmaddubsw      %1, %1, [pic(db20_256)]
     vpmaddubsw      %5, %5, %4
     vpmaddubsw      %6, %6, %4
     vpaddw          %1, %1, %5
@@ -3307,14 +3014,7 @@
 ; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 db20=%4 tmp=%5,%6
 %macro AVX2_FilterHorizontal_16px 6
     AVX2_FilterHorizontalbw_16px %1, %2, %3, %4, %5, %6
-%ifdef X86_32_PICASM
-    vpcmpeqw        %6, %6, %6
-    vpsrlw          %6, %6, 15
-    vpsllw          %6, %6, 4
-    vpaddw          %1, %1, %6
-%else
-    vpaddw          %1, %1, [h264_w0x10_256]
-%endif
+    vpaddw          %1, %1, [pic(h264_w0x10_256)]
     vpsraw          %1, %1, 5
 %endmacro
 
@@ -3327,24 +3027,7 @@
     vpunpcklqdq     %1, %1, %2
     vpunpcklqdq     %6, %6, %7
     vpshufd         %7, %1, 10110001b
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xffffffe0
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    vpmaddubsw      %1, %1, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    vpmaddubsw      %1, %1, [db20_256]
-%endif
+    vpmaddubsw      %1, %1, [pic(db20_256)]
     vpmaddubsw      %6, %6, %5
     vpmaddubsw      %7, %7, %5
     vpaddw          %1, %1, %6
@@ -3354,20 +3037,13 @@
 ; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 db20=%5 tmp=%6,%7
 %macro AVX2_FilterHorizontal_4x4px 7
     AVX2_FilterHorizontalbw_4x4px %1, %2, %3, %4, %5, %6, %7
-%ifdef X86_32_PICASM
-    vpcmpeqw        %7, %7, %7
-    vpsrlw          %7, %7, 15
-    vpsllw          %7, %7, 4
-    vpaddw          %1, %1, %7
-%else
-    vpaddw          %1, %1, [h264_w0x10_256]
-%endif
+    vpaddw          %1, %1, [pic(h264_w0x10_256)]
     vpsraw          %1, %1, 5
 %endmacro
 
 ; pixels=%1 -32768>>scale=%2 tmp=%3
 %macro AVX2_FilterHorizontalbw_4px 3
-    vpmaddubsw      %1, %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_256]
+    vpmaddubsw      %1, %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_256)]
     vpmaddwd        %1, %1, %2
     vpshufd         %3, %1, 10110001b
     vpaddd          %1, %1, %3
@@ -3375,45 +3051,8 @@
 
 ; pixels=%1 tmp=%2
 %macro AVX2_FilterHorizontal_4px 2
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xffffffe0
-    push            0x0000fe0a    ;maddubsw_m2p10_m40m40_p10m2_p0p0_256
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0xfc00fc00    ;dwm1024_256
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0x00008000    ;dd32768_256
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    vpmaddubsw      %1, %1, [esp+64]
-    vpmaddwd        %1, %1, [esp+32]
-    vpshufd         %2, %1, 10110001b
-    vpaddd          %1, %1, %2
-    vpaddd          %1, %1, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    AVX2_FilterHorizontalbw_4px %1, [dwm1024_256], %2
-    vpaddd          %1, %1, [dd32768_256]
-%endif
+    AVX2_FilterHorizontalbw_4px %1, [pic(dwm1024_256)], %2
+    vpaddd          %1, %1, [pic(dd32768_256)]
 %endmacro
 
 ; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
@@ -3423,14 +3062,7 @@
     vpaddw          %1, %1, %7
     vpmaddubsw      %7, %3, %6
     vpaddw          %1, %1, %7
-%ifdef X86_32_PICASM
-    vpcmpeqw        %7, %7, %7
-    vpsrlw          %7, %7, 15
-    vpsllw          %7, %7, 4
-    vpaddw          %1, %1, %7
-%else
-    vpaddw          %1, %1, [h264_w0x10_256]
-%endif
+    vpaddw          %1, %1, [pic(h264_w0x10_256)]
     vpsraw          %1, %1, 5
 %endmacro
 
@@ -3444,14 +3076,7 @@
     vpaddw          %1, %1, %7
     vpmaddubsw      %7, %4, %6
     vpaddw          %1, %1, %7
-%ifdef X86_32_PICASM
-    vpcmpeqw        %7, %7, %7
-    vpsrlw          %7, %7, 15
-    vpsllw          %7, %7, 4
-    vpaddw          %1, %1, %7
-%else
-    vpaddw          %1, %1, [h264_w0x10_256]
-%endif
+    vpaddw          %1, %1, [pic(h264_w0x10_256)]
     vpsraw          %1, %1, 5
 %endmacro
 
@@ -3465,24 +3090,7 @@
     vpaddw          %7, %3, %4
     vpaddw          %1, %1, %7
     vpsraw          %1, %1, 2
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xffffffe0
-    push            0x00200020
-    push            0x00200020
-    push            0x00200020
-    push            0x00200020
-    push            0x00200020
-    push            0x00200020
-    push            0x00200020
-    push            0x00200020
-    vpaddw          %7, %7, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    vpaddw          %7, %7, [dw32_256]
-%endif
+    vpaddw          %7, %7, [pic(dw32_256)]
     vpaddw          %1, %1, %7
     vpsraw          %1, %1, 6
 %endmacro
@@ -3501,7 +3109,11 @@
 %define i_srcstride   r1
 %define p_dst         r2
 %define i_dststride   r3
+%ifdef X86_32_PICASM
+%define i_width       dword arg5
+%else
 %define i_width       r4
+%endif
 %define i_height      r5
 %define i_srcstride3  r6
     %assign push_num 0
@@ -3515,6 +3127,7 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
+    INIT_X86_32_PIC_NOPRESERVE r4
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
     lea             i_srcstride3, [3 * i_srcstride]
@@ -3522,32 +3135,6 @@
     je              .width8
     jg              .width16
 ; .width4:
-%ifdef X86_32_PICASM
-    push            i_width
-    mov             i_width, esp
-    and             esp, 0xffffffe0
-    sub             esp, 16
-    push            0x14141414    ;db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0xfb01fb01    ;maddubsw_p1m5_256
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0x01fb01fb    ;maddubsw_m5p1_256
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-%endif
     vmovd           xmm0, [p_src]
     vpbroadcastd    xmm5, [p_src + i_srcstride]
     vpunpcklbw      xmm0, xmm0, xmm5
@@ -3574,13 +3161,8 @@
     vpunpcklbw      ymm5, ymm5, ymm4
     vpblendd        ymm3, ymm3, ymm5, 11001100b
     vpblendd        ymm2, ymm2, ymm3, 11110000b
-%ifdef X86_32_PICASM
-    vbroadcasti128  ymm6, [esp+64]
-    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [esp+32], ymm6, [esp], ymm5
-%else
-    vbroadcasti128  ymm6, [db20_128]
-    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
-%endif
+    vbroadcasti128  ymm6, [pic(db20_128)]
+    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
     vpackuswb       ymm0, ymm0, ymm0
     vmovd           [p_dst], xmm0
     vpsrlq          xmm5, xmm0, 32
@@ -3596,11 +3178,7 @@
     vpbroadcastd    ymm5, [p_src + i_srcstride3]
     vpunpcklbw      ymm4, ymm4, ymm5
     jg              .width4_height_ge8
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [esp+32], xmm6, [esp], xmm5
-%else
-    AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
-%endif
+    AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
     vpackuswb       xmm2, xmm2, xmm2
     vmovd           [p_dst], xmm2
     jmp             .width4_done
@@ -3616,11 +3194,7 @@
     vpunpcklbw      ymm5, ymm5, ymm0
     vpblendd        ymm1, ymm1, ymm5, 11001100b
     vpblendd        ymm4, ymm4, ymm1, 11110000b
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [esp+32], ymm6, [esp], ymm5
-%else
-    AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
-%endif
+    AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
     vpackuswb       ymm2, ymm2, ymm2
     vmovd           [p_dst], xmm2
     vpsrlq          xmm5, xmm2, 32
@@ -3635,19 +3209,12 @@
     lea             p_dst, [p_dst + 2 * i_dststride]
     vmovd           xmm5, [p_src + i_srcstride3]
     vpunpcklbw      xmm0, xmm0, xmm5
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [esp+32], xmm6, [esp], xmm5
-%else
-    AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
-%endif
+    AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
     vpackuswb       xmm4, xmm4, xmm4
     vmovd           [p_dst], xmm4
 .width4_done:
-%ifdef X86_32_PICASM
-    mov             esp, i_width
-    pop             i_width
-%endif
     vzeroupper
+    DEINIT_X86_32_PIC_KEEPDEF
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -3656,32 +3223,6 @@
     ret
 
 .width8:
-%ifdef X86_32_PICASM
-    push            i_width
-    mov             i_width, esp
-    and             esp, 0xffffffe0
-    sub             esp, 16
-    push            0x14141414    ;db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0xfb01fb01    ;maddubsw_p1m5_256
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0x01fb01fb    ;maddubsw_m5p1_256
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-%endif
     sub             i_height, 1
     vmovq           xmm0, [p_src]
     vmovq           xmm4, [p_src + i_srcstride]
@@ -3701,13 +3242,8 @@
     vmovq           xmm3, [p_src + 2 * i_srcstride]
     vpunpcklbw      xmm4, xmm4, xmm3
     vinserti128     ymm2, ymm2, xmm4, 1
-%ifdef X86_32_PICASM
-    vbroadcasti128  ymm5, [esp+64]
-    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [esp+32], ymm5, [esp], ymm4
-%else
-    vbroadcasti128  ymm5, [db20_128]
-    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+    vbroadcasti128  ymm5, [pic(db20_128)]
+    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
     vmovq           xmm4, [p_src + i_srcstride3]
     lea             p_src, [p_src + 4 * i_srcstride]
     vpunpcklbw      xmm3, xmm3, xmm4
@@ -3714,11 +3250,7 @@
     vmovq           xmm6, [p_src]
     vpunpcklbw      xmm4, xmm4, xmm6
     vinserti128     ymm3, ymm3, xmm4, 1
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [esp+32], ymm5, [esp], ymm4
-%else
-    AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+    AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
     vpackuswb       ymm0, ymm0, ymm1
     vmovlps         [p_dst], xmm0
     vextracti128    xmm1, ymm0, 1
@@ -3732,11 +3264,7 @@
     vmovq           xmm4, [p_src + i_srcstride]
     vpunpcklbw      xmm0, xmm6, xmm4
     jg              .width8_height_ge8
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [esp+32], xmm5, [esp], xmm4
-%else
-    AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
-%endif
+    AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
     vpackuswb       xmm2, xmm2, xmm2
     vmovlps         [p_dst], xmm2
     jmp             .width8_done
@@ -3744,11 +3272,7 @@
     vmovq           xmm1, [p_src + 2 * i_srcstride]
     vpunpcklbw      xmm4, xmm4, xmm1
     vinserti128     ymm0, ymm0, xmm4, 1
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [esp+32], ymm5, [esp], ymm4
-%else
-    AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+    AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
     vmovq           xmm4, [p_src + i_srcstride3]
     lea             p_src, [p_src + 4 * i_srcstride]
     vpunpcklbw      xmm1, xmm1, xmm4
@@ -3755,11 +3279,7 @@
     vmovq           xmm6, [p_src]
     vpunpcklbw      xmm4, xmm4, xmm6
     vinserti128     ymm1, ymm1, xmm4, 1
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [esp+32], ymm5, [esp], ymm4
-%else
-    AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+    AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
     vpackuswb       ymm2, ymm2, ymm3
     vmovlps         [p_dst], xmm2
     vextracti128    xmm3, ymm2, 1
@@ -3773,19 +3293,12 @@
     jl              .width8_done
     vmovq           xmm4, [p_src + i_srcstride]
     vpunpcklbw      xmm2, xmm6, xmm4
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [esp+32], xmm5, [esp], xmm4
-%else
-    AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
-%endif
+    AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
     vpackuswb       xmm0, xmm0, xmm0
     vmovlps         [p_dst], xmm0
 .width8_done:
-%ifdef X86_32_PICASM
-    mov             esp, i_width
-    pop             i_width
-%endif
     vzeroupper
+    DEINIT_X86_32_PIC_KEEPDEF
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -3794,51 +3307,6 @@
     ret
 
 .width16:
-%ifdef X86_32_PICASM
-    push            i_width
-    mov             i_width, esp
-    and             esp, 0xffffffe0
-    push            0x14141414    ;db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0xfb01fb01    ;maddubsw_p1m5_256
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0x01fb01fb    ;maddubsw_m5p1_256
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x14fb14fb    ;maddubsw_m5p20_256
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0xfb14fb14    ;maddubsw_p20m5_256
-    push            0xfb14fb14
-    push            0xfb14fb14
-    push            0xfb14fb14
-    push            0xfb14fb14
-    push            0xfb14fb14
-    push            0xfb14fb14
-    push            0xfb14fb14
-%endif
     sub             i_height, 1
     test            i_height, 1
     jnz             .width16_yloop_begin_even
@@ -3865,11 +3333,7 @@
     lea             p_src, [p_src + 2 * i_srcstride]
     vpblendd        ymm5, ymm5, ymm6, 11110000b
     vpunpcklbw      ymm4, ymm4, ymm5
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [esp+96], [esp+128], [esp+64], ymm7
-%else
-    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm7
-%endif
+    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm7
     vpackuswb       ymm0, ymm0, ymm0
     vpermq          ymm0, ymm0, 1000b
     vmovdqa         [p_dst], xmm0
@@ -3899,20 +3363,12 @@
     vmovq           xmm6, [p_src]
     vpbroadcastq    ymm7, [p_src + 8]
     vpblendd        ymm6, ymm6, ymm7, 11110000b
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [esp+32], [esp], ymm0, ymm7
-%else
-    AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm0, ymm7
-%endif
+    AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm0, ymm7
     vmovq           xmm7, [p_src + i_srcstride]
     vpbroadcastq    ymm0, [p_src + i_srcstride + 8]
     vpblendd        ymm7, ymm7, ymm0, 11110000b
     vpunpcklbw      ymm6, ymm6, ymm7
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [esp+96], [esp+128], [esp+64], ymm0
-%else
-    AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm0
-%endif
+    AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm0
     vpackuswb       ymm1, ymm1, ymm2
     vpermq          ymm1, ymm1, 11011000b
     vmovdqa         [p_dst], xmm1
@@ -3921,21 +3377,13 @@
     vmovq           xmm0, [p_src + 2 * i_srcstride]
     vpbroadcastq    ymm1, [p_src + 2 * i_srcstride + 8]
     vpblendd        ymm0, ymm0, ymm1, 11110000b
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [esp+32], [esp], ymm2, ymm1
-%else
-    AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm2, ymm1
-%endif
+    AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm2, ymm1
     vmovq           xmm1, [p_src + i_srcstride3]
     vpbroadcastq    ymm2, [p_src + i_srcstride3 + 8]
     lea             p_src, [p_src + 4 * i_srcstride]
     vpblendd        ymm1, ymm1, ymm2, 11110000b
     vpunpcklbw      ymm0, ymm0, ymm1
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [esp+96], [esp+128], [esp+64], ymm2
-%else
-    AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm2
-%endif
+    AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm2
     vpackuswb       ymm3, ymm3, ymm4
     vpermq          ymm3, ymm3, 11011000b
     vmovdqa         [p_dst], xmm3
@@ -3944,20 +3392,12 @@
     vmovq           xmm2, [p_src]
     vpbroadcastq    ymm3, [p_src + 8]
     vpblendd        ymm2, ymm2, ymm3, 11110000b
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [esp+32], [esp], ymm4, ymm3
-%else
-    AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm4, ymm3
-%endif
+    AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm4, ymm3
     vmovq           xmm3, [p_src + i_srcstride]
     vpbroadcastq    ymm4, [p_src + i_srcstride + 8]
     vpblendd        ymm3, ymm3, ymm4, 11110000b
     vpunpcklbw      ymm2, ymm2, ymm3
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [esp+96], [esp+128], [esp+64], ymm4
-%else
-    AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm4
-%endif
+    AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm4
     vpackuswb       ymm5, ymm5, ymm6
     vpermq          ymm5, ymm5, 11011000b
     vmovdqa         [p_dst], xmm5
@@ -3966,21 +3406,13 @@
     vmovq           xmm4, [p_src + 2 * i_srcstride]
     vpbroadcastq    ymm5, [p_src + 2 * i_srcstride + 8]
     vpblendd        ymm4, ymm4, ymm5, 11110000b
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [esp+32], [esp], ymm6, ymm5
-%else
-    AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm6, ymm5
-%endif
+    AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm6, ymm5
     vmovq           xmm5, [p_src + i_srcstride3]
     vpbroadcastq    ymm6, [p_src + i_srcstride3 + 8]
     lea             p_src, [p_src + 4 * i_srcstride]
     vpblendd        ymm5, ymm5, ymm6, 11110000b
     vpunpcklbw      ymm4, ymm4, ymm5
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [esp+96], [esp+128], [esp+64], ymm6
-%else
-    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm6
-%endif
+    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm6
     vpackuswb       ymm7, ymm7, ymm0
     vpermq          ymm7, ymm7, 11011000b
     vmovdqa         [p_dst], xmm7
@@ -3988,11 +3420,8 @@
     lea             p_dst, [p_dst + 2 * i_dststride]
     sub             i_height, 8
     jg              .width16_yloop
-%ifdef X86_32_PICASM
-    mov             esp, i_width
-    pop             i_width
-%endif
     vzeroupper
+    DEINIT_X86_32_PIC
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -4026,6 +3455,7 @@
 %define i_width      r4
 %define i_height     r5
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 7
     SIGN_EXTENSION  r1, r1d
@@ -4032,32 +3462,9 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    vbroadcasti128  ymm4, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    vbroadcasti128  ymm5, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    vbroadcasti128  ymm6, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    vbroadcasti128  ymm4, [shufb_32435465768798A9]
-    vbroadcasti128  ymm5, [shufb_011267784556ABBC]
-    vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    vbroadcasti128  ymm4, [pic(shufb_32435465768798A9)]
+    vbroadcasti128  ymm5, [pic(shufb_011267784556ABBC)]
+    vbroadcasti128  ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     cmp             i_width, 8
     je              .width8
     jg              .width16_yloop
@@ -4086,6 +3493,7 @@
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width8:
     lea             i_srcstride3, [3 * i_srcstride]
@@ -4110,6 +3518,7 @@
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 %undef i_srcstride3
 .width16_yloop:
@@ -4129,6 +3538,7 @@
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -4155,6 +3565,7 @@
 %define i_width      r4
 %define i_height     r5
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -4161,32 +3572,9 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    vbroadcasti128  ymm5, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    vbroadcasti128  ymm6, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    vbroadcasti128  ymm7, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    vbroadcasti128  ymm5, [shufb_32435465768798A9]
-    vbroadcasti128  ymm6, [shufb_011267784556ABBC]
-    vbroadcasti128  ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    vbroadcasti128  ymm5, [pic(shufb_32435465768798A9)]
+    vbroadcasti128  ymm6, [pic(shufb_011267784556ABBC)]
+    vbroadcasti128  ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     cmp             i_width, 9
     je              .width9
     jg              .width17
@@ -4210,6 +3598,7 @@
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width9:
 %xdefine i_srcstride3 i_width
@@ -4248,6 +3637,7 @@
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width17:
     lea             i_srcstride3, [3 * i_srcstride]
@@ -4291,6 +3681,7 @@
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef i_srcstride3
 %undef p_src
@@ -4320,6 +3711,7 @@
     push            r4
     %assign  push_num 1
 %endif
+    INIT_X86_32_PIC r5
     LOAD_4_PARA
     PUSH_XMM 7
     SIGN_EXTENSION  r1, r1d
@@ -4327,32 +3719,9 @@
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
     lea             i_srcstride3, [3 * i_srcstride]
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    vbroadcasti128  ymm4, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    vbroadcasti128  ymm5, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    vbroadcasti128  ymm6, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    vbroadcasti128  ymm4, [shufb_32435465768798A9]
-    vbroadcasti128  ymm5, [shufb_011267784556ABBC]
-    vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    vbroadcasti128  ymm4, [pic(shufb_32435465768798A9)]
+    vbroadcasti128  ymm5, [pic(shufb_011267784556ABBC)]
+    vbroadcasti128  ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     sub             i_height, 3
 .yloop:
     vmovdqu         xmm0, [p_src - 2]
@@ -4372,6 +3741,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop             r4
 %endif
@@ -4403,6 +3773,7 @@
     push            r4
     %assign  push_num 1
 %endif
+    INIT_X86_32_PIC r5
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r2, r2d
@@ -4443,6 +3814,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop             r4
 %endif
@@ -4469,6 +3841,7 @@
 %define i_height     r3
 %define i_dststride  16
     %assign  push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 6
     SIGN_EXTENSION  r1, r1d
@@ -4475,32 +3848,9 @@
     SIGN_EXTENSION  r3, r3d
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    vbroadcasti128  ymm3, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    vbroadcasti128  ymm4, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    vbroadcasti128  ymm5, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    vbroadcasti128  ymm3, [shufb_32435465768798A9]
-    vbroadcasti128  ymm4, [shufb_011267784556ABBC]
-    vbroadcasti128  ymm5, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    vbroadcasti128  ymm3, [pic(shufb_32435465768798A9)]
+    vbroadcasti128  ymm4, [pic(shufb_011267784556ABBC)]
+    vbroadcasti128  ymm5, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     sub             i_height, 1
 .yloop:
     vmovdqu         xmm0, [p_src - 2]
@@ -4519,6 +3869,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -4541,6 +3892,7 @@
 %define i_height     r3
 %define i_srcstride  16
     %assign  push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r2, r2d
@@ -4614,6 +3966,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef p_dst
@@ -4641,6 +3994,7 @@
     push            r4
     %assign  push_num 1
 %endif
+    INIT_X86_32_PIC r5
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r2, r2d
@@ -4687,6 +4041,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop             r4
 %endif
@@ -4713,6 +4068,7 @@
 %define i_height     r3
 %define i_dststride  32
     %assign  push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 7
     SIGN_EXTENSION  r1, r1d
@@ -4719,32 +4075,9 @@
     SIGN_EXTENSION  r3, r3d
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    vbroadcasti128  ymm4, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    vbroadcasti128  ymm5, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    vbroadcasti128  ymm6, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    vbroadcasti128  ymm4, [shufb_32435465768798A9]
-    vbroadcasti128  ymm5, [shufb_011267784556ABBC]
-    vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    vbroadcasti128  ymm4, [pic(shufb_32435465768798A9)]
+    vbroadcasti128  ymm5, [pic(shufb_011267784556ABBC)]
+    vbroadcasti128  ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     sub             i_height, 1
 .yloop:
     vmovdqu         xmm0, [p_src - 2]
@@ -4768,6 +4101,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -4790,6 +4124,7 @@
 %define i_height     r3
 %define i_srcstride  32
     %assign  push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r2, r2d
@@ -4869,6 +4204,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -4896,6 +4232,7 @@
     push            r4
     %assign  push_num 1
 %endif
+    INIT_X86_32_PIC r5
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -4903,47 +4240,9 @@
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
     lea             i_srcstride3, [3 * i_srcstride]
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    vbroadcasti128  ymm5, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    vbroadcasti128  ymm6, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    vbroadcasti128  ymm7, [esp]
-    sub             esp, 16
-    push            0x0000fe0a    ;maddubsw_m2p10_m40m40_p10m2_p0p0_256
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x80008000    ;dwm32768_256
-    push            0x80008000
-    push            0x80008000
-    push            0x80008000
-    push            0x80008000
-    push            0x80008000
-    push            0x80008000
-    push            0x80008000
-%else
-    vbroadcasti128  ymm5, [shufb_32435465768798A9]
-    vbroadcasti128  ymm6, [shufb_011267784556ABBC]
-    vbroadcasti128  ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    vbroadcasti128  ymm5, [pic(shufb_32435465768798A9)]
+    vbroadcasti128  ymm6, [pic(shufb_011267784556ABBC)]
+    vbroadcasti128  ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     sub             i_height, 3
 .yloop:
     vmovdqu         xmm0, [p_src - 2]
@@ -4961,14 +4260,7 @@
     vinserti128     ymm0, ymm0, [p_src + i_srcstride3 + 6], 1
     lea             p_src, [p_src + 4 * i_srcstride]
     vpunpckhqdq     ymm4, ymm4, ymm0
-%ifdef X86_32_PICASM
-    vpmaddubsw      ymm4, ymm4, [esp+32]
-    vpmaddwd        ymm4, ymm4, [esp]
-    vpshufd         ymm2, ymm4, 10110001b
-    vpaddd          ymm4, ymm4, ymm2
-%else
-    AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
-%endif
+    AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
     vmovlps         [p_dst + 26], xmm4
     vmovdqa         [p_dst + 16], xmm3
     vextracti128    xmm2, ymm4, 1
@@ -4991,16 +4283,7 @@
     vmovdqu         xmm3, [p_src + i_srcstride - 2]
     vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1
     vpunpckhqdq     ymm4, ymm0, ymm3
-%ifdef X86_32_PICASM
-    vpmaddubsw      ymm4, ymm4, [esp+32]
-    vpmaddwd        ymm4, ymm4, [esp]
-    vpshufd         ymm2, ymm4, 10110001b
-    vpaddd          ymm4, ymm4, ymm2
-    mov             esp, r5
-    pop             r5
-%else
-    AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
-%endif
+    AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
     AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
     AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
     vextracti128    xmm4, ymm4, 1
@@ -5011,6 +4294,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop             r4
 %endif
@@ -5037,7 +4321,11 @@
 %define i_srcstride  r1
 %define p_dst        r2
 %define i_dststride  r3
+%ifdef X86_32_PICASM
+%define i_width      dword arg5
+%else
 %define i_width      r4
+%endif
 %define i_height     r5
 %define i_srcstride3 r6
     %assign  push_num 0
@@ -5051,6 +4339,7 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
+    INIT_X86_32_PIC_NOPRESERVE r4
     sub             i_height, 1
     lea             i_srcstride3, [3 * i_srcstride]
     test            i_width, 1
@@ -5058,7 +4347,14 @@
     push            i_height
     push            p_src
     push            p_dst
+    %assign push_num push_num + 3
+%ifdef X86_32_PICASM
+    add             p_src, i_width
+    add             p_src, i_width
+    sub             p_src, 2
+%else
     lea             p_src, [p_src + 2 * i_width - 2]
+%endif
     add             p_dst, i_width
     vmovd           xmm0, [p_src]
     vpunpcklwd      xmm0, xmm0, [p_src + i_srcstride]
@@ -5119,6 +4415,7 @@
     pop             p_dst
     pop             p_src
     pop             i_height
+    %assign push_num push_num - 3
 .align_begin:
     vmovdqa         ymm0, [p_src]
     vmovdqa         ymm1, [p_src + i_srcstride]
@@ -5175,6 +4472,7 @@
     vmovdqa         [p_dst], xmm0
 .done:
     vzeroupper
+    DEINIT_X86_32_PIC
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32