shithub: openh264

Download patch

ref: c4636c3f4e1043684089a2cbade3af085d55da77
parent: 3b372806c06f10f634f6e8aa48aaab311422fde0
parent: f711d0a2a838691f3695595149de982f9b8a56c7
author: ruil2 <ruil2@cisco.com>
date: Mon Mar 20 12:32:39 EDT 2017

Merge pull request #2677 from saamas/x86-32-picasm-improvements

X86_32_PICASM improvements

--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -668,3 +668,68 @@
     vpcmpeqw %1, %1, %1
     vpsrlw   %1, %1,  1
 %endmacro
+
+
+;***********************************************************************
+; Utility macros for X86_32 PIC support
+;***********************************************************************
+
+; Used internally by other macros.
+%macro INIT_X86_32_PIC_ 2
+%ifdef X86_32_PICASM
+    %xdefine pic_ptr %1
+    %xdefine pic_ptr_preserve %2
+  %if pic_ptr_preserve
+    %assign push_num push_num+1
+    push            pic_ptr
+  %endif
+    call            %%get_pc
+%%pic_refpoint:
+    jmp             %%pic_init_done
+%%get_pc:
+    mov             pic_ptr, [esp]
+    ret
+%%pic_init_done:
+    %define pic(data_addr) (pic_ptr+(data_addr)-%%pic_refpoint)
+%else
+    %define pic(data_addr) (data_addr)
+%endif
+%endmacro
+
+; Get program counter and define a helper macro "pic(addr)" to convert absolute
+; addresses to program counter-relative addresses if X86_32_PICASM is defined.
+; Otherwise define "pic(addr)" as an identity function.
+; %1=register to store PC/EIP in.
+%macro INIT_X86_32_PIC 1
+    INIT_X86_32_PIC_ %1, 1
+%endmacro
+
+; Equivalent as above, but without preserving the value of the register argument.
+%macro INIT_X86_32_PIC_NOPRESERVE 1
+    INIT_X86_32_PIC_ %1, 0
+%endmacro
+
+; Clean up after INIT_X86_32_PIC.
+; Restore the register used to hold PC/EIP if applicable, and undefine defines.
+%macro DEINIT_X86_32_PIC 0
+%ifdef X86_32_PICASM
+  %if pic_ptr_preserve
+    pop             pic_ptr
+    %assign push_num push_num-1
+  %endif
+    %undef pic_ptr
+    %undef pic_ptr_preserve
+%endif
+    %undef pic
+%endmacro
+
+; Equivalent as above, but without undefining. Useful for functions with
+; multiple epilogues.
+%macro DEINIT_X86_32_PIC_KEEPDEF 0
+%ifdef X86_32_PICASM
+  %if pic_ptr_preserve
+    pop             pic_ptr
+    %assign push_num push_num-1
+  %endif
+%endif
+%endmacro
--- a/codec/common/x86/dct.asm
+++ b/codec/common/x86/dct.asm
@@ -60,7 +60,11 @@
     %define prefixed(a) a
 %endif
 
+%ifdef X86_32_PICASM
+SECTION .text align=32
+%else
 SECTION .rodata align=32
+%endif
 
 ;***********************************************************************
 ; Constant
@@ -392,40 +396,14 @@
 ; Do 2 horizontal 4-pt DCTs in parallel packed as 8 words in an xmm register.
 ; out=%1 in=%1 clobber=%2
 %macro SSE2_DCT_HORIZONTAL 2
-    pshuflw       %2, %1, 1bh               ; [x[3],x[2],x[1],x[0]] low qw
-%ifdef X86_32_PICASM
-    push          r0
-    mov           r0, esp
-    and           esp, 0xfffffff0
-    push          0xffff0001    ;wels_p1m1p1m1w_128
-    push          0xffff0001
-    push          0xffff0001
-    push          0xffff0001
-    push          0x0001ffff    ;wels_p1m1m1p1w_128
-    push          0xffff0001
-    push          0x0001ffff
-    push          0xffff0001
-    push          0x00020001    ;wels_p1p2p1p2w_128
-    push          0x00020001
-    push          0x00020001
-    push          0x00020001
-    pmullw        %1, [esp+32]  ; [x[0],-x[1],x[2],-x[3], ...]
-%else
-    pmullw        %1, [wels_p1m1p1m1w_128]  ; [x[0],-x[1],x[2],-x[3], ...]
-%endif
-    pshufhw       %2, %2, 1bh               ; [x[3],x[2],x[1],x[0]] high qw
-    paddw         %1, %2                    ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
-    pshufd        %2, %1, 0b1h              ; [s[2],s[3],s[0],s[1], ...]
-%ifdef X86_32_PICASM
-    pmullw        %1, [esp+16]  ; [s[0],-s[1],-s[2],s[3], ...]
-    pmullw        %2, [esp]  ; [s[2],2*s[3],s[0],2*s[1], ...]]
-    mov           esp, r0
-    pop           r0
-%else
-    pmullw        %1, [wels_p1m1m1p1w_128]  ; [s[0],-s[1],-s[2],s[3], ...]
-    pmullw        %2, [wels_p1p2p1p2w_128]  ; [s[2],2*s[3],s[0],2*s[1], ...]]
-%endif
-    paddw         %1, %2                    ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
+    pshuflw       %2, %1, 1bh                    ; [x[3],x[2],x[1],x[0]] low qw
+    pmullw        %1, [pic(wels_p1m1p1m1w_128)]  ; [x[0],-x[1],x[2],-x[3], ...]
+    pshufhw       %2, %2, 1bh                    ; [x[3],x[2],x[1],x[0]] high qw
+    paddw         %1, %2                         ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
+    pshufd        %2, %1, 0b1h                   ; [s[2],s[3],s[0],s[1], ...]
+    pmullw        %1, [pic(wels_p1m1m1p1w_128)]  ; [s[0],-s[1],-s[2],s[3], ...]
+    pmullw        %2, [pic(wels_p1p2p1p2w_128)]  ; [s[2],2*s[3],s[0],2*s[1], ...]]
+    paddw         %1, %2                         ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
 %endmacro
 
 ; Do 2 horizontal 4-pt IDCTs in parallel packed as 8 words in an xmm register.
@@ -436,22 +414,7 @@
 ;
 ; out=%1 in=%1 wels_p1m1m1p1w_128=%2 clobber=%3,%4
 %macro SSE2_IDCT_HORIZONTAL 4
-%ifdef X86_32_PICASM
-    push          r0
-    mov           r0, esp
-    and           esp, 0xfffffff0
-    push          0x80000000    ;wels_p0m8000p0m8000w_128
-    push          0x80000000
-    push          0x80000000
-    push          0x80000000
-    push          0xffffffff    ;wels_p1p1m1m1w_128
-    push          0x00010001
-    push          0xffffffff
-    push          0x00010001
-    movdqa        %3, [esp+16]
-%else
-    movdqa        %3, [wels_p0m8000p0m8000w_128]
-%endif
+    movdqa        %3, [pic(wels_p0m8000p0m8000w_128)]
     pmulhw        %3, %1                    ; x[0:7] * [0,-8000h,0,-8000h, ...] >> 16
     pshufd        %4, %1, 0b1h              ; [x[2],x[3],x[0],x[1], ...]
     pmullw        %4, %2                    ; [x[2],-x[3],-x[0],x[1], ...]
@@ -458,13 +421,7 @@
     paddw         %1, %3                    ; [x[0]+0,x[1]+(-x[1]>>1),x[2]+0,x[3]+(-x[3]>>1), ...]
     paddw         %1, %4                    ; s = [x[0]+x[2],(x[1]>>1)-x[3],x[2]-x[0],(x[3]>>1)+x[1], ...]
     pshuflw       %3, %1, 1bh               ; [s[3],s[2],s[1],s[0]] low qw
-%ifdef X86_32_PICASM
-    pmullw        %1, [esp]  ; [s[0],s[1],-s[2],-s[3], ...]
-    mov           esp, r0
-    pop           r0
-%else
-    pmullw        %1, [wels_p1p1m1m1w_128]  ; [s[0],s[1],-s[2],-s[3], ...]
-%endif
+    pmullw        %1, [pic(wels_p1p1m1m1w_128)]  ; [s[0],s[1],-s[2],-s[3], ...]
     pshufhw       %3, %3, 1bh               ; [s[3],s[2],s[1],s[0]] high qw
     pmullw        %3, %2                    ; [s[3],-s[2],-s[1],s[0], ...]
     paddw         %1, %3                    ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...]
@@ -481,24 +438,9 @@
     punpckhqdq    %2, %1                    ; s03 = [x0+x3,x0-x3]
     punpcklqdq    %3, %1                    ; s12 = [x1+x2,x1-x2]
     movdqa        %1, %2
-%ifdef X86_32_PICASM
-    push          r0
-    mov           r0, esp
-    and           esp, 0xfffffff0
-    push          0x00020002    ;wels_4xp1w_4xp2w
-    push          0x00020002
-    push          0x00010001
-    push          0x00010001
-    pmullw        %1, [esp]    ; [s03[0],2*s03[1]]
-    paddw         %1, %3                    ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
-    pmullw        %3, [esp]    ; [s12[0],2*s12[1]]
-    mov           esp, r0
-    pop           r0
-%else
-    pmullw        %1, [wels_4xp1w_4xp2w]    ; [s03[0],2*s03[1]]
-    paddw         %1, %3                    ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
-    pmullw        %3, [wels_4xp1w_4xp2w]    ; [s12[0],2*s12[1]]
-%endif
+    pmullw        %1, [pic(wels_4xp1w_4xp2w)] ; [s03[0],2*s03[1]]
+    paddw         %1, %3                      ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
+    pmullw        %3, [pic(wels_4xp1w_4xp2w)] ; [s12[0],2*s12[1]]
     psubw         %2, %3                    ; [y2,y3] = [s03[0]-s12[0],s03[1]-2*s12[1]]
 %endmacro
 
@@ -506,20 +448,7 @@
 ; Output is scrambled to save a negation.
 ; [y1,y0]=%1 [y2,y3]=%2 [x0,x1]=%1 [x2,x3]=%2 clobber=%3,%4
 %macro SSE2_IDCT_4x4P 4
-%ifdef X86_32_PICASM
-    push          r0
-    mov           r0, esp
-    and           esp, 0xfffffff0
-    push          0x80008000    ;wels_4xp0w_4xm8000w
-    push          0x80008000
-    push          0x00000000
-    push          0x00000000
-    movdqa        %4, [esp]
-    mov           esp, r0
-    pop           r0
-%else
-    movdqa        %4, [wels_4xp0w_4xm8000w]
-%endif
+    movdqa        %4, [pic(wels_4xp0w_4xm8000w)]
     movdqa        %3, %1
     pmulhw        %3, %4                    ; x[0:1] * [0,-8000h] >> 16
     pmulhw        %4, %2                    ; x[2:3] * [0,-8000h] >> 16
@@ -540,6 +469,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsDctFourT4_sse2
     %assign push_num 0
+    INIT_X86_32_PIC r5
     LOAD_5_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r2, r2d
@@ -582,6 +512,7 @@
 
     POP_XMM
     LOAD_5_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 ;***********************************************************************
@@ -589,6 +520,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsIDctFourT4Rec_sse2
     %assign push_num 0
+    INIT_X86_32_PIC r5
     LOAD_5_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
@@ -596,18 +528,7 @@
     ;Load 4x8
     SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
 
-%ifdef X86_32_PICASM
-    push          r5
-    mov           r5, esp
-    and           esp, 0xffffffe0
-    push          0x0001ffff    ;wels_p1m1m1p1w_128
-    push          0xffff0001
-    push          0x0001ffff
-    push          0xffff0001
-    movdqa xmm7, [esp]
-%else
-    movdqa xmm7, [wels_p1m1m1p1w_128]
-%endif
+    movdqa xmm7, [pic(wels_p1m1m1p1w_128)]
     SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
     SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
     SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
@@ -626,13 +547,7 @@
     lea     r2, [r2 + 2 * r3]
     SSE2_Load4x8p  r4+64, xmm0, xmm1, xmm4, xmm2, xmm5
 
-%ifdef X86_32_PICASM
-    movdqa xmm7, [esp]
-    mov    esp, r5
-    pop    r5
-%else
-    movdqa xmm7, [wels_p1m1m1p1w_128]
-%endif
+    movdqa xmm7, [pic(wels_p1m1m1p1w_128)]
     SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
     SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
     SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
@@ -648,6 +563,7 @@
     SSE2_StoreDiff16p xmm0, xmm4, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
     POP_XMM
     LOAD_5_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 ;***********************************************************************
@@ -655,6 +571,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsDctT4_sse2
     %assign push_num 0
+    INIT_X86_32_PIC r5
     LOAD_5_PARA
     PUSH_XMM 5
     SIGN_EXTENSION r2, r2d
@@ -673,6 +590,7 @@
 
     POP_XMM
     LOAD_5_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 ;***********************************************************************
@@ -690,6 +608,7 @@
     %assign push_num 0
     LOAD_5_PARA
 .begin:
+    INIT_X86_32_PIC r5
     PUSH_XMM 6
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
@@ -696,20 +615,7 @@
 
     SSE2_Load2x4P xmm0, r4
     SSE2_Load2x4P xmm1, r4+16
-%ifdef X86_32_PICASM
-    push          r5
-    mov           r5, esp
-    and           esp, 0xfffffff0
-    push          0x0001ffff    ;wels_p1m1m1p1w_128
-    push          0xffff0001
-    push          0x0001ffff
-    push          0xffff0001
-    movdqa xmm4, [esp]
-    mov           esp, r5
-    pop           r5
-%else
-    movdqa xmm4, [wels_p1m1m1p1w_128]
-%endif
+    movdqa xmm4, [pic(wels_p1m1m1p1w_128)]
     SSE2_IDCT_HORIZONTAL xmm0, xmm4, xmm2, xmm3
     SSE2_IDCT_HORIZONTAL xmm1, xmm4, xmm2, xmm3
     SSE2_IDCT_4x4P xmm0, xmm1, xmm2, xmm3
@@ -721,6 +627,7 @@
     SSE2_StoreDiff2x4P r0+r1, r0+2*r1, xmm1, r2+r3, r2+2*r3, xmm5, xmm4, xmm2, xmm3
 
     POP_XMM
+    DEINIT_X86_32_PIC
     LOAD_5_PARA_POP
     ret
 
@@ -815,20 +722,7 @@
     vpshufb       y%9, y%9, y%8
     vpaddsw       y%4, y%4, y%9
     vpackuswb     y%3, y%3, y%4
-%ifdef X86_32_PICASM
-    push          r0
-    mov           r0, esp
-    and           esp, 0xffffffe0
-    push          0x0d0f0e0c    ;wels_shufb0231_128
-    push          0x090b0a08
-    push          0x05070604
-    push          0x01030200
-    vbroadcasti128 y%4, [esp]
-    mov           esp, r0
-    pop           r0
-%else
-    vbroadcasti128 y%4, [wels_shufb0231_128]
-%endif
+    vbroadcasti128 y%4, [pic(wels_shufb0231_128)]
     vpshufb       y%3, y%3, y%4
     vextracti128  x%4, y%3, 1
     vmovlps       [%1         ], x%3
@@ -906,20 +800,7 @@
     AVX2_Loadzx4x4P %8, %4, %5, y%7, %9, %10
     vpaddsw        y%3, y%3, y%8
     vpackuswb      y%3, y%3, y%3
-%ifdef X86_32_PICASM
-    push          r0
-    mov           r0, esp
-    and           esp, 0xffffffe0
-    push          0x0d0f0e0c    ;wels_shufb0231_128
-    push          0x090b0a08
-    push          0x05070604
-    push          0x01030200
-    vbroadcasti128 y%8, [esp]
-    mov           esp, r0
-    pop           r0
-%else
-    vbroadcasti128 y%8, [wels_shufb0231_128]
-%endif
+    vbroadcasti128 y%8, [pic(wels_shufb0231_128)]
     vpshufb        y%3, y%3, y%8
     vextracti128   x%8, y%3, 1
     vmovd          [%1         ], x%3
@@ -965,39 +846,10 @@
 ; Uses scrambled input to save a negation.
 ; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 wels_shufb2301=%2 clobber=%3
 %macro AVX2_DCT_HORIZONTAL 3
-%ifdef X86_32_PICASM
-    push          r0
-    mov           r0, esp
-    and           esp, 0xffffffe0
-    push          0xffff0001    ;wels_p1m1p1m1w_256
-    push          0xffff0001
-    push          0xffff0001
-    push          0xffff0001
-    push          0xffff0001
-    push          0xffff0001
-    push          0xffff0001
-    push          0xffff0001
-    push          0xfffeffff    ;wels_p1m2m1m2w_256
-    push          0x00020001
-    push          0xfffeffff
-    push          0x00020001
-    push          0xfffeffff
-    push          0x00020001
-    push          0xfffeffff
-    push          0x00020001
-    vpsignw       %3, %1, [esp+32]  ; [x0,-x3,x1,-x2]
-%else
-    vpsignw       %3, %1, [wels_p1m1p1m1w_256]  ; [x0,-x3,x1,-x2]
-%endif
+    vpsignw       %3, %1, [pic(wels_p1m1p1m1w_256)]  ; [x0,-x3,x1,-x2]
     vpshufb       %1, %1, %2                    ; [x3,x0,x2,x1]
     vpaddw        %1, %1, %3                    ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
-%ifdef X86_32_PICASM
-    vpmullw       %3, %1, [esp]  ; [s[0],2*s[1],-s[2],-2*s[3], ...]
-    mov           esp, r0
-    pop           r0
-%else
-    vpmullw       %3, %1, [wels_p1p2m1m2w_256]  ; [s[0],2*s[1],-s[2],-2*s[3], ...]
-%endif
+    vpmullw       %3, %1, [pic(wels_p1p2m1m2w_256)]  ; [s[0],2*s[1],-s[2],-2*s[3], ...]
     vpshufd       %1, %1, 0b1h                  ; [s[2],s[3],s[0],s[1], ...]
     vpaddw        %1, %1, %3                    ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1], ...]
 %endmacro
@@ -1008,40 +860,11 @@
 %macro AVX2_IDCT_HORIZONTAL 3
     vpsraw        %3, %1, 1                     ; [x0>>1,x1>>1,x2>>1,x3>>1]
     vpblendw      %3, %1, %3, 10101010b         ; [x0,x1>>1,x2,x3>>1]
-%ifdef X86_32_PICASM
-    push          r0
-    mov           r0, esp
-    and           esp, 0xffffffe0
-    push          0xffffffff    ;wels_p1p1m1m1w_256
-    push          0x00010001
-    push          0xffffffff
-    push          0x00010001
-    push          0xffffffff
-    push          0x00010001
-    push          0xffffffff
-    push          0x00010001
-    push          0xffff0001    ;wels_p1m1p1m1w_256
-    push          0xffff0001
-    push          0xffff0001
-    push          0xffff0001
-    push          0xffff0001
-    push          0xffff0001
-    push          0xffff0001
-    push          0xffff0001
-    vpsignw       %1, %1, [esp+32]  ; [x0,x1,-x2,-x3]
-%else
-    vpsignw       %1, %1, [wels_p1p1m1m1w_256]  ; [x0,x1,-x2,-x3]
-%endif
+    vpsignw       %1, %1, [pic(wels_p1p1m1m1w_256)]  ; [x0,x1,-x2,-x3]
     vpshufd       %3, %3, 0b1h                  ; [x2,x3>>1,x0,x1>>1]
     vpaddw        %1, %3, %1                    ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
     vpshufb       %3, %1, %2                    ; [s[1],s[0],s[3],s[2], ...]
-%ifdef X86_32_PICASM
-    vpsignw       %1, %1, [esp]  ; [s[0],-s[1],s[2],-s[3], ...]
-    mov           esp, r0
-    pop           r0
-%else
-    vpsignw       %1, %1, [wels_p1m1p1m1w_256]  ; [s[0],-s[1],s[2],-s[3], ...]
-%endif
+    vpsignw       %1, %1, [pic(wels_p1m1p1m1w_256)]  ; [s[0],-s[1],s[2],-s[3], ...]
     vpaddw        %1, %1, %3                    ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2], ...]
 %endmacro
 
@@ -1049,39 +872,10 @@
 ; Uses scrambled input to save a negation.
 ; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 clobber=%2
 %macro AVX2_DCT_4x4P 2
-%ifdef X86_32_PICASM
-    push          r0
-    mov           r0, esp
-    and           esp, 0xffffffe0
-    push          0xffffffff    ;wels_4xp1w_4xm1w_256
-    push          0xffffffff
-    push          0x00010001
-    push          0x00010001
-    push          0xffffffff
-    push          0xffffffff
-    push          0x00010001
-    push          0x00010001
-    push          0xfffefffe    ;wels_4xp1w_4xp2w_4xm1w_4xm2w
-    push          0xfffefffe
-    push          0xffffffff
-    push          0xffffffff
-    push          0x00020002
-    push          0x00020002
-    push          0x00010001
-    push          0x00010001
-    vpsignw       %2, %1, [esp+32]         ; [x0,-x3,x1,-x2]
-%else
-    vpsignw       %2, %1, [wels_4xp1w_4xm1w_256]         ; [x0,-x3,x1,-x2]
-%endif
+    vpsignw       %2, %1, [pic(wels_4xp1w_4xm1w_256)]    ; [x0,-x3,x1,-x2]
     vpshufd       %1, %1, 4eh                            ; [x3,x0,x2,x1]
     vpaddw        %1, %1, %2                             ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
-%ifdef X86_32_PICASM
-    vpmullw       %2, %1, [esp] ; [s[0],2*s[1],-s[2],-2*s[3]]
-    mov           esp, r0
-    pop           r0
-%else
-    vpmullw       %2, %1, [wels_4xp1w_4xp2w_4xm1w_4xm2w] ; [s[0],2*s[1],-s[2],-2*s[3]]
-%endif
+    vpmullw       %2, %1, [pic(wels_4xp1w_4xp2w_4xm1w_4xm2w)] ; [s[0],2*s[1],-s[2],-2*s[3]]
     vpermq        %1, %1, 4eh                            ; [s[2],s[3],s[0],s[1]]
     vpaddw        %1, %1, %2                             ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1]]
 %endmacro
@@ -1092,40 +886,11 @@
 %macro AVX2_IDCT_4x4P 2
     vpsraw        %2, %1, 1                              ; [x0>>1,x1>>1,x2>>1,x3>>1]
     vpblendw      %2, %1, %2, 11110000b                  ; [x0,x1>>1,x2,x3>>1]
-%ifdef X86_32_PICASM
-    push          r0
-    mov           r0, esp
-    and           esp, 0xffffffe0
-    push          0xffffffff    ;wels_8xp1w_8xm1w
-    push          0xffffffff
-    push          0xffffffff
-    push          0xffffffff
-    push          0x00010001
-    push          0x00010001
-    push          0x00010001
-    push          0x00010001
-    push          0xffffffff    ;wels_4xp1w_4xm1w_256
-    push          0xffffffff
-    push          0x00010001
-    push          0x00010001
-    push          0xffffffff
-    push          0xffffffff
-    push          0x00010001
-    push          0x00010001
-    vpsignw       %1, %1, [esp+32]             ; [x0,x1,-x2,-x3]
-%else
-    vpsignw       %1, %1, [wels_8xp1w_8xm1w]             ; [x0,x1,-x2,-x3]
-%endif
+    vpsignw       %1, %1, [pic(wels_8xp1w_8xm1w)]        ; [x0,x1,-x2,-x3]
     vpermq        %2, %2, 4eh                            ; [x2,x3>>1,x0,x1>>1]
     vpaddw        %1, %2, %1                             ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
     vpshufd       %2, %1, 4eh                            ; [s[1],s[0],s[3],s[2]]
-%ifdef X86_32_PICASM
-    vpmullw       %1, %1, [esp]         ; [s[0],-s[1],s[2],-s[3], ...]
-    mov           esp, r0
-    pop           r0
-%else
-    vpmullw       %1, %1, [wels_4xp1w_4xm1w_256]         ; [s[0],-s[1],s[2],-s[3], ...]
-%endif
+    vpmullw       %1, %1, [pic(wels_4xp1w_4xm1w_256)]    ; [s[0],-s[1],s[2],-s[3], ...]
     vpaddw        %1, %1, %2                             ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2]]
 %endmacro
 
@@ -1134,27 +899,13 @@
 ;***********************************************************************
 WELS_EXTERN WelsDctFourT4_avx2
     %assign push_num 0
+    INIT_X86_32_PIC r5
     LOAD_5_PARA
     PUSH_XMM 7
     SIGN_EXTENSION r2, r2d
     SIGN_EXTENSION r4, r4d
 
-%ifdef X86_32_PICASM
-    push     r5
-    mov      r5, esp
-    and      esp, 0xffffffe0
-    push     0x80068005    ;wels_shufb0312_movzxw_128
-    push     0x80078004
-    push     0x80028001
-    push     0x80038000
-    push     0x0d0c0f0e   ;wels_shufb2301_128
-    push     0x09080b0a
-    push     0x05040706
-    push     0x01000302
-    vbroadcasti128 ymm6, [esp+16]
-%else
-    vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
-%endif
+    vbroadcasti128 ymm6, [pic(wels_shufb0312_movzxw_128)]
 
     ;Load 4x16
     AVX2_LoadDiff16P mm0, r1, r2, r3, r4, mm6, mm4, mm5
@@ -1169,13 +920,7 @@
     AVX2_LoadDiff16P mm3, r1, r2, r3, r4, mm6, mm4, mm5
 
     AVX2_DCT ymm0, ymm1, ymm2, ymm3, ymm5
-%ifdef X86_32_PICASM
-    vbroadcasti128 ymm6, [esp]
-    mov      esp, r5
-    pop      r5
-%else
-    vbroadcasti128 ymm6, [wels_shufb2301_128]
-%endif
+    vbroadcasti128 ymm6, [pic(wels_shufb2301_128)]
     AVX2_DCT_HORIZONTAL ymm0, ymm6, ymm5
     AVX2_DCT_HORIZONTAL ymm1, ymm6, ymm5
     AVX2_DCT_HORIZONTAL ymm2, ymm6, ymm5
@@ -1186,6 +931,7 @@
 
     POP_XMM
     LOAD_5_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 ;***********************************************************************
@@ -1203,31 +949,13 @@
     %assign push_num 0
     LOAD_5_PARA
 .begin:
+    INIT_X86_32_PIC r5
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
 
     AVX2_Load4x16P mm0, mm1, mm2, mm3, r4, mm5
-%ifdef X86_32_PICASM
-    push     r5
-    mov      r5, esp
-    and      esp, 0xffffffe0
-    push     0x0d0c0f0e    ;wels_shufb2301_128
-    push     0x09080b0a
-    push     0x05040706
-    push     0x01000302
-    push     0x80068005    ;wels_shufb0312_movzxw_128
-    push     0x80078004
-    push     0x80028001
-    push     0x80038000
-    push     0x00200020    ;wels_dw32_128
-    push     0x00200020
-    push     0x00200020
-    push     0x00200020
-    vbroadcasti128 ymm6, [esp+32]
-%else
-    vbroadcasti128 ymm6, [wels_shufb2301_128]
-%endif
+    vbroadcasti128 ymm6, [pic(wels_shufb2301_128)]
     AVX2_IDCT_HORIZONTAL ymm0, ymm6, ymm5
     AVX2_IDCT_HORIZONTAL ymm1, ymm6, ymm5
     AVX2_IDCT_HORIZONTAL ymm2, ymm6, ymm5
@@ -1234,15 +962,8 @@
     AVX2_IDCT_HORIZONTAL ymm3, ymm6, ymm5
     AVX2_IDCT ymm0, ymm1, ymm2, ymm3, ymm5
 
-%ifdef X86_32_PICASM
-    vbroadcasti128 ymm6, [esp+16]
-    vbroadcasti128 ymm7, [esp]
-    mov     esp, r5
-    pop     r5
-%else
-    vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
-    vbroadcasti128 ymm7, [wels_dw32_128]
-%endif
+    vbroadcasti128 ymm6, [pic(wels_shufb0312_movzxw_128)]
+    vbroadcasti128 ymm7, [pic(wels_dw32_128)]
     AVX2_StoreDiff32P r0, r1, mm0, mm1, r2, r3, mm7, mm6, mm5, mm4
     add r2, r3
     add r0, r1
@@ -1250,6 +971,7 @@
     vzeroupper
 
     POP_XMM
+    DEINIT_X86_32_PIC
     LOAD_5_PARA_POP
     ret
 
@@ -1258,36 +980,16 @@
 ;***********************************************************************
 WELS_EXTERN WelsDctT4_avx2
     %assign push_num 0
+    INIT_X86_32_PIC r5
     LOAD_5_PARA
     PUSH_XMM 5
     SIGN_EXTENSION r2, r2d
     SIGN_EXTENSION r4, r4d
 
-%ifdef X86_32_PICASM
-    push     r5
-    mov      r5, esp
-    and      esp, 0xffffffe0
-    push     0x80068005    ;wels_shufb0312_movzxw_128
-    push     0x80078004
-    push     0x80028001
-    push     0x80038000
-    push     0x0d0c0f0e   ;wels_shufb2301_128
-    push     0x09080b0a
-    push     0x05040706
-    push     0x01000302
-    vbroadcasti128 ymm1, [esp+16]
-%else
-    vbroadcasti128 ymm1, [wels_shufb0312_movzxw_128]
-%endif
+    vbroadcasti128 ymm1, [pic(wels_shufb0312_movzxw_128)]
     AVX2_LoadDiff4x4P mm0, r1, r2, r3, r4, mm1, mm2, mm3, mm4
     AVX2_DCT_4x4P ymm0, ymm2
-%ifdef X86_32_PICASM
-    vbroadcasti128 ymm1, [esp]
-    mov     esp, r5
-    pop     r5
-%else
-    vbroadcasti128 ymm1, [wels_shufb2301_128]
-%endif
+    vbroadcasti128 ymm1, [pic(wels_shufb2301_128)]
     AVX2_DCT_HORIZONTAL ymm0, ymm1, ymm2
     AVX2_Store4x4P r0, mm0
     vzeroupper
@@ -1294,6 +996,7 @@
 
     POP_XMM
     LOAD_5_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 ;***********************************************************************
@@ -1311,46 +1014,22 @@
     %assign push_num 0
     LOAD_5_PARA
 .begin:
+    INIT_X86_32_PIC r5
     PUSH_XMM 6
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
 
     AVX2_Load4x4P mm0, r4
-%ifdef X86_32_PICASM
-    push     r5
-    mov      r5, esp
-    and      esp, 0xffffffe0
-    push     0x0d0c0f0e   ;wels_shufb2301_128
-    push     0x09080b0a
-    push     0x05040706
-    push     0x01000302
-    push     0x80068005    ;wels_shufb0312_movzxw_128
-    push     0x80078004
-    push     0x80028001
-    push     0x80038000
-    push     0x00200020    ;wels_dw32_128
-    push     0x00200020
-    push     0x00200020
-    push     0x00200020
-    vbroadcasti128 ymm4, [esp+32]
-%else
-    vbroadcasti128 ymm4, [wels_shufb2301_128]
-%endif
+    vbroadcasti128 ymm4, [pic(wels_shufb2301_128)]
     AVX2_IDCT_HORIZONTAL ymm0, ymm4, ymm1
     AVX2_IDCT_4x4P ymm0, ymm1
-%ifdef X86_32_PICASM
-    vbroadcasti128 ymm4, [esp+16]
-    vbroadcasti128 ymm5, [esp]
-    mov     esp, r5
-    pop     r5
-%else
-    vbroadcasti128 ymm4, [wels_shufb0312_movzxw_128]
-    vbroadcasti128 ymm5, [wels_dw32_128]
-%endif
+    vbroadcasti128 ymm4, [pic(wels_shufb0312_movzxw_128)]
+    vbroadcasti128 ymm5, [pic(wels_dw32_128)]
     AVX2_StoreDiff4x4P r0, r1, mm0, r2, r3, mm5, mm4, mm1, mm2, mm3
     vzeroupper
 
     POP_XMM
+    DEINIT_X86_32_PIC
     LOAD_5_PARA_POP
     ret
 %endif
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -45,7 +45,11 @@
 ; Macros and other preprocessor constants
 ;*******************************************************************************
 
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
 SECTION .rodata align=16
+%endif
 
 ALIGN   16
 FOUR_16B_SSE2:   dw   4, 4, 4, 4, 4, 4, 4, 4
@@ -157,25 +161,9 @@
     ; Unbias and split into a non-negative and a non-positive part.
     ; Clip each part to iTc via minub.
     ; Add/subtract each part to/from p0/q0 and clip.
-%ifdef X86_32_PICASM
-    push       r0
-    mov        r0, esp
-    sub        esp, 16
-    and        esp, -16
-    push       0x60606060    ;WELS_DB96_16
-    push       0x60606060
-    push       0x60606060
-    push       0x60606060
-    movdqa     %6, [esp]
+    movdqa     %6, [pic(WELS_DB96_16)]
     psubusb    %6, %8
-    psubusb    %8, [esp]
-    mov        esp, r0
-    pop        r0
-%else
-    movdqa     %6, [WELS_DB96_16]
-    psubusb    %6, %8
-    psubusb    %8, [WELS_DB96_16]
-%endif
+    psubusb    %8, [pic(WELS_DB96_16)]
     pminub     %6, %5
     pminub     %8, %5
     psubusb    %2, %6
@@ -192,6 +180,7 @@
 
 WELS_EXTERN DeblockLumaLt4V_ssse3
     %assign push_num 0
+    INIT_X86_32_PIC r5
     LOAD_5_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
@@ -198,21 +187,8 @@
     movd     xmm1, arg3d
     movd     xmm2, arg4d
     pxor     xmm3, xmm3
-%ifdef X86_32_PICASM
-    push     r4
-    mov      r4, esp
-    sub      esp, 16
-    and      esp, -16
-    push     0x7f7f7f7f
-    push     0x7f7f7f7f
-    push     0x7f7f7f7f
-    push     0x7f7f7f7f
-    pxor     xmm1, [esp]
-    pxor     xmm2, [esp]
-%else
-    pxor     xmm1, [WELS_DB127_16]
-    pxor     xmm2, [WELS_DB127_16]
-%endif
+    pxor     xmm1, [pic(WELS_DB127_16)]
+    pxor     xmm2, [pic(WELS_DB127_16)]
     pshufb   xmm1, xmm3                       ; iAlpha ^ 0x7f
     pshufb   xmm2, xmm3                       ; iBeta  ^ 0x7f
     mov      r2, r1                           ; iStride
@@ -225,40 +201,22 @@
     MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
     movdqa   xmm4, xmm6
     SSE2_AbsDiffUB xmm6, xmm0, xmm3           ; |p0 - q0|
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm6, xmm1, [esp]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
-%else
-    SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
-%endif
+    SSE2_CmpltUB xmm6, xmm1, [pic(WELS_DB127_16)]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
     MOVDQ    xmm1, [r0 + 1 * r2]              ; q1
     SSE2_AbsDiffUB xmm7, xmm4, xmm3           ; |p1 - p0|
     SSE2_AbsDiffUB xmm0, xmm1, xmm3           ; |q1 - q0|
     pmaxub   xmm7, xmm0                       ; max(|p1 - p0|, |q1 - q0|)
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
-%else
-    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
-%endif
+    SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
     pand     xmm6, xmm7                       ; bDeltaP0Q0P1P0Q1Q0 = bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
     MOVDQ    xmm7, [r3 + 2 * r1]              ; p2
     movdqa   xmm0, xmm7
     SSE2_AbsDiffUB xmm7, xmm4, xmm3           ; |p2 - p0|
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP2P0 = |p2 - p0| < iBeta
-%else
-    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP2P0 = |p2 - p0| < iBeta
-%endif
+    SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)]  ; bDeltaP2P0 = |p2 - p0| < iBeta
     MOVDQ    xmm5, [r0 + 2 * r2]              ; q2
     MOVDQ    xmm3, [r0 + 0 * r2]              ; q0
     movdqa   xmm1, xmm5
     SSE2_AbsDiffUB xmm5, xmm3, xmm4           ; |q2 - q0|
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm5, xmm2, [esp]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
-    mov      esp, r4
-    pop      r4
-%else
-    SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
-%endif
+    SSE2_CmpltUB xmm5, xmm2, [pic(WELS_DB127_16)]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
 
     pavgb    xmm3, [r3 + 0 * r1]
     pcmpeqw  xmm2, xmm2  ; FFh
@@ -273,21 +231,7 @@
     pxor     xmm1, xmm2
 
     movd     xmm3, [r4]
-%ifdef X86_32_PICASM
-    push     r0
-    mov      r0, esp
-    sub      esp, 16
-    and      esp, -16
-    push     0x03030303    ;WELS_SHUFB0000111122223333
-    push     0x02020202
-    push     0x01010101
-    push     0x00000000
-    pshufb   xmm3, [esp] ; iTc
-    mov      esp, r0
-    pop      r0
-%else
-    pshufb   xmm3, [WELS_SHUFB0000111122223333] ; iTc
-%endif
+    pshufb   xmm3, [pic(WELS_SHUFB0000111122223333)] ; iTc
     movdqa   xmm4, xmm3  ; iTc0 = iTc
     pcmpgtb  xmm3, xmm2  ; iTc > -1 ? 0xff : 0x00
     pand     xmm6, xmm3  ; bDeltaP0Q0P1P0Q1Q0 &= iTc > -1
@@ -315,6 +259,7 @@
 
     POP_XMM
     LOAD_5_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 
@@ -380,6 +325,7 @@
 
 WELS_EXTERN DeblockLumaEq4V_ssse3
     %assign push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 10
     SIGN_EXTENSION r1, r1d
@@ -389,21 +335,8 @@
     add      r2, 1
     movd     xmm3, r2d
     pxor     xmm4, xmm4
-%ifdef X86_32_PICASM
-    push     r4
-    mov      r4, esp
-    sub      esp, 16
-    and      esp, -16
-    push     0x7f7f7f7f    ;WELS_DB127_16
-    push     0x7f7f7f7f
-    push     0x7f7f7f7f
-    push     0x7f7f7f7f
-    pxor     xmm1, [esp]
-    pxor     xmm2, [esp]
-%else
-    pxor     xmm1, [WELS_DB127_16]
-    pxor     xmm2, [WELS_DB127_16]
-%endif
+    pxor     xmm1, [pic(WELS_DB127_16)]
+    pxor     xmm2, [pic(WELS_DB127_16)]
     pshufb   xmm1, xmm4                       ; iAlpha ^ 0x7f
     pshufb   xmm2, xmm4                       ; iBeta  ^ 0x7f
     pshufb   xmm3, xmm4                       ; (iAlpha >> 2) + 1
@@ -418,41 +351,23 @@
     movdqa   xmm4, xmm6
     SSE2_AbsDiffUB xmm6, xmm0, xmm5           ; |p0 - q0|
     SSE2_CmpgeUB xmm3, xmm6                   ; |p0 - q0| < (iAlpha >> 2) + 2
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm6, xmm1, [esp]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
-%else
-    SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
-%endif
+    SSE2_CmpltUB xmm6, xmm1, [pic(WELS_DB127_16)]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
     MOVDQ    xmm1, [r0 + 1 * r2]              ; q1
     SSE2_AbsDiffUB xmm7, xmm4, xmm5           ; |p1 - p0|
     SSE2_AbsDiffUB xmm0, xmm1, xmm5           ; |q1 - q0|
     pmaxub   xmm7, xmm0                       ; max(|p1 - p0|, |q1 - q0|)
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
-%else
-    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
-%endif
+    SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
     pand     xmm6, xmm7                       ; & bDeltaP0Q0
 
     MOVDQ    xmm7, [r3 + 2 * r1]              ; p2
     SSE2_AbsDiffUB xmm7, xmm4, xmm5           ; |p2 - p0|
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP2P0 = |p2 - p0| < iBeta
-%else
-    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP2P0 = |p2 - p0| < iBeta
-%endif
+    SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)]  ; bDeltaP2P0 = |p2 - p0| < iBeta
     pand     xmm7, xmm3                       ; &= |p0 - q0| < (iAlpha >> 2) + 2
 
     MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
     MOVDQ    xmm5, [r0 + 2 * r2]              ; q2
     SSE2_AbsDiffUB xmm5, xmm0, xmm4           ; |q2 - q0|
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm5, xmm2, [esp]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
-    mov      esp, r4
-    pop      r4
-%else
-    SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
-%endif
+    SSE2_CmpltUB xmm5, xmm2, [pic(WELS_DB127_16)]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
     pand     xmm5, xmm3                       ; &= |p0 - q0| < (iAlpha >> 2) + 2
 
 %ifdef X86_32
@@ -461,26 +376,12 @@
     mov      r2, esp
     sub      esp,  16
     and      esp, -16
-%ifdef X86_32_PICASM
-    push     0x01010101
-    push     0x01010101
-    push     0x01010101
-    push     0x01010101
-    sub      esp, 16
     movdqa   [esp], xmm5
-    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [esp+16]
+    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [pic(WELS_DB1_16)]
     movdqa   xmm5, [esp]
-    neg      r1
-    SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [esp+16]
     mov      esp, r2
-%else
-    movdqa   [esp], xmm5
-    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [WELS_DB1_16]
-    movdqa   xmm5, [esp]
-    mov      esp, r2
     neg      r1
-    SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [WELS_DB1_16]
-%endif
+    SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [pic(WELS_DB1_16)]
 %else
     movdqa   xmm9, [WELS_DB1_16]
     SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm8, xmm4, 1, xmm9
@@ -489,6 +390,7 @@
 
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 
@@ -649,6 +551,7 @@
 
 WELS_EXTERN DeblockChromaLt4V_ssse3
     %assign push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r2, r2d
@@ -681,6 +584,7 @@
 
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 
@@ -737,7 +641,9 @@
     lea        r3, [3 * r2 - 1]                 ; 3 * iStride - 1
 
     SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
+    INIT_X86_32_PIC r1
     SSSE3_DeblockChromaLt4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, r5, xmm2, xmm3, xmm6, 0
+    DEINIT_X86_32_PIC
     SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
 
     POP_XMM
--- a/codec/common/x86/mc_luma.asm
+++ b/codec/common/x86/mc_luma.asm
@@ -44,7 +44,11 @@
 ;*******************************************************************************
 ; Local Data (Read Only)
 ;*******************************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=32
+%else
 SECTION .rodata align=32
+%endif
 
 ;*******************************************************************************
 ; Various memory constants (trigonometric values or rounding values)
@@ -120,12 +124,6 @@
     psllw        %1,  4
 %endmacro
 
-%macro MOVEIMM_DW32 1
-    pcmpeqw      %1,  %1
-    psrlw        %1,  15
-    psllw        %1,  5
-%endmacro
-
 %endif
 
 ;*******************************************************************************
@@ -197,12 +195,7 @@
 
 %macro FILTER_HV_W8 9
     paddw   %1, %6
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16 %8
-    paddw   %1, %8
-%else
-    paddw   %1, [h264_w0x10_1]
-%endif
+    paddw   %1, [pic(h264_w0x10_1)]
     movdqa  %8, %3
     movdqa  %7, %2
     paddw   %8, %4
@@ -221,12 +214,7 @@
 
 %macro FILTER_HV_W4 9
 paddw   %1, %6
-%ifdef X86_32_PICASM
-MOVEIMM_DW16 %8
-paddw   %1, %8
-%else
-paddw   %1, [h264_w0x10_1]
-%endif
+paddw   %1, [pic(h264_w0x10_1)]
 movdqa  %8, %3
 movdqa  %7, %2
 paddw   %8, %4
@@ -457,6 +445,7 @@
 ;*******************************************************************************
 WELS_EXTERN McHorVer02WidthEq8_sse2
     %assign  push_num 0
+    INIT_X86_32_PIC r5
     LOAD_5_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -530,6 +519,7 @@
 .xx_exit:
     POP_XMM
     LOAD_5_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 ;***********************************************************************
@@ -550,6 +540,7 @@
 ;***********************************************************************
 WELS_EXTERN McHorVer02Height9Or17_sse2
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -671,6 +662,7 @@
 %endif
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 
@@ -684,6 +676,7 @@
 ;***********************************************************************
 WELS_EXTERN McHorVer02Height5_sse2
 %assign  push_num 0
+INIT_X86_32_PIC r6
 LOAD_6_PARA
 PUSH_XMM 8
 SIGN_EXTENSION  r1, r1d
@@ -805,6 +798,7 @@
 %endif
 POP_XMM
 LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
 ret
 
 
@@ -819,6 +813,7 @@
 ;***********************************************************************
 WELS_EXTERN McHorVer20Width9Or17_sse2
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -855,12 +850,7 @@
     paddw xmm0, xmm6
     psllw xmm6, 2
     paddw xmm0, xmm6
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16 xmm6
-    paddw xmm0, xmm6
-%else
-    paddw xmm0, [h264_w0x10_1]
-%endif
+    paddw xmm0, [pic(h264_w0x10_1)]
     psraw  xmm0, 5
     packuswb xmm0, xmm0
     movd [r2], xmm0
@@ -877,11 +867,7 @@
     paddw xmm2, xmm5
     psllw xmm5, 2
     paddw xmm2, xmm5
-%ifdef X86_32_PICASM
-    paddw xmm2, xmm6
-%else
-    paddw xmm2, [h264_w0x10_1]
-%endif
+    paddw xmm2, [pic(h264_w0x10_1)]
     psraw  xmm2, 5
     packuswb xmm2, xmm2
     movq [r2+1], xmm2
@@ -892,6 +878,7 @@
     jnz .yloop_width_9
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 
 
@@ -918,12 +905,7 @@
     paddw xmm0, xmm4
     psllw xmm4, 2
     paddw xmm0, xmm4
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16 xmm6
-    paddw xmm0, xmm6
-%else
-    paddw xmm0, [h264_w0x10_1]
-%endif
+    paddw xmm0, [pic(h264_w0x10_1)]
     psraw  xmm0, 5
     packuswb xmm0, xmm0
     movq [r2], xmm0
@@ -951,12 +933,7 @@
     paddw xmm0, xmm6
     psllw xmm6, 2
     paddw xmm0, xmm6
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16 xmm6
-    paddw xmm0, xmm6
-%else
-    paddw xmm0, [h264_w0x10_1]
-%endif
+    paddw xmm0, [pic(h264_w0x10_1)]
     psraw  xmm0, 5
     packuswb xmm0, xmm0
     movd [r2+8], xmm0
@@ -974,11 +951,7 @@
     paddw xmm2, xmm5
     psllw xmm5, 2
     paddw xmm2, xmm5
-%ifdef X86_32_PICASM
-    paddw xmm2, xmm6
-%else
-    paddw xmm2, [h264_w0x10_1]
-%endif
+    paddw xmm2, [pic(h264_w0x10_1)]
     psraw  xmm2, 5
     packuswb xmm2, xmm2
     movq [r2+9], xmm2
@@ -988,6 +961,7 @@
     jnz .yloop_width_17
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 
@@ -1002,6 +976,7 @@
 ;***********************************************************************
 WELS_EXTERN McHorVer20Width5_sse2
 %assign  push_num 0
+INIT_X86_32_PIC r6
 LOAD_6_PARA
 PUSH_XMM 8
 SIGN_EXTENSION  r1, r1d
@@ -1035,12 +1010,7 @@
 paddw xmm0, xmm6
 psllw xmm6, 2
 paddw xmm0, xmm6
-%ifdef X86_32_PICASM
-MOVEIMM_DW16 xmm6
-paddw xmm0, xmm6
-%else
-paddw xmm0, [h264_w0x10_1]
-%endif
+paddw xmm0, [pic(h264_w0x10_1)]
 psraw  xmm0, 5
 packuswb xmm0, xmm0
 movd [r2], xmm0
@@ -1057,11 +1027,7 @@
 paddw xmm2, xmm5
 psllw xmm5, 2
 paddw xmm2, xmm5
-%ifdef X86_32_PICASM
-paddw xmm2, xmm6
-%else
-paddw xmm2, [h264_w0x10_1]
-%endif
+paddw xmm2, [pic(h264_w0x10_1)]
 psraw  xmm2, 5
 packuswb xmm2, xmm2
 movd [r2+1], xmm2
@@ -1072,6 +1038,7 @@
 jnz .yloop_width_5
 POP_XMM
 LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
 ret
 
 
@@ -1238,12 +1205,7 @@
     psubw  %1, %7
     psraw   %1, 2
     paddw  %8, %1
-%ifdef X86_32_PICASM
-    MOVEIMM_DW32 %7
-    paddw  %8, %7
-%else
-    paddw  %8, [h264_mc_hc_32]
-%endif
+    paddw  %8, [pic(h264_mc_hc_32)]
     psraw   %8, 6
     packuswb %8, %8
     movq %9, %8
@@ -1260,6 +1222,7 @@
 
 WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -1377,6 +1340,7 @@
 %endif
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 ;***********************************************************************
@@ -1391,6 +1355,7 @@
 
 WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -1507,6 +1472,7 @@
 %endif
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 
@@ -1595,12 +1561,7 @@
 psubw  %1, %7
 psraw   %1, 2
 paddw  %8, %1
-%ifdef X86_32_PICASM
-MOVEIMM_DW32 %7
-paddw  %8, %7
-%else
-paddw  %8, [h264_mc_hc_32]
-%endif
+paddw  %8, [pic(h264_mc_hc_32)]
 psraw   %8, 6
 packuswb %8, %8
 movd %9, %8
@@ -1619,6 +1580,7 @@
 
 WELS_EXTERN McHorVer22Width4VerLastAlign_sse2
 %assign  push_num 0
+INIT_X86_32_PIC r6
 LOAD_6_PARA
 PUSH_XMM 8
 SIGN_EXTENSION  r1, r1d
@@ -1736,6 +1698,7 @@
 %endif
 POP_XMM
 LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
 ret
 
 
@@ -1751,6 +1714,7 @@
 
 WELS_EXTERN McHorVer22Width4VerLastUnAlign_sse2
 %assign  push_num 0
+INIT_X86_32_PIC r6
 LOAD_6_PARA
 PUSH_XMM 8
 SIGN_EXTENSION  r1, r1d
@@ -1867,6 +1831,7 @@
 %endif
 POP_XMM
 LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
 ret
 
 
@@ -1879,12 +1844,7 @@
     movdqa          %7, %3
     pmaddubsw       %7, %6
     paddw           %1, %7
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16    %7
-    paddw            %1, %7
-%else
-    paddw           %1, [h264_w0x10_1]
-%endif
+    paddw           %1, [pic(h264_w0x10_1)]
     psraw           %1, 5
 %endmacro
 
@@ -1901,12 +1861,7 @@
     movdqa          %7, %4
     pmaddubsw       %7, %6
     paddw           %1, %7
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16    %7
-    paddw            %1, %7
-%else
-    paddw           %1, [h264_w0x10_1]
-%endif
+    paddw           %1, [pic(h264_w0x10_1)]
     psraw           %1, 5
 %endmacro
 
@@ -1916,20 +1871,7 @@
     pshufb          %1, %2
     pshufb          %5, %3
     pshufd          %6, %1, 10110001b
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xfffffff0
-    push            0x14141414    ;db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    pmaddubsw       %1, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    pmaddubsw       %1, [db20_128]
-%endif
+    pmaddubsw       %1, [pic(db20_128)]
     pmaddubsw       %5, %4
     pmaddubsw       %6, %4
     paddw           %1, %5
@@ -1939,12 +1881,7 @@
 ; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
 %macro SSSE3_FilterHorizontal_8px 6
     SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16    %5
-    paddw           %1, %5
-%else
-    paddw           %1, [h264_w0x10_1]
-%endif
+    paddw           %1, [pic(h264_w0x10_1)]
     psraw           %1, 5
 %endmacro
 
@@ -1959,20 +1896,7 @@
     pshufb          %7, %4
     punpcklqdq      %6, %7
     pshufd          %7, %1, 10110001b
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xfffffff0
-    push            0x14141414    ;db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    pmaddubsw       %1, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    pmaddubsw       %1, [db20_128]
-%endif
+    pmaddubsw       %1, [pic(db20_128)]
     pmaddubsw       %6, %5
     pmaddubsw       %7, %5
     paddw           %1, %6
@@ -1982,31 +1906,13 @@
 ; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
 %macro SSSE3_FilterHorizontal_2x4px 7
     SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7
-%ifdef X86_32_PICASM
-    MOVEIMM_DW16    %6
-    paddw           %1, %6
-%else
-    paddw           %1, [h264_w0x10_1]
-%endif
+    paddw           %1, [pic(h264_w0x10_1)]
     psraw           %1, 5
 %endmacro
 
 ; pixels=%1 -32768>>scale=%2 tmp=%3
 %macro SSSE3_FilterHorizontalbw_2px 3
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    pmaddubsw       %1, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    pmaddubsw       %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_128]
-%endif
+    pmaddubsw       %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_128)]
     pmaddwd         %1, %2
     pshufd          %3, %1, 10110001b
     paddd           %1, %3
@@ -2014,33 +1920,8 @@
 
 ; pixels=%1 tmp=%2
 %macro SSSE3_FilterHorizontal_2px 2
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    pmaddubsw       %1, [esp]
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    pmaddwd         %1, [esp]
-    pshufd          %2, %1, 10110001b
-    paddd           %1, %2
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    paddd           %1, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    SSSE3_FilterHorizontalbw_2px %1, [dwm1024_128], %2
-    paddd           %1, [dd32768_128]
-%endif
+    SSSE3_FilterHorizontalbw_2px %1, [pic(dwm1024_128)], %2
+    paddd           %1, [pic(dd32768_128)]
 %endmacro
 
 ; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
@@ -2055,14 +1936,8 @@
     paddw           %7, %4
     paddw           %1, %7
     psraw           %1, 2
-%ifdef X86_32_PICASM
+    paddw           %7, [pic(h264_mc_hc_32)]
     paddw           %1, %7
-    MOVEIMM_DW32    %7
-    paddw           %1, %7
-%else
-    paddw           %7, [h264_mc_hc_32]
-    paddw           %1, %7
-%endif
     psraw           %1, 6
 %endmacro
 
@@ -2080,7 +1955,11 @@
 %define i_srcstride   r1
 %define p_dst         r2
 %define i_dststride   r3
+%ifdef X86_32_PICASM
+%define i_width       dword arg5
+%else
 %define i_width       r4
+%endif
 %define i_height      r5
 %define i_srcstride3  r6
     %assign push_num 0
@@ -2094,28 +1973,14 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
+    INIT_X86_32_PIC_NOPRESERVE r4
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
     lea             i_srcstride3, [3 * i_srcstride]
+    %assign push_num_begin push_num
     cmp             i_width, 4
     jg              .width8or16
 
-%ifdef X86_32_PICASM
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    movdqu          xmm6, [esp]
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    movdqu          xmm7, [esp]
-    push            0x14141414    ;db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-%endif
     movd            xmm0, [p_src]
     movd            xmm4, [p_src + i_srcstride]
     punpcklbw       xmm0, xmm4
@@ -2134,14 +1999,8 @@
     movd            xmm3, [p_src]
     punpcklbw       xmm4, xmm3
     punpcklqdq      xmm2, xmm4
-%ifdef X86_32_PICASM
-    movdqu          xmm5, [esp]
-    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, xmm6, xmm5, xmm7, xmm4
-    add             esp, 48
-%else
-    movdqa          xmm5, [db20_128]
-    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+    movdqa          xmm5, [pic(db20_128)]
+    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm0, xmm0
     movd            [p_dst], xmm0
     psrlq           xmm0, 32
@@ -2152,11 +2011,7 @@
     movd            xmm0, [p_src + 2 * i_srcstride]
     punpcklbw       xmm4, xmm0
     punpcklqdq      xmm3, xmm4
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, xmm6, xmm5, xmm7, xmm4
-%else
-    SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+    SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm1, xmm1
     movd            [p_dst], xmm1
     psrlq           xmm1, 32
@@ -2167,14 +2022,11 @@
     movd            xmm4, [p_src + i_srcstride3]
     punpcklbw       xmm0, xmm4
     jg              .width4_height_ge8
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, xmm6, xmm5, xmm7, xmm4
-%else
-    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm2, xmm2
     movd            [p_dst], xmm2
 .width4_height_le5_done:
+    DEINIT_X86_32_PIC_KEEPDEF
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -2186,11 +2038,7 @@
     movd            xmm1, [p_src]
     punpcklbw       xmm4, xmm1
     punpcklqdq      xmm0, xmm4
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, xmm6, xmm5, xmm7, xmm4
-%else
-    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm2, xmm2
     movd            [p_dst], xmm2
     psrlq           xmm2, 32
@@ -2201,11 +2049,7 @@
     movd            xmm2, [p_src + 2 * i_srcstride]
     punpcklbw       xmm4, xmm2
     punpcklqdq      xmm1, xmm4
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, xmm6, xmm5, xmm7, xmm4
-%else
-    SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+    SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm3, xmm3
     movd            [p_dst], xmm3
     psrlq           xmm3, 32
@@ -2215,14 +2059,11 @@
     lea             p_dst, [p_dst + 2 * i_dststride]
     movd            xmm4, [p_src + i_srcstride3]
     punpcklbw       xmm2, xmm4
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, xmm6, xmm5, xmm7, xmm4
-%else
-    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm0, xmm0
     movd            [p_dst], xmm0
 .width4_height_ge8_done:
+    DEINIT_X86_32_PIC_KEEPDEF
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -2231,38 +2072,16 @@
     ret
 
 .width8or16:
+    %assign push_num push_num_begin
     sub             i_height, 1
     push            i_height
+    %assign push_num push_num + 1
 %xdefine i_ycnt i_height
 %define i_height [r7]
 .xloop:
     push            p_src
     push            p_dst
-%ifdef X86_32_PICASM
-    push            i_width
-    mov             i_width, esp
-    and             esp, 0xfffffff0
-    push            0xfb01fb01    ;[esp+64]maddubsw_p1m5_128
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0x14141414    ;[esp+48]db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x01fb01fb    ;[esp+32]maddubsw_m5p1_128
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x14fb14fb    ;[esp+16]maddubsw_m5p20_128
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0xfb14fb14    ;[esp] maddubsw_p20m5_128
-    push            0xfb14fb14
-    push            0xfb14fb14
-    push            0xfb14fb14
-%endif
+    %assign push_num push_num + 2
     test            i_ycnt, 1
     jnz             .yloop_begin_even
     movq            xmm0, [p_src]
@@ -2276,11 +2095,7 @@
     movq            xmm5, [p_src + i_srcstride]
     lea             p_src, [p_src + 2 * i_srcstride]
     punpcklbw       xmm4, xmm5
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [esp+64], [esp+48], [esp+32], xmm7
-%else
-    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm7
-%endif
+    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm7
     packuswb        xmm0, xmm0
     movlps          [p_dst], xmm0
     add             p_dst, i_dststride
@@ -2297,36 +2112,20 @@
     punpcklbw       xmm4, xmm5
 .yloop:
     movq            xmm6, [p_src]
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [esp+16], [esp], xmm0, xmm7
-%else
-    SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm0, xmm7
-%endif
+    SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm0, xmm7
     movq            xmm7, [p_src + i_srcstride]
     punpcklbw       xmm6, xmm7
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [esp+64], [esp+48], [esp+32], xmm0
-%else
-    SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm0
-%endif
+    SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm0
     packuswb        xmm1, xmm2
     movlps          [p_dst], xmm1
     movhps          [p_dst + i_dststride], xmm1
     lea             p_dst, [p_dst + 2 * i_dststride]
     movq            xmm0, [p_src + 2 * i_srcstride]
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [esp+16], [esp], xmm2, xmm1
-%else
-    SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm2, xmm1
-%endif
+    SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm2, xmm1
     movq            xmm1, [p_src + i_srcstride3]
     lea             p_src, [p_src + 4 * i_srcstride]
     punpcklbw       xmm0, xmm1
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [esp+64], [esp+48], [esp+32], xmm2
-%else
-    SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm2
-%endif
+    SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm2
     packuswb        xmm3, xmm4
     movlps          [p_dst], xmm3
     movhps          [p_dst + i_dststride], xmm3
@@ -2334,36 +2133,20 @@
     jle             .yloop_exit
     lea             p_dst, [p_dst + 2 * i_dststride]
     movq            xmm2, [p_src]
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [esp+16], [esp], xmm4, xmm3
-%else
-    SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm4, xmm3
-%endif
+    SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm4, xmm3
     movq            xmm3, [p_src + i_srcstride]
     punpcklbw       xmm2, xmm3
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [esp+64], [esp+48], [esp+32], xmm4
-%else
-    SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm4
-%endif
+    SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm4
     packuswb        xmm5, xmm6
     movlps          [p_dst], xmm5
     movhps          [p_dst + i_dststride], xmm5
     lea             p_dst, [p_dst + 2 * i_dststride]
     movq            xmm4, [p_src + 2 * i_srcstride]
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [esp+16], [esp], xmm6, xmm5
-%else
-    SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm6, xmm5
-%endif
+    SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm6, xmm5
     movq            xmm5, [p_src + i_srcstride3]
     lea             p_src, [p_src + 4 * i_srcstride]
     punpcklbw       xmm4, xmm5
-%ifdef X86_32_PICASM
-    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [esp+64], [esp+48], [esp+32], xmm6
-%else
-    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm6
-%endif
+    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm6
     packuswb        xmm7, xmm0
     movlps          [p_dst], xmm7
     movhps          [p_dst + i_dststride], xmm7
@@ -2371,12 +2154,9 @@
     sub             i_ycnt, 8
     jg              .yloop
 .yloop_exit:
-%ifdef X86_32_PICASM
-    mov             esp, i_width
-    pop             i_width
-%endif
     pop             p_dst
     pop             p_src
+    %assign push_num push_num - 2
     sub             i_width, 8
     jle             .width8or16_done
     add             p_src, 8
@@ -2385,6 +2165,8 @@
     jmp             .xloop
 .width8or16_done:
     pop             i_ycnt
+    %assign push_num push_num - 1
+    DEINIT_X86_32_PIC
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -2418,6 +2200,7 @@
 %define i_width      r4
 %define i_height     r5
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 7
     SIGN_EXTENSION  r1, r1d
@@ -2424,28 +2207,9 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
-%ifdef X86_32_PICASM
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    movdqu          xmm4, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    movdqu          xmm5, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    movdqu          xmm6, [esp]
-    add             esp, 48
-%else
-    movdqa          xmm4, [shufb_32435465768798A9]
-    movdqa          xmm5, [shufb_011267784556ABBC]
-    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    movdqa          xmm4, [pic(shufb_32435465768798A9)]
+    movdqa          xmm5, [pic(shufb_011267784556ABBC)]
+    movdqa          xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     cmp             i_width, 8
     je              .width8_yloop
     jg              .width16_yloop
@@ -2463,6 +2227,7 @@
     jg              .width4_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width8_yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2478,6 +2243,7 @@
     jg              .width8_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width16_yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2492,6 +2258,7 @@
     jg              .width16_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -2518,6 +2285,7 @@
 %define i_width      r4
 %define i_height     r5
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -2524,28 +2292,9 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
-%ifdef X86_32_PICASM
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    movdqu          xmm5, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    movdqu          xmm6, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    movdqu          xmm7, [esp]
-    add             esp, 48
-%else
-    movdqa          xmm5, [shufb_32435465768798A9]
-    movdqa          xmm6, [shufb_011267784556ABBC]
-    movdqa          xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    movdqa          xmm5, [pic(shufb_32435465768798A9)]
+    movdqa          xmm6, [pic(shufb_011267784556ABBC)]
+    movdqa          xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     cmp             i_width, 9
     je              .width9_yloop
     jg              .width17_yloop
@@ -2563,6 +2312,7 @@
     jg              .width5_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width9_yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2586,6 +2336,7 @@
     jg              .width9_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width17_yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2615,6 +2366,7 @@
     jg              .width17_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -2637,6 +2389,7 @@
 %define p_dst        r2
 %define i_height     r3
     %assign  push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 7
     SIGN_EXTENSION  r1, r1d
@@ -2643,28 +2396,9 @@
     SIGN_EXTENSION  r3, r3d
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
-%ifdef X86_32_PICASM
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    movdqu          xmm4, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    movdqu          xmm5, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    movdqu          xmm6, [esp]
-    add             esp, 48
-%else
-    movdqa          xmm4, [shufb_32435465768798A9]
-    movdqa          xmm5, [shufb_011267784556ABBC]
-    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    movdqa          xmm4, [pic(shufb_32435465768798A9)]
+    movdqa          xmm5, [pic(shufb_011267784556ABBC)]
+    movdqa          xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     sub             i_height, 1
 .yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2681,6 +2415,7 @@
     movlps          [p_dst], xmm0
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -2702,6 +2437,7 @@
 %define i_height     r3
 %define i_srcstride  8
     %assign  push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r2, r2d
@@ -2746,6 +2482,7 @@
 .done:
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef p_dst
@@ -2769,6 +2506,7 @@
 %define i_dststride  r3
 %define i_height     r4
     %assign  push_num 0
+    INIT_X86_32_PIC r5
     LOAD_5_PARA
     PUSH_XMM 7
     SIGN_EXTENSION  r1, r1d
@@ -2776,28 +2514,9 @@
     SIGN_EXTENSION  r4, r4d
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
-%ifdef X86_32_PICASM
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    movdqu          xmm4, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    movdqu          xmm5, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    movdqu          xmm6, [esp]
-    add             esp, 48
-%else
-    movdqa          xmm4, [shufb_32435465768798A9]
-    movdqa          xmm5, [shufb_011267784556ABBC]
-    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    movdqa          xmm4, [pic(shufb_32435465768798A9)]
+    movdqa          xmm5, [pic(shufb_011267784556ABBC)]
+    movdqa          xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     sub             i_height, 1
 .yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2818,6 +2537,7 @@
 .done:
     POP_XMM
     LOAD_5_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -2846,6 +2566,7 @@
     push            r5
     %assign  push_num 1
 %endif
+    INIT_X86_32_PIC r6
     LOAD_5_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -2936,6 +2657,7 @@
 .done:
     POP_XMM
     LOAD_5_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop             r5
 %endif
@@ -2965,6 +2687,7 @@
 %define i_width     r4
 %define i_height    r5
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -2975,28 +2698,9 @@
     sub             p_src, i_srcstride
     pcmpeqw         xmm4, xmm4
     psllw           xmm4, 15                                ; dw -32768
-%ifdef X86_32_PICASM
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    movdqu          xmm5, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    movdqu          xmm6, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    movdqu          xmm7, [esp]
-    add             esp, 48
-%else
-    movdqa          xmm5, [shufb_32435465768798A9]
-    movdqa          xmm6, [shufb_011267784556ABBC]
-    movdqa          xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    movdqa          xmm5, [pic(shufb_32435465768798A9)]
+    movdqa          xmm6, [pic(shufb_011267784556ABBC)]
+    movdqa          xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     cmp             i_width, 9
     jne             .width17_yloop
 
@@ -3019,6 +2723,7 @@
     jg              .width9_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 
 .width17_yloop:
@@ -3047,6 +2752,7 @@
     jg              .width17_yloop
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -3070,7 +2776,11 @@
 %define i_srcstride  r1
 %define p_dst        r2
 %define i_dststride  r3
+%ifdef X86_32_PICASM
+%define i_width      dword arg5
+%else
 %define i_width      r4
+%endif
 %define i_height     r5
 %define i_srcstride3 r6
     %assign  push_num 0
@@ -3084,14 +2794,23 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
+    INIT_X86_32_PIC_NOPRESERVE r4
     sub             i_height, 1
     push            i_height
+    %assign push_num push_num + 1
     lea             i_srcstride3, [3 * i_srcstride]
     test            i_width, 1
     jz              .width_loop
     push            p_src
     push            p_dst
+    %assign push_num push_num + 2
+%ifdef X86_32_PICASM
+    add             p_src, i_width
+    add             p_src, i_width
+    sub             p_src, 2
+%else
     lea             p_src, [p_src + 2 * i_width - 2]
+%endif
     add             p_dst, i_width
     movd            xmm0, [p_src]
     punpcklwd       xmm0, [p_src + i_srcstride]
@@ -3186,11 +2905,13 @@
 .unalign_done:
     pop             p_dst
     pop             p_src
+    %assign push_num push_num - 2
     mov             i_height, [r7]
     sub             i_width, 1
 .width_loop:
     push            p_src
     push            p_dst
+    %assign push_num push_num + 2
     movdqa          xmm0, [p_src]
     movdqa          xmm1, [p_src + i_srcstride]
     movdqa          xmm2, [p_src + 2 * i_srcstride]
@@ -3245,6 +2966,7 @@
 .x_loop_dec:
     pop             p_dst
     pop             p_src
+    %assign push_num push_num - 2
     sub             i_width, 8
     jle             .done
     mov             i_height, [r7]
@@ -3258,6 +2980,8 @@
     pop             p_src
 .done:
     pop             i_height
+    %assign push_num push_num - 1
+    DEINIT_X86_32_PIC
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -3280,24 +3004,7 @@
     vpshufb         %5, %1, %3
     vpshufb         %1, %1, %2
     vpshufd         %6, %1, 10110001b
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xffffffe0
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    vpmaddubsw      %1, %1, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    vpmaddubsw      %1, %1, [db20_256]
-%endif
+    vpmaddubsw      %1, %1, [pic(db20_256)]
     vpmaddubsw      %5, %5, %4
     vpmaddubsw      %6, %6, %4
     vpaddw          %1, %1, %5
@@ -3307,14 +3014,7 @@
 ; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 db20=%4 tmp=%5,%6
 %macro AVX2_FilterHorizontal_16px 6
     AVX2_FilterHorizontalbw_16px %1, %2, %3, %4, %5, %6
-%ifdef X86_32_PICASM
-    vpcmpeqw        %6, %6, %6
-    vpsrlw          %6, %6, 15
-    vpsllw          %6, %6, 4
-    vpaddw          %1, %1, %6
-%else
-    vpaddw          %1, %1, [h264_w0x10_256]
-%endif
+    vpaddw          %1, %1, [pic(h264_w0x10_256)]
     vpsraw          %1, %1, 5
 %endmacro
 
@@ -3327,24 +3027,7 @@
     vpunpcklqdq     %1, %1, %2
     vpunpcklqdq     %6, %6, %7
     vpshufd         %7, %1, 10110001b
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xffffffe0
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    vpmaddubsw      %1, %1, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    vpmaddubsw      %1, %1, [db20_256]
-%endif
+    vpmaddubsw      %1, %1, [pic(db20_256)]
     vpmaddubsw      %6, %6, %5
     vpmaddubsw      %7, %7, %5
     vpaddw          %1, %1, %6
@@ -3354,20 +3037,13 @@
 ; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 db20=%5 tmp=%6,%7
 %macro AVX2_FilterHorizontal_4x4px 7
     AVX2_FilterHorizontalbw_4x4px %1, %2, %3, %4, %5, %6, %7
-%ifdef X86_32_PICASM
-    vpcmpeqw        %7, %7, %7
-    vpsrlw          %7, %7, 15
-    vpsllw          %7, %7, 4
-    vpaddw          %1, %1, %7
-%else
-    vpaddw          %1, %1, [h264_w0x10_256]
-%endif
+    vpaddw          %1, %1, [pic(h264_w0x10_256)]
     vpsraw          %1, %1, 5
 %endmacro
 
 ; pixels=%1 -32768>>scale=%2 tmp=%3
 %macro AVX2_FilterHorizontalbw_4px 3
-    vpmaddubsw      %1, %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_256]
+    vpmaddubsw      %1, %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_256)]
     vpmaddwd        %1, %1, %2
     vpshufd         %3, %1, 10110001b
     vpaddd          %1, %1, %3
@@ -3375,45 +3051,8 @@
 
 ; pixels=%1 tmp=%2
 %macro AVX2_FilterHorizontal_4px 2
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xffffffe0
-    push            0x0000fe0a    ;maddubsw_m2p10_m40m40_p10m2_p0p0_256
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0xfc00fc00    ;dwm1024_256
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0xfc00fc00
-    push            0x00008000    ;dd32768_256
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    push            0x00008000
-    vpmaddubsw      %1, %1, [esp+64]
-    vpmaddwd        %1, %1, [esp+32]
-    vpshufd         %2, %1, 10110001b
-    vpaddd          %1, %1, %2
-    vpaddd          %1, %1, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    AVX2_FilterHorizontalbw_4px %1, [dwm1024_256], %2
-    vpaddd          %1, %1, [dd32768_256]
-%endif
+    AVX2_FilterHorizontalbw_4px %1, [pic(dwm1024_256)], %2
+    vpaddd          %1, %1, [pic(dd32768_256)]
 %endmacro
 
 ; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
@@ -3423,14 +3062,7 @@
     vpaddw          %1, %1, %7
     vpmaddubsw      %7, %3, %6
     vpaddw          %1, %1, %7
-%ifdef X86_32_PICASM
-    vpcmpeqw        %7, %7, %7
-    vpsrlw          %7, %7, 15
-    vpsllw          %7, %7, 4
-    vpaddw          %1, %1, %7
-%else
-    vpaddw          %1, %1, [h264_w0x10_256]
-%endif
+    vpaddw          %1, %1, [pic(h264_w0x10_256)]
     vpsraw          %1, %1, 5
 %endmacro
 
@@ -3444,14 +3076,7 @@
     vpaddw          %1, %1, %7
     vpmaddubsw      %7, %4, %6
     vpaddw          %1, %1, %7
-%ifdef X86_32_PICASM
-    vpcmpeqw        %7, %7, %7
-    vpsrlw          %7, %7, 15
-    vpsllw          %7, %7, 4
-    vpaddw          %1, %1, %7
-%else
-    vpaddw          %1, %1, [h264_w0x10_256]
-%endif
+    vpaddw          %1, %1, [pic(h264_w0x10_256)]
     vpsraw          %1, %1, 5
 %endmacro
 
@@ -3465,24 +3090,7 @@
     vpaddw          %7, %3, %4
     vpaddw          %1, %1, %7
     vpsraw          %1, %1, 2
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xffffffe0
-    push            0x00200020
-    push            0x00200020
-    push            0x00200020
-    push            0x00200020
-    push            0x00200020
-    push            0x00200020
-    push            0x00200020
-    push            0x00200020
-    vpaddw          %7, %7, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    vpaddw          %7, %7, [dw32_256]
-%endif
+    vpaddw          %7, %7, [pic(dw32_256)]
     vpaddw          %1, %1, %7
     vpsraw          %1, %1, 6
 %endmacro
@@ -3501,7 +3109,11 @@
 %define i_srcstride   r1
 %define p_dst         r2
 %define i_dststride   r3
+%ifdef X86_32_PICASM
+%define i_width       dword arg5
+%else
 %define i_width       r4
+%endif
 %define i_height      r5
 %define i_srcstride3  r6
     %assign push_num 0
@@ -3515,6 +3127,7 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
+    INIT_X86_32_PIC_NOPRESERVE r4
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
     lea             i_srcstride3, [3 * i_srcstride]
@@ -3522,32 +3135,6 @@
     je              .width8
     jg              .width16
 ; .width4:
-%ifdef X86_32_PICASM
-    push            i_width
-    mov             i_width, esp
-    and             esp, 0xffffffe0
-    sub             esp, 16
-    push            0x14141414    ;db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0xfb01fb01    ;maddubsw_p1m5_256
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0x01fb01fb    ;maddubsw_m5p1_256
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-%endif
     vmovd           xmm0, [p_src]
     vpbroadcastd    xmm5, [p_src + i_srcstride]
     vpunpcklbw      xmm0, xmm0, xmm5
@@ -3574,13 +3161,8 @@
     vpunpcklbw      ymm5, ymm5, ymm4
     vpblendd        ymm3, ymm3, ymm5, 11001100b
     vpblendd        ymm2, ymm2, ymm3, 11110000b
-%ifdef X86_32_PICASM
-    vbroadcasti128  ymm6, [esp+64]
-    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [esp+32], ymm6, [esp], ymm5
-%else
-    vbroadcasti128  ymm6, [db20_128]
-    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
-%endif
+    vbroadcasti128  ymm6, [pic(db20_128)]
+    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
     vpackuswb       ymm0, ymm0, ymm0
     vmovd           [p_dst], xmm0
     vpsrlq          xmm5, xmm0, 32
@@ -3596,11 +3178,7 @@
     vpbroadcastd    ymm5, [p_src + i_srcstride3]
     vpunpcklbw      ymm4, ymm4, ymm5
     jg              .width4_height_ge8
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [esp+32], xmm6, [esp], xmm5
-%else
-    AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
-%endif
+    AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
     vpackuswb       xmm2, xmm2, xmm2
     vmovd           [p_dst], xmm2
     jmp             .width4_done
@@ -3616,11 +3194,7 @@
     vpunpcklbw      ymm5, ymm5, ymm0
     vpblendd        ymm1, ymm1, ymm5, 11001100b
     vpblendd        ymm4, ymm4, ymm1, 11110000b
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [esp+32], ymm6, [esp], ymm5
-%else
-    AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
-%endif
+    AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
     vpackuswb       ymm2, ymm2, ymm2
     vmovd           [p_dst], xmm2
     vpsrlq          xmm5, xmm2, 32
@@ -3635,19 +3209,12 @@
     lea             p_dst, [p_dst + 2 * i_dststride]
     vmovd           xmm5, [p_src + i_srcstride3]
     vpunpcklbw      xmm0, xmm0, xmm5
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [esp+32], xmm6, [esp], xmm5
-%else
-    AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
-%endif
+    AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
     vpackuswb       xmm4, xmm4, xmm4
     vmovd           [p_dst], xmm4
 .width4_done:
-%ifdef X86_32_PICASM
-    mov             esp, i_width
-    pop             i_width
-%endif
     vzeroupper
+    DEINIT_X86_32_PIC_KEEPDEF
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -3656,32 +3223,6 @@
     ret
 
 .width8:
-%ifdef X86_32_PICASM
-    push            i_width
-    mov             i_width, esp
-    and             esp, 0xffffffe0
-    sub             esp, 16
-    push            0x14141414    ;db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0xfb01fb01    ;maddubsw_p1m5_256
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0x01fb01fb    ;maddubsw_m5p1_256
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-%endif
     sub             i_height, 1
     vmovq           xmm0, [p_src]
     vmovq           xmm4, [p_src + i_srcstride]
@@ -3701,13 +3242,8 @@
     vmovq           xmm3, [p_src + 2 * i_srcstride]
     vpunpcklbw      xmm4, xmm4, xmm3
     vinserti128     ymm2, ymm2, xmm4, 1
-%ifdef X86_32_PICASM
-    vbroadcasti128  ymm5, [esp+64]
-    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [esp+32], ymm5, [esp], ymm4
-%else
-    vbroadcasti128  ymm5, [db20_128]
-    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+    vbroadcasti128  ymm5, [pic(db20_128)]
+    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
     vmovq           xmm4, [p_src + i_srcstride3]
     lea             p_src, [p_src + 4 * i_srcstride]
     vpunpcklbw      xmm3, xmm3, xmm4
@@ -3714,11 +3250,7 @@
     vmovq           xmm6, [p_src]
     vpunpcklbw      xmm4, xmm4, xmm6
     vinserti128     ymm3, ymm3, xmm4, 1
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [esp+32], ymm5, [esp], ymm4
-%else
-    AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+    AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
     vpackuswb       ymm0, ymm0, ymm1
     vmovlps         [p_dst], xmm0
     vextracti128    xmm1, ymm0, 1
@@ -3732,11 +3264,7 @@
     vmovq           xmm4, [p_src + i_srcstride]
     vpunpcklbw      xmm0, xmm6, xmm4
     jg              .width8_height_ge8
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [esp+32], xmm5, [esp], xmm4
-%else
-    AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
-%endif
+    AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
     vpackuswb       xmm2, xmm2, xmm2
     vmovlps         [p_dst], xmm2
     jmp             .width8_done
@@ -3744,11 +3272,7 @@
     vmovq           xmm1, [p_src + 2 * i_srcstride]
     vpunpcklbw      xmm4, xmm4, xmm1
     vinserti128     ymm0, ymm0, xmm4, 1
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [esp+32], ymm5, [esp], ymm4
-%else
-    AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+    AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
     vmovq           xmm4, [p_src + i_srcstride3]
     lea             p_src, [p_src + 4 * i_srcstride]
     vpunpcklbw      xmm1, xmm1, xmm4
@@ -3755,11 +3279,7 @@
     vmovq           xmm6, [p_src]
     vpunpcklbw      xmm4, xmm4, xmm6
     vinserti128     ymm1, ymm1, xmm4, 1
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [esp+32], ymm5, [esp], ymm4
-%else
-    AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+    AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
     vpackuswb       ymm2, ymm2, ymm3
     vmovlps         [p_dst], xmm2
     vextracti128    xmm3, ymm2, 1
@@ -3773,19 +3293,12 @@
     jl              .width8_done
     vmovq           xmm4, [p_src + i_srcstride]
     vpunpcklbw      xmm2, xmm6, xmm4
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [esp+32], xmm5, [esp], xmm4
-%else
-    AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
-%endif
+    AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
     vpackuswb       xmm0, xmm0, xmm0
     vmovlps         [p_dst], xmm0
 .width8_done:
-%ifdef X86_32_PICASM
-    mov             esp, i_width
-    pop             i_width
-%endif
     vzeroupper
+    DEINIT_X86_32_PIC_KEEPDEF
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -3794,51 +3307,6 @@
     ret
 
 .width16:
-%ifdef X86_32_PICASM
-    push            i_width
-    mov             i_width, esp
-    and             esp, 0xffffffe0
-    push            0x14141414    ;db20_128
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0x14141414
-    push            0xfb01fb01    ;maddubsw_p1m5_256
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0xfb01fb01
-    push            0x01fb01fb    ;maddubsw_m5p1_256
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x01fb01fb
-    push            0x14fb14fb    ;maddubsw_m5p20_256
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0x14fb14fb
-    push            0xfb14fb14    ;maddubsw_p20m5_256
-    push            0xfb14fb14
-    push            0xfb14fb14
-    push            0xfb14fb14
-    push            0xfb14fb14
-    push            0xfb14fb14
-    push            0xfb14fb14
-    push            0xfb14fb14
-%endif
     sub             i_height, 1
     test            i_height, 1
     jnz             .width16_yloop_begin_even
@@ -3865,11 +3333,7 @@
     lea             p_src, [p_src + 2 * i_srcstride]
     vpblendd        ymm5, ymm5, ymm6, 11110000b
     vpunpcklbw      ymm4, ymm4, ymm5
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [esp+96], [esp+128], [esp+64], ymm7
-%else
-    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm7
-%endif
+    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm7
     vpackuswb       ymm0, ymm0, ymm0
     vpermq          ymm0, ymm0, 1000b
     vmovdqa         [p_dst], xmm0
@@ -3899,20 +3363,12 @@
     vmovq           xmm6, [p_src]
     vpbroadcastq    ymm7, [p_src + 8]
     vpblendd        ymm6, ymm6, ymm7, 11110000b
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [esp+32], [esp], ymm0, ymm7
-%else
-    AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm0, ymm7
-%endif
+    AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm0, ymm7
     vmovq           xmm7, [p_src + i_srcstride]
     vpbroadcastq    ymm0, [p_src + i_srcstride + 8]
     vpblendd        ymm7, ymm7, ymm0, 11110000b
     vpunpcklbw      ymm6, ymm6, ymm7
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [esp+96], [esp+128], [esp+64], ymm0
-%else
-    AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm0
-%endif
+    AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm0
     vpackuswb       ymm1, ymm1, ymm2
     vpermq          ymm1, ymm1, 11011000b
     vmovdqa         [p_dst], xmm1
@@ -3921,21 +3377,13 @@
     vmovq           xmm0, [p_src + 2 * i_srcstride]
     vpbroadcastq    ymm1, [p_src + 2 * i_srcstride + 8]
     vpblendd        ymm0, ymm0, ymm1, 11110000b
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [esp+32], [esp], ymm2, ymm1
-%else
-    AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm2, ymm1
-%endif
+    AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm2, ymm1
     vmovq           xmm1, [p_src + i_srcstride3]
     vpbroadcastq    ymm2, [p_src + i_srcstride3 + 8]
     lea             p_src, [p_src + 4 * i_srcstride]
     vpblendd        ymm1, ymm1, ymm2, 11110000b
     vpunpcklbw      ymm0, ymm0, ymm1
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [esp+96], [esp+128], [esp+64], ymm2
-%else
-    AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm2
-%endif
+    AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm2
     vpackuswb       ymm3, ymm3, ymm4
     vpermq          ymm3, ymm3, 11011000b
     vmovdqa         [p_dst], xmm3
@@ -3944,20 +3392,12 @@
     vmovq           xmm2, [p_src]
     vpbroadcastq    ymm3, [p_src + 8]
     vpblendd        ymm2, ymm2, ymm3, 11110000b
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [esp+32], [esp], ymm4, ymm3
-%else
-    AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm4, ymm3
-%endif
+    AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm4, ymm3
     vmovq           xmm3, [p_src + i_srcstride]
     vpbroadcastq    ymm4, [p_src + i_srcstride + 8]
     vpblendd        ymm3, ymm3, ymm4, 11110000b
     vpunpcklbw      ymm2, ymm2, ymm3
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [esp+96], [esp+128], [esp+64], ymm4
-%else
-    AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm4
-%endif
+    AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm4
     vpackuswb       ymm5, ymm5, ymm6
     vpermq          ymm5, ymm5, 11011000b
     vmovdqa         [p_dst], xmm5
@@ -3966,21 +3406,13 @@
     vmovq           xmm4, [p_src + 2 * i_srcstride]
     vpbroadcastq    ymm5, [p_src + 2 * i_srcstride + 8]
     vpblendd        ymm4, ymm4, ymm5, 11110000b
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [esp+32], [esp], ymm6, ymm5
-%else
-    AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm6, ymm5
-%endif
+    AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm6, ymm5
     vmovq           xmm5, [p_src + i_srcstride3]
     vpbroadcastq    ymm6, [p_src + i_srcstride3 + 8]
     lea             p_src, [p_src + 4 * i_srcstride]
     vpblendd        ymm5, ymm5, ymm6, 11110000b
     vpunpcklbw      ymm4, ymm4, ymm5
-%ifdef X86_32_PICASM
-    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [esp+96], [esp+128], [esp+64], ymm6
-%else
-    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm6
-%endif
+    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm6
     vpackuswb       ymm7, ymm7, ymm0
     vpermq          ymm7, ymm7, 11011000b
     vmovdqa         [p_dst], xmm7
@@ -3988,11 +3420,8 @@
     lea             p_dst, [p_dst + 2 * i_dststride]
     sub             i_height, 8
     jg              .width16_yloop
-%ifdef X86_32_PICASM
-    mov             esp, i_width
-    pop             i_width
-%endif
     vzeroupper
+    DEINIT_X86_32_PIC
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -4026,6 +3455,7 @@
 %define i_width      r4
 %define i_height     r5
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 7
     SIGN_EXTENSION  r1, r1d
@@ -4032,32 +3462,9 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    vbroadcasti128  ymm4, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    vbroadcasti128  ymm5, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    vbroadcasti128  ymm6, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    vbroadcasti128  ymm4, [shufb_32435465768798A9]
-    vbroadcasti128  ymm5, [shufb_011267784556ABBC]
-    vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    vbroadcasti128  ymm4, [pic(shufb_32435465768798A9)]
+    vbroadcasti128  ymm5, [pic(shufb_011267784556ABBC)]
+    vbroadcasti128  ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     cmp             i_width, 8
     je              .width8
     jg              .width16_yloop
@@ -4086,6 +3493,7 @@
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width8:
     lea             i_srcstride3, [3 * i_srcstride]
@@ -4110,6 +3518,7 @@
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 %undef i_srcstride3
 .width16_yloop:
@@ -4129,6 +3538,7 @@
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -4155,6 +3565,7 @@
 %define i_width      r4
 %define i_height     r5
     %assign  push_num 0
+    INIT_X86_32_PIC r6
     LOAD_6_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -4161,32 +3572,9 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    vbroadcasti128  ymm5, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    vbroadcasti128  ymm6, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    vbroadcasti128  ymm7, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    vbroadcasti128  ymm5, [shufb_32435465768798A9]
-    vbroadcasti128  ymm6, [shufb_011267784556ABBC]
-    vbroadcasti128  ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    vbroadcasti128  ymm5, [pic(shufb_32435465768798A9)]
+    vbroadcasti128  ymm6, [pic(shufb_011267784556ABBC)]
+    vbroadcasti128  ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     cmp             i_width, 9
     je              .width9
     jg              .width17
@@ -4210,6 +3598,7 @@
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width9:
 %xdefine i_srcstride3 i_width
@@ -4248,6 +3637,7 @@
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC_KEEPDEF
     ret
 .width17:
     lea             i_srcstride3, [3 * i_srcstride]
@@ -4291,6 +3681,7 @@
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef i_srcstride3
 %undef p_src
@@ -4320,6 +3711,7 @@
     push            r4
     %assign  push_num 1
 %endif
+    INIT_X86_32_PIC r5
     LOAD_4_PARA
     PUSH_XMM 7
     SIGN_EXTENSION  r1, r1d
@@ -4327,32 +3719,9 @@
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
     lea             i_srcstride3, [3 * i_srcstride]
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    vbroadcasti128  ymm4, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    vbroadcasti128  ymm5, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    vbroadcasti128  ymm6, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    vbroadcasti128  ymm4, [shufb_32435465768798A9]
-    vbroadcasti128  ymm5, [shufb_011267784556ABBC]
-    vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    vbroadcasti128  ymm4, [pic(shufb_32435465768798A9)]
+    vbroadcasti128  ymm5, [pic(shufb_011267784556ABBC)]
+    vbroadcasti128  ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     sub             i_height, 3
 .yloop:
     vmovdqu         xmm0, [p_src - 2]
@@ -4372,6 +3741,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop             r4
 %endif
@@ -4403,6 +3773,7 @@
     push            r4
     %assign  push_num 1
 %endif
+    INIT_X86_32_PIC r5
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r2, r2d
@@ -4443,6 +3814,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop             r4
 %endif
@@ -4469,6 +3841,7 @@
 %define i_height     r3
 %define i_dststride  16
     %assign  push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 6
     SIGN_EXTENSION  r1, r1d
@@ -4475,32 +3848,9 @@
     SIGN_EXTENSION  r3, r3d
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    vbroadcasti128  ymm3, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    vbroadcasti128  ymm4, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    vbroadcasti128  ymm5, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    vbroadcasti128  ymm3, [shufb_32435465768798A9]
-    vbroadcasti128  ymm4, [shufb_011267784556ABBC]
-    vbroadcasti128  ymm5, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    vbroadcasti128  ymm3, [pic(shufb_32435465768798A9)]
+    vbroadcasti128  ymm4, [pic(shufb_011267784556ABBC)]
+    vbroadcasti128  ymm5, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     sub             i_height, 1
 .yloop:
     vmovdqu         xmm0, [p_src - 2]
@@ -4519,6 +3869,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -4541,6 +3892,7 @@
 %define i_height     r3
 %define i_srcstride  16
     %assign  push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r2, r2d
@@ -4614,6 +3966,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef p_dst
@@ -4641,6 +3994,7 @@
     push            r4
     %assign  push_num 1
 %endif
+    INIT_X86_32_PIC r5
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r2, r2d
@@ -4687,6 +4041,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop             r4
 %endif
@@ -4713,6 +4068,7 @@
 %define i_height     r3
 %define i_dststride  32
     %assign  push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 7
     SIGN_EXTENSION  r1, r1d
@@ -4719,32 +4075,9 @@
     SIGN_EXTENSION  r3, r3d
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
-%ifdef X86_32_PICASM
-    push            r1
-    mov             r1, esp
-    and             esp, 0xfffffff0
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    vbroadcasti128  ymm4, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    vbroadcasti128  ymm5, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    vbroadcasti128  ymm6, [esp]
-    mov             esp, r1
-    pop             r1
-%else
-    vbroadcasti128  ymm4, [shufb_32435465768798A9]
-    vbroadcasti128  ymm5, [shufb_011267784556ABBC]
-    vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    vbroadcasti128  ymm4, [pic(shufb_32435465768798A9)]
+    vbroadcasti128  ymm5, [pic(shufb_011267784556ABBC)]
+    vbroadcasti128  ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     sub             i_height, 1
 .yloop:
     vmovdqu         xmm0, [p_src - 2]
@@ -4768,6 +4101,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -4790,6 +4124,7 @@
 %define i_height     r3
 %define i_srcstride  32
     %assign  push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r2, r2d
@@ -4869,6 +4204,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 %undef p_src
 %undef i_srcstride
@@ -4896,6 +4232,7 @@
     push            r4
     %assign  push_num 1
 %endif
+    INIT_X86_32_PIC r5
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION  r1, r1d
@@ -4903,47 +4240,9 @@
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
     lea             i_srcstride3, [3 * i_srcstride]
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x090a0809        ;shufb_32435465768798A9
-    push            0x07080607
-    push            0x05060405
-    push            0x03040203
-    vbroadcasti128  ymm5, [esp]
-    push            0x0c0b0b0a
-    push            0x06050504
-    push            0x08070706
-    push            0x02010100
-    vbroadcasti128  ymm6, [esp]
-    push            0x01fb01fb
-    push            0xfb01fb01
-    push            0x01fb01fb
-    push            0xfb01fb01
-    vbroadcasti128  ymm7, [esp]
-    sub             esp, 16
-    push            0x0000fe0a    ;maddubsw_m2p10_m40m40_p10m2_p0p0_256
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x0000fe0a
-    push            0xd8d80afe
-    push            0x80008000    ;dwm32768_256
-    push            0x80008000
-    push            0x80008000
-    push            0x80008000
-    push            0x80008000
-    push            0x80008000
-    push            0x80008000
-    push            0x80008000
-%else
-    vbroadcasti128  ymm5, [shufb_32435465768798A9]
-    vbroadcasti128  ymm6, [shufb_011267784556ABBC]
-    vbroadcasti128  ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+    vbroadcasti128  ymm5, [pic(shufb_32435465768798A9)]
+    vbroadcasti128  ymm6, [pic(shufb_011267784556ABBC)]
+    vbroadcasti128  ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
     sub             i_height, 3
 .yloop:
     vmovdqu         xmm0, [p_src - 2]
@@ -4961,14 +4260,7 @@
     vinserti128     ymm0, ymm0, [p_src + i_srcstride3 + 6], 1
     lea             p_src, [p_src + 4 * i_srcstride]
     vpunpckhqdq     ymm4, ymm4, ymm0
-%ifdef X86_32_PICASM
-    vpmaddubsw      ymm4, ymm4, [esp+32]
-    vpmaddwd        ymm4, ymm4, [esp]
-    vpshufd         ymm2, ymm4, 10110001b
-    vpaddd          ymm4, ymm4, ymm2
-%else
-    AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
-%endif
+    AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
     vmovlps         [p_dst + 26], xmm4
     vmovdqa         [p_dst + 16], xmm3
     vextracti128    xmm2, ymm4, 1
@@ -4991,16 +4283,7 @@
     vmovdqu         xmm3, [p_src + i_srcstride - 2]
     vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1
     vpunpckhqdq     ymm4, ymm0, ymm3
-%ifdef X86_32_PICASM
-    vpmaddubsw      ymm4, ymm4, [esp+32]
-    vpmaddwd        ymm4, ymm4, [esp]
-    vpshufd         ymm2, ymm4, 10110001b
-    vpaddd          ymm4, ymm4, ymm2
-    mov             esp, r5
-    pop             r5
-%else
-    AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
-%endif
+    AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
     AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
     AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
     vextracti128    xmm4, ymm4, 1
@@ -5011,6 +4294,7 @@
     vzeroupper
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop             r4
 %endif
@@ -5037,7 +4321,11 @@
 %define i_srcstride  r1
 %define p_dst        r2
 %define i_dststride  r3
+%ifdef X86_32_PICASM
+%define i_width      dword arg5
+%else
 %define i_width      r4
+%endif
 %define i_height     r5
 %define i_srcstride3 r6
     %assign  push_num 0
@@ -5051,6 +4339,7 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
+    INIT_X86_32_PIC_NOPRESERVE r4
     sub             i_height, 1
     lea             i_srcstride3, [3 * i_srcstride]
     test            i_width, 1
@@ -5058,7 +4347,14 @@
     push            i_height
     push            p_src
     push            p_dst
+    %assign push_num push_num + 3
+%ifdef X86_32_PICASM
+    add             p_src, i_width
+    add             p_src, i_width
+    sub             p_src, 2
+%else
     lea             p_src, [p_src + 2 * i_width - 2]
+%endif
     add             p_dst, i_width
     vmovd           xmm0, [p_src]
     vpunpcklwd      xmm0, xmm0, [p_src + i_srcstride]
@@ -5119,6 +4415,7 @@
     pop             p_dst
     pop             p_src
     pop             i_height
+    %assign push_num push_num - 3
 .align_begin:
     vmovdqa         ymm0, [p_src]
     vmovdqa         ymm1, [p_src + i_srcstride]
@@ -5175,6 +4472,7 @@
     vmovdqa         [p_dst], xmm0
 .done:
     vzeroupper
+    DEINIT_X86_32_PIC
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
--- a/codec/common/x86/satd_sad.asm
+++ b/codec/common/x86/satd_sad.asm
@@ -53,7 +53,11 @@
 ;***********************************************************************
 ; Data
 ;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
 SECTION .rodata align=16
+%endif
 
 align 16
 HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
@@ -772,29 +776,12 @@
     mov  r12, r2
 %endif
 
+    INIT_X86_32_PIC r2
     pxor        xmm4,   xmm4
-%ifdef X86_32_PICASM
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqu      xmm5,   [esp]
-    push        0xffff0001
-    push        0xffff0001
-    push        0xffff0001
-    push        0xffff0001
-    movdqu      xmm6,   [esp]
-    push        0x00010001
-    push        0x00010001
-    push        0x00010001
-    push        0x00010001
-    movdqu      xmm7,   [esp]
-    add         esp, 48
-%else
-    movdqa      xmm5,   [HSumSubDB1]
-    movdqa      xmm6,   [HSumSubDW1]
-    movdqa      xmm7,   [PDW1]
-%endif
+    movdqa      xmm5,   [pic(HSumSubDB1)]
+    movdqa      xmm6,   [pic(HSumSubDW1)]
+    movdqa      xmm7,   [pic(PDW1)]
+    DEINIT_X86_32_PIC
     sub         r0,    r1
     movdqu      xmm0,   [r0]
     movhlps     xmm1,   xmm0
@@ -916,9 +903,9 @@
 ret
 
 %macro SSE41_ChromaGetX38x8Satd 0
-    movdqa      xmm5,   [HSumSubDB1]
-    movdqa      xmm6,   [HSumSubDW1]
-    movdqa      xmm7,   [PDW1]
+    movdqa      xmm5,   [pic(HSumSubDB1)]
+    movdqa      xmm6,   [pic(HSumSubDW1)]
+    movdqa      xmm7,   [pic(PDW1)]
     sub         r0,    r1
     movq        xmm0,   [r0]
     punpcklqdq  xmm0,   xmm0
@@ -940,7 +927,7 @@
     SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
     movdqa      [r6+16], xmm0 ;H
 ;(sum+2)>>2
-    movdqa      xmm6,   [PDQ2]
+    movdqa      xmm6,   [pic(PDQ2)]
     movdqa      xmm5,   xmm4
     punpckhqdq  xmm5,   xmm1
     paddd       xmm5,   xmm6
@@ -993,88 +980,8 @@
     SIGN_EXTENSION r3, r3d
     SIGN_EXTENSION r5, r5d
 loop_chroma_satdx3:
-%ifdef X86_32_PICASM
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqa      xmm5, [esp]
-    push        0xffff0001
-    push        0xffff0001
-    push        0xffff0001
-    push        0xffff0001
-    movdqa      xmm6, [esp]
-    push        0x00010001
-    push        0x00010001
-    push        0x00010001
-    push        0x00010001
-    movdqa      xmm7, [esp]
-    mov         esp, r0
-    mov         r0, [esp + push_num*4 + 4]
-
-    sub         r0,    r1
-    movq        xmm0,  [r0]
-    punpcklqdq  xmm0,  xmm0
-    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
-    movdqa      [r6],  xmm0 ;V
-    add         r0,     r1
-    pinsrb      xmm0,   byte[r0-1], 0
-    pinsrb      xmm0,   byte[r0+r1-1], 1
-    lea         r0,    [r0+2*r1]
-    pinsrb      xmm0,   byte[r0-1],     2
-    pinsrb      xmm0,   byte[r0+r1-1], 3
-    lea         r0,    [r0+2*r1]
-    pinsrb      xmm0,   byte[r0-1],     4
-    pinsrb      xmm0,   byte[r0+r1-1], 5
-    lea         r0,    [r0+2*r1]
-    pinsrb      xmm0,   byte[r0-1],     6
-    pinsrb      xmm0,   byte[r0+r1-1], 7
-    punpcklqdq  xmm0,   xmm0
-    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
-;movdqa      [r6+16], xmm0 ;H
-;(sum+2)>>2
-    mov        DWORD [r6+16], 0x0002
-    mov        DWORD [r6+20], 0x0000
-    mov        DWORD [r6+24], 0x0002
-    mov        DWORD [r6+28], 0x0000
-    movdqa      xmm6,   [r6+16]
-    movdqa      [r6+16], xmm0 ;H
-
-    movdqa      xmm5,   xmm4
-    punpckhqdq  xmm5,   xmm1
-    paddd       xmm5,   xmm6
-    psrld       xmm5,   2
-;(sum1+sum2+4)>>3
-    paddd       xmm6,   xmm6
-    paddd       xmm4,   xmm1
-    paddd       xmm4,   xmm6
-    psrld       xmm4,   3
-;satd *16
-    pslld       xmm5,   4
-    pslld       xmm4,   4
-;temp satd
-    movdqa      xmm6,   xmm4
-    punpcklqdq  xmm4,   xmm5
-    psllq       xmm4,   32
-    psrlq       xmm4,   32
-    movdqa      [r6+32], xmm4
-    punpckhqdq  xmm5,   xmm6
-    psllq       xmm5,   32
-    psrlq       xmm5,   32
-    movdqa      [r6+48], xmm5
-
-    pxor        xmm4,   xmm4 ;V
-    pxor        xmm5,   xmm5 ;H
-    pxor        xmm6,   xmm6 ;DC
-    mov         r0,    0
-    SSE41_ChromaGetX38x4Satd r0, 0
-    inc         r0
-    SSE41_ChromaGetX38x4Satd r0, 0
-%else
+    INIT_X86_32_PIC r4
     SSE41_ChromaGetX38x8Satd
-%endif
     SSEReg2MMX  xmm4, mm0,mm1
     SSEReg2MMX  xmm5, mm2,mm3
     SSEReg2MMX  xmm6, mm5,mm6
@@ -1081,89 +988,8 @@
     mov r0,     arg8
     mov r2,     arg9
 
-%ifdef X86_32_PICASM
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqa      xmm5,   [esp]
-    push        0xffff0001
-    push        0xffff0001
-    push        0xffff0001
-    push        0xffff0001
-    movdqa      xmm6,   [esp]
-    push        0x00010001
-    push        0x00010001
-    push        0x00010001
-    push        0x00010001
-    movdqa      xmm7,   [esp]
-    mov         esp,    r0
-    mov r0,     arg8
-
-    sub         r0,    r1
-    movq        xmm0,   [r0]
-    punpcklqdq  xmm0,   xmm0
-    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
-    movdqa      [r6],  xmm0 ;V
-    add         r0,    r1
-    pinsrb      xmm0,   byte[r0-1], 0
-    pinsrb      xmm0,   byte[r0+r1-1], 1
-    lea         r0,    [r0+2*r1]
-    pinsrb      xmm0,   byte[r0-1],     2
-    pinsrb      xmm0,   byte[r0+r1-1], 3
-    lea         r0,    [r0+2*r1]
-    pinsrb      xmm0,   byte[r0-1],     4
-    pinsrb      xmm0,   byte[r0+r1-1], 5
-    lea         r0,    [r0+2*r1]
-    pinsrb      xmm0,   byte[r0-1],     6
-    pinsrb      xmm0,   byte[r0+r1-1], 7
-    punpcklqdq  xmm0,   xmm0
-    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
-    ;movdqa      [r6+16], xmm0 ;H
-;(sum+2)>>2
-
-    mov        DWORD [r6+16], 0x0002
-    mov        DWORD [r6+20], 0x0000
-    mov        DWORD [r6+24], 0x0002
-    mov        DWORD [r6+28], 0x0000
-    movdqa      xmm6,   [r6+16]
-    movdqa      [r6+16], xmm0 ;H
-
-    movdqa      xmm5,   xmm4
-    punpckhqdq  xmm5,   xmm1
-    paddd       xmm5,   xmm6
-    psrld       xmm5,   2
-;(sum1+sum2+4)>>3
-    paddd       xmm6,   xmm6
-    paddd       xmm4,   xmm1
-    paddd       xmm4,   xmm6
-    psrld       xmm4,   3
-;satd *16
-    pslld       xmm5,   4
-    pslld       xmm4,   4
-;temp satd
-    movdqa      xmm6,   xmm4
-    punpcklqdq  xmm4,   xmm5
-    psllq       xmm4,   32
-    psrlq       xmm4,   32
-    movdqa      [r6+32], xmm4
-    punpckhqdq  xmm5,   xmm6
-    psllq       xmm5,   32
-    psrlq       xmm5,   32
-    movdqa      [r6+48], xmm5
-
-    pxor        xmm4,   xmm4 ;V
-    pxor        xmm5,   xmm5 ;H
-    pxor        xmm6,   xmm6 ;DC
-    mov         r0,    0
-    SSE41_ChromaGetX38x4Satd r0, 0
-    inc         r0
-    SSE41_ChromaGetX38x4Satd r0, 0
-%else
     SSE41_ChromaGetX38x8Satd
-%endif
+    DEINIT_X86_32_PIC
 
     MMXReg2SSE  xmm0, xmm3, mm0, mm1
     MMXReg2SSE  xmm1, xmm3, mm2, mm3
@@ -1457,20 +1283,12 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd4x4_sse41
     %assign  push_num 0
+    INIT_X86_32_PIC r5
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
-%ifdef X86_32_PICASM
-    push        0xff01ff01
-    push        0x01010101
-    push        0xff01ff01
-    push        0x01010101
-    movdqu      xmm4,   [esp]
-    add         esp, 16
-%else
-    movdqa      xmm4,[HSwapSumSubDB1]
-%endif
+    movdqa      xmm4,[pic(HSwapSumSubDB1)]
     movd        xmm2,[r2]
     movd        xmm5,[r2+r3]
     shufps      xmm2,xmm5,0
@@ -1511,6 +1329,7 @@
     SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 ;***********************************************************************
@@ -1524,21 +1343,13 @@
     push  r5
 %endif
     %assign  push_num 2
+    INIT_X86_32_PIC r6
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
 
-%ifdef X86_32_PICASM
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqu      xmm7,   [esp]
-    add         esp, 16
-%else
-    movdqa      xmm7, [HSumSubDB1]
-%endif
+    movdqa      xmm7, [pic(HSumSubDB1)]
     lea         r4,  [r1+r1*2]
     lea         r5,  [r3+r3*2]
     pxor        xmm6, xmm6
@@ -1549,6 +1360,7 @@
     SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop  r5
     pop  r4
@@ -1572,16 +1384,9 @@
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
 
-%ifdef X86_32_PICASM
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqu      xmm7,   [esp]
-    add         esp, 16
-%else
-    movdqa      xmm7, [HSumSubDB1]
-%endif
+    INIT_X86_32_PIC_NOPRESERVE r4
+    movdqa      xmm7, [pic(HSumSubDB1)]
+    DEINIT_X86_32_PIC
     lea         r4,  [r1+r1*2]
     lea         r5,  [r3+r3*2]
     pxor        xmm6, xmm6
@@ -1614,6 +1419,7 @@
     push  r5
 %endif
     %assign  push_num 2
+    INIT_X86_32_PIC r6
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
@@ -1621,16 +1427,7 @@
     push  r0
     push  r2
 
-%ifdef X86_32_PICASM
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqu      xmm7,   [esp]
-    add         esp, 16
-%else
-    movdqa      xmm7, [HSumSubDB1]
-%endif
+    movdqa      xmm7, [pic(HSumSubDB1)]
     lea         r4,  [r1+r1*2]
     lea         r5,  [r3+r3*2]
     pxor        xmm6,   xmm6
@@ -1650,6 +1447,7 @@
     SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop  r5
     pop  r4
@@ -1677,16 +1475,9 @@
     push  r0
     push  r2
 
-%ifdef X86_32_PICASM
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqu      xmm7,   [esp]
-    add         esp, 16
-%else
-    movdqa      xmm7, [HSumSubDB1]
-%endif
+    INIT_X86_32_PIC_NOPRESERVE r4
+    movdqa      xmm7, [pic(HSumSubDB1)]
+    DEINIT_X86_32_PIC
     lea         r4,  [r1+r1*2]
     lea         r5,  [r3+r3*2]
     pxor        xmm6,   xmm6
@@ -1863,19 +1654,9 @@
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
 
-%ifdef X86_32_PICASM
-    mov         r1, esp
-    and         esp, 0xfffffff0
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    vbroadcasti128 ymm7, [esp]
-    mov            esp, r1
-    mov            r1, [esp + push_num*4 + 8]
-%else
-    vbroadcasti128 ymm7, [HSumSubDB1]
-%endif
+    INIT_X86_32_PIC_NOPRESERVE r5
+    vbroadcasti128 ymm7, [pic(HSumSubDB1)]
+    DEINIT_X86_32_PIC
     lea            r5, [3 * r1]
     lea            r6, [3 * r3]
     vpxor          ymm6, ymm6, ymm6
@@ -1941,22 +1722,11 @@
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
 
-%ifdef X86_32_PICASM
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    vpbroadcastq xmm0, [esp]
-    vpbroadcastq ymm6, [esp + 8]
-    mov            esp, r0
-    mov            r0, [esp + push_num*4 + 4]
-%else
-    vpbroadcastq xmm0, [HSumSubDB1]
-    vpbroadcastq ymm6, [HSumSubDB1 + 8]
-%endif
+    INIT_X86_32_PIC_NOPRESERVE r5
+    vpbroadcastq xmm0, [pic(HSumSubDB1)]
+    vpbroadcastq ymm6, [pic(HSumSubDB1 + 8)]
     vpblendd     ymm6, ymm0, ymm6, 11110000b
+    DEINIT_X86_32_PIC
     lea          r5, [3 * r1]
     lea          r6, [3 * r3]
     vpxor        ymm5, ymm5, ymm5
--- a/codec/decoder/core/x86/intra_pred.asm
+++ b/codec/decoder/core/x86/intra_pred.asm
@@ -49,7 +49,11 @@
 ; Local Data (Read Only)
 ;*******************************************************************************
 
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
 SECTION .rodata align=16
+%endif
 
 align 16
 sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
@@ -132,20 +136,7 @@
 %macro COPY_16_TIMES 2
     movdqa      %2, [%1-16]
     psrldq      %2, 15
-%ifdef X86_32_PICASM
-    push        r5
-    mov         r5, esp
-    and         esp, 0xfffffff0
-    push        0x01010101    ;mmx_01bytes
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmuludq     %2, [esp]
-    mov         esp, r5
-    pop         r5
-%else
-    pmuludq     %2, [mmx_01bytes]
-%endif
+    pmuludq     %2, [pic(mmx_01bytes)]
     pshufd      %2, %2, 0
 %endmacro
 
@@ -152,20 +143,7 @@
 %macro COPY_16_TIMESS 3
     movdqa      %2, [%1+%3-16]
     psrldq      %2, 15
-%ifdef X86_32_PICASM
-    push        r5
-    mov         r5, esp
-    and         esp, 0xfffffff0
-    push        0x01010101    ;mmx_01bytes
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmuludq     %2, [esp]
-    mov         esp, r5
-    pop         r5
-%else
-    pmuludq     %2, [mmx_01bytes]
-%endif
+    pmuludq     %2, [pic(mmx_01bytes)]
     pshufd      %2, %2, 0
 %endmacro
 
@@ -203,52 +181,26 @@
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
-%ifdef X86_32_PICASM
-    push        r3
-    mov         r3, esp
-    and         esp, 0xfffffff0
-    push        0x01010101    ;mmx_01bytes
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-%endif
 
     movzx       r2, byte [r0-1]
     movd        xmm0,   r2d
-%ifdef X86_32_PICASM
-    pmuludq     xmm0,   [esp]
-%else
-    pmuludq     xmm0,   [mmx_01bytes]
-%endif
+    pmuludq     xmm0,   [pic(mmx_01bytes)]
 
     movzx       r2, byte [r0+r1-1]
     movd        xmm1,   r2d
-%ifdef X86_32_PICASM
-    pmuludq     xmm1,   [esp]
-%else
-    pmuludq     xmm1,   [mmx_01bytes]
-%endif
+    pmuludq     xmm1,   [pic(mmx_01bytes)]
 
     lea         r0, [r0+r1]
     movzx       r2, byte [r0+r1-1]
     movd        xmm2,   r2d
-%ifdef X86_32_PICASM
-    pmuludq     xmm2,   [esp]
-%else
-    pmuludq     xmm2,   [mmx_01bytes]
-%endif
+    pmuludq     xmm2,   [pic(mmx_01bytes)]
 
     movzx       r2, byte [r0+2*r1-1]
     movd        xmm3,   r2d
-%ifdef X86_32_PICASM
-    pmuludq     xmm3,   [esp]
-    mov         esp, r3
-    pop         r3
-%else
-    pmuludq     xmm3,   [mmx_01bytes]
-%endif
+    pmuludq     xmm3,   [pic(mmx_01bytes)]
 
     sub         r0,    r1
     movd        [r0], xmm0
@@ -257,6 +209,7 @@
     movd        [r0], xmm2
     movd        [r0+r1], xmm3
 
+    DEINIT_X86_32_PIC
     ret
 
 ;*******************************************************************************
@@ -266,6 +219,7 @@
     push r3
     push r4
     %assign push_num 2
+    INIT_X86_32_PIC r5
     LOAD_2_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
@@ -276,37 +230,11 @@
     ;for H
     pxor    xmm7,   xmm7
     movq    xmm0,   [r0]
-%ifdef X86_32_PICASM
-    push    r0
-    mov     r0, esp
-    and     esp, 0xfffffff0
-    push    0x00010002
-    push    0x00030004
-    push    0x00050006
-    push    0x00070008
-    movdqa  xmm5,   [esp]
-    mov     esp, r0
-    pop     r0
-%else
-    movdqa  xmm5,   [sse2_plane_dec]
-%endif
+    movdqa  xmm5,   [pic(sse2_plane_dec)]
     punpcklbw xmm0, xmm7
     pmullw  xmm0,   xmm5
     movq    xmm1,   [r0 + 9]
-%ifdef X86_32_PICASM
-    push    r0
-    mov     r0, esp
-    and     esp, 0xfffffff0
-    push    0x00080007    ;sse2_plane_inc
-    push    0x00060005
-    push    0x00040003
-    push    0x00020001
-    movdqa  xmm6,   [esp]
-    mov     esp, r0
-    pop     r0
-%else
-    movdqa  xmm6,   [sse2_plane_inc]
-%endif
+    movdqa  xmm6,   [pic(sse2_plane_inc)]
     punpcklbw xmm1, xmm7
     pmullw  xmm1,   xmm6
     psubw   xmm1,   xmm0
@@ -361,19 +289,7 @@
     SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
 
     xor     r2, r2
-%ifdef X86_32_PICASM
-    mov     r2, esp
-    and     esp, 0xfffffff0
-    push    0x0000ffff    ;sse2_plane_inc_minus
-    push    0xfffefffd
-    push    0xfffcfffb
-    push    0xfffafff9
-    movdqa  xmm5,   [esp]
-    mov     esp, r2
-    xor     r2, r2
-%else
-    movdqa  xmm5,   [sse2_plane_inc_minus]
-%endif
+    movdqa  xmm5,   [pic(sse2_plane_inc_minus)]
 
 get_i16x16_luma_pred_plane_sse2_1:
     movdqa  xmm2,   xmm1
@@ -393,6 +309,7 @@
     jnz get_i16x16_luma_pred_plane_sse2_1
 
     POP_XMM
+    DEINIT_X86_32_PIC
     pop r4
     pop r3
     ret
@@ -414,6 +331,7 @@
 
 WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
     %assign push_num 0
+    INIT_X86_32_PIC_NOPRESERVE r2
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
 
@@ -430,6 +348,7 @@
     SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
     SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
 
+    DEINIT_X86_32_PIC
     ret
 
 ;*******************************************************************************
@@ -477,6 +396,7 @@
     push r3
     push r4
     %assign push_num 2
+    INIT_X86_32_PIC r5
     LOAD_2_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
@@ -486,30 +406,11 @@
 
     pxor    mm7,    mm7
     movq    mm0,    [r0]
-%ifdef X86_32_PICASM
-    push    r5
-    mov     r5, esp
-    and     esp, 0xfffffff0
-    push    0x00010002    ;sse2_plane_dec_c
-    push    0x00030004
-    push    0x00040003    ;sse2_plane_inc_c
-    push    0x00020001
-    push    0x00040003    ;
-    push    0x00020001
-    push    0x0000ffff
-    push    0xfffefffd
-    movq    mm5,    [esp+24]
-%else
-    movq    mm5,    [sse2_plane_dec_c]
-%endif
+    movq    mm5,    [pic(sse2_plane_dec_c)]
     punpcklbw mm0,  mm7
     pmullw  mm0,    mm5
     movq    mm1,    [r0 + 5]
-%ifdef X86_32_PICASM
-    movq    mm6,    [esp+16]
-%else
-    movq    mm6,    [sse2_plane_inc_c]
-%endif
+    movq    mm6,    [pic(sse2_plane_inc_c)]
     punpcklbw mm1,  mm7
     pmullw  mm1,    mm6
     psubw   mm1,    mm0
@@ -561,13 +462,7 @@
     SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
 
     xor     r2, r2
-%ifdef X86_32_PICASM
-    movdqa  xmm5,   [esp]
-    mov     esp, r5
-    pop     r5
-%else
-    movdqa  xmm5,   [sse2_plane_mul_b_c]
-%endif
+    movdqa  xmm5,   [pic(sse2_plane_mul_b_c)]
 
 get_i_chroma_pred_plane_sse2_1:
     movdqa  xmm2,   xmm1
@@ -583,6 +478,7 @@
     jnz get_i_chroma_pred_plane_sse2_1
 
     POP_XMM
+    DEINIT_X86_32_PIC
     pop r4
     pop r3
     WELSEMMS
@@ -602,6 +498,7 @@
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
     mov r2, r0
@@ -629,20 +526,7 @@
     movq        mm4,mm3             ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
     pavgb       mm3,mm1             ;mm3=([11]+[21]+1)/2
     pxor        mm1,mm4             ;find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pand        mm1,[esp]   ;set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm1,[mmx_01bytes]   ;set the odd bit
-%endif
+    pand        mm1,[pic(mmx_01bytes)]   ;set the odd bit
     psubusb     mm3,mm1             ;decrease 1 from odd bytes
     pavgb       mm2,mm3             ;mm2=(([11]+[21]+1)/2+1+[16])/2
 
@@ -655,6 +539,7 @@
     movd        [r0+r1],mm2
     psrlq       mm2,8
     movd        [r0],mm2
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -667,20 +552,7 @@
     movq        %1,     [%3-8]
     psrlq       %1,     38h
 
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmullw      %1,     [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmullw      %1,     [mmx_01bytes]
-%endif
+    pmullw      %1,     [pic(mmx_01bytes)]
     pshufw      %1,     %1, 0
     movq        [%4],   %1
 %endmacro
@@ -689,20 +561,7 @@
     movq        %1,     [%3+r1-8]
     psrlq       %1,     38h
 
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmullw      %1,     [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmullw      %1,     [mmx_01bytes]
-%endif
+    pmullw      %1,     [pic(mmx_01bytes)]
     pshufw      %1,     %1, 0
     movq        [%4],   %1
 %endmacro
@@ -709,6 +568,7 @@
 
 WELS_EXTERN WelsDecoderIChromaPredH_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
     mov r2, r0
@@ -716,20 +576,7 @@
     movq        mm0,    [r2-8]
     psrlq       mm0,    38h
 
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmullw      mm0,        [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmullw      mm0,        [mmx_01bytes]
-%endif
+    pmullw      mm0,        [pic(mmx_01bytes)]
     pshufw      mm0,    mm0,    0
     movq        [r0],   mm0
 
@@ -753,6 +600,7 @@
     lea         r0, [r0+2*r1]
     MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
 
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -816,6 +664,7 @@
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
     mov r2, r0
@@ -841,18 +690,7 @@
     pavgb       mm1, mm0
 
     pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm4, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm4, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm4, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm1, mm4                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
@@ -876,6 +714,7 @@
     movd        [r0+2*r1], mm3
     psrlq       mm3, 10h
     movd        [r0+r1], mm3
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -909,6 +748,7 @@
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
     mov r2, r0
@@ -937,18 +777,7 @@
     pavgb       mm2, mm0
 
     pxor        mm5, mm0                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm5, [esp]      ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm5, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm5, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm2, mm5                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
@@ -970,6 +799,7 @@
     movd        [r0+r1], mm1
     psrlq       mm1, 10h
     movd        [r0+2*r1], mm1
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -1005,6 +835,7 @@
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
     mov r2, r0
@@ -1030,18 +861,7 @@
     pavgb       mm2, mm0
 
     pxor        mm3, mm0                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm3, [esp]      ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm3, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm3, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm2, mm3                ; decrease 1 from odd bytes
 
     movq        mm3, mm0
@@ -1071,6 +891,7 @@
     pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
     lea         r0, [r0+2*r1]
     movd        [r0+r1], mm5
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -1102,6 +923,7 @@
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
     mov r2, r0
@@ -1121,18 +943,7 @@
     movq        mm3, mm1
     pavgb       mm1, mm2
     pxor        mm3, mm2                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm3, [esp]      ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm3, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm3, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm1, mm3                ; decrease 1 from odd bytes
 
     pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
@@ -1146,6 +957,7 @@
     psrlq       mm0, 8h
     lea         r0, [r0+2*r1]
     movd        [r0+r1], mm0
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -1181,6 +993,7 @@
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
     mov r2, r0
@@ -1199,18 +1012,7 @@
     movq        mm4, mm2
     pavgb       mm2, mm0
     pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm4, [esp]              ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm4, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm4, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm2, mm4                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
@@ -1223,6 +1025,7 @@
     psrlq       mm2, 8h
     lea         r0, [r0+2*r1]
     movd        [r0+r1], mm2
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -1234,6 +1037,7 @@
     push    r3
     push    r4
     %assign push_num 2
+    INIT_X86_32_PIC r5
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
     mov r4, r0
@@ -1275,18 +1079,7 @@
     movq        mm1, mm2
     paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
 
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x00000000
-    push        0x00000002
-    movq        mm4, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    movq        mm4, [mmx_0x02]
-%endif
+    movq        mm4, [pic(mmx_0x02)]
 
     paddq       mm0, mm4
     psrlq       mm0, 0x02
@@ -1302,30 +1095,13 @@
     paddq       mm1, mm4
     psrlq       mm1, 0x03
 
-%ifdef X86_32_PICASM
-    push        r5
-    mov         r5, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pmuludq     mm0, [esp]
-    pmuludq     mm3, [esp]
-%else
-    pmuludq     mm0, [mmx_01bytes]
-    pmuludq     mm3, [mmx_01bytes]
-%endif
+    pmuludq     mm0, [pic(mmx_01bytes)]
+    pmuludq     mm3, [pic(mmx_01bytes)]
     psllq       mm0, 0x20
     pxor        mm0, mm3                 ; mm0 = m_up
 
-%ifdef X86_32_PICASM
-    pmuludq     mm2, [esp]
-    pmuludq     mm1, [esp]
-    mov         esp, r5
-    pop         r5
-%else
-    pmuludq     mm2, [mmx_01bytes]
-    pmuludq     mm1, [mmx_01bytes]
-%endif
+    pmuludq     mm2, [pic(mmx_01bytes)]
+    pmuludq     mm1, [pic(mmx_01bytes)]
     psllq       mm1, 0x20
     pxor        mm1, mm2                 ; mm2 = m_down
 
@@ -1342,6 +1118,7 @@
     lea         r4, [r4+2*r1]
     movq        [r4+r1],   mm1
 
+    DEINIT_X86_32_PIC
     pop r4
     pop r3
     WELSEMMS
@@ -1357,6 +1134,7 @@
     push    r3
     push    r4
     %assign push_num 2
+    INIT_X86_32_PIC r5
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
     mov r4, r0
@@ -1385,20 +1163,7 @@
     movd        xmm1, r2d
     paddw       xmm0, xmm1
     psrld       xmm0, 0x05
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmuludq     xmm0, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmuludq     xmm0, [mmx_01bytes]
-%endif
+    pmuludq     xmm0, [pic(mmx_01bytes)]
     pshufd      xmm0, xmm0, 0
 
     movdqa      [r4],       xmm0
@@ -1432,6 +1197,7 @@
 
     movdqa      [r4+r1],   xmm0
 
+    DEINIT_X86_32_PIC
     pop r4
     pop r3
 
@@ -1518,24 +1284,12 @@
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
     lea r2, [2*r1+r1]       ; 3*kiStride
 
-%ifdef X86_32_PICASM
-    push    r0
-    mov     r0, esp
-    and     esp, 0xfffffff0
-    push    0x80808080
-    push    0x80808080
-    push    0x80808080
-    push    0x80808080
-    movdqa xmm0, [esp]
-    mov     esp, r0
-    pop     r0
-%else
-    movdqa xmm0, [sse2_dc_0x80]
-%endif
+    movdqa xmm0, [pic(sse2_dc_0x80)]
     movdqa xmm1, xmm0
     movdqa [r0], xmm0
     movdqa [r0+r1], xmm1
@@ -1557,6 +1311,7 @@
     movdqa [r0+2*r1], xmm0
     movdqa [r0+r2], xmm1
 
+    DEINIT_X86_32_PIC
     ret
 
 ;*******************************************************************************
@@ -1680,21 +1435,11 @@
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
     lea r2, [2*r1+r1]
-%ifdef X86_32_PICASM
-    push r0
-    mov  r0, esp
-    and  esp, 0xfffffff0
-    push 0x80808080
-    push 0x80808080
-    movq mm0, [esp]
-    mov  esp, r0
-    pop  r0
-%else
-    movq mm0, [sse2_dc_0x80]
-%endif
+    movq mm0, [pic(sse2_dc_0x80)]
     movq mm1, mm0
     movq [r0], mm0
     movq [r0+r1], mm1
@@ -1705,6 +1450,7 @@
     movq [r0+r1], mm1
     movq [r0+2*r1], mm0
     movq [r0+r2], mm1
+    DEINIT_X86_32_PIC
     emms
     ret
 
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -75,9 +75,7 @@
 
 #ifdef X86_ASM
 
-#ifndef X86_32_PICASM
 int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel);
-#endif
 int32_t WelsGetNoneZeroCount_sse42 (int16_t* pLevel);
 
 /****************************************************************************
@@ -86,9 +84,7 @@
 void WelsScan4x4Ac_sse2 (int16_t* zig_value, int16_t* pDct);
 void WelsScan4x4DcAc_ssse3 (int16_t* pLevel, int16_t* pDct);
 void WelsScan4x4DcAc_sse2 (int16_t* pLevel, int16_t* pDct);
-#ifndef X86_32_PICASM
 int32_t WelsCalculateSingleCtr4x4_sse2 (int16_t* pDct);
-#endif
 
 /****************************************************************************
  * DCT functions
--- a/codec/encoder/core/inc/set_mb_syn_cavlc.h
+++ b/codec/encoder/core/inc/set_mb_syn_cavlc.h
@@ -78,12 +78,10 @@
 int32_t CavlcParamCal_c (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
                          int32_t iEndIdx);
 #ifdef  X86_ASM
-#ifndef  X86_32_PICASM
 int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
                             int32_t iEndIdx);
 int32_t CavlcParamCal_sse42 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
                              int32_t iEndIdx);
-#endif
 #endif
 
 #if defined(__cplusplus)
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -500,9 +500,7 @@
     pFuncList->pfCopy8x16Aligned        = WelsCopy8x16_mmx;
   }
   if (uiCpuFlag & WELS_CPU_SSE2) {
-#ifndef X86_32_PICASM
     pFuncList->pfGetNoneZeroCount       = WelsGetNoneZeroCount_sse2;
-#endif
     pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_sse2;
 
     pFuncList->pfQuantization4x4        = WelsQuant4x4_sse2;
@@ -516,9 +514,7 @@
 
     pFuncList->pfScan4x4                = WelsScan4x4DcAc_sse2;
     pFuncList->pfScan4x4Ac              = WelsScan4x4Ac_sse2;
-#ifndef X86_32_PICASM
     pFuncList->pfCalculateSingleCtr4x4  = WelsCalculateSingleCtr4x4_sse2;
-#endif
 
     pFuncList->pfDctT4                  = WelsDctT4_sse2;
     pFuncList->pfDctFourT4              = WelsDctFourT4_sse2;
--- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp
+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp
@@ -291,19 +291,15 @@
   pFuncList->pfCavlcParamCal = CavlcParamCal_c;
 
 #if defined(X86_32_ASM)
-#ifndef X86_32_PICASM
   if (uiCpuFlag & WELS_CPU_SSE2) {
     pFuncList->pfCavlcParamCal = CavlcParamCal_sse2;
   }
 #endif
-#endif
 
 #ifdef X86_ASM
-#ifndef X86_32_PICASM
   if (uiCpuFlag & WELS_CPU_SSE42) {
     pFuncList->pfCavlcParamCal = CavlcParamCal_sse42;
   }
-#endif
 #endif
   if (iEntropyCodingModeFlag) {
     pFuncList->pfStashMBStatus = StashMBStatusCabac;
--- a/codec/encoder/core/x86/coeff.asm
+++ b/codec/encoder/core/x86/coeff.asm
@@ -42,7 +42,11 @@
 
 %include "asm_inc.asm"
 
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
 SECTION .rodata align=16
+%endif
 
 align 16
 
@@ -369,7 +373,6 @@
 
 %ifdef X86_32
 
-%ifndef X86_32_PICASM
 ;***********************************************************************
 ;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
 ;***********************************************************************
@@ -377,10 +380,12 @@
     push ebx
     push edi
     push esi
+    %assign push_num 3
+    INIT_X86_32_PIC ebp
 
-    mov         eax,    [esp+16]    ;coffLevel
-    mov         edi,    [esp+24]    ;Level
-    mov         ebx,    [esp+32]    ;endIdx
+    mov         eax,    arg1    ;coffLevel
+    mov         edi,    arg3    ;Level
+    mov         ebx,    arg5    ;endIdx
     cmp         ebx,    3
     jne         .Level16
     pxor        xmm1,   xmm1
@@ -400,7 +405,7 @@
     pmovmskb    edx,    xmm0
     cmp         edx,    0
     je near   .return
-    movdqa      xmm6,   [sse2_b_1]
+    movdqa      xmm6,   [pic(sse2_b_1)]
     pcmpeqw     xmm7,   xmm7    ;generate -1
     mov         ebx,    0xff
     ;pinsrw     xmm6,   ebx,    3
@@ -407,7 +412,7 @@
 
     mov       bl,   dh
 
-    lea       ebx,  [byte_1pos_table+8*ebx]
+    lea       ebx,  [pic(byte_1pos_table+8*ebx)]
     movq      xmm0, [ebx]
     pextrw    ecx,  xmm0, 3
     shr       ecx,  8
@@ -438,7 +443,7 @@
     add       edi,   2
 .LowByteFind0:
     and       edx,  0xff
-    lea       ebx,  [byte_1pos_table+8*edx]
+    lea       ebx,  [pic(byte_1pos_table+8*edx)]
     movq      xmm1, [ebx]
     pextrw    esi,  xmm1, 3
     or        esi,  0xff
@@ -466,7 +471,7 @@
     mov       edx, [eax]
     mov       [edi], dx
 .getLevelEnd:
-    mov      edx, [esp+28]  ;total_coeffs
+    mov      edx, arg4  ;total_coeffs
     ;mov      ebx,   ecx
     ;and      ebx,   0xff
     movzx    ebx,   byte cl
@@ -473,7 +478,7 @@
     add      cl,    ch
     mov      [edx], cl
 ;getRun
-    movq     xmm5, [sse2_b8]
+    movq     xmm5, [pic(sse2_b8)]
     paddb    xmm0, xmm5
     pxor     xmm2, xmm2
     pxor     xmm3, xmm3
@@ -499,18 +504,17 @@
     paddb    xmm1,  xmm7
     psrldq   xmm0,  1
     psubb    xmm1,  xmm0
-    mov      ecx,   [esp+20] ;run
+    mov      ecx,   arg2 ;run
     movdqa   [ecx], xmm1
 ;getRunEnd
 .return:
+    DEINIT_X86_32_PIC
     pop esi
     pop edi
     pop ebx
     ret
-%endif ;%ifndef X86_32_PICASM
 %endif ;%ifdef X86_32
 
-%ifndef X86_32_PICASM
 ;***********************************************************************
 ;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
 ;***********************************************************************
@@ -524,17 +528,21 @@
     push            r5
     push            r6
     %assign push_num 4
+%ifdef X86_32_PICASM
+    %define p_total_coeffs r1
+%else
     %define p_total_coeffs r0
+%endif
     %define r_tmp r1
     %define r_tmpd r1d
     %define r_tmpb r1b
     %define p_level r2
     %define p_coeff_level r3
+    %define p_run r6
     %define r_mask  r5
     %define r_maskd r5d
-    %define p_run r6
-    %define p_shufb_lut wels_cavlc_param_cal_shufb_lut
-    %define p_run_lut   wels_cavlc_param_cal_run_lut
+    %define p_shufb_lut pic(wels_cavlc_param_cal_shufb_lut)
+    %define p_run_lut   pic(wels_cavlc_param_cal_run_lut)
     mov             p_coeff_level, arg1
     mov             p_run, arg2
     mov             p_level, arg3
@@ -571,6 +579,7 @@
     %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
     lea             p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
 %endif
+    INIT_X86_32_PIC_NOPRESERVE r0
 
     ; Acquire a bitmask indicating which words are non-zero.
     ; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3.
@@ -588,7 +597,7 @@
 .load_done:
     movdqa          [p_run], xmm1                           ; Zero-initialize because we may read back implied zeros.
     pcmpeqb         xmm0, xmm1
-    pshufb          xmm0, [wels_shufb_rev]
+    pshufb          xmm0, [pic(wels_shufb_rev)]
     pmovmskb        r_maskd, xmm0
     xor             r_maskd, 0FFFFh
 %undef i_endidxd
@@ -605,12 +614,18 @@
     %xdefine i_total_zeros p_total_coeffs
 %endif
 %undef p_total_coeffs
+%ifdef X86_32_PICASM
+    push            r_tmp2
+    %undef i_total_zeros
+    %define i_total_zeros dword [esp]
+%else
     mov             i_total_zeros, r_tmp2
+%endif
     jz              .done
-    mov             i_total_zeros, 16
-    sub             i_total_zeros, r_tmp2
     bsf             r_tmpd, r_maskd                         ; Find first set bit.
-    sub             i_total_zeros, r_tmp
+    lea             r_tmp2, [r_tmp2 + r_tmp - 16]
+    neg             r_tmp2
+    mov             i_total_zeros, r_tmp2
     ; Skip trailing zeros.
     ; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores.
     and             r_tmpd, -4
@@ -649,8 +664,13 @@
     jnz             .loop
 .done:
 %ifnidni retrq, i_total_zeros
+  %ifdef X86_32_PICASM
+    pop             retrq
+  %else
     mov             retrq, i_total_zeros
+  %endif
 %endif
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop             r6
     pop             r5
@@ -673,5 +693,3 @@
 %undef r_tmp2d
 %undef p_shufb_lut
 %undef p_run_lut
-
-%endif  ;ifndef X86_32_PICASM
--- a/codec/encoder/core/x86/intra_pred.asm
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -45,7 +45,11 @@
 ; Local Data (Read Only)
 ;***********************************************************************
 
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
 SECTION .rodata align=16
+%endif
 
 align 16
 sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
@@ -144,20 +148,7 @@
 %macro COPY_16_TIMES 2
     movdqa      %2, [%1-16]
     psrldq      %2, 15
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmuludq     %2, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmuludq     %2, [mmx_01bytes]
-%endif
+    pmuludq     %2, [pic(mmx_01bytes)]
     pshufd      %2, %2, 0
 %endmacro
 
@@ -164,20 +155,7 @@
 %macro COPY_16_TIMESS 3
     movdqa      %2, [%1+%3-16]
     psrldq      %2, 15
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmuludq     %2, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmuludq     %2, [mmx_01bytes]
-%endif
+    pmuludq     %2, [pic(mmx_01bytes)]
     pshufd      %2, %2, 0
 %endmacro
 
@@ -215,30 +193,16 @@
 WELS_EXTERN WelsI4x4LumaPredH_sse2
     push r3
     %assign push_num 1
+    INIT_X86_32_PIC r4
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     movzx       r3, byte [r1-1]
     movd        xmm0,   r3d
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmuludq     xmm0,   [esp]
-%else
-    pmuludq     xmm0,   [mmx_01bytes]
-%endif
+    pmuludq     xmm0,   [pic(mmx_01bytes)]
 
     movzx       r3, byte [r1+r2-1]
     movd        xmm1,   r3d
-%ifdef X86_32_PICASM
-    pmuludq     xmm1,   [esp]
-%else
-    pmuludq     xmm1,   [mmx_01bytes]
-%endif
+    pmuludq     xmm1,   [pic(mmx_01bytes)]
 
     unpcklps    xmm0,   xmm1
 
@@ -245,26 +209,17 @@
     lea         r1, [r1+r2*2]
     movzx       r3, byte [r1-1]
     movd        xmm2,   r3d
-%ifdef X86_32_PICASM
-    pmuludq     xmm2,   [esp]
-%else
-    pmuludq     xmm2,   [mmx_01bytes]
-%endif
+    pmuludq     xmm2,   [pic(mmx_01bytes)]
 
     movzx       r3, byte [r1+r2-1]
     movd        xmm3,   r3d
-%ifdef X86_32_PICASM
-    pmuludq     xmm3,   [esp]
-    mov         esp,    r0
-    pop         r0
-%else
-    pmuludq     xmm3,   [mmx_01bytes]
-%endif
+    pmuludq     xmm3,   [pic(mmx_01bytes)]
 
     unpcklps    xmm2,   xmm3
     unpcklpd    xmm0,   xmm2
 
     movdqa      [r0],   xmm0
+    DEINIT_X86_32_PIC
     pop r3
     ret
 
@@ -275,6 +230,7 @@
     push r3
     push r4
     %assign push_num 2
+    INIT_X86_32_PIC r5
     LOAD_3_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r2, r2d
@@ -284,34 +240,11 @@
     ;for H
     pxor    xmm7,   xmm7
     movq    xmm0,   [r1]
-%ifdef X86_32_PICASM
-    push    r5
-    mov     r5, esp
-    and     esp, 0xfffffff0
-    push    0x00010002    ;sse2_plane_dec
-    push    0x00030004
-    push    0x00050006
-    push    0x00070008
-    push    0x00080007    ;sse_plane_inc
-    push    0x00060005
-    push    0x00040003
-    push    0x00020001
-    push    0x0000ffff    ;sse_plane_inc_minus
-    push    0xfffefffd
-    push    0xfffcfffb
-    push    0xfffafff9
-    movdqa  xmm5,   [esp+32]
-%else
-    movdqa  xmm5,   [sse2_plane_dec]
-%endif
+    movdqa  xmm5,   [pic(sse2_plane_dec)]
     punpcklbw xmm0, xmm7
     pmullw  xmm0,   xmm5
     movq    xmm1,   [r1 + 9]
-%ifdef X86_32_PICASM
-    movdqa  xmm6,   [esp+16]
-%else
-    movdqa  xmm6,   [sse2_plane_inc]
-%endif
+    movdqa  xmm6,   [pic(sse2_plane_inc)]
     punpcklbw xmm1, xmm7
     pmullw  xmm1,   xmm6
     psubw   xmm1,   xmm0
@@ -357,13 +290,7 @@
     SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
 
     xor     r3, r3
-%ifdef X86_32_PICASM
-    movdqa  xmm5,   [esp]
-    mov     esp,    r5
-    pop     r5
-%else
-    movdqa  xmm5,   [sse2_plane_inc_minus]
-%endif
+    movdqa  xmm5,   [pic(sse2_plane_inc_minus)]
 
 get_i16x16_luma_pred_plane_sse2_1:
     movdqa  xmm2,   xmm1
@@ -382,6 +309,7 @@
     cmp     r3, 16
     jnz get_i16x16_luma_pred_plane_sse2_1
     POP_XMM
+    DEINIT_X86_32_PIC
     pop r4
     pop r3
     ret
@@ -393,6 +321,7 @@
     push r3
     push r4
     %assign push_num 2
+    INIT_X86_32_PIC r5
     LOAD_3_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r2, r2d
@@ -401,30 +330,11 @@
 
     pxor    mm7,    mm7
     movq    mm0,    [r1]
-%ifdef X86_32_PICASM
-    push    r5
-    mov     r5, esp
-    and     esp, 0xfffffff0
-    push    0x00010002    ;sse2_plane_dec_c
-    push    0x00030004
-    push    0x00040003    ;sse2_plane_inc_c
-    push    0x00020001
-    push    0x00040003    ;sse2_plane_mul_b_c
-    push    0x00020001
-    push    0x0000ffff
-    push    0xfffefffd
-    movq    mm5,    [esp+24]
-%else
-    movq    mm5,    [sse2_plane_dec_c]
-%endif
+    movq    mm5,    [pic(sse2_plane_dec_c)]
     punpcklbw mm0,  mm7
     pmullw  mm0,    mm5
     movq    mm1,    [r1 + 5]
-%ifdef X86_32_PICASM
-    movq    mm6,    [esp+16]
-%else
-    movq    mm6,    [sse2_plane_inc_c]
-%endif
+    movq    mm6,    [pic(sse2_plane_inc_c)]
     punpcklbw mm1,  mm7
     pmullw  mm1,    mm6
     psubw   mm1,    mm0
@@ -474,13 +384,7 @@
     SSE2_Copy8Times xmm0, r3d   ; xmm0 = s,s,s,s,s,s,s,s
 
     xor     r3, r3
-%ifdef X86_32_PICASM
-    movdqa  xmm5,   [esp]
-    mov     esp,    r5
-    pop     r5
-%else
-    movdqa  xmm5,   [sse2_plane_mul_b_c]
-%endif
+    movdqa  xmm5,   [pic(sse2_plane_mul_b_c)]
 
 get_i_chroma_pred_plane_sse2_1:
     movdqa  xmm2,   xmm1
@@ -495,6 +399,7 @@
     cmp     r3, 8
     jnz get_i_chroma_pred_plane_sse2_1
     POP_XMM
+    DEINIT_X86_32_PIC
     pop r4
     pop r3
     WELSEMMS
@@ -514,6 +419,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDR_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     movq        mm1,[r1+r2-8]       ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
@@ -539,18 +445,7 @@
     movq        mm4,mm3             ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
     pavgb       mm3,mm1             ;mm3=([11]+[21]+1)/2
     pxor        mm1,mm4             ;find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm1,[esp]   ;set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm1,[mmx_01bytes]   ;set the odd bit
-%endif
+    pand        mm1,[pic(mmx_01bytes)]   ;set the odd bit
     psubusb     mm3,mm1             ;decrease 1 from odd bytes
     pavgb       mm2,mm3             ;mm2=(([11]+[21]+1)/2+1+[16])/2
 
@@ -561,6 +456,7 @@
     movd        [r0+4],mm2
     psrlq       mm2,8
     movd        [r0],mm2
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -619,20 +515,7 @@
     psrlq       %1,     38h
 
     ;pmuludq        %1,     [mmx_01bytes]       ;extend to 4 bytes
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmullw      %1,     [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmullw      %1,     [mmx_01bytes]
-%endif
+    pmullw      %1,     [pic(mmx_01bytes)]
     pshufw      %1,     %1, 0
     movq        [%4],   %1
 %endmacro
@@ -642,20 +525,7 @@
     psrlq       %1,     38h
 
     ;pmuludq        %1,     [mmx_01bytes]       ;extend to 4 bytes
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmullw      %1,     [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmullw      %1,     [mmx_01bytes]
-%endif
+    pmullw      %1,     [pic(mmx_01bytes)]
     pshufw      %1,     %1, 0
     movq        [%4],   %1
 %endmacro
@@ -662,6 +532,7 @@
 
 WELS_EXTERN WelsIChromaPredH_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     movq        mm0,    [r1-8]
@@ -668,20 +539,7 @@
     psrlq       mm0,    38h
 
     ;pmuludq        mm0,    [mmx_01bytes]       ;extend to 4 bytes
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmullw      mm0,        [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmullw      mm0,        [mmx_01bytes]
-%endif
+    pmullw      mm0,        [pic(mmx_01bytes)]
     pshufw      mm0,    mm0,    0
     movq        [r0],   mm0
 
@@ -701,6 +559,7 @@
     MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
 
     MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+56
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -767,6 +626,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHD_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     sub         r1, r2
@@ -791,18 +651,7 @@
     pavgb       mm1, mm0
 
     pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm4, [esp]      ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm4, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm4, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm1, mm4                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
@@ -824,6 +673,7 @@
     movd        [r0+8], mm3
     psrlq       mm3, 10h
     movd        [r0+4], mm3
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -855,6 +705,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHU_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     movd        mm0, [r1-4]            ; mm0[3] = l0
@@ -881,18 +732,7 @@
     pavgb       mm2, mm0
 
     pxor        mm5, mm0                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm5, [esp]      ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm5, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm5, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm2, mm5                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
@@ -912,6 +752,7 @@
     movd        [r0+4], mm1
     psrlq       mm1, 10h
     movd        [r0+8], mm1
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -947,6 +788,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVR_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     sub         r1, r2
@@ -971,18 +813,7 @@
     pavgb       mm2, mm0
 
     pxor        mm3, mm0                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm3, [esp]      ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm3, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm3, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm2, mm3                ; decrease 1 from odd bytes
 
     movq        mm3, mm0
@@ -1011,6 +842,7 @@
     psllq       mm2, 8h
     pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
     movd        [r0+12], mm5
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -1042,6 +874,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDL_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     sub         r1, r2
@@ -1060,18 +893,7 @@
     movq        mm3, mm1
     pavgb       mm1, mm2
     pxor        mm3, mm2                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm3, [esp]      ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm3, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm3, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm1, mm3                ; decrease 1 from odd bytes
 
     pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
@@ -1084,6 +906,7 @@
     movd        [r0+8], mm0
     psrlq       mm0, 8h
     movd        [r0+12], mm0
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -1119,6 +942,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVL_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     sub         r1, r2
@@ -1135,18 +959,7 @@
     movq        mm4, mm2
     pavgb       mm2, mm0
     pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm4, [esp]      ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm4, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm4, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm2, mm4                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
@@ -1158,6 +971,7 @@
     movd        [r0+4], mm2
     psrlq       mm2, 8h
     movd        [r0+12], mm2
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -1169,6 +983,7 @@
     push r3
     push r4
     %assign push_num 2
+    INIT_X86_32_PIC r5
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     sub         r1, r2
@@ -1208,18 +1023,7 @@
     movq        mm1, mm2
     paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
 
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x00000000
-    push        0x00000002
-    movq        mm4, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    movq        mm4, [mmx_0x02]
-%endif
+    movq        mm4, [pic(mmx_0x02)]
 
     paddq       mm0, mm4
     psrlq       mm0, 0x02
@@ -1235,32 +1039,13 @@
     paddq       mm1, mm4
     psrlq       mm1, 0x03
 
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmuludq     mm0, [esp]
-    pmuludq     mm3, [esp]
-%else
-    pmuludq     mm0, [mmx_01bytes]
-    pmuludq     mm3, [mmx_01bytes]
-%endif
+    pmuludq     mm0, [pic(mmx_01bytes)]
+    pmuludq     mm3, [pic(mmx_01bytes)]
     psllq       mm0, 0x20
     pxor        mm0, mm3                 ; mm0 = m_up
 
-%ifdef X86_32_PICASM
-    pmuludq     mm2, [esp]
-    pmuludq     mm1, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmuludq     mm2, [mmx_01bytes]
-    pmuludq     mm1, [mmx_01bytes]
-%endif
+    pmuludq     mm2, [pic(mmx_01bytes)]
+    pmuludq     mm1, [pic(mmx_01bytes)]
     psllq       mm1, 0x20
     pxor        mm1, mm2                 ; mm2 = m_down
 
@@ -1274,6 +1059,7 @@
     movq        [r0+0x30], mm1
     movq        [r0+0x38], mm1
 
+    DEINIT_X86_32_PIC
     pop r4
     pop r3
     WELSEMMS
@@ -1289,6 +1075,7 @@
     push r3
     push r4
     %assign push_num 2
+    INIT_X86_32_PIC r5
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     sub         r1, r2
@@ -1316,20 +1103,7 @@
     movd        xmm1, r3d
     paddw       xmm0, xmm1
     psrld       xmm0, 0x05
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmuludq     xmm0, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmuludq     xmm0, [mmx_01bytes]
-%endif
+    pmuludq     xmm0, [pic(mmx_01bytes)]
     pshufd      xmm0, xmm0, 0
 
     movdqa      [r0], xmm0
@@ -1349,6 +1123,7 @@
     movdqa      [r0+0xe0], xmm0
     movdqa      [r0+0xf0], xmm0
 
+    DEINIT_X86_32_PIC
     pop r4
     pop r3
     ret
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -34,7 +34,11 @@
 ;***********************************************************************
 ; Local Data (Read Only)
 ;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
 SECTION .rodata align=16
+%endif
 
 ALIGN 16
 mv_x_inc_x4     dw  0x10, 0x10, 0x10, 0x10
@@ -696,26 +700,12 @@
     mov     ebx,    [height]
     mov     [i_height], ebx
 
-%ifdef X86_32_PICASM
-    push    r0
-    mov     r0, esp
-    and     esp, 0xfffffff0
-    push    0x00100010                  ;mv_x_inc_x4
-    push    0x00100010
-    push    0x00040004                  ;mv_y_inc_x4
-    push    0x00040004
-    push    0x000c0008                  ;mx_x_offset_x4
-    push    0x00040000
-    movq    xmm7,   [esp+16]            ; x_qpel inc
-    movq    xmm6,   [esp+8]             ; y_qpel inc
-    movq    xmm5,   [esp]               ; x_qpel vector
-    mov     esp,    r0
-    pop     r0
-%else
-    movq    xmm7,   [mv_x_inc_x4]       ; x_qpel inc
-    movq    xmm6,   [mv_y_inc_x4]       ; y_qpel inc
-    movq    xmm5,   [mx_x_offset_x4]    ; x_qpel vector
-%endif
+    %assign push_num 5
+    INIT_X86_32_PIC_NOPRESERVE ecx
+    movq    xmm7,   [pic(mv_x_inc_x4)]     ; x_qpel inc
+    movq    xmm6,   [pic(mv_y_inc_x4)]     ; y_qpel inc
+    movq    xmm5,   [pic(mx_x_offset_x4)]  ; x_qpel vector
+    DEINIT_X86_32_PIC
     pxor    xmm4,   xmm4
     pxor    xmm3,   xmm3                ; y_qpel vector
 HASH_HEIGHT_LOOP_SSE2:
@@ -1415,24 +1405,9 @@
     push r13
     mov     r12,    r2
 
-%ifdef X86_32_PICASM
-    push    r0
-    mov     r0, esp
-    and     esp, 0xfffffff0
-    push    0x00100010                  ;mv_x_inc_x4
-    push    0x00100010
-    push    0x00040004                  ;mv_y_inc_x4
-    push    0x00040004
-    push    0x000c0008                  ;mx_x_offset_x4
-    push    0x00040000
-    movq    xmm7,   [esp+16]            ; x_qpel inc
-    movq    xmm6,   [esp+8]             ; y_qpel inc
-    movq    xmm5,   [esp]               ; x_qpel vector
-%else
     movq    xmm7,   [mv_x_inc_x4]       ; x_qpel inc
     movq    xmm6,   [mv_y_inc_x4]       ; y_qpel inc
     movq    xmm5,   [mx_x_offset_x4]    ; x_qpel vector
-%endif
     pxor    xmm4,   xmm4
     pxor    xmm3,   xmm3                ; y_qpel vector
 HASH_HEIGHT_LOOP_SSE2:
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -49,7 +49,11 @@
 ;***********************************************************************
 ; Local Data (Read Only)
 ;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
 SECTION .rodata align=16
+%endif
 
 ;align 16
 ;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
@@ -200,6 +204,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsScan4x4DcAc_ssse3
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_2_PARA
     movdqa     xmm0, [r1]
     movdqa     xmm1, [r1+16]
@@ -207,29 +212,12 @@
     pextrw      r1d,  xmm1, 0           ; eax = [8]
     pinsrw      xmm0, r1d, 7            ; xmm0[7]   =   [8]
     pinsrw      xmm1, r2d, 0            ; xmm1[0]   =   [7]
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x0d0c0706              ;pb_scanacdc_maska
-    push        0x05040b0a
-    push        0x0f0e0908
-    push        0x03020100
-    push        0x0f0e0d0c              ;pb_scanacdc_maskb
-    push        0x07060100
-    push        0x05040b0a
-    push        0x09080302
-    pshufb      xmm1, [esp]
-    pshufb      xmm0, [esp+16]
-    mov         esp, r0
-    pop         r0
-%else
-    pshufb      xmm1, [pb_scanacdc_maskb]
-    pshufb      xmm0, [pb_scanacdc_maska]
-%endif
+    pshufb      xmm1, [pic(pb_scanacdc_maskb)]
+    pshufb      xmm0, [pic(pb_scanacdc_maska)]
 
     movdqa     [r0],xmm0
     movdqa     [r0+16], xmm1
+    DEINIT_X86_32_PIC
     ret
 ;***********************************************************************
 ;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
@@ -268,7 +256,6 @@
     ret
 
 
-%ifndef X86_32_PICASM
 ;***********************************************************************
 ;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
 ;***********************************************************************
@@ -279,6 +266,7 @@
     %else
     %assign push_num 0
     %endif
+    INIT_X86_32_PIC r4
     LOAD_1_PARA
     movdqa    xmm0, [r0]
     movdqa    xmm1, [r0+16]
@@ -309,16 +297,17 @@
 .find1end:
     sub       r1, r2
     sub       r1, 1
-    lea   r2,  [i_ds_table]
+    lea   r2,  [pic(i_ds_table)]
     add       r0b,  [r2+r1]
     mov       r1, r3
     and       r3, 0xff
     shr       r1, 8
     and       r1, 0xff
-    lea   r2 , [low_mask_table]
+    lea   r2 , [pic(low_mask_table)]
     add       r0b,  [r2 +r3]
-    lea   r2, [high_mask_table]
+    lea   r2, [pic(high_mask_table)]
     add       r0b,  [r2+r1]
+    DEINIT_X86_32_PIC
     %ifdef X86_32
     pop r3
     %else
@@ -325,15 +314,14 @@
     mov retrd, r0d
     %endif
     ret
-%endif ;ifndef X86_32_PICASM
 
 
-%ifndef X86_32_PICASM
 ;***********************************************************************
 ; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
 ;***********************************************************************
 WELS_EXTERN WelsGetNoneZeroCount_sse2
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_1_PARA
     movdqa    xmm0, [r0]
     movdqa    xmm1, [r0+16]
@@ -350,14 +338,14 @@
 ;   and       ecx,  0xff    ; we do not need this due to high 16bits equal to 0 yet
 ;   xor       retr,  retr
     ;add       al,  [nozero_count_table+r2]
-    lea       r0 , [nozero_count_table]
+    lea       r0 , [pic(nozero_count_table)]
     movzx     r2, byte [r0+r2]
     movzx     r1,   byte [r0+r1]
     mov   retrq, r2
     add   retrq, r1
     ;add       al,  [nozero_count_table+r1]
+    DEINIT_X86_32_PIC
     ret
-%endif ;%ifndef X86_32_PICASM
 
 ;***********************************************************************
 ; int32_t WelsGetNoneZeroCount_sse42(int16_t* level);
--- a/codec/processing/src/x86/denoisefilter.asm
+++ b/codec/processing/src/x86/denoisefilter.asm
@@ -44,7 +44,11 @@
 ;***********************************************************************
 ; Constant
 ;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
 SECTION .rodata align=16
+%endif
 
 sse2_32 times 8 dw 32
 sse2_20 times 8 dw 20
@@ -147,20 +151,7 @@
     movdqa      %2, %1
     psrldq      %2, 2
     punpcklbw   %2, %4
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x00140014
-    push        0x00140014
-    push        0x00140014
-    push        0x00140014
-    pmullw      %2, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmullw      %2, [sse2_20]
-%endif
+    pmullw      %2, [pic(sse2_20)]
     paddw       %3, %2
 
     movdqa      %2, %1
@@ -254,6 +245,7 @@
 
     %assign push_num 1
 
+    INIT_X86_32_PIC r4
     LOAD_2_PARA
 
     mov     r3, r1
@@ -285,6 +277,7 @@
     movq        [r0 + 2],       xmm3
 
 
+    DEINIT_X86_32_PIC
     pop r3
 
     %assign push_num 0
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -57,7 +57,11 @@
 ; Local Data (Read Only)
 ;***********************************************************************
 
+%ifdef X86_32_PICASM
+SECTION .text align=32
+%else
 SECTION .rodata align=32
+%endif
 
 ;***********************************************************************
 ; Various memory constants (trigonometric values or rounding values)
@@ -64,6 +68,7 @@
 ;***********************************************************************
 
 ALIGN 32
+%ifndef X86_32_PICASM
 db80h_256:
     times 32 db 80h
 shufb_0000000088888888:
@@ -74,6 +79,7 @@
     times 4 db 4
     times 4 db 8
     times 4 db 12
+%endif
 shufb_mask_low:
     db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
 shufb_mask_high:
@@ -1253,20 +1259,7 @@
     pmaddwd     xmm2,   xmm1
     pshufd  xmm1,   xmm2,   00000001b
     paddd   xmm2,   xmm1
-%ifdef X86_32_PICASM
-    push    r0
-    mov     r0, esp
-    and     esp, 0xffffffe0
-    push    0x00000000
-    push    0x00000000
-    push    0x00000000
-    push    0x00004000
-    movdqa  xmm1,   [esp]
-    mov     esp, r0
-    pop     r0
-%else
     movdqa  xmm1,   [add_extra_half]
-%endif
     paddd   xmm2,   xmm1
     psrld   xmm2,   15
 
@@ -1567,20 +1560,7 @@
     pmaddwd     xmm2,   xmm1
     pshufd  xmm1,   xmm2,   00000001b
     paddd   xmm2,   xmm1
-%ifdef X86_32_PICASM
-    push    r0
-    mov     r0, esp
-    and     esp, 0xffffffe0
-    push    0x00000000
-    push    0x00000000
-    push    0x00000000
-    push    0x00004000
-    movdqa  xmm1,   [esp]
-    mov     esp, r0
-    pop     r0
-%else
     movdqa  xmm1,   [add_extra_half]
-%endif
     paddd   xmm2,   xmm1
     psrld   xmm2,   15
 
@@ -1657,6 +1637,12 @@
     SIGN_EXTENSION r3, r3d
     SIGN_EXTENSION r4, r4d
     SIGN_EXTENSION r5, r5d
+%ifdef X86_32_PICASM
+    %define i_height dword arg6
+%else
+    %define i_height r5
+%endif
+    INIT_X86_32_PIC_NOPRESERVE r5
 
 %ifndef X86_32
     push r12
@@ -1664,7 +1650,7 @@
 %endif
 
     mov r6, r1             ;Save the tailer for the unasigned size
-    imul r6, r5
+    imul r6, i_height
     add r6, r0
     movdqa xmm7, [r6]
 
@@ -1697,52 +1683,15 @@
     ;1st line
     movdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A
     movdqa xmm1, xmm0
-%ifdef X86_32_PICASM
-    push   r0
-    mov    r0, esp
-    and    esp, 0xfffffff0
-    push   0x80808080    ;shufb_mask_onethird_low_1
-    push   0x80808080
-    push   0x80800f0c
-    push   0x09060300
-    push   0x80808080    ;shufb_mask_onethird_high_1
-    push   0x80808080
-    push   0x8080800d
-    push   0x0a070401
-    push   0x80808080    ;shufb_mask_onethird_low_2
-    push   0x800e0b08
-    push   0x05028080
-    push   0x80808080
-    push   0x80808080    ;shufb_mask_onethird_high_2
-    push   0x800f0c09
-    push   0x06030080
-    push   0x80808080
-    push   0x0d0a0704    ;shufb_mask_onethird_low_3
-    push   0x01808080
-    push   0x80808080
-    push   0x80808080
-    push   0x0e0b0805    ;shufb_mask_onethird_high_3
-    push   0x02808080
-    push   0x80808080
-    push   0x80808080
-    movdqa xmm5, [esp+80]
-    movdqa xmm6, [esp+64]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_1]
-    movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
     pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
     pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
 
     movdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f
     movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+48]
-    movdqa xmm6, [esp+32]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_2]
-    movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
     pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
     pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
 
@@ -1751,13 +1700,8 @@
 
     movdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *
     movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+16]
-    movdqa xmm6, [esp]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_3]
-    movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
     pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
     pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
 
@@ -1768,25 +1712,15 @@
     ;2nd line
     movdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
     movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+80]
-    movdqa xmm6, [esp+64]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_1]
-    movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
     pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
     pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3
 
     movdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
     movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+48]
-    movdqa xmm6, [esp+32]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_2]
-    movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
     pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1
     pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
 
@@ -1795,15 +1729,8 @@
 
     movdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
     movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+16]
-    movdqa xmm6, [esp]
-    mov    esp, r0
-    pop    r0
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_3]
-    movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
     pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
     pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
 
@@ -1832,7 +1759,7 @@
     lea r0, [r0+r1]
     lea r0, [r0+r6]                             ;current dst lien + 1 line
 
-    dec r5
+    dec i_height
     jg near .yloops_onethird_sse3
 
     movdqa [r0], xmm7                           ;restore the tailer for the unasigned size
@@ -1841,6 +1768,7 @@
     pop r12
 %endif
 
+    DEINIT_X86_32_PIC
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -1847,6 +1775,7 @@
     pop r6
 %endif
     ret
+%undef i_height
 
 ;***********************************************************************
 ;   void DyadicBilinearOneThirdDownsampler_sse4(    unsigned char* pDst, const int iDstStride,
@@ -1866,6 +1795,12 @@
     SIGN_EXTENSION r3, r3d
     SIGN_EXTENSION r4, r4d
     SIGN_EXTENSION r5, r5d
+%ifdef X86_32_PICASM
+    %define i_height dword arg6
+%else
+    %define i_height r5
+%endif
+    INIT_X86_32_PIC_NOPRESERVE r5
 
 %ifndef X86_32
     push r12
@@ -1873,7 +1808,7 @@
 %endif
 
     mov r6, r1             ;Save the tailer for the unasigned size
-    imul r6, r5
+    imul r6, i_height
     add r6, r0
     movdqa xmm7, [r6]
 
@@ -1906,52 +1841,15 @@
     ;1st line
     movntdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A
     movdqa xmm1, xmm0
-%ifdef X86_32_PICASM
-    push   r0
-    mov    r0, esp
-    and    esp, 0xfffffff0
-    push   0x80808080    ;shufb_mask_onethird_low_1
-    push   0x80808080
-    push   0x80800f0c
-    push   0x09060300
-    push   0x80808080    ;shufb_mask_onethird_high_1
-    push   0x80808080
-    push   0x8080800d
-    push   0x0a070401
-    push   0x80808080    ;shufb_mask_onethird_low_2
-    push   0x800e0b08
-    push   0x05028080
-    push   0x80808080
-    push   0x80808080    ;shufb_mask_onethird_high_2
-    push   0x800f0c09
-    push   0x06030080
-    push   0x80808080
-    push   0x0d0a0704    ;shufb_mask_onethird_low_3
-    push   0x01808080
-    push   0x80808080
-    push   0x80808080
-    push   0x0e0b0805    ;shufb_mask_onethird_high_3
-    push   0x02808080
-    push   0x80808080
-    push   0x80808080
-    movdqa xmm5, [esp+80]
-    movdqa xmm6, [esp+64]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_1]
-    movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
     pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
     pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
 
     movntdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f
     movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+48]
-    movdqa xmm6, [esp+32]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_2]
-    movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
     pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
     pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
 
@@ -1960,13 +1858,8 @@
 
     movntdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *
     movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+16]
-    movdqa xmm6, [esp]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_3]
-    movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
     pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
     pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
 
@@ -1977,25 +1870,15 @@
     ;2nd line
     movntdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
     movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+80]
-    movdqa xmm6, [esp+64]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_1]
-    movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
     pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
     pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3
 
     movntdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
     movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+48]
-    movdqa xmm6, [esp+32]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_2]
-    movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
     pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1
     pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
 
@@ -2004,15 +1887,8 @@
 
     movntdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
     movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+16]
-    movdqa xmm6, [esp]
-    mov    esp, r0
-    pop    r0
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_3]
-    movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
     pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
     pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
 
@@ -2041,7 +1917,7 @@
     lea r0, [r0+r1]
     lea r0, [r0+r6]                             ;current dst lien + 1 line
 
-    dec r5
+    dec i_height
     jg near .yloops_onethird_sse4
 
     movdqa [r0], xmm7                           ;restore the tailer for the unasigned size
@@ -2050,6 +1926,7 @@
     pop r12
 %endif
 
+    DEINIT_X86_32_PIC
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -2056,6 +1933,7 @@
     pop r6
 %endif
     ret
+%undef i_height
 
 ;***********************************************************************
 ;   void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride,
@@ -2256,20 +2134,10 @@
     add r6, r0
     movq xmm7, [r6]
 
-%ifdef X86_32_PICASM
-    push   r0
-    mov    r0, esp
-    and    esp, 0xfffffff0
-    push   0x80808080
-    push   0x0d090501
-    push   0x80808080
-    push   0x0c080400
-    movdqa xmm6, [esp]
-    mov    esp, r0
-    pop    r0
-%else
-    movdqa xmm6, [shufb_mask_quarter]
-%endif
+    INIT_X86_32_PIC_NOPRESERVE r4
+    movdqa xmm6, [pic(shufb_mask_quarter)]
+    DEINIT_X86_32_PIC
+
 .yloops_quarter_sse3:
     ;mov eax, [esp+40]   ; iSrcWidth
     ;sar eax, $02            ; iSrcWidth >> 2
@@ -2378,20 +2246,9 @@
     add r6, r0
     movq xmm7, [r6]
 
-%ifdef X86_32_PICASM
-    push   r0
-    mov    r0, esp
-    and    esp, 0xfffffff0
-    push   0x80808080
-    push   0x0d090501
-    push   0x80808080
-    push   0x0c080400
-    movdqa xmm6, [esp]
-    mov    esp, r0
-    pop    r0
-%else
-    movdqa xmm6, [shufb_mask_quarter]    ;mask
-%endif
+    INIT_X86_32_PIC_NOPRESERVE r4
+    movdqa xmm6, [pic(shufb_mask_quarter)]    ;mask
+    DEINIT_X86_32_PIC
 
 .yloops_quarter_sse4:
 %ifdef X86_32
@@ -2534,20 +2391,7 @@
 
 %macro SSSE3_BilinearFastDownsample4xOrLess_8px 0
     movdqa          xmm_tmp0, xmm_xpos_int
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xfffffff0
-    push            0x08080808
-    push            0x08080808
-    push            0x00000000
-    push            0x00000000
-    pshufb          xmm_tmp0, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    pshufb          xmm_tmp0, [shufb_0000000088888888]
-%endif
+    pshufb          xmm_tmp0, xmm_shufb_0000000088888888
     psubb           xmm_xpos_int, xmm_tmp0
     SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
     mov             r_tmp0, i_xpos
@@ -2555,24 +2399,7 @@
     lddqu           xmm_tmp3, [p_src_row0 + r_tmp0]
     lddqu           xmm_tmp4, [p_src_row1 + r_tmp0]
     movdqa          xmm_tmp2, xmm_xpos_int
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    punpcklbw       xmm_tmp2, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    punpcklbw       xmm_tmp2, [db80h_256]
-%endif
+    punpcklbw       xmm_tmp2, xmm_db80h
     pshufb          xmm_tmp3, xmm_tmp2
     pshufb          xmm_tmp4, xmm_tmp2
     SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
@@ -2585,24 +2412,7 @@
     lddqu           xmm_tmp3, [p_src_row0 + r_tmp0]
     lddqu           xmm_tmp4, [p_src_row1 + r_tmp0]
     movdqa          xmm_tmp2, xmm_xpos_int
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    punpckhbw       xmm_tmp2, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    punpckhbw       xmm_tmp2, [db80h_256]
-%endif
+    punpckhbw       xmm_tmp2, xmm_db80h
     pshufb          xmm_tmp3, xmm_tmp2
     pshufb          xmm_tmp4, xmm_tmp2
     SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
@@ -2741,43 +2551,13 @@
 
 %macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0
     movdqa          xmm_tmp0, xmm_xpos_int
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xfffffff0
-    push            0x08080808
-    push            0x08080808
-    push            0x00000000
-    push            0x00000000
-    pshufb          xmm_tmp0, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    pshufb          xmm_tmp0, [shufb_0000000088888888]
-%endif
+    pshufb          xmm_tmp0, xmm_shufb_0000000088888888
     psubb           xmm_xpos_int, xmm_tmp0
     SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
     mov             r_tmp0, i_xpos
     shr             r_tmp0, 16
     movdqa          xmm_tmp3, xmm_xpos_int
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    punpcklbw       xmm_tmp3, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    punpcklbw       xmm_tmp3, [db80h_256]
-%endif
+    punpcklbw       xmm_tmp3, xmm_db80h
     lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
     lddqu           xmm_tmp2, [p_src_row1 + r_tmp0]
     lea             r_tmp0, [i_xpos + 4 * i_scalex]
@@ -2789,24 +2569,7 @@
     pmaddwd         xmm_tmp2, xmm_tmp0
     SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
     movdqa          xmm_tmp2, xmm_xpos_int
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    punpckhbw       xmm_tmp2, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    punpckhbw       xmm_tmp2, [db80h_256]
-%endif
+    punpckhbw       xmm_tmp2, xmm_db80h
     lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
     lddqu           xmm_tmp3, [p_src_row1 + r_tmp0]
     pshufb          xmm_tmp4, xmm_tmp2
@@ -2987,7 +2750,11 @@
     movd            xmm0, arg8
     movd            xmm1, esp
     and             esp, -16
+%ifdef X86_32_PICASM
+    sub             esp, 8 * 4 + 9 * 16
+%else
     sub             esp, 8 * 4 + 7 * 16
+%endif
     movd            [esp], xmm1
     %define p_dst                   r0
     %define i_dst_stride_less_width [esp + 1 * 4]
@@ -3021,6 +2788,22 @@
     %define xmm_0                   [esp + 8 * 4 + 4 * 16]
     %define xmm_xpos_int_begin      [esp + 8 * 4 + 5 * 16]
     %define xmm_xpos_frac_begin     [esp + 8 * 4 + 6 * 16]
+%ifdef X86_32_PICASM
+    %define xmm_db80h                  [esp + 8 * 4 + 7 * 16]
+    %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 16]
+    pxor            xmm_tmp4, xmm_tmp4
+    pcmpeqb         xmm_tmp5, xmm_tmp5
+    psubb           xmm_tmp4, xmm_tmp5
+    movdqa          xmm_tmp3, xmm_tmp4
+    psllw           xmm_tmp3, 3
+    pslldq          xmm_tmp3, 8
+    movdqa          xmm_shufb_0000000088888888, xmm_tmp3
+    psllw           xmm_tmp4, 7
+    movdqa          xmm_db80h, xmm_tmp4
+%else
+    %define xmm_db80h               [db80h_256]
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+%endif
     mov             i_dst_stride_less_width, r1
     mov             i_dst_width, r2
     mov             i_dst_height, r3
@@ -3067,6 +2850,8 @@
     %define xmm_tmp5                xmm6
     %define xmm_xpos_int_begin      xmm14
     %define xmm_xpos_frac_begin     xmm15
+    %define xmm_db80h               [db80h_256]
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
     pxor            xmm_0, xmm_0
 %endif
 
@@ -3230,6 +3015,8 @@
 %undef xmm_xfrac0_begin
 %undef xmm_xfrac1_begin
 %undef xmm_xfrac_inc
+%undef xmm_db80h
+%undef xmm_shufb_0000000088888888
 
 ;**************************************************************************************************************
 ;void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
@@ -3265,7 +3052,11 @@
     movd            xmm0, arg8
     movd            xmm1, esp
     and             esp, -16
+%ifdef X86_32_PICASM
+    sub             esp, 8 * 4 + 10 * 16
+%else
     sub             esp, 8 * 4 + 8 * 16
+%endif
     movd            [esp], xmm1
     %define p_dst                   r0
     %define i_dst_stride_less_width [esp + 1 * 4]
@@ -3300,6 +3091,22 @@
     %define xmm_7fff                [esp + 8 * 4 + 5 * 16]
     %define xmm_xpos_int_begin      [esp + 8 * 4 + 6 * 16]
     %define xmm_xpos_frac_begin     [esp + 8 * 4 + 7 * 16]
+%ifdef X86_32_PICASM
+    %define xmm_db80h                  [esp + 8 * 4 + 8 * 16]
+    %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 16]
+    pxor            xmm_tmp4, xmm_tmp4
+    pcmpeqb         xmm_tmp5, xmm_tmp5
+    psubb           xmm_tmp4, xmm_tmp5
+    movdqa          xmm_tmp3, xmm_tmp4
+    psllw           xmm_tmp3, 3
+    pslldq          xmm_tmp3, 8
+    movdqa          xmm_shufb_0000000088888888, xmm_tmp3
+    psllw           xmm_tmp4, 7
+    movdqa          xmm_db80h, xmm_tmp4
+%else
+    %define xmm_db80h               [db80h_256]
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+%endif
     mov             i_dst_stride_less_width, r1
     mov             i_dst_width, r2
     mov             i_dst_height, r3
@@ -3350,6 +3157,8 @@
     %define xmm_7fff                xmm13
     %define xmm_xpos_int_begin      xmm14
     %define xmm_xpos_frac_begin     xmm15
+    %define xmm_db80h               [db80h_256]
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
     pxor            xmm_0, xmm_0
     pcmpeqw         xmm_7fff, xmm_7fff
     psrlw           xmm_7fff, 1
@@ -3517,6 +3326,8 @@
 %undef xmm_xfrac0_begin
 %undef xmm_xfrac1_begin
 %undef xmm_xfrac_inc
+%undef xmm_db80h
+%undef xmm_shufb_0000000088888888
 
 %ifdef HAVE_AVX2
 ; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
@@ -3585,20 +3396,7 @@
 %endmacro
 
 %macro AVX2_BilinearFastDownsample4xOrLess_16px 0
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xfffffff0
-    push            0x08080808
-    push            0x08080808
-    push            0x00000000
-    push            0x00000000
-    vbroadcasti128  ymm_tmp0, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    vbroadcasti128  ymm_tmp0, [shufb_0000000088888888]
-%endif
+    vbroadcasti128  ymm_tmp0, xmm_shufb_0000000088888888
     vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
     vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
     AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
@@ -3642,20 +3440,7 @@
 %endmacro
 
 %macro AVX2_BilinearFastDownsample8xOrLess_16px 0
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xffffffe0
-    push            0x0c0c0c0c
-    push            0x08080808
-    push            0x04040404
-    push            0x00000000
-    vbroadcasti128  ymm_tmp0, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    vbroadcasti128  ymm_tmp0, [shufb_000044448888CCCC]
-%endif
+    vbroadcasti128  ymm_tmp0, xmm_shufb_000044448888CCCC
     vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
     vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
     mov             r_tmp0, i_xpos
@@ -3894,20 +3679,7 @@
 %endmacro
 
 %macro AVX2_BilinearAccurateDownsample4xOrLess_16px 0
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x08080808    ;shufb_0000000088888888
-    push            0x08080808
-    push            0x00000000
-    push            0x00000000
-    vbroadcasti128  ymm_tmp0, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    vbroadcasti128  ymm_tmp0, [shufb_0000000088888888]
-%endif
+    vbroadcasti128  ymm_tmp0, xmm_shufb_0000000088888888
     vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
     vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
     AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
@@ -3922,24 +3694,7 @@
     lea             r_tmp0, [i_xpos + 2 * i_scalex2]
     lea             i_xpos, [r_tmp0 + 4 * i_scalex2]
     shr             r_tmp0, 16
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
-%endif
+    vpunpcklbw      ymm_tmp3, ymm_xpos_int, ymm_db80h
     vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
     vpshufb         ymm_tmp2, ymm_tmp2, ymm_tmp3
     vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp0
@@ -3952,24 +3707,7 @@
     shr             r_tmp0, 16
     vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
     vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    vpunpckhbw      ymm_tmp3, ymm_xpos_int, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    vpunpckhbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
-%endif
+    vpunpckhbw      ymm_tmp3, ymm_xpos_int, ymm_db80h
     vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
     vpshufb         ymm_tmp2, ymm_tmp2, ymm_tmp3
     vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp1
@@ -3985,20 +3723,7 @@
 %endmacro
 
 %macro AVX2_BilinearAccurateDownsample8xOrLess_16px 0
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x0c0c0c0c    ;shufb_000044448888cccc
-    push            0x08080808
-    push            0x04040404
-    push            0x00000000
-    vbroadcasti128  ymm_tmp0, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    vbroadcasti128  ymm_tmp0, [shufb_000044448888CCCC]
-%endif
+    vbroadcasti128  ymm_tmp0, xmm_shufb_000044448888CCCC
     vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
     vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
     mov             r_tmp0, i_xpos
@@ -4019,24 +3744,7 @@
     shr             r_tmp0, 16
     vinserti128     ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
     vinserti128     ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
-%endif
+    vpunpcklbw      ymm_tmp3, ymm_xpos_int, ymm_db80h
     vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
     vpshufb         ymm_tmp5, ymm_tmp5, ymm_tmp3
     vpshufb         ymm_tmp0, ymm_tmp0, ymm_tmp3
@@ -4313,7 +4021,11 @@
     vmovd           xmm0, arg8
     vmovd           xmm1, esp
     and             esp, -32
+%ifdef X86_32_PICASM
+    sub             esp, 8 * 4 + 9 * 32
+%else
     sub             esp, 8 * 4 + 8 * 32
+%endif
     vmovd           [esp], xmm1
     %define p_dst                   r0
     %define i_dst_stride_less_width [esp + 1 * 4]
@@ -4354,6 +4066,22 @@
     %define ymm_ffff                [esp + 8 * 4 + 5 * 32]
     %define ymm_xpos_int_begin      [esp + 8 * 4 + 6 * 32]
     %define ymm_xpos_frac_begin     [esp + 8 * 4 + 7 * 32]
+%ifdef X86_32_PICASM
+    %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 32]
+    %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 8 * 32 + 16]
+    vpxor           ymm_tmp4, ymm_tmp4, ymm_tmp4
+    vpcmpeqb        ymm_tmp5, ymm_tmp5, ymm_tmp5
+    vpsubb          ymm_tmp4, ymm_tmp4, ymm_tmp5
+    vpsllw          ymm_tmp3, ymm_tmp4, 3
+    vpslldq         ymm_tmp3, ymm_tmp3, 8
+    vmovdqa         xmm_shufb_0000000088888888, xmm_tmp3
+    vpsllq          ymm_tmp5, ymm_tmp4, 34
+    vpaddb          ymm_tmp5, ymm_tmp5, ymm_tmp3
+    vmovdqa         xmm_shufb_000044448888CCCC, xmm_tmp5
+%else
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+    %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
+%endif
     mov             i_dst_stride_less_width, r1
     mov             i_dst_width, r2
     mov             i_dst_height, r3
@@ -4409,6 +4137,8 @@
     %define ymm_ffff                ymm13
     %define ymm_xpos_int_begin      ymm14
     %define ymm_xpos_frac_begin     ymm15
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+    %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
     vpxor           ymm_0, ymm_0, ymm_0
     vpcmpeqw        ymm_ffff, ymm_ffff, ymm_ffff
 %endif
@@ -4597,6 +4327,8 @@
 %undef ymm_xfrac0_begin
 %undef ymm_xfrac1_begin
 %undef ymm_xfrac_inc
+%undef xmm_shufb_0000000088888888
+%undef xmm_shufb_000044448888CCCC
 
 ;**************************************************************************************************************
 ;void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
@@ -4632,7 +4364,11 @@
     vmovd           xmm0, arg8
     vmovd           xmm1, esp
     and             esp, -32
+%ifdef X86_32_PICASM
+    sub             esp, 8 * 4 + 10 * 32
+%else
     sub             esp, 8 * 4 + 8 * 32
+%endif
     vmovd           [esp], xmm1
     %define p_dst                   r0
     %define i_dst_stride_less_width [esp + 1 * 4]
@@ -4673,6 +4409,26 @@
     %define ymm_7fff                [esp + 8 * 4 + 5 * 32]
     %define ymm_xpos_int_begin      [esp + 8 * 4 + 6 * 32]
     %define ymm_xpos_frac_begin     [esp + 8 * 4 + 7 * 32]
+%ifdef X86_32_PICASM
+    %define ymm_db80h                  [esp + 8 * 4 + 8 * 32]
+    %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 32]
+    %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 9 * 32 + 16]
+    vpxor           ymm_tmp4, ymm_tmp4, ymm_tmp4
+    vpcmpeqb        ymm_tmp5, ymm_tmp5, ymm_tmp5
+    vpsubb          ymm_tmp4, ymm_tmp4, ymm_tmp5
+    vpsllw          ymm_tmp3, ymm_tmp4, 3
+    vpslldq         ymm_tmp3, ymm_tmp3, 8
+    vmovdqa         xmm_shufb_0000000088888888, xmm_tmp3
+    vpsllq          ymm_tmp5, ymm_tmp4, 34
+    vpaddb          ymm_tmp5, ymm_tmp5, ymm_tmp3
+    vmovdqa         xmm_shufb_000044448888CCCC, xmm_tmp5
+    vpsllw          ymm_tmp4, ymm_tmp4, 7
+    vmovdqa         ymm_db80h, ymm_tmp4
+%else
+    %define ymm_db80h               [db80h_256]
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+    %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
+%endif
     mov             i_dst_stride_less_width, r1
     mov             i_dst_width, r2
     mov             i_dst_height, r3
@@ -4729,6 +4485,9 @@
     %define ymm_7fff                ymm13
     %define ymm_xpos_int_begin      ymm14
     %define ymm_xpos_frac_begin     ymm15
+    %define ymm_db80h               [db80h_256]
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+    %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
     vpxor           ymm_0, ymm_0, ymm_0
     vpcmpeqw        ymm_7fff, ymm_7fff, ymm_7fff
     vpsrlw          ymm_7fff, ymm_7fff, 1
@@ -4920,5 +4679,8 @@
 %undef ymm_xfrac0_begin
 %undef ymm_xfrac1_begin
 %undef ymm_xfrac_inc
-%endif
+%undef ymm_db80h
+%undef xmm_shufb_0000000088888888
+%undef xmm_shufb_000044448888CCCC
 
+%endif
--- a/test/encoder/EncUT_Cavlc.cpp
+++ b/test/encoder/EncUT_Cavlc.cpp
@@ -77,18 +77,14 @@
 }
 
 #ifdef X86_32_ASM
-#ifndef X86_32_PICASM
 TEST (CavlcTest, CavlcParamCal_sse2) {
   TestCavlcParamCal (CavlcParamCal_sse2);
 }
 #endif
-#endif
 
 #ifdef X86_ASM
-#ifndef X86_32_PICASM
 TEST (CavlcTest, CavlcParamCal_sse42) {
   if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
     TestCavlcParamCal (CavlcParamCal_sse42);
 }
-#endif
 #endif
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -222,7 +222,6 @@
 }
 #endif //HAVE_AVX2
 
-#ifndef X86_32_PICASM
 TEST (EncodeMbAuxTest, WelsCalculateSingleCtr4x4_sse2) {
   CMemoryAlign cMemoryAlign (0);
   ALLOC_MEMORY (int16_t, iDctC, 16);
@@ -236,7 +235,6 @@
   FREE_MEMORY (iDctC);
   FREE_MEMORY (iDctS);
 }
-#endif //#ifndef X86_32_PICASM
 #endif
 
 void copy (uint8_t* pDst, int32_t iDStride, uint8_t* pSrc, int32_t iSStride, int32_t iWidth, int32_t iHeight) {
@@ -304,11 +302,9 @@
   TestGetNoneZeroCount (WelsGetNoneZeroCount_c);
 }
 #ifdef X86_ASM
-#ifndef X86_32_PICASM
 TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) {
   TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2);
 }
-#endif
 TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse42) {
   if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
     TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);