shithub: openh264

Download patch

ref: 578535509a789530c6bb9de061419a34fc8a1447
parent: 4769d4d0b8917e16106747442e24336cf44f5ad8
author: Sindre Aamås <saamas@cisco.com>
date: Tue Mar 7 09:36:08 EST 2017

[Processing/x86] Simplify downsample_bilinear X86_32_PICASM handling

Utilize program counter-relative offsets to simplify X86_32_PICASM
code.

In order for this to work with nasm, data constants are placed in
the text segment.

Avoid this for some routines where putting constants on the stack
is convenient and efficient.

--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -57,7 +57,11 @@
 ; Local Data (Read Only)
 ;***********************************************************************
 
+%ifdef X86_32_PICASM
+SECTION .text align=32
+%else
 SECTION .rodata align=32
+%endif
 
 ;***********************************************************************
 ; Various memory constants (trigonometric values or rounding values)
@@ -64,6 +68,7 @@
 ;***********************************************************************
 
 ALIGN 32
+%ifndef X86_32_PICASM
 db80h_256:
     times 32 db 80h
 shufb_0000000088888888:
@@ -74,6 +79,7 @@
     times 4 db 4
     times 4 db 8
     times 4 db 12
+%endif
 shufb_mask_low:
     db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
 shufb_mask_high:
@@ -1253,20 +1259,7 @@
     pmaddwd     xmm2,   xmm1
     pshufd  xmm1,   xmm2,   00000001b
     paddd   xmm2,   xmm1
-%ifdef X86_32_PICASM
-    push    r0
-    mov     r0, esp
-    and     esp, 0xffffffe0
-    push    0x00000000
-    push    0x00000000
-    push    0x00000000
-    push    0x00004000
-    movdqa  xmm1,   [esp]
-    mov     esp, r0
-    pop     r0
-%else
     movdqa  xmm1,   [add_extra_half]
-%endif
     paddd   xmm2,   xmm1
     psrld   xmm2,   15
 
@@ -1567,20 +1560,7 @@
     pmaddwd     xmm2,   xmm1
     pshufd  xmm1,   xmm2,   00000001b
     paddd   xmm2,   xmm1
-%ifdef X86_32_PICASM
-    push    r0
-    mov     r0, esp
-    and     esp, 0xffffffe0
-    push    0x00000000
-    push    0x00000000
-    push    0x00000000
-    push    0x00004000
-    movdqa  xmm1,   [esp]
-    mov     esp, r0
-    pop     r0
-%else
     movdqa  xmm1,   [add_extra_half]
-%endif
     paddd   xmm2,   xmm1
     psrld   xmm2,   15
 
@@ -1657,6 +1637,12 @@
     SIGN_EXTENSION r3, r3d
     SIGN_EXTENSION r4, r4d
     SIGN_EXTENSION r5, r5d
+%ifdef X86_32_PICASM
+    %define i_height dword arg6
+%else
+    %define i_height r5
+%endif
+    INIT_X86_32_PIC_NOPRESERVE r5
 
 %ifndef X86_32
     push r12
@@ -1664,7 +1650,7 @@
 %endif
 
     mov r6, r1             ;Save the tailer for the unasigned size
-    imul r6, r5
+    imul r6, i_height
     add r6, r0
     movdqa xmm7, [r6]
 
@@ -1697,52 +1683,15 @@
     ;1st line
     movdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A
     movdqa xmm1, xmm0
-%ifdef X86_32_PICASM
-    push   r0
-    mov    r0, esp
-    and    esp, 0xfffffff0
-    push   0x80808080    ;shufb_mask_onethird_low_1
-    push   0x80808080
-    push   0x80800f0c
-    push   0x09060300
-    push   0x80808080    ;shufb_mask_onethird_high_1
-    push   0x80808080
-    push   0x8080800d
-    push   0x0a070401
-    push   0x80808080    ;shufb_mask_onethird_low_2
-    push   0x800e0b08
-    push   0x05028080
-    push   0x80808080
-    push   0x80808080    ;shufb_mask_onethird_high_2
-    push   0x800f0c09
-    push   0x06030080
-    push   0x80808080
-    push   0x0d0a0704    ;shufb_mask_onethird_low_3
-    push   0x01808080
-    push   0x80808080
-    push   0x80808080
-    push   0x0e0b0805    ;shufb_mask_onethird_high_3
-    push   0x02808080
-    push   0x80808080
-    push   0x80808080
-    movdqa xmm5, [esp+80]
-    movdqa xmm6, [esp+64]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_1]
-    movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
     pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
     pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
 
     movdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f
     movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+48]
-    movdqa xmm6, [esp+32]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_2]
-    movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
     pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
     pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
 
@@ -1751,13 +1700,8 @@
 
     movdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *
     movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+16]
-    movdqa xmm6, [esp]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_3]
-    movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
     pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
     pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
 
@@ -1768,25 +1712,15 @@
     ;2nd line
     movdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
     movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+80]
-    movdqa xmm6, [esp+64]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_1]
-    movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
     pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
     pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3
 
     movdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
     movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+48]
-    movdqa xmm6, [esp+32]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_2]
-    movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
     pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1
     pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
 
@@ -1795,15 +1729,8 @@
 
     movdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
     movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+16]
-    movdqa xmm6, [esp]
-    mov    esp, r0
-    pop    r0
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_3]
-    movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
     pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
     pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
 
@@ -1832,7 +1759,7 @@
     lea r0, [r0+r1]
     lea r0, [r0+r6]                             ;current dst lien + 1 line
 
-    dec r5
+    dec i_height
     jg near .yloops_onethird_sse3
 
     movdqa [r0], xmm7                           ;restore the tailer for the unasigned size
@@ -1841,6 +1768,7 @@
     pop r12
 %endif
 
+    DEINIT_X86_32_PIC
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -1847,6 +1775,7 @@
     pop r6
 %endif
     ret
+%undef i_height
 
 ;***********************************************************************
 ;   void DyadicBilinearOneThirdDownsampler_sse4(    unsigned char* pDst, const int iDstStride,
@@ -1866,6 +1795,12 @@
     SIGN_EXTENSION r3, r3d
     SIGN_EXTENSION r4, r4d
     SIGN_EXTENSION r5, r5d
+%ifdef X86_32_PICASM
+    %define i_height dword arg6
+%else
+    %define i_height r5
+%endif
+    INIT_X86_32_PIC_NOPRESERVE r5
 
 %ifndef X86_32
     push r12
@@ -1873,7 +1808,7 @@
 %endif
 
     mov r6, r1             ;Save the tailer for the unasigned size
-    imul r6, r5
+    imul r6, i_height
     add r6, r0
     movdqa xmm7, [r6]
 
@@ -1906,52 +1841,15 @@
     ;1st line
     movntdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A
     movdqa xmm1, xmm0
-%ifdef X86_32_PICASM
-    push   r0
-    mov    r0, esp
-    and    esp, 0xfffffff0
-    push   0x80808080    ;shufb_mask_onethird_low_1
-    push   0x80808080
-    push   0x80800f0c
-    push   0x09060300
-    push   0x80808080    ;shufb_mask_onethird_high_1
-    push   0x80808080
-    push   0x8080800d
-    push   0x0a070401
-    push   0x80808080    ;shufb_mask_onethird_low_2
-    push   0x800e0b08
-    push   0x05028080
-    push   0x80808080
-    push   0x80808080    ;shufb_mask_onethird_high_2
-    push   0x800f0c09
-    push   0x06030080
-    push   0x80808080
-    push   0x0d0a0704    ;shufb_mask_onethird_low_3
-    push   0x01808080
-    push   0x80808080
-    push   0x80808080
-    push   0x0e0b0805    ;shufb_mask_onethird_high_3
-    push   0x02808080
-    push   0x80808080
-    push   0x80808080
-    movdqa xmm5, [esp+80]
-    movdqa xmm6, [esp+64]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_1]
-    movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
     pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
     pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
 
     movntdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f
     movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+48]
-    movdqa xmm6, [esp+32]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_2]
-    movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
     pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
     pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
 
@@ -1960,13 +1858,8 @@
 
     movntdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *
     movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+16]
-    movdqa xmm6, [esp]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_3]
-    movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
     pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
     pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
 
@@ -1977,25 +1870,15 @@
     ;2nd line
     movntdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
     movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+80]
-    movdqa xmm6, [esp+64]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_1]
-    movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
     pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
     pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3
 
     movntdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
     movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+48]
-    movdqa xmm6, [esp+32]
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_2]
-    movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
     pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1
     pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
 
@@ -2004,15 +1887,8 @@
 
     movntdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
     movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
-    movdqa xmm5, [esp+16]
-    movdqa xmm6, [esp]
-    mov    esp, r0
-    pop    r0
-%else
-    movdqa xmm5, [shufb_mask_onethird_low_3]
-    movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+    movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+    movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
     pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
     pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
 
@@ -2041,7 +1917,7 @@
     lea r0, [r0+r1]
     lea r0, [r0+r6]                             ;current dst lien + 1 line
 
-    dec r5
+    dec i_height
     jg near .yloops_onethird_sse4
 
     movdqa [r0], xmm7                           ;restore the tailer for the unasigned size
@@ -2050,6 +1926,7 @@
     pop r12
 %endif
 
+    DEINIT_X86_32_PIC
     POP_XMM
     LOAD_6_PARA_POP
 %ifdef X86_32
@@ -2056,6 +1933,7 @@
     pop r6
 %endif
     ret
+%undef i_height
 
 ;***********************************************************************
 ;   void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride,
@@ -2256,20 +2134,10 @@
     add r6, r0
     movq xmm7, [r6]
 
-%ifdef X86_32_PICASM
-    push   r0
-    mov    r0, esp
-    and    esp, 0xfffffff0
-    push   0x80808080
-    push   0x0d090501
-    push   0x80808080
-    push   0x0c080400
-    movdqa xmm6, [esp]
-    mov    esp, r0
-    pop    r0
-%else
-    movdqa xmm6, [shufb_mask_quarter]
-%endif
+    INIT_X86_32_PIC_NOPRESERVE r4
+    movdqa xmm6, [pic(shufb_mask_quarter)]
+    DEINIT_X86_32_PIC
+
 .yloops_quarter_sse3:
     ;mov eax, [esp+40]   ; iSrcWidth
     ;sar eax, $02            ; iSrcWidth >> 2
@@ -2378,20 +2246,9 @@
     add r6, r0
     movq xmm7, [r6]
 
-%ifdef X86_32_PICASM
-    push   r0
-    mov    r0, esp
-    and    esp, 0xfffffff0
-    push   0x80808080
-    push   0x0d090501
-    push   0x80808080
-    push   0x0c080400
-    movdqa xmm6, [esp]
-    mov    esp, r0
-    pop    r0
-%else
-    movdqa xmm6, [shufb_mask_quarter]    ;mask
-%endif
+    INIT_X86_32_PIC_NOPRESERVE r4
+    movdqa xmm6, [pic(shufb_mask_quarter)]    ;mask
+    DEINIT_X86_32_PIC
 
 .yloops_quarter_sse4:
 %ifdef X86_32
@@ -2534,20 +2391,7 @@
 
 %macro SSSE3_BilinearFastDownsample4xOrLess_8px 0
     movdqa          xmm_tmp0, xmm_xpos_int
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xfffffff0
-    push            0x08080808
-    push            0x08080808
-    push            0x00000000
-    push            0x00000000
-    pshufb          xmm_tmp0, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    pshufb          xmm_tmp0, [shufb_0000000088888888]
-%endif
+    pshufb          xmm_tmp0, xmm_shufb_0000000088888888
     psubb           xmm_xpos_int, xmm_tmp0
     SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
     mov             r_tmp0, i_xpos
@@ -2555,24 +2399,7 @@
     lddqu           xmm_tmp3, [p_src_row0 + r_tmp0]
     lddqu           xmm_tmp4, [p_src_row1 + r_tmp0]
     movdqa          xmm_tmp2, xmm_xpos_int
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    punpcklbw       xmm_tmp2, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    punpcklbw       xmm_tmp2, [db80h_256]
-%endif
+    punpcklbw       xmm_tmp2, xmm_db80h
     pshufb          xmm_tmp3, xmm_tmp2
     pshufb          xmm_tmp4, xmm_tmp2
     SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
@@ -2585,24 +2412,7 @@
     lddqu           xmm_tmp3, [p_src_row0 + r_tmp0]
     lddqu           xmm_tmp4, [p_src_row1 + r_tmp0]
     movdqa          xmm_tmp2, xmm_xpos_int
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    punpckhbw       xmm_tmp2, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    punpckhbw       xmm_tmp2, [db80h_256]
-%endif
+    punpckhbw       xmm_tmp2, xmm_db80h
     pshufb          xmm_tmp3, xmm_tmp2
     pshufb          xmm_tmp4, xmm_tmp2
     SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
@@ -2741,43 +2551,13 @@
 
 %macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0
     movdqa          xmm_tmp0, xmm_xpos_int
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xfffffff0
-    push            0x08080808
-    push            0x08080808
-    push            0x00000000
-    push            0x00000000
-    pshufb          xmm_tmp0, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    pshufb          xmm_tmp0, [shufb_0000000088888888]
-%endif
+    pshufb          xmm_tmp0, xmm_shufb_0000000088888888
     psubb           xmm_xpos_int, xmm_tmp0
     SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
     mov             r_tmp0, i_xpos
     shr             r_tmp0, 16
     movdqa          xmm_tmp3, xmm_xpos_int
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    punpcklbw       xmm_tmp3, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    punpcklbw       xmm_tmp3, [db80h_256]
-%endif
+    punpcklbw       xmm_tmp3, xmm_db80h
     lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
     lddqu           xmm_tmp2, [p_src_row1 + r_tmp0]
     lea             r_tmp0, [i_xpos + 4 * i_scalex]
@@ -2789,24 +2569,7 @@
     pmaddwd         xmm_tmp2, xmm_tmp0
     SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
     movdqa          xmm_tmp2, xmm_xpos_int
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    punpckhbw       xmm_tmp2, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    punpckhbw       xmm_tmp2, [db80h_256]
-%endif
+    punpckhbw       xmm_tmp2, xmm_db80h
     lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
     lddqu           xmm_tmp3, [p_src_row1 + r_tmp0]
     pshufb          xmm_tmp4, xmm_tmp2
@@ -2987,7 +2750,11 @@
     movd            xmm0, arg8
     movd            xmm1, esp
     and             esp, -16
+%ifdef X86_32_PICASM
+    sub             esp, 8 * 4 + 9 * 16
+%else
     sub             esp, 8 * 4 + 7 * 16
+%endif
     movd            [esp], xmm1
     %define p_dst                   r0
     %define i_dst_stride_less_width [esp + 1 * 4]
@@ -3021,6 +2788,22 @@
     %define xmm_0                   [esp + 8 * 4 + 4 * 16]
     %define xmm_xpos_int_begin      [esp + 8 * 4 + 5 * 16]
     %define xmm_xpos_frac_begin     [esp + 8 * 4 + 6 * 16]
+%ifdef X86_32_PICASM
+    %define xmm_db80h                  [esp + 8 * 4 + 7 * 16]
+    %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 16]
+    pxor            xmm_tmp4, xmm_tmp4
+    pcmpeqb         xmm_tmp5, xmm_tmp5
+    psubb           xmm_tmp4, xmm_tmp5
+    movdqa          xmm_tmp3, xmm_tmp4
+    psllw           xmm_tmp3, 3
+    pslldq          xmm_tmp3, 8
+    movdqa          xmm_shufb_0000000088888888, xmm_tmp3
+    psllw           xmm_tmp4, 7
+    movdqa          xmm_db80h, xmm_tmp4
+%else
+    %define xmm_db80h               [db80h_256]
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+%endif
     mov             i_dst_stride_less_width, r1
     mov             i_dst_width, r2
     mov             i_dst_height, r3
@@ -3067,6 +2850,8 @@
     %define xmm_tmp5                xmm6
     %define xmm_xpos_int_begin      xmm14
     %define xmm_xpos_frac_begin     xmm15
+    %define xmm_db80h               [db80h_256]
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
     pxor            xmm_0, xmm_0
 %endif
 
@@ -3230,6 +3015,8 @@
 %undef xmm_xfrac0_begin
 %undef xmm_xfrac1_begin
 %undef xmm_xfrac_inc
+%undef xmm_db80h
+%undef xmm_shufb_0000000088888888
 
 ;**************************************************************************************************************
 ;void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
@@ -3265,7 +3052,11 @@
     movd            xmm0, arg8
     movd            xmm1, esp
     and             esp, -16
+%ifdef X86_32_PICASM
+    sub             esp, 8 * 4 + 10 * 16
+%else
     sub             esp, 8 * 4 + 8 * 16
+%endif
     movd            [esp], xmm1
     %define p_dst                   r0
     %define i_dst_stride_less_width [esp + 1 * 4]
@@ -3300,6 +3091,22 @@
     %define xmm_7fff                [esp + 8 * 4 + 5 * 16]
     %define xmm_xpos_int_begin      [esp + 8 * 4 + 6 * 16]
     %define xmm_xpos_frac_begin     [esp + 8 * 4 + 7 * 16]
+%ifdef X86_32_PICASM
+    %define xmm_db80h                  [esp + 8 * 4 + 8 * 16]
+    %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 16]
+    pxor            xmm_tmp4, xmm_tmp4
+    pcmpeqb         xmm_tmp5, xmm_tmp5
+    psubb           xmm_tmp4, xmm_tmp5
+    movdqa          xmm_tmp3, xmm_tmp4
+    psllw           xmm_tmp3, 3
+    pslldq          xmm_tmp3, 8
+    movdqa          xmm_shufb_0000000088888888, xmm_tmp3
+    psllw           xmm_tmp4, 7
+    movdqa          xmm_db80h, xmm_tmp4
+%else
+    %define xmm_db80h               [db80h_256]
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+%endif
     mov             i_dst_stride_less_width, r1
     mov             i_dst_width, r2
     mov             i_dst_height, r3
@@ -3350,6 +3157,8 @@
     %define xmm_7fff                xmm13
     %define xmm_xpos_int_begin      xmm14
     %define xmm_xpos_frac_begin     xmm15
+    %define xmm_db80h               [db80h_256]
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
     pxor            xmm_0, xmm_0
     pcmpeqw         xmm_7fff, xmm_7fff
     psrlw           xmm_7fff, 1
@@ -3517,6 +3326,8 @@
 %undef xmm_xfrac0_begin
 %undef xmm_xfrac1_begin
 %undef xmm_xfrac_inc
+%undef xmm_db80h
+%undef xmm_shufb_0000000088888888
 
 %ifdef HAVE_AVX2
 ; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
@@ -3585,20 +3396,7 @@
 %endmacro
 
 %macro AVX2_BilinearFastDownsample4xOrLess_16px 0
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xfffffff0
-    push            0x08080808
-    push            0x08080808
-    push            0x00000000
-    push            0x00000000
-    vbroadcasti128  ymm_tmp0, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    vbroadcasti128  ymm_tmp0, [shufb_0000000088888888]
-%endif
+    vbroadcasti128  ymm_tmp0, xmm_shufb_0000000088888888
     vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
     vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
     AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
@@ -3642,20 +3440,7 @@
 %endmacro
 
 %macro AVX2_BilinearFastDownsample8xOrLess_16px 0
-%ifdef X86_32_PICASM
-    push            r0
-    mov             r0, esp
-    and             esp, 0xffffffe0
-    push            0x0c0c0c0c
-    push            0x08080808
-    push            0x04040404
-    push            0x00000000
-    vbroadcasti128  ymm_tmp0, [esp]
-    mov             esp, r0
-    pop             r0
-%else
-    vbroadcasti128  ymm_tmp0, [shufb_000044448888CCCC]
-%endif
+    vbroadcasti128  ymm_tmp0, xmm_shufb_000044448888CCCC
     vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
     vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
     mov             r_tmp0, i_xpos
@@ -3894,20 +3679,7 @@
 %endmacro
 
 %macro AVX2_BilinearAccurateDownsample4xOrLess_16px 0
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x08080808    ;shufb_0000000088888888
-    push            0x08080808
-    push            0x00000000
-    push            0x00000000
-    vbroadcasti128  ymm_tmp0, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    vbroadcasti128  ymm_tmp0, [shufb_0000000088888888]
-%endif
+    vbroadcasti128  ymm_tmp0, xmm_shufb_0000000088888888
     vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
     vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
     AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
@@ -3922,24 +3694,7 @@
     lea             r_tmp0, [i_xpos + 2 * i_scalex2]
     lea             i_xpos, [r_tmp0 + 4 * i_scalex2]
     shr             r_tmp0, 16
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
-%endif
+    vpunpcklbw      ymm_tmp3, ymm_xpos_int, ymm_db80h
     vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
     vpshufb         ymm_tmp2, ymm_tmp2, ymm_tmp3
     vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp0
@@ -3952,24 +3707,7 @@
     shr             r_tmp0, 16
     vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
     vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    vpunpckhbw      ymm_tmp3, ymm_xpos_int, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    vpunpckhbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
-%endif
+    vpunpckhbw      ymm_tmp3, ymm_xpos_int, ymm_db80h
     vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
     vpshufb         ymm_tmp2, ymm_tmp2, ymm_tmp3
     vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp1
@@ -3985,20 +3723,7 @@
 %endmacro
 
 %macro AVX2_BilinearAccurateDownsample8xOrLess_16px 0
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x0c0c0c0c    ;shufb_000044448888cccc
-    push            0x08080808
-    push            0x04040404
-    push            0x00000000
-    vbroadcasti128  ymm_tmp0, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    vbroadcasti128  ymm_tmp0, [shufb_000044448888CCCC]
-%endif
+    vbroadcasti128  ymm_tmp0, xmm_shufb_000044448888CCCC
     vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
     vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
     mov             r_tmp0, i_xpos
@@ -4019,24 +3744,7 @@
     shr             r_tmp0, 16
     vinserti128     ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
     vinserti128     ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
-%ifdef X86_32_PICASM
-    push            r5
-    mov             r5, esp
-    and             esp, 0xffffffe0
-    push            0x80808080    ;db80h_256
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    push            0x80808080
-    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [esp]
-    mov             esp, r5
-    pop             r5
-%else
-    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
-%endif
+    vpunpcklbw      ymm_tmp3, ymm_xpos_int, ymm_db80h
     vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
     vpshufb         ymm_tmp5, ymm_tmp5, ymm_tmp3
     vpshufb         ymm_tmp0, ymm_tmp0, ymm_tmp3
@@ -4313,7 +4021,11 @@
     vmovd           xmm0, arg8
     vmovd           xmm1, esp
     and             esp, -32
+%ifdef X86_32_PICASM
+    sub             esp, 8 * 4 + 9 * 32
+%else
     sub             esp, 8 * 4 + 8 * 32
+%endif
     vmovd           [esp], xmm1
     %define p_dst                   r0
     %define i_dst_stride_less_width [esp + 1 * 4]
@@ -4354,6 +4066,22 @@
     %define ymm_ffff                [esp + 8 * 4 + 5 * 32]
     %define ymm_xpos_int_begin      [esp + 8 * 4 + 6 * 32]
     %define ymm_xpos_frac_begin     [esp + 8 * 4 + 7 * 32]
+%ifdef X86_32_PICASM
+    %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 32]
+    %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 8 * 32 + 16]
+    vpxor           ymm_tmp4, ymm_tmp4, ymm_tmp4
+    vpcmpeqb        ymm_tmp5, ymm_tmp5, ymm_tmp5
+    vpsubb          ymm_tmp4, ymm_tmp4, ymm_tmp5
+    vpsllw          ymm_tmp3, ymm_tmp4, 3
+    vpslldq         ymm_tmp3, ymm_tmp3, 8
+    vmovdqa         xmm_shufb_0000000088888888, xmm_tmp3
+    vpsllq          ymm_tmp5, ymm_tmp4, 34
+    vpaddb          ymm_tmp5, ymm_tmp5, ymm_tmp3
+    vmovdqa         xmm_shufb_000044448888CCCC, xmm_tmp5
+%else
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+    %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
+%endif
     mov             i_dst_stride_less_width, r1
     mov             i_dst_width, r2
     mov             i_dst_height, r3
@@ -4409,6 +4137,8 @@
     %define ymm_ffff                ymm13
     %define ymm_xpos_int_begin      ymm14
     %define ymm_xpos_frac_begin     ymm15
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+    %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
     vpxor           ymm_0, ymm_0, ymm_0
     vpcmpeqw        ymm_ffff, ymm_ffff, ymm_ffff
 %endif
@@ -4597,6 +4327,8 @@
 %undef ymm_xfrac0_begin
 %undef ymm_xfrac1_begin
 %undef ymm_xfrac_inc
+%undef xmm_shufb_0000000088888888
+%undef xmm_shufb_000044448888CCCC
 
 ;**************************************************************************************************************
 ;void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
@@ -4632,7 +4364,11 @@
     vmovd           xmm0, arg8
     vmovd           xmm1, esp
     and             esp, -32
+%ifdef X86_32_PICASM
+    sub             esp, 8 * 4 + 10 * 32
+%else
     sub             esp, 8 * 4 + 8 * 32
+%endif
     vmovd           [esp], xmm1
     %define p_dst                   r0
     %define i_dst_stride_less_width [esp + 1 * 4]
@@ -4673,6 +4409,26 @@
     %define ymm_7fff                [esp + 8 * 4 + 5 * 32]
     %define ymm_xpos_int_begin      [esp + 8 * 4 + 6 * 32]
     %define ymm_xpos_frac_begin     [esp + 8 * 4 + 7 * 32]
+%ifdef X86_32_PICASM
+    %define ymm_db80h                  [esp + 8 * 4 + 8 * 32]
+    %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 32]
+    %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 9 * 32 + 16]
+    vpxor           ymm_tmp4, ymm_tmp4, ymm_tmp4
+    vpcmpeqb        ymm_tmp5, ymm_tmp5, ymm_tmp5
+    vpsubb          ymm_tmp4, ymm_tmp4, ymm_tmp5
+    vpsllw          ymm_tmp3, ymm_tmp4, 3
+    vpslldq         ymm_tmp3, ymm_tmp3, 8
+    vmovdqa         xmm_shufb_0000000088888888, xmm_tmp3
+    vpsllq          ymm_tmp5, ymm_tmp4, 34
+    vpaddb          ymm_tmp5, ymm_tmp5, ymm_tmp3
+    vmovdqa         xmm_shufb_000044448888CCCC, xmm_tmp5
+    vpsllw          ymm_tmp4, ymm_tmp4, 7
+    vmovdqa         ymm_db80h, ymm_tmp4
+%else
+    %define ymm_db80h               [db80h_256]
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+    %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
+%endif
     mov             i_dst_stride_less_width, r1
     mov             i_dst_width, r2
     mov             i_dst_height, r3
@@ -4729,6 +4485,9 @@
     %define ymm_7fff                ymm13
     %define ymm_xpos_int_begin      ymm14
     %define ymm_xpos_frac_begin     ymm15
+    %define ymm_db80h               [db80h_256]
+    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+    %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
     vpxor           ymm_0, ymm_0, ymm_0
     vpcmpeqw        ymm_7fff, ymm_7fff, ymm_7fff
     vpsrlw          ymm_7fff, ymm_7fff, 1
@@ -4920,5 +4679,8 @@
 %undef ymm_xfrac0_begin
 %undef ymm_xfrac1_begin
 %undef ymm_xfrac_inc
-%endif
+%undef ymm_db80h
+%undef xmm_shufb_0000000088888888
+%undef xmm_shufb_000044448888CCCC
 
+%endif