ref: 659ff14af50bc184bac74dd058010143cf9e5734
parent: c82c19022b5a0bed5845a4e6dcb03a36991224b9
author: Sindre Aamås <saamas@cisco.com>
date: Tue Mar 7 09:21:17 EST 2017
[Encoder/x86] Simplify intra_pred X86_32_PICASM handling Utilize program counter-relative offsets to simplify X86_32_PICASM code. In order for this to work with nasm, data constants are placed in the text segment.
--- a/codec/encoder/core/x86/intra_pred.asm
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -45,7 +45,11 @@
; Local Data (Read Only)
;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
SECTION .rodata align=16
+%endif
align 16
sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
@@ -144,20 +148,7 @@
%macro COPY_16_TIMES 2
movdqa %2, [%1-16]
psrldq %2, 15
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq %2, [esp]
- mov esp, r0
- pop r0
-%else
- pmuludq %2, [mmx_01bytes]
-%endif
+ pmuludq %2, [pic(mmx_01bytes)]
pshufd %2, %2, 0
%endmacro
@@ -164,20 +155,7 @@
%macro COPY_16_TIMESS 3
movdqa %2, [%1+%3-16]
psrldq %2, 15
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq %2, [esp]
- mov esp, r0
- pop r0
-%else
- pmuludq %2, [mmx_01bytes]
-%endif
+ pmuludq %2, [pic(mmx_01bytes)]
pshufd %2, %2, 0
%endmacro
@@ -215,30 +193,16 @@
WELS_EXTERN WelsI4x4LumaPredH_sse2
push r3
%assign push_num 1
+ INIT_X86_32_PIC r4
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
movzx r3, byte [r1-1]
movd xmm0, r3d
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq xmm0, [esp]
-%else
- pmuludq xmm0, [mmx_01bytes]
-%endif
+ pmuludq xmm0, [pic(mmx_01bytes)]
movzx r3, byte [r1+r2-1]
movd xmm1, r3d
-%ifdef X86_32_PICASM
- pmuludq xmm1, [esp]
-%else
- pmuludq xmm1, [mmx_01bytes]
-%endif
+ pmuludq xmm1, [pic(mmx_01bytes)]
unpcklps xmm0, xmm1
@@ -245,26 +209,17 @@
lea r1, [r1+r2*2]
movzx r3, byte [r1-1]
movd xmm2, r3d
-%ifdef X86_32_PICASM
- pmuludq xmm2, [esp]
-%else
- pmuludq xmm2, [mmx_01bytes]
-%endif
+ pmuludq xmm2, [pic(mmx_01bytes)]
movzx r3, byte [r1+r2-1]
movd xmm3, r3d
-%ifdef X86_32_PICASM
- pmuludq xmm3, [esp]
- mov esp, r0
- pop r0
-%else
- pmuludq xmm3, [mmx_01bytes]
-%endif
+ pmuludq xmm3, [pic(mmx_01bytes)]
unpcklps xmm2, xmm3
unpcklpd xmm0, xmm2
movdqa [r0], xmm0
+ DEINIT_X86_32_PIC
pop r3
ret
@@ -275,6 +230,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_3_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -284,34 +240,11 @@
;for H
pxor xmm7, xmm7
movq xmm0, [r1]
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xfffffff0
- push 0x00010002 ;sse2_plane_dec
- push 0x00030004
- push 0x00050006
- push 0x00070008
- push 0x00080007 ;sse_plane_inc
- push 0x00060005
- push 0x00040003
- push 0x00020001
- push 0x0000ffff ;sse_plane_inc_minus
- push 0xfffefffd
- push 0xfffcfffb
- push 0xfffafff9
- movdqa xmm5, [esp+32]
-%else
- movdqa xmm5, [sse2_plane_dec]
-%endif
+ movdqa xmm5, [pic(sse2_plane_dec)]
punpcklbw xmm0, xmm7
pmullw xmm0, xmm5
movq xmm1, [r1 + 9]
-%ifdef X86_32_PICASM
- movdqa xmm6, [esp+16]
-%else
- movdqa xmm6, [sse2_plane_inc]
-%endif
+ movdqa xmm6, [pic(sse2_plane_inc)]
punpcklbw xmm1, xmm7
pmullw xmm1, xmm6
psubw xmm1, xmm0
@@ -357,13 +290,7 @@
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
xor r3, r3
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp]
- mov esp, r5
- pop r5
-%else
- movdqa xmm5, [sse2_plane_inc_minus]
-%endif
+ movdqa xmm5, [pic(sse2_plane_inc_minus)]
get_i16x16_luma_pred_plane_sse2_1:
movdqa xmm2, xmm1
@@ -382,6 +309,7 @@
cmp r3, 16
jnz get_i16x16_luma_pred_plane_sse2_1
POP_XMM
+ DEINIT_X86_32_PIC
pop r4
pop r3
ret
@@ -393,6 +321,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_3_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -401,30 +330,11 @@
pxor mm7, mm7
movq mm0, [r1]
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xfffffff0
- push 0x00010002 ;sse2_plane_dec_c
- push 0x00030004
- push 0x00040003 ;sse2_plane_inc_c
- push 0x00020001
- push 0x00040003 ;sse2_plane_mul_b_c
- push 0x00020001
- push 0x0000ffff
- push 0xfffefffd
- movq mm5, [esp+24]
-%else
- movq mm5, [sse2_plane_dec_c]
-%endif
+ movq mm5, [pic(sse2_plane_dec_c)]
punpcklbw mm0, mm7
pmullw mm0, mm5
movq mm1, [r1 + 5]
-%ifdef X86_32_PICASM
- movq mm6, [esp+16]
-%else
- movq mm6, [sse2_plane_inc_c]
-%endif
+ movq mm6, [pic(sse2_plane_inc_c)]
punpcklbw mm1, mm7
pmullw mm1, mm6
psubw mm1, mm0
@@ -474,13 +384,7 @@
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
xor r3, r3
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp]
- mov esp, r5
- pop r5
-%else
- movdqa xmm5, [sse2_plane_mul_b_c]
-%endif
+ movdqa xmm5, [pic(sse2_plane_mul_b_c)]
get_i_chroma_pred_plane_sse2_1:
movdqa xmm2, xmm1
@@ -495,6 +399,7 @@
cmp r3, 8
jnz get_i_chroma_pred_plane_sse2_1
POP_XMM
+ DEINIT_X86_32_PIC
pop r4
pop r3
WELSEMMS
@@ -514,6 +419,7 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDDR_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
@@ -539,18 +445,7 @@
movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
pxor mm1,mm4 ;find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm1,[esp] ;set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm1,[mmx_01bytes] ;set the odd bit
-%endif
+ pand mm1,[pic(mmx_01bytes)] ;set the odd bit
psubusb mm3,mm1 ;decrease 1 from odd bytes
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
@@ -561,6 +456,7 @@
movd [r0+4],mm2
psrlq mm2,8
movd [r0],mm2
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -619,20 +515,7 @@
psrlq %1, 38h
;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmullw %1, [esp]
- mov esp, r0
- pop r0
-%else
- pmullw %1, [mmx_01bytes]
-%endif
+ pmullw %1, [pic(mmx_01bytes)]
pshufw %1, %1, 0
movq [%4], %1
%endmacro
@@ -642,20 +525,7 @@
psrlq %1, 38h
;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmullw %1, [esp]
- mov esp, r0
- pop r0
-%else
- pmullw %1, [mmx_01bytes]
-%endif
+ pmullw %1, [pic(mmx_01bytes)]
pshufw %1, %1, 0
movq [%4], %1
%endmacro
@@ -662,6 +532,7 @@
WELS_EXTERN WelsIChromaPredH_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
movq mm0, [r1-8]
@@ -668,20 +539,7 @@
psrlq mm0, 38h
;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmullw mm0, [esp]
- mov esp, r0
- pop r0
-%else
- pmullw mm0, [mmx_01bytes]
-%endif
+ pmullw mm0, [pic(mmx_01bytes)]
pshufw mm0, mm0, 0
movq [r0], mm0
@@ -701,6 +559,7 @@
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -767,6 +626,7 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHD_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
sub r1, r2
@@ -791,18 +651,7 @@
pavgb mm1, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm4, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm4, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm4, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm1, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
@@ -824,6 +673,7 @@
movd [r0+8], mm3
psrlq mm3, 10h
movd [r0+4], mm3
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -855,6 +705,7 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHU_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
movd mm0, [r1-4] ; mm0[3] = l0
@@ -881,18 +732,7 @@
pavgb mm2, mm0
pxor mm5, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm5, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm5, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm5, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm2, mm5 ; decrease 1 from odd bytes
pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
@@ -912,6 +752,7 @@
movd [r0+4], mm1
psrlq mm1, 10h
movd [r0+8], mm1
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -947,6 +788,7 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVR_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
sub r1, r2
@@ -971,18 +813,7 @@
pavgb mm2, mm0
pxor mm3, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm3, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm3, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm3, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm2, mm3 ; decrease 1 from odd bytes
movq mm3, mm0
@@ -1011,6 +842,7 @@
psllq mm2, 8h
pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
movd [r0+12], mm5
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1042,6 +874,7 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
sub r1, r2
@@ -1060,18 +893,7 @@
movq mm3, mm1
pavgb mm1, mm2
pxor mm3, mm2 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm3, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm3, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm3, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm1, mm3 ; decrease 1 from odd bytes
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
@@ -1084,6 +906,7 @@
movd [r0+8], mm0
psrlq mm0, 8h
movd [r0+12], mm0
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1119,6 +942,7 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVL_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
sub r1, r2
@@ -1135,18 +959,7 @@
movq mm4, mm2
pavgb mm2, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm4, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm4, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm4, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm2, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
@@ -1158,6 +971,7 @@
movd [r0+4], mm2
psrlq mm2, 8h
movd [r0+12], mm2
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1169,6 +983,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
sub r1, r2
@@ -1208,18 +1023,7 @@
movq mm1, mm2
paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x00000000
- push 0x00000002
- movq mm4, [esp]
- mov esp, r0
- pop r0
-%else
- movq mm4, [mmx_0x02]
-%endif
+ movq mm4, [pic(mmx_0x02)]
paddq mm0, mm4
psrlq mm0, 0x02
@@ -1235,32 +1039,13 @@
paddq mm1, mm4
psrlq mm1, 0x03
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq mm0, [esp]
- pmuludq mm3, [esp]
-%else
- pmuludq mm0, [mmx_01bytes]
- pmuludq mm3, [mmx_01bytes]
-%endif
+ pmuludq mm0, [pic(mmx_01bytes)]
+ pmuludq mm3, [pic(mmx_01bytes)]
psllq mm0, 0x20
pxor mm0, mm3 ; mm0 = m_up
-%ifdef X86_32_PICASM
- pmuludq mm2, [esp]
- pmuludq mm1, [esp]
- mov esp, r0
- pop r0
-%else
- pmuludq mm2, [mmx_01bytes]
- pmuludq mm1, [mmx_01bytes]
-%endif
+ pmuludq mm2, [pic(mmx_01bytes)]
+ pmuludq mm1, [pic(mmx_01bytes)]
psllq mm1, 0x20
pxor mm1, mm2 ; mm2 = m_down
@@ -1274,6 +1059,7 @@
movq [r0+0x30], mm1
movq [r0+0x38], mm1
+ DEINIT_X86_32_PIC
pop r4
pop r3
WELSEMMS
@@ -1289,6 +1075,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
sub r1, r2
@@ -1316,20 +1103,7 @@
movd xmm1, r3d
paddw xmm0, xmm1
psrld xmm0, 0x05
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq xmm0, [esp]
- mov esp, r0
- pop r0
-%else
- pmuludq xmm0, [mmx_01bytes]
-%endif
+ pmuludq xmm0, [pic(mmx_01bytes)]
pshufd xmm0, xmm0, 0
movdqa [r0], xmm0
@@ -1349,6 +1123,7 @@
movdqa [r0+0xe0], xmm0
movdqa [r0+0xf0], xmm0
+ DEINIT_X86_32_PIC
pop r4
pop r3
ret