shithub: openh264

Download patch

ref: eb238e654972292004294b2cc49093bdebb759d0
parent: 1449c83f57ad535cadc79a4f81eab95592c5858e
author: Martin Storsjö <martin@martin.st>
date: Fri Mar 14 10:32:41 EDT 2014

Use the SIGN_EXTENSION macro where possible

This shortens the x86 assembly by 134 lines in total.

--- a/codec/common/mb_copy.asm
+++ b/codec/common/mb_copy.asm
@@ -442,12 +442,10 @@
     %assign  push_num 0
     LOAD_7_PARA
 
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r5, r5d
-	movsx	r6, r6d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r5, r5d
+	SIGN_EXTENSION	r6, r6d
 
 ALIGN 4
 .height_loop:
@@ -491,12 +489,10 @@
     %assign  push_num 0
     LOAD_7_PARA
 
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r5, r5d
-	movsx	r6, r6d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r5, r5d
+	SIGN_EXTENSION	r6, r6d
 
 ALIGN 4
 .height_loop:
@@ -531,12 +527,10 @@
 
     %assign  push_num 0
     LOAD_7_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r5, r5d
-	movsx	r6, r6d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r5, r5d
+	SIGN_EXTENSION	r6, r6d
 ALIGN 4
 .height_loop:
 	movdqu      xmm0, [r2]
@@ -596,11 +590,9 @@
     %assign  push_num 1
     LOAD_5_PARA
 
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
 
 ALIGN 4
 .height_loop:
@@ -633,11 +625,9 @@
     %assign  push_num 0
     LOAD_5_PARA
 
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
 
 ALIGN 4
 .height_loop:
@@ -680,11 +670,9 @@
 
     %assign  push_num 0
     LOAD_5_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
 ALIGN 4
 .height_loop:
     SSE_READ_UNA	xmm0, r0
--- a/codec/common/mc_chroma.asm
+++ b/codec/common/mc_chroma.asm
@@ -82,11 +82,9 @@
 
 	%assign  push_num 0
 	LOAD_6_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r5, r5d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r5, r5d
 
 	;mov eax, [esp +12 + 20]
 
@@ -174,11 +172,9 @@
 
 	%assign  push_num 0
 	LOAD_6_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r5, r5d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r5, r5d
 
 	;mov eax, [esp +12 + 20]
 	movd xmm3, [r4]
@@ -268,11 +264,9 @@
 	;push edi
 	%assign  push_num 0
 	LOAD_6_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r5, r5d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r5, r5d
 
 	;mov eax, [esp + 12 + 20]
 
--- a/codec/common/mc_luma.asm
+++ b/codec/common/mc_luma.asm
@@ -94,11 +94,9 @@
 
     %assign  push_num 0
     LOAD_5_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
 
 	sub r0, 2
 	WELS_Zero mm7
@@ -198,11 +196,9 @@
 
 	%assign  push_num 0
     LOAD_5_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
 	pxor xmm7, xmm7
 
 	sub r0, r1				;;;;;;;;need more 5 lines.
@@ -260,11 +256,9 @@
 
 	%assign  push_num 0
     LOAD_5_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
 	lea r0, [r0-2]            ;pSrc -= 2;
 
 	pxor xmm7, xmm7
@@ -325,11 +319,9 @@
 
 	%assign  push_num 0
     LOAD_5_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
 	lea r0, [r0-2]            ;pSrc -= 2;
 
 	pxor xmm7, xmm7
@@ -416,11 +408,9 @@
 
 	%assign  push_num 0
     LOAD_5_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
 	sub r0, r1
 	sub r0, r1
 
@@ -526,12 +516,10 @@
 
 	%assign  push_num 0
     LOAD_6_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-	movsx	r5, r5d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	SIGN_EXTENSION	r5, r5d
 
 %ifndef X86_32
 	push r12
@@ -675,12 +663,10 @@
 
 	%assign  push_num 0
     LOAD_6_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-	movsx	r5, r5d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	SIGN_EXTENSION	r5, r5d
 	sub r0, 2
 	pxor xmm7, xmm7
 
@@ -845,12 +831,10 @@
 
 	%assign  push_num 0
     LOAD_6_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-	movsx	r5, r5d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	SIGN_EXTENSION	r5, r5d
 	pxor xmm7, xmm7
 	sub r0, r1				;;;;;;;;need more 5 lines.
 	sub r0, r1
@@ -1026,13 +1010,11 @@
 
 	%assign  push_num 0
     LOAD_6_PARA
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	SIGN_EXTENSION	r5, r5d
 %ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-	movsx	r5, r5d
-%endif
-%ifndef X86_32
 	push r12
 	push r13
 	push r14
@@ -1172,12 +1154,10 @@
 
 	%assign  push_num 0
     LOAD_6_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-	movsx	r5, r5d
-%endif
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	SIGN_EXTENSION	r5, r5d
 %ifndef X86_32
 	push r12
 	push r13
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -59,9 +59,7 @@
         ;push     r0
         %assign push_num 0
         LOAD_2_PARA
-		%ifndef X86_32
-		movsx r1, r1d
-		%endif
+	SIGN_EXTENSION r1, r1d
 	;mov      r0,        [esp+08h]
 	;mov      r1,        [esp+0ch]
 	;lea      r1,        [r1*2]
@@ -140,9 +138,7 @@
 	  ;push      r0
 	  %assign push_num 0
           LOAD_2_PARA
-		  %ifndef X86_32
-		  movsx r1, r1d
-		  %endif
+	  SIGN_EXTENSION r1, r1d
       	  ;mov       r0,     [esp+08h]
 	  ;mov       r1,     [esp+0ch]
 	  lea       r1,     [r1*2]
--- a/codec/decoder/core/asm/dct.asm
+++ b/codec/decoder/core/asm/dct.asm
@@ -93,9 +93,7 @@
 IdctResAddPred_mmx:
     %assign push_num 0
     LOAD_3_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+    SIGN_EXTENSION r1, r1d
     movq    mm0, [r2+ 0]
     movq    mm1, [r2+ 8]
     movq    mm2, [r2+16]
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -191,9 +191,7 @@
 WelsDecoderI4x4LumaPredH_sse2:
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	;mov			eax,	[esp+4]			;pPred
 	;mov			ecx,	[esp+8]			;kiStride
 
@@ -232,9 +230,7 @@
 		push r4
 		%assign push_num 2
 		LOAD_2_PARA
-		%ifndef X86_32
-		movsx r1, r1d
-		%endif
+		SIGN_EXTENSION r1, r1d
 		mov r4, r0 ; save r0 in r4
 		;push	esi
 		;mov		esi,	[esp + pushsize + 4]
@@ -341,9 +337,7 @@
 WelsDecoderI16x16LumaPredH_sse2:
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
     ;mov     eax, [esp+4]    ; pPred
     ;mov     ecx, [esp+8]    ; kiStride
 
@@ -369,9 +363,7 @@
 WelsDecoderI16x16LumaPredV_sse2:
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
     ;mov     edx, [esp+4]    ; pPred
     ;mov     ecx, [esp+8]    ; kiStride
 
@@ -415,9 +407,7 @@
 		push r4
 		%assign push_num 2
 		LOAD_2_PARA
-		%ifndef X86_32
-		movsx r1, r1d
-		%endif
+		SIGN_EXTENSION r1, r1d
 		mov r4, r0
 		;push	esi
 		;mov		esi,	[esp + pushsize + 4]	;pPred
@@ -521,9 +511,7 @@
 WelsDecoderI4x4LumaPredDDR_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	mov r2, r0
 	;mov			edx,[esp+4]			;pPred
 	;mov         eax,edx
@@ -596,9 +584,7 @@
 WelsDecoderIChromaPredH_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	mov r2, r0
 	;mov			edx,	[esp+4]			;pPred
 	;mov         eax,	edx
@@ -644,9 +630,7 @@
 WelsDecoderIChromaPredV_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	;mov			eax,		[esp+4]    ;pPred
 	;mov			ecx,		[esp+8]    ;kiStride
 
@@ -703,9 +687,7 @@
 WelsDecoderI4x4LumaPredHD_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	mov r2, r0
 	;mov			edx, [esp+4]			; pPred
 	;mov         eax, edx
@@ -792,9 +774,7 @@
 WelsDecoderI4x4LumaPredHU_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	mov r2, r0
 	;mov			edx, [esp+4]			; pPred
 	;mov         eax, edx
@@ -884,9 +864,7 @@
 WelsDecoderI4x4LumaPredVR_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	mov r2, r0
 	;mov			edx, [esp+4]			; pPred
 	;mov         eax, edx
@@ -977,9 +955,7 @@
 WelsDecoderI4x4LumaPredDDL_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	mov r2, r0
 	;mov			edx, [esp+4]			; pPred
 	;mov         eax, edx
@@ -1052,9 +1028,7 @@
 WelsDecoderI4x4LumaPredVL_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	mov r2, r0
 	;mov			edx, [esp+4]			; pPred
 	;mov         eax, edx
@@ -1101,9 +1075,7 @@
 	push 	r4
 	%assign push_num 2
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	mov r4, r0
 	;push        ebx
 	;mov         eax, [esp+8]			; pPred
@@ -1209,9 +1181,7 @@
 	push 	r4
 	%assign push_num 2
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	mov r4, r0
 	sub         r0, r1
 	movdqa      xmm0, [r0]             ; read one row
@@ -1296,9 +1266,7 @@
 	;mov ebx, [esp+PUSH_SIZE+8]	; kiStride
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	mov r2, r0
 	sub r2, r1
 	movdqa xmm0, [r2]		; pPred-kiStride, top line
@@ -1378,9 +1346,7 @@
 	;mov ebx, [esp+PUSH_SIZE+8]	; kiStride
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	lea r2, [2*r1+r1]		; 3*kiStride
 
 	movdqa xmm0, [sse2_dc_0x80]
@@ -1426,9 +1392,7 @@
 	push r4
 	%assign push_num 2
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	mov r4, r0
 	; for left
 	dec r0
@@ -1507,9 +1471,7 @@
 	;neg ebx
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	mov r2, r0
 	sub r2, r1
 	movq xmm0, [r2]		; top: 8x1 pixels
@@ -1554,9 +1516,7 @@
 	;mov ebx, [esp+PUSH_SIZE+8]	; kiStride
 	%assign push_num 0
 	LOAD_2_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	%endif
+	SIGN_EXTENSION r1, r1d
 	lea r2, [2*r1+r1]
 	movq mm0, [sse2_dc_0x80]
 	movq mm1, mm0
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -143,10 +143,8 @@
     ;mov     edx, [esp+24]   ; i_pix2
     %assign push_num 0
     LOAD_5_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	movsx r4, r4d
-	%endif
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r4, r4d
     WELS_Zero    mm7
 
     MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
@@ -182,10 +180,8 @@
 ;%define     pDct        esp+pushsize+20
     %assign push_num 0
     LOAD_5_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	movsx r3, r3d
-	%endif
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
 ;	mov     eax, [pDct   ]
     movq    mm0, [r4+ 0]
     movq    mm1, [r4+ 8]
@@ -332,10 +328,8 @@
     ;mov     edx, [esp+28]   ; i_pix2
     %assign push_num 0
     LOAD_5_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	movsx r4, r4d
-	%endif
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r4, r4d
     pxor    xmm7, xmm7
 	;Load 4x8
 	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1], [r3]
@@ -399,10 +393,8 @@
 ;    mov			esi,		[rs]
 	%assign push_num 0
 	LOAD_5_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	movsx r3, r3d
-	%endif
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
 	;Load 4x8
 	SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
 
@@ -462,10 +454,8 @@
 WelsIDctRecI16x16Dc_sse2:
 	%assign push_num 0
 	LOAD_5_PARA
-	%ifndef X86_32
-	movsx r1, r1d
-	movsx r3, r3d
-	%endif
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
    ; push		esi
    ; push		edi
 
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -203,9 +203,7 @@
 	push r3
 	%assign push_num 1
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	movzx		r3,	byte [r1-1]
 	movd		xmm0,	r3d
 	pmuludq		xmm0,	[mmx_01bytes]
@@ -244,9 +242,7 @@
 		push r4
 		%assign push_num 2
 		LOAD_3_PARA
-		%ifndef X86_32
-		movsx r2, r2d
-		%endif
+		SIGN_EXTENSION r2, r2d
 		sub		r1,	1
 		sub		r1,	r2
 
@@ -343,9 +339,7 @@
 	push r3
 	%assign push_num 1
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	dec r1
 	movzx r3, byte [r1]
 	SSE2_Copy16Times xmm0, r3d
@@ -378,9 +372,7 @@
     ;mov     ecx, [esp+12]   ; stride
     %assign push_num 0
     LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+    SIGN_EXTENSION r2, r2d
     sub     r1, r2
     movdqa  xmm0, [r1]
 
@@ -416,9 +408,7 @@
 		push r4
 		%assign push_num 2
 		LOAD_3_PARA
-		%ifndef X86_32
-		movsx r2, r2d
-		%endif
+		SIGN_EXTENSION r2, r2d
 		sub		r1,	1
 		sub		r1,	r2
 
@@ -517,9 +507,7 @@
 	;mov			ecx,[esp+12]		;stride
 	%assign push_num 0
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	movq        mm1,[r1+r2-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
 	movq        mm2,[r1-8]			;get value of 6 mm2[8] = 6
 	sub		r1, r2			;mov eax to above line of current block(postion of 1)
@@ -575,9 +563,7 @@
 	push r4
 	%assign push_num 2
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	movzx		r4,	byte [r1-1h]
 	sub			r1,	r2
 	movd		xmm0,	[r1]
@@ -638,9 +624,7 @@
 	;mov			ecx,	[esp+12]		;stride
 	%assign push_num 0
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	movq		mm0,	[r1-8]
 	psrlq		mm0,	38h
 
@@ -677,9 +661,7 @@
 WelsI4x4LumaPredV_sse2:
 	%assign push_num 0
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	sub			r1,	r2
 	movd		xmm0,	[r1]
 	pshufd		xmm0,	xmm0,	0
@@ -695,9 +677,7 @@
 WelsIChromaPredV_sse2:
 	%assign push_num 0
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	sub		r1,		r2
 	movq		xmm0,		[r1]
 	movdqa		xmm1,		xmm0
@@ -742,9 +722,7 @@
 WelsI4x4LumaPredHD_mmx:
 	%assign push_num 0
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	sub         r1, r2
 	movd        mm0, [r1-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
 	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
@@ -823,9 +801,7 @@
 WelsI4x4LumaPredHU_mmx:
 	%assign push_num 0
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	movd        mm0, [r1-4]            ; mm0[3] = l0
 	punpcklbw   mm0, [r1+r2-4]        ; mm0[7] = l1, mm0[6] = l0
 	lea         r1, [r1+2*r2]
@@ -908,9 +884,7 @@
 WelsI4x4LumaPredVR_mmx:
 	%assign push_num 0
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	sub         r1, r2
 	movq        mm0, [r1-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
 	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
@@ -996,9 +970,7 @@
 WelsI4x4LumaPredDDL_mmx:
 	%assign push_num 0
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	sub         r1, r2
 	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
@@ -1066,9 +1038,7 @@
 WelsI4x4LumaPredVL_mmx:
 	%assign push_num 0
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	sub         r1, r2
 	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
@@ -1109,9 +1079,7 @@
 	push r4
 	%assign push_num 2
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	sub         r1, r2
 	movq        mm0, [r1]
 
@@ -1203,9 +1171,7 @@
 	push r4
 	%assign push_num 2
 	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
+	SIGN_EXTENSION r2, r2d
 	sub         r1, r2
 	movdqa      xmm0, [r1]             ; read one row
 	pxor		xmm1, xmm1
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -106,10 +106,8 @@
 WelsQuant4x4Dc_sse2:
  		%assign push_num 0
 		LOAD_3_PARA
-		%ifndef X86_32
-		movsx r1, r1w
-		movsx r2, r2w
-		%endif
+		SIGN_EXTENSION r1, r1w
+		SIGN_EXTENSION r2, r2w
 		;mov		ax,		[mf]
 		SSE2_Copy8Times xmm3, r2d
 
@@ -216,10 +214,8 @@
 WelsHadamardQuant2x2_mmx:
 		%assign push_num 0
 		LOAD_5_PARA
-		%ifndef X86_32
-		movsx r1, r1w
-		movsx r2, r2w
-		%endif
+		SIGN_EXTENSION r1, r1w
+		SIGN_EXTENSION r2, r2w
 		;mov			eax,			[pDct]
 		movd		mm0,			[r0]
 		movd		mm1,			[r0 + 0x20]
@@ -281,10 +277,8 @@
 WelsHadamardQuant2x2Skip_mmx:
 		%assign push_num 0
 		LOAD_3_PARA
-		%ifndef X86_32
-		movsx r1, r1w
-		movsx r2, r2w
-		%endif
+		SIGN_EXTENSION r1, r1w
+		SIGN_EXTENSION r2, r2w
 		;mov			eax,			[pDct]
 		movd		mm0,			[r0]
 		movd		mm1,			[r0 + 0x20]