shithub: openh264

Download patch

ref: 5cf163449f646ec95ecceee4f254450c58e5ce82
parent: faf1bdde0f9ef9f5c86058521ac84c04ac406c54
parent: a00e2e722926c9f5ca7ea964a8258a888f520e03
author: zhilwang <zhilwang@cisco.com>
date: Mon Apr 27 13:54:38 EDT 2015

Merge pull request #1914 from mstorsjo/asm-cleanup

Clean up assembly source files

--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -205,7 +205,7 @@
     sqrshrun \arg0\().2s, \arg0\().2d, #10
     uqxtn \arg0\().4h, \arg0\().4s
     uqxtn \arg0\().8b, \arg0\().8h
-   //   }
+    //   }
 .endm
 
 //(const uint8_t* pSrc {x0}, int32_t iSrcStride{x1}, uint8_t* pDst{x2}, int32_t iDstStride{x3}, int32_t iHeight{x4})
--- a/codec/decoder/core/x86/dct.asm
+++ b/codec/decoder/core/x86/dct.asm
@@ -114,8 +114,8 @@
     emms
     ret
 
-;void WelsBlockZero16x16_sse2(int16_t * block, int32_t stride);
-WELS_EXTERN WelsBlockZero16x16_sse2
+;void WelsBlockZero16x16_sse2(int16_t * block, int32_t stride);
+WELS_EXTERN WelsBlockZero16x16_sse2
     %assign  push_num 0
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
@@ -127,9 +127,9 @@
     add     r0, r1
 %endrep
     ret
-
-;void WelsBlockZero8x8_sse2(int16_t * block, int32_t stride);
-WELS_EXTERN WelsBlockZero8x8_sse2
+
+;void WelsBlockZero8x8_sse2(int16_t * block, int32_t stride);
+WELS_EXTERN WelsBlockZero8x8_sse2
     %assign  push_num 0
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
--- a/codec/encoder/core/arm/svc_motion_estimation.S
+++ b/codec/encoder/core/arm/svc_motion_estimation.S
@@ -72,8 +72,8 @@
 WELS_ASM_FUNC_BEGIN SumOf8x8BlockOfFrame_neon
 //(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
     stmdb sp!, {r4-r12}
-    ldr	r5, [sp, #40] //pTimesOfFeatureValue
-    ldr	r4, [sp, #36] //pFeatureOfBlock
+    ldr r5, [sp, #40] //pTimesOfFeatureValue
+    ldr r4, [sp, #36] //pFeatureOfBlock
 
     mov r8, r0
     mov r6, r1
@@ -158,8 +158,8 @@
 WELS_ASM_FUNC_BEGIN SumOf16x16BlockOfFrame_neon
 //(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
     stmdb sp!, {r4-r12}
-    ldr	r5, [sp, #40] //pTimesOfFeatureValue
-    ldr	r4, [sp, #36] //pFeatureOfBlock
+    ldr r5, [sp, #40] //pTimesOfFeatureValue
+    ldr r4, [sp, #36] //pFeatureOfBlock
 
     mov r8, r0
     mov r6, r1
@@ -238,7 +238,7 @@
 WELS_ASM_FUNC_BEGIN InitializeHashforFeature_neon
 // (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
     stmdb sp!, {r4-r7}
-    ldr	r4, [sp, #16] //pFeatureValuePointerList
+    ldr r4, [sp, #16] //pFeatureValuePointerList
     bic r5, r2, #3
 _hash_assign_loop_x4:
     vld1.64 {q0}, [r0]!
@@ -272,8 +272,8 @@
     vst1.64 {q2}, [r4]!
 
 _assign_next:
-	subs r5, r5, #4
-	bne _hash_assign_loop_x4
+    subs r5, r5, #4
+    bne _hash_assign_loop_x4
 
     and r5, r2, #3
     cmp r5, #0
@@ -299,7 +299,7 @@
 WELS_ASM_FUNC_BEGIN FillQpelLocationByFeatureValue_neon
 // void  (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
     stmdb sp!, {r4-r8}
-    vpush		{q4-q7}
+    vpush {q4-q7}
     adr r7, mv_x_inc_x4
     vld1.64 {q7}, [r7]
     adr r7, mv_y_inc_x4
@@ -360,7 +360,7 @@
     subs r2, #1
     bne _hash_height_loop
 
-    vpop		{q4-q7}
+    vpop {q4-q7}
     ldmia sp!, {r4-r8}
 WELS_ASM_FUNC_END
 #endif
--- a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
+++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
@@ -251,8 +251,8 @@
     st1 {v2.16b, v3.16b}, [x4], #32
 
 _assign_next:
-	subs x5, x5, #4
-	cbnz x5, _hash_assign_loop_x4
+    subs x5, x5, #4
+    cbnz x5, _hash_assign_loop_x4
 
     and x5, x2, x9
     cbz x5, _hash_assign_end
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -37,9 +37,9 @@
 SECTION .rodata align=16
 
 ALIGN 16
-mv_x_inc_x4		dw	0x10, 0x10, 0x10, 0x10
-mv_y_inc_x4		dw	0x04, 0x04, 0x04, 0x04
-mx_x_offset_x4	dw	0x00, 0x04, 0x08, 0x0C
+mv_x_inc_x4     dw  0x10, 0x10, 0x10, 0x10
+mv_y_inc_x4     dw  0x04, 0x04, 0x04, 0x04
+mx_x_offset_x4  dw  0x00, 0x04, 0x08, 0x0C
 
 SECTION .text
 %ifdef X86_32
@@ -48,113 +48,113 @@
 ;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
 ;*********************************************************************************************************************
 WELS_EXTERN SumOf8x8BlockOfFrame_sse2
-%define		pushsize		16
-%define		localsize		4
-%define		ref				esp + pushsize + localsize + 4
-%define		sum_ref			esp + pushsize + localsize + 20
-%define		times_of_sum	esp + pushsize + localsize + 24
-%define		width			esp + pushsize + localsize + 8
-%define		height			esp + pushsize + localsize + 12
-%define		linesize		esp + pushsize + localsize + 16
-%define		tmp_width		esp + 0
-    push	ebx
-    push	ebp
-    push	esi
-    push	edi
-    sub		esp,	localsize
+%define     pushsize        16
+%define     localsize       4
+%define     ref             esp + pushsize + localsize + 4
+%define     sum_ref         esp + pushsize + localsize + 20
+%define     times_of_sum    esp + pushsize + localsize + 24
+%define     width           esp + pushsize + localsize + 8
+%define     height          esp + pushsize + localsize + 12
+%define     linesize        esp + pushsize + localsize + 16
+%define     tmp_width       esp + 0
+    push    ebx
+    push    ebp
+    push    esi
+    push    edi
+    sub     esp,    localsize
 
-    pxor	xmm0,	xmm0
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		edx,	[times_of_sum]
-    mov		ebx,	[linesize]
-    mov		eax,	[width]
-    lea		ecx,	[ebx+ebx*2]	; 3*linesize
+    pxor    xmm0,   xmm0
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     edx,    [times_of_sum]
+    mov     ebx,    [linesize]
+    mov     eax,    [width]
+    lea     ecx,    [ebx+ebx*2] ; 3*linesize
 
-    mov		[tmp_width],	eax
-    lea		ebp,	[esi+ebx*4]
+    mov     [tmp_width],    eax
+    lea     ebp,    [esi+ebx*4]
 FIRST_ROW:
-    movq	xmm1,	[esi]
-    movq	xmm2,	[esi+ebx]
-    movq	xmm3,	[esi+ebx*2]
-    movq	xmm4,	[esi+ecx]
+    movq    xmm1,   [esi]
+    movq    xmm2,   [esi+ebx]
+    movq    xmm3,   [esi+ebx*2]
+    movq    xmm4,   [esi+ecx]
 
-    shufps	xmm1,	xmm2,	01000100b
-    shufps	xmm3,	xmm4,	01000100b
-    psadbw	xmm1,	xmm0
-    psadbw	xmm3,	xmm0
-    paddd	xmm1,	xmm3
+    shufps  xmm1,   xmm2,   01000100b
+    shufps  xmm3,   xmm4,   01000100b
+    psadbw  xmm1,   xmm0
+    psadbw  xmm3,   xmm0
+    paddd   xmm1,   xmm3
 
-    movq	xmm2,	[ebp]
-    movq	xmm3,	[ebp+ebx]
-    movq	xmm4,	[ebp+ebx*2]
-    movq	xmm5,	[ebp+ecx]
+    movq    xmm2,   [ebp]
+    movq    xmm3,   [ebp+ebx]
+    movq    xmm4,   [ebp+ebx*2]
+    movq    xmm5,   [ebp+ecx]
 
-    shufps	xmm2,	xmm3,	01000100b
-    shufps	xmm4,	xmm5,	01000100b
-    psadbw	xmm2,	xmm0
-    psadbw	xmm4,	xmm0
-    paddd	xmm2,	xmm4
+    shufps  xmm2,   xmm3,   01000100b
+    shufps  xmm4,   xmm5,   01000100b
+    psadbw  xmm2,   xmm0
+    psadbw  xmm4,   xmm0
+    paddd   xmm2,   xmm4
 
-    paddd	xmm1,	xmm2
-    pshufd	xmm2,	xmm1,	00001110b
-    paddd	xmm1,	xmm2
-    movd	eax,	xmm1
-    mov		[edi],	ax
-    inc		dword [edx+eax*4]
+    paddd   xmm1,   xmm2
+    pshufd  xmm2,   xmm1,   00001110b
+    paddd   xmm1,   xmm2
+    movd    eax,    xmm1
+    mov     [edi],  ax
+    inc     dword [edx+eax*4]
 
-    inc		esi
-    inc		ebp
-    add		edi,	2
+    inc     esi
+    inc     ebp
+    add     edi,    2
 
-    dec		dword [tmp_width]
-    jg		FIRST_ROW
+    dec     dword [tmp_width]
+    jg      FIRST_ROW
 
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		ebp,	[width]
-    dec		dword [height]
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     ebp,    [width]
+    dec     dword [height]
 HEIGHT_LOOP:
-    mov		[tmp_width],	ebp
+    mov     [tmp_width],    ebp
 WIDTH_LOOP:
-    movq	xmm1,	[esi+ebx*8]
-    movq	xmm2,	[esi]
-    psadbw	xmm1,	xmm0
-    psadbw	xmm2,	xmm0
-    psubd	xmm1,	xmm2
-    movd	eax,	xmm1
-    mov		cx,		[edi]
-    add		eax,	ecx
+    movq    xmm1,   [esi+ebx*8]
+    movq    xmm2,   [esi]
+    psadbw  xmm1,   xmm0
+    psadbw  xmm2,   xmm0
+    psubd   xmm1,   xmm2
+    movd    eax,    xmm1
+    mov     cx,     [edi]
+    add     eax,    ecx
 
-    mov		[edi+ebp*2],	ax
-    inc		dword [edx+eax*4]
+    mov     [edi+ebp*2],    ax
+    inc     dword [edx+eax*4]
 
-    inc		esi
-    add		edi,	2
+    inc     esi
+    add     edi,    2
 
-    dec		dword [tmp_width]
-    jg		WIDTH_LOOP
+    dec     dword [tmp_width]
+    jg      WIDTH_LOOP
 
-    add		esi,	ebx
-    sub		esi,	ebp
+    add     esi,    ebx
+    sub     esi,    ebp
 
-    dec		dword [height]
-    jg		HEIGHT_LOOP
+    dec     dword [height]
+    jg      HEIGHT_LOOP
 
-    add		esp,	localsize
-    pop		edi
-    pop		esi
-    pop		ebp
-    pop		ebx
-%undef		pushsize
-%undef		localsize
-%undef		ref
-%undef		sum_ref
-%undef		times_of_sum
-%undef		width
-%undef		height
-%undef		linesize
-%undef		tmp_width
+    add     esp,    localsize
+    pop     edi
+    pop     esi
+    pop     ebp
+    pop     ebx
+%undef      pushsize
+%undef      localsize
+%undef      ref
+%undef      sum_ref
+%undef      times_of_sum
+%undef      width
+%undef      height
+%undef      linesize
+%undef      tmp_width
     ret
 
 
@@ -161,10 +161,10 @@
 %macro COUNT_SUM 3
 %define xmm_reg %1
 %define tmp_reg %2
-    movd	tmp_reg,	xmm_reg
-    inc		dword [edx+tmp_reg*4]
+    movd    tmp_reg,    xmm_reg
+    inc     dword [edx+tmp_reg*4]
 %if %3 == 1
-    psrldq	xmm_reg,	4
+    psrldq  xmm_reg,    4
 %endif
 %endmacro
 
@@ -178,177 +178,177 @@
 ; read extra (16 - (width % 8) ) mod 16 bytes of every line
 ; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref
 WELS_EXTERN SumOf8x8BlockOfFrame_sse4
-%define		pushsize		16
-%define		localsize		4
-%define		ref				esp + pushsize + localsize + 4
-%define		sum_ref			esp + pushsize + localsize + 20
-%define		times_of_sum	esp + pushsize + localsize + 24
-%define		width			esp + pushsize + localsize + 8
-%define		height			esp + pushsize + localsize + 12
-%define		linesize		esp + pushsize + localsize + 16
-%define		tmp_width		esp + 0
-    push	ebx
-    push	ebp
-    push	esi
-    push	edi
-    sub		esp,	localsize
+%define     pushsize        16
+%define     localsize       4
+%define     ref             esp + pushsize + localsize + 4
+%define     sum_ref         esp + pushsize + localsize + 20
+%define     times_of_sum    esp + pushsize + localsize + 24
+%define     width           esp + pushsize + localsize + 8
+%define     height          esp + pushsize + localsize + 12
+%define     linesize        esp + pushsize + localsize + 16
+%define     tmp_width       esp + 0
+    push    ebx
+    push    ebp
+    push    esi
+    push    edi
+    sub     esp,    localsize
 
-    pxor	xmm0,	xmm0
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		edx,	[times_of_sum]
-    mov		ebx,	[linesize]
-    mov		eax,	[width]
-    lea		ecx,	[ebx+ebx*2]	; 3*linesize
+    pxor    xmm0,   xmm0
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     edx,    [times_of_sum]
+    mov     ebx,    [linesize]
+    mov     eax,    [width]
+    lea     ecx,    [ebx+ebx*2] ; 3*linesize
 
-    mov		[tmp_width],	eax
-    lea		ebp,	[esi+ebx*4]
+    mov     [tmp_width],    eax
+    lea     ebp,    [esi+ebx*4]
 FIRST_ROW_SSE4:
-    movdqu	xmm1,	[esi]
-    movdqu	xmm3,	[esi+ebx]
-    movdqu	xmm5,	[esi+ebx*2]
-    movdqu	xmm7,	[esi+ecx]
+    movdqu  xmm1,   [esi]
+    movdqu  xmm3,   [esi+ebx]
+    movdqu  xmm5,   [esi+ebx*2]
+    movdqu  xmm7,   [esi+ecx]
 
-    movdqa	xmm2,	xmm1
-    mpsadbw	xmm1,	xmm0,	000b
-    mpsadbw	xmm2,	xmm0,	100b
-    paddw	xmm1,	xmm2			; 8 sums of line1
+    movdqa  xmm2,   xmm1
+    mpsadbw xmm1,   xmm0,   000b
+    mpsadbw xmm2,   xmm0,   100b
+    paddw   xmm1,   xmm2            ; 8 sums of line1
 
-    movdqa	xmm4,	xmm3
-    mpsadbw	xmm3,	xmm0,	000b
-    mpsadbw	xmm4,	xmm0,	100b
-    paddw	xmm3,	xmm4			; 8 sums of line2
+    movdqa  xmm4,   xmm3
+    mpsadbw xmm3,   xmm0,   000b
+    mpsadbw xmm4,   xmm0,   100b
+    paddw   xmm3,   xmm4            ; 8 sums of line2
 
-    movdqa	xmm2,	xmm5
-    mpsadbw	xmm5,	xmm0,	000b
-    mpsadbw	xmm2,	xmm0,	100b
-    paddw	xmm5,	xmm2			; 8 sums of line3
+    movdqa  xmm2,   xmm5
+    mpsadbw xmm5,   xmm0,   000b
+    mpsadbw xmm2,   xmm0,   100b
+    paddw   xmm5,   xmm2            ; 8 sums of line3
 
-    movdqa	xmm4,	xmm7
-    mpsadbw	xmm7,	xmm0,	000b
-    mpsadbw	xmm4,	xmm0,	100b
-    paddw	xmm7,	xmm4			; 8 sums of line4
+    movdqa  xmm4,   xmm7
+    mpsadbw xmm7,   xmm0,   000b
+    mpsadbw xmm4,   xmm0,   100b
+    paddw   xmm7,   xmm4            ; 8 sums of line4
 
-    paddw	xmm1,	xmm3
-    paddw	xmm5,	xmm7
-    paddw	xmm1,	xmm5			; sum the upper 4 lines first
+    paddw   xmm1,   xmm3
+    paddw   xmm5,   xmm7
+    paddw   xmm1,   xmm5            ; sum the upper 4 lines first
 
-    movdqu	xmm2,	[ebp]
-    movdqu	xmm3,	[ebp+ebx]
-    movdqu	xmm4,	[ebp+ebx*2]
-    movdqu	xmm5,	[ebp+ecx]
+    movdqu  xmm2,   [ebp]
+    movdqu  xmm3,   [ebp+ebx]
+    movdqu  xmm4,   [ebp+ebx*2]
+    movdqu  xmm5,   [ebp+ecx]
 
-    movdqa	xmm6,	xmm2
-    mpsadbw	xmm2,	xmm0,	000b
-    mpsadbw	xmm6,	xmm0,	100b
-    paddw	xmm2,	xmm6
+    movdqa  xmm6,   xmm2
+    mpsadbw xmm2,   xmm0,   000b
+    mpsadbw xmm6,   xmm0,   100b
+    paddw   xmm2,   xmm6
 
-    movdqa	xmm7,	xmm3
-    mpsadbw	xmm3,	xmm0,	000b
-    mpsadbw	xmm7,	xmm0,	100b
-    paddw	xmm3,	xmm7
+    movdqa  xmm7,   xmm3
+    mpsadbw xmm3,   xmm0,   000b
+    mpsadbw xmm7,   xmm0,   100b
+    paddw   xmm3,   xmm7
 
-    movdqa	xmm6,	xmm4
-    mpsadbw	xmm4,	xmm0,	000b
-    mpsadbw	xmm6,	xmm0,	100b
-    paddw	xmm4,	xmm6
+    movdqa  xmm6,   xmm4
+    mpsadbw xmm4,   xmm0,   000b
+    mpsadbw xmm6,   xmm0,   100b
+    paddw   xmm4,   xmm6
 
-    movdqa	xmm7,	xmm5
-    mpsadbw	xmm5,	xmm0,	000b
-    mpsadbw	xmm7,	xmm0,	100b
-    paddw	xmm5,	xmm7
+    movdqa  xmm7,   xmm5
+    mpsadbw xmm5,   xmm0,   000b
+    mpsadbw xmm7,   xmm0,   100b
+    paddw   xmm5,   xmm7
 
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm1,	xmm2
-    paddw	xmm1,	xmm4			; sum of lines 1- 8
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm1,   xmm2
+    paddw   xmm1,   xmm4            ; sum of lines 1- 8
 
-    movdqu	[edi],	xmm1
+    movdqu  [edi],  xmm1
 
-    movdqa	xmm2,	xmm1
-    punpcklwd	xmm1,	xmm0
-    punpckhwd	xmm2,	xmm0
+    movdqa  xmm2,   xmm1
+    punpcklwd   xmm1,   xmm0
+    punpckhwd   xmm2,   xmm0
 
-    COUNT_SUM	xmm1,	eax,	1
-    COUNT_SUM	xmm1,	eax,	1
-    COUNT_SUM	xmm1,	eax,	1
-    COUNT_SUM	xmm1,	eax,	0
-    COUNT_SUM	xmm2,	eax,	1
-    COUNT_SUM	xmm2,	eax,	1
-    COUNT_SUM	xmm2,	eax,	1
-    COUNT_SUM	xmm2,	eax,	0
+    COUNT_SUM   xmm1,   eax,    1
+    COUNT_SUM   xmm1,   eax,    1
+    COUNT_SUM   xmm1,   eax,    1
+    COUNT_SUM   xmm1,   eax,    0
+    COUNT_SUM   xmm2,   eax,    1
+    COUNT_SUM   xmm2,   eax,    1
+    COUNT_SUM   xmm2,   eax,    1
+    COUNT_SUM   xmm2,   eax,    0
 
-    lea		esi,	[esi+8]
-    lea		ebp,	[ebp+8]
-    lea		edi,	[edi+16]		; element size is 2
+    lea     esi,    [esi+8]
+    lea     ebp,    [ebp+8]
+    lea     edi,    [edi+16]        ; element size is 2
 
-    sub		dword [tmp_width], 8
-    jg		near FIRST_ROW_SSE4
+    sub     dword [tmp_width], 8
+    jg      near FIRST_ROW_SSE4
 
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		ebp,	[width]
-    dec		dword [height]
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     ebp,    [width]
+    dec     dword [height]
 HEIGHT_LOOP_SSE4:
-    mov		ecx,	ebp
+    mov     ecx,    ebp
 WIDTH_LOOP_SSE4:
-    movdqu	xmm1,	[esi+ebx*8]
-    movdqu	xmm2,	[esi]
-    movdqu	xmm7,	[edi]
+    movdqu  xmm1,   [esi+ebx*8]
+    movdqu  xmm2,   [esi]
+    movdqu  xmm7,   [edi]
 
-    movdqa	xmm3,	xmm1
-    mpsadbw	xmm1,	xmm0,	000b
-    mpsadbw	xmm3,	xmm0,	100b
-    paddw	xmm1,	xmm3
+    movdqa  xmm3,   xmm1
+    mpsadbw xmm1,   xmm0,   000b
+    mpsadbw xmm3,   xmm0,   100b
+    paddw   xmm1,   xmm3
 
-    movdqa	xmm4,	xmm2
-    mpsadbw	xmm2,	xmm0,	000b
-    mpsadbw	xmm4,	xmm0,	100b
-    paddw	xmm2,	xmm4
+    movdqa  xmm4,   xmm2
+    mpsadbw xmm2,   xmm0,   000b
+    mpsadbw xmm4,   xmm0,   100b
+    paddw   xmm2,   xmm4
 
-    paddw	xmm7,	xmm1
-    psubw	xmm7,	xmm2
-    movdqu	[edi+ebp*2], xmm7
+    paddw   xmm7,   xmm1
+    psubw   xmm7,   xmm2
+    movdqu  [edi+ebp*2], xmm7
 
-    movdqa	xmm6,	xmm7
-    punpcklwd	xmm7,	xmm0
-    punpckhwd	xmm6,	xmm0
+    movdqa  xmm6,   xmm7
+    punpcklwd   xmm7,   xmm0
+    punpckhwd   xmm6,   xmm0
 
-    COUNT_SUM	xmm7,	eax,	1
-    COUNT_SUM	xmm7,	eax,	1
-    COUNT_SUM	xmm7,	eax,	1
-    COUNT_SUM	xmm7,	eax,	0
-    COUNT_SUM	xmm6,	eax,	1
-    COUNT_SUM	xmm6,	eax,	1
-    COUNT_SUM	xmm6,	eax,	1
-    COUNT_SUM	xmm6,	eax,	0
+    COUNT_SUM   xmm7,   eax,    1
+    COUNT_SUM   xmm7,   eax,    1
+    COUNT_SUM   xmm7,   eax,    1
+    COUNT_SUM   xmm7,   eax,    0
+    COUNT_SUM   xmm6,   eax,    1
+    COUNT_SUM   xmm6,   eax,    1
+    COUNT_SUM   xmm6,   eax,    1
+    COUNT_SUM   xmm6,   eax,    0
 
-    lea		esi,	[esi+8]
-    lea		edi,	[edi+16]
+    lea     esi,    [esi+8]
+    lea     edi,    [edi+16]
 
-    sub		ecx,	8
-    jg		near WIDTH_LOOP_SSE4
+    sub     ecx,    8
+    jg      near WIDTH_LOOP_SSE4
 
-    lea		esi,	[esi+ebx]
-    sub		esi,	ebp
+    lea     esi,    [esi+ebx]
+    sub     esi,    ebp
 
-    dec		dword [height]
-    jg		near HEIGHT_LOOP_SSE4
+    dec     dword [height]
+    jg      near HEIGHT_LOOP_SSE4
 
-    add		esp,	localsize
-    pop		edi
-    pop		esi
-    pop		ebp
-    pop		ebx
-%undef		pushsize
-%undef		localsize
-%undef		ref
-%undef		sum_ref
-%undef		times_of_sum
-%undef		width
-%undef		height
-%undef		linesize
-%undef		tmp_width
+    add     esp,    localsize
+    pop     edi
+    pop     esi
+    pop     ebp
+    pop     ebx
+%undef      pushsize
+%undef      localsize
+%undef      ref
+%undef      sum_ref
+%undef      times_of_sum
+%undef      width
+%undef      height
+%undef      linesize
+%undef      tmp_width
     ret
 
 
@@ -357,153 +357,153 @@
 ;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
 ;****************************************************************************************************************************************************
 WELS_EXTERN SumOf16x16BlockOfFrame_sse2
-%define		pushsize		16
-%define		localsize		4
-%define		ref				esp + pushsize + localsize + 4
-%define		sum_ref			esp + pushsize + localsize + 20
-%define		times_of_sum	esp + pushsize + localsize + 24
-%define		width			esp + pushsize + localsize + 8
-%define		height			esp + pushsize + localsize + 12
-%define		linesize		esp + pushsize + localsize + 16
-%define		tmp_width		esp
-    push	ebx
-    push	ebp
-    push	esi
-    push	edi
-    sub		esp,	localsize
+%define     pushsize        16
+%define     localsize       4
+%define     ref             esp + pushsize + localsize + 4
+%define     sum_ref         esp + pushsize + localsize + 20
+%define     times_of_sum    esp + pushsize + localsize + 24
+%define     width           esp + pushsize + localsize + 8
+%define     height          esp + pushsize + localsize + 12
+%define     linesize        esp + pushsize + localsize + 16
+%define     tmp_width       esp
+    push    ebx
+    push    ebp
+    push    esi
+    push    edi
+    sub     esp,    localsize
 
-    pxor	xmm0,	xmm0
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		edx,	[times_of_sum]
-    mov		ebx,	[linesize]
-    mov		eax,	[width]
+    pxor    xmm0,   xmm0
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     edx,    [times_of_sum]
+    mov     ebx,    [linesize]
+    mov     eax,    [width]
 
-    lea		ecx,	[ebx+ebx*2]
-    mov		[tmp_width],	eax
+    lea     ecx,    [ebx+ebx*2]
+    mov     [tmp_width],    eax
 FIRST_ROW_X16H:
-    movdqu	xmm1,	[esi]
-    movdqu	xmm2,	[esi+ebx]
-    movdqu	xmm3,	[esi+ebx*2]
-    movdqu	xmm4,	[esi+ecx]
+    movdqu  xmm1,   [esi]
+    movdqu  xmm2,   [esi+ebx]
+    movdqu  xmm3,   [esi+ebx*2]
+    movdqu  xmm4,   [esi+ecx]
 
-    psadbw  xmm1,	xmm0
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    paddw	xmm1,	xmm2
-    paddw	xmm3,	xmm4
-    paddw	xmm1,	xmm3
+    psadbw  xmm1,   xmm0
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    paddw   xmm1,   xmm2
+    paddw   xmm3,   xmm4
+    paddw   xmm1,   xmm3
 
-    lea		ebp,	[esi+ebx*4]
-    movdqu	xmm2,	[ebp]
-    movdqu	xmm3,	[ebp+ebx]
-    movdqu	xmm4,	[ebp+ebx*2]
-    movdqu	xmm5,	[ebp+ecx]
+    lea     ebp,    [esi+ebx*4]
+    movdqu  xmm2,   [ebp]
+    movdqu  xmm3,   [ebp+ebx]
+    movdqu  xmm4,   [ebp+ebx*2]
+    movdqu  xmm5,   [ebp+ecx]
 
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    psadbw  xmm5,	xmm0
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm2,	xmm4
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    psadbw  xmm5,   xmm0
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm2,   xmm4
 
-    paddw	xmm1,	xmm2
+    paddw   xmm1,   xmm2
 
-    lea		ebp,	[ebp+ebx*4]
-    movdqu	xmm2,	[ebp]
-    movdqu	xmm3,	[ebp+ebx]
-    movdqu	xmm4,	[ebp+ebx*2]
-    movdqu	xmm5,	[ebp+ecx]
+    lea     ebp,    [ebp+ebx*4]
+    movdqu  xmm2,   [ebp]
+    movdqu  xmm3,   [ebp+ebx]
+    movdqu  xmm4,   [ebp+ebx*2]
+    movdqu  xmm5,   [ebp+ecx]
 
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    psadbw  xmm5,	xmm0
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm2,	xmm4
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    psadbw  xmm5,   xmm0
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm2,   xmm4
 
-    paddw	xmm1,	xmm2
+    paddw   xmm1,   xmm2
 
-    lea		ebp,	[ebp+ebx*4]
-    movdqu	xmm2,	[ebp]
-    movdqu	xmm3,	[ebp+ebx]
-    movdqu	xmm4,	[ebp+ebx*2]
-    movdqu	xmm5,	[ebp+ecx]
+    lea     ebp,    [ebp+ebx*4]
+    movdqu  xmm2,   [ebp]
+    movdqu  xmm3,   [ebp+ebx]
+    movdqu  xmm4,   [ebp+ebx*2]
+    movdqu  xmm5,   [ebp+ecx]
 
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    psadbw  xmm5,	xmm0
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm2,	xmm4
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    psadbw  xmm5,   xmm0
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm2,   xmm4
 
-    paddw	xmm1,	xmm2
-    movdqa	xmm2,	xmm1
+    paddw   xmm1,   xmm2
+    movdqa  xmm2,   xmm1
     punpckhwd xmm2, xmm0
     paddw xmm1, xmm2
-    movd	eax,	xmm1
-    mov		[edi],	ax
-    inc		dword [edx+eax*4]
+    movd    eax,    xmm1
+    mov     [edi],  ax
+    inc     dword [edx+eax*4]
 
-    inc		esi
-    lea		edi,	[edi+2]
+    inc     esi
+    lea     edi,    [edi+2]
 
-    dec		dword [tmp_width]
-    jg		near FIRST_ROW_X16H
+    dec     dword [tmp_width]
+    jg      near FIRST_ROW_X16H
 
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		ebp,	[width]
-    dec		dword [height]
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     ebp,    [width]
+    dec     dword [height]
 
-    mov		ecx,	ebx
-    sal		ecx,	4		; succeeded 16th line
+    mov     ecx,    ebx
+    sal     ecx,    4       ; succeeded 16th line
 HEIGHT_LOOP_X16:
-    mov		[tmp_width],	ebp
+    mov     [tmp_width],    ebp
 WIDTH_LOOP_X16:
-    movdqu	xmm1,	[esi+ecx]
-    movdqu	xmm2,	[esi]
-    psadbw	xmm1,	xmm0
-    psadbw	xmm2,	xmm0
-    psubw	xmm1,	xmm2
-    movdqa	xmm2,	xmm1
+    movdqu  xmm1,   [esi+ecx]
+    movdqu  xmm2,   [esi]
+    psadbw  xmm1,   xmm0
+    psadbw  xmm2,   xmm0
+    psubw   xmm1,   xmm2
+    movdqa  xmm2,   xmm1
     punpckhwd xmm2, xmm0
-    paddw	xmm1,	xmm2
-    movd	eax,	xmm1
-    add		ax,	word [edi]
-    mov		[edi+ebp*2],	ax
-    inc		dword [edx+eax*4]
+    paddw   xmm1,   xmm2
+    movd    eax,    xmm1
+    add     ax, word [edi]
+    mov     [edi+ebp*2],    ax
+    inc     dword [edx+eax*4]
 
-    inc		esi
-    add		edi,	2
+    inc     esi
+    add     edi,    2
 
-    dec		dword [tmp_width]
-    jg		near WIDTH_LOOP_X16
+    dec     dword [tmp_width]
+    jg      near WIDTH_LOOP_X16
 
-    add		esi,	ebx
-    sub		esi,	ebp
+    add     esi,    ebx
+    sub     esi,    ebp
 
-    dec		dword [height]
-    jg		near HEIGHT_LOOP_X16
+    dec     dword [height]
+    jg      near HEIGHT_LOOP_X16
 
-    add		esp,	localsize
-    pop		edi
-    pop		esi
-    pop		ebp
-    pop		ebx
-%undef		pushsize
-%undef		localsize
-%undef		ref
-%undef		sum_ref
-%undef		times_of_sum
-%undef		width
-%undef		height
-%undef		linesize
-%undef		tmp_width
+    add     esp,    localsize
+    pop     edi
+    pop     esi
+    pop     ebp
+    pop     ebx
+%undef      pushsize
+%undef      localsize
+%undef      ref
+%undef      sum_ref
+%undef      times_of_sum
+%undef      width
+%undef      height
+%undef      linesize
+%undef      tmp_width
     ret
 
 ; requires:  width % 16 == 0 && height > 1
@@ -512,163 +512,163 @@
 ;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
 ;-----------------------------------------------------------------------------------------------------------------------------
 ; try 8 mv via offset
-%macro   SUM_LINE_X16_SSE41  5	; ref, dst0, dst1, tmp0, tmp1
-    movdqu	%2,	[%1]
-    movdqu	%3,	[%1+8h]
-    movdqa	%4,	%2
-    movdqa	%5,	%3
+%macro SUM_LINE_X16_SSE41  5    ; ref, dst0, dst1, tmp0, tmp1
+    movdqu  %2, [%1]
+    movdqu  %3, [%1+8h]
+    movdqa  %4, %2
+    movdqa  %5, %3
 
-    mpsadbw	%2,	xmm0,	0	; 000 B
-    mpsadbw	%4,	xmm0,	5	; 101 B
-    mpsadbw	%3,	xmm0,	2	; 010 B
-    mpsadbw	%5,	xmm0,	7	; 111 B
-    paddw	%2,	%4
-    paddw	%3, %5
-    paddw	%2,	%3	; accumulate cost
-%endmacro	; end of SAD_16x16_LINE_SSE41
+    mpsadbw %2, xmm0,   0   ; 000 B
+    mpsadbw %4, xmm0,   5   ; 101 B
+    mpsadbw %3, xmm0,   2   ; 010 B
+    mpsadbw %5, xmm0,   7   ; 111 B
+    paddw   %2, %4
+    paddw   %3, %5
+    paddw   %2, %3  ; accumulate cost
+%endmacro   ; end of SAD_16x16_LINE_SSE41
 
 WELS_EXTERN SumOf16x16BlockOfFrame_sse4
-%define		pushsize		16
-%define		localsize		4
-%define		ref				esp + pushsize + localsize + 4
-%define		sum_ref			esp + pushsize + localsize + 20
-%define		times_of_sum	esp + pushsize + localsize + 24
-%define		width			esp + pushsize + localsize + 8
-%define		height			esp + pushsize + localsize + 12
-%define		linesize		esp + pushsize + localsize + 16
-%define		tmp_width		esp
-    push	ebx
-    push	ebp
-    push	esi
-    push	edi
-    sub		esp,	localsize
+%define     pushsize        16
+%define     localsize       4
+%define     ref             esp + pushsize + localsize + 4
+%define     sum_ref         esp + pushsize + localsize + 20
+%define     times_of_sum    esp + pushsize + localsize + 24
+%define     width           esp + pushsize + localsize + 8
+%define     height          esp + pushsize + localsize + 12
+%define     linesize        esp + pushsize + localsize + 16
+%define     tmp_width       esp
+    push    ebx
+    push    ebp
+    push    esi
+    push    edi
+    sub     esp,    localsize
 
-    pxor	xmm0,	xmm0
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		edx,	[times_of_sum]
-    mov		ebx,	[linesize]
-    mov		eax,	[width]
+    pxor    xmm0,   xmm0
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     edx,    [times_of_sum]
+    mov     ebx,    [linesize]
+    mov     eax,    [width]
 
-    lea		ecx,	[ebx+ebx*2]
-    mov		[tmp_width],	eax
+    lea     ecx,    [ebx+ebx*2]
+    mov     [tmp_width],    eax
 FIRST_ROW_X16_SSE4:
-    SUM_LINE_X16_SSE41	esi,		xmm1, xmm2, xmm3, xmm4
-    SUM_LINE_X16_SSE41	esi+ebx,	xmm2, xmm3, xmm4, xmm5
-    SUM_LINE_X16_SSE41	esi+ebx*2,	xmm3, xmm4, xmm5, xmm6
-    SUM_LINE_X16_SSE41	esi+ecx,	xmm4, xmm5, xmm6, xmm7
-    paddw	xmm1, xmm2
-    paddw	xmm3, xmm4
-    paddw	xmm1, xmm3
+    SUM_LINE_X16_SSE41  esi,        xmm1, xmm2, xmm3, xmm4
+    SUM_LINE_X16_SSE41  esi+ebx,    xmm2, xmm3, xmm4, xmm5
+    SUM_LINE_X16_SSE41  esi+ebx*2,  xmm3, xmm4, xmm5, xmm6
+    SUM_LINE_X16_SSE41  esi+ecx,    xmm4, xmm5, xmm6, xmm7
+    paddw   xmm1, xmm2
+    paddw   xmm3, xmm4
+    paddw   xmm1, xmm3
 
-    lea		ebp,	[esi+ebx*4]
-    SUM_LINE_X16_SSE41	ebp,		xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ebx,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ebx*2,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ecx,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
+    lea     ebp,    [esi+ebx*4]
+    SUM_LINE_X16_SSE41  ebp,        xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ebx,    xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ebx*2,  xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ecx,    xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
 
-    lea		ebp,	[ebp+ebx*4]
-    SUM_LINE_X16_SSE41	ebp,		xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ebx,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ebx*2,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ecx,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
+    lea     ebp,    [ebp+ebx*4]
+    SUM_LINE_X16_SSE41  ebp,        xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ebx,    xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ebx*2,  xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ecx,    xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
 
-    lea		ebp,	[ebp+ebx*4]
-    SUM_LINE_X16_SSE41	ebp,		xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ebx,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ebx*2,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ecx,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
+    lea     ebp,    [ebp+ebx*4]
+    SUM_LINE_X16_SSE41  ebp,        xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ebx,    xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ebx*2,  xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ecx,    xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
 
-    movdqa	[edi],	xmm1
-    movdqa	xmm2,	xmm1
-    punpcklwd	xmm1,	xmm0
-    punpckhwd	xmm2,	xmm0
+    movdqa  [edi],  xmm1
+    movdqa  xmm2,   xmm1
+    punpcklwd   xmm1,   xmm0
+    punpckhwd   xmm2,   xmm0
 
-    COUNT_SUM	xmm1,	eax,	1
-    COUNT_SUM	xmm1,	eax,	1
-    COUNT_SUM	xmm1,	eax,	1
-    COUNT_SUM	xmm1,	eax,	0
-    COUNT_SUM	xmm2,	eax,	1
-    COUNT_SUM	xmm2,	eax,	1
-    COUNT_SUM	xmm2,	eax,	1
-    COUNT_SUM	xmm2,	eax,	0
+    COUNT_SUM   xmm1,   eax,    1
+    COUNT_SUM   xmm1,   eax,    1
+    COUNT_SUM   xmm1,   eax,    1
+    COUNT_SUM   xmm1,   eax,    0
+    COUNT_SUM   xmm2,   eax,    1
+    COUNT_SUM   xmm2,   eax,    1
+    COUNT_SUM   xmm2,   eax,    1
+    COUNT_SUM   xmm2,   eax,    0
 
-    lea		esi,	[esi+8]
-    lea		edi,	[edi+16]	; element size is 2
+    lea     esi,    [esi+8]
+    lea     edi,    [edi+16]    ; element size is 2
 
-    sub		dword [tmp_width], 8
-    jg		near FIRST_ROW_X16_SSE4
+    sub     dword [tmp_width], 8
+    jg      near FIRST_ROW_X16_SSE4
 
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		ebp,	[width]
-    dec		dword [height]
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     ebp,    [width]
+    dec     dword [height]
 
-    mov		ecx,	ebx
-    sal		ecx,	4		; succeeded 16th line
+    mov     ecx,    ebx
+    sal     ecx,    4       ; succeeded 16th line
 
 HEIGHT_LOOP_X16_SSE4:
-    mov		[tmp_width],	ebp
+    mov     [tmp_width],    ebp
 WIDTH_LOOP_X16_SSE4:
-    movdqa	xmm7,	[edi]
-    SUM_LINE_X16_SSE41	esi+ecx, xmm1, xmm2, xmm3, xmm4
-    SUM_LINE_X16_SSE41	esi, xmm2, xmm3, xmm4, xmm5
+    movdqa  xmm7,   [edi]
+    SUM_LINE_X16_SSE41  esi+ecx, xmm1, xmm2, xmm3, xmm4
+    SUM_LINE_X16_SSE41  esi, xmm2, xmm3, xmm4, xmm5
 
-    paddw	xmm7,	xmm1
-    psubw	xmm7,	xmm2
-    movdqa	[edi+ebp*2], xmm7
+    paddw   xmm7,   xmm1
+    psubw   xmm7,   xmm2
+    movdqa  [edi+ebp*2], xmm7
 
-    movdqa	xmm6,	xmm7
-    punpcklwd	xmm7,	xmm0
-    punpckhwd	xmm6,	xmm0
+    movdqa  xmm6,   xmm7
+    punpcklwd   xmm7,   xmm0
+    punpckhwd   xmm6,   xmm0
 
-    COUNT_SUM	xmm7,	eax,	1
-    COUNT_SUM	xmm7,	eax,	1
-    COUNT_SUM	xmm7,	eax,	1
-    COUNT_SUM	xmm7,	eax,	0
-    COUNT_SUM	xmm6,	eax,	1
-    COUNT_SUM	xmm6,	eax,	1
-    COUNT_SUM	xmm6,	eax,	1
-    COUNT_SUM	xmm6,	eax,	0
+    COUNT_SUM   xmm7,   eax,    1
+    COUNT_SUM   xmm7,   eax,    1
+    COUNT_SUM   xmm7,   eax,    1
+    COUNT_SUM   xmm7,   eax,    0
+    COUNT_SUM   xmm6,   eax,    1
+    COUNT_SUM   xmm6,   eax,    1
+    COUNT_SUM   xmm6,   eax,    1
+    COUNT_SUM   xmm6,   eax,    0
 
-    lea		esi,	[esi+8]
-    lea		edi,	[edi+16]
+    lea     esi,    [esi+8]
+    lea     edi,    [edi+16]
 
-    sub		dword [tmp_width], 8
-    jg		near WIDTH_LOOP_X16_SSE4
+    sub     dword [tmp_width], 8
+    jg      near WIDTH_LOOP_X16_SSE4
 
-    add		esi,	ebx
-    sub		esi,	ebp
+    add     esi,    ebx
+    sub     esi,    ebp
 
-    dec		dword [height]
-    jg		near HEIGHT_LOOP_X16_SSE4
+    dec     dword [height]
+    jg      near HEIGHT_LOOP_X16_SSE4
 
-    add		esp,	localsize
-    pop		edi
-    pop		esi
-    pop		ebp
-    pop		ebx
-%undef		pushsize
-%undef		localsize
-%undef		ref
-%undef		sum_ref
-%undef		times_of_sum
-%undef		width
-%undef		height
-%undef		linesize
-%undef		tmp_width
+    add     esp,    localsize
+    pop     edi
+    pop     esi
+    pop     ebp
+    pop     ebx
+%undef      pushsize
+%undef      localsize
+%undef      ref
+%undef      sum_ref
+%undef      times_of_sum
+%undef      width
+%undef      height
+%undef      linesize
+%undef      tmp_width
     ret
 
 
@@ -676,78 +676,78 @@
 ; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
 ;-----------------------------------------------------------------------------------------------------------------------------
 WELS_EXTERN FillQpelLocationByFeatureValue_sse2
-    push	esi
-    push	edi
-    push	ebx
-    push	ebp
+    push    esi
+    push    edi
+    push    ebx
+    push    ebp
 
-    %define _ps			16				; push size
-    %define	_ls			4				; local size
-    %define	sum_ref		esp+_ps+_ls+4
-    %define	pos_list	esp+_ps+_ls+16
-    %define width		esp+_ps+_ls+8
-    %define height		esp+_ps+_ls+12
-    %define	i_height	esp
-    sub		esp,	_ls
+    %define _ps         16              ; push size
+    %define _ls         4               ; local size
+    %define sum_ref     esp+_ps+_ls+4
+    %define pos_list    esp+_ps+_ls+16
+    %define width       esp+_ps+_ls+8
+    %define height      esp+_ps+_ls+12
+    %define i_height    esp
+    sub     esp,    _ls
 
-    mov		esi,	[sum_ref]
-    mov		edi,	[pos_list]
-    mov		ebp,	[width]
-    mov		ebx,	[height]
-    mov		[i_height],	ebx
+    mov     esi,    [sum_ref]
+    mov     edi,    [pos_list]
+    mov     ebp,    [width]
+    mov     ebx,    [height]
+    mov     [i_height], ebx
 
-    movq	xmm7,	[mv_x_inc_x4]		; x_qpel inc
-    movq	xmm6,	[mv_y_inc_x4]		; y_qpel inc
-    movq	xmm5,	[mx_x_offset_x4]	; x_qpel vector
-    pxor	xmm4,	xmm4
-    pxor	xmm3,	xmm3				; y_qpel vector
+    movq    xmm7,   [mv_x_inc_x4]       ; x_qpel inc
+    movq    xmm6,   [mv_y_inc_x4]       ; y_qpel inc
+    movq    xmm5,   [mx_x_offset_x4]    ; x_qpel vector
+    pxor    xmm4,   xmm4
+    pxor    xmm3,   xmm3                ; y_qpel vector
 HASH_HEIGHT_LOOP_SSE2:
-    movdqa	xmm2,	xmm5	; x_qpel vector
-    mov		ecx,	ebp
+    movdqa  xmm2,   xmm5    ; x_qpel vector
+    mov     ecx,    ebp
 HASH_WIDTH_LOOP_SSE2:
-    movq	xmm0,	[esi]			; load x8 sum
-    punpcklwd	xmm0,	xmm4
-    movdqa		xmm1,	xmm2
-    punpcklwd	xmm1,	xmm3
-%rep	3
-    movd	edx,	xmm0
-    lea		ebx,	[edi+edx*4]
-    mov		eax,	[ebx]
-    movd	[eax],	xmm1
-    mov		edx,	[eax+4]	; explictly load eax+4 due cache miss from vtune observation
-    lea		eax,	[eax+4]
-    mov		[ebx],	eax
-    psrldq	xmm1,	4
-    psrldq	xmm0,	4
+    movq    xmm0,   [esi]           ; load x8 sum
+    punpcklwd   xmm0,   xmm4
+    movdqa      xmm1,   xmm2
+    punpcklwd   xmm1,   xmm3
+%rep    3
+    movd    edx,    xmm0
+    lea     ebx,    [edi+edx*4]
+    mov     eax,    [ebx]
+    movd    [eax],  xmm1
+    mov     edx,    [eax+4] ; explictly load eax+4 due cache miss from vtune observation
+    lea     eax,    [eax+4]
+    mov     [ebx],  eax
+    psrldq  xmm1,   4
+    psrldq  xmm0,   4
 %endrep
-    movd	edx,	xmm0
-    lea		ebx,	[edi+edx*4]
-    mov		eax,	[ebx]
-    movd	[eax],	xmm1
-    mov		edx,	[eax+4]	; explictly load eax+4 due cache miss from vtune observation
-    lea		eax,	[eax+4]
-    mov		[ebx],	eax
+    movd    edx,    xmm0
+    lea     ebx,    [edi+edx*4]
+    mov     eax,    [ebx]
+    movd    [eax],  xmm1
+    mov     edx,    [eax+4] ; explictly load eax+4 due cache miss from vtune observation
+    lea     eax,    [eax+4]
+    mov     [ebx],  eax
 
-    paddw	xmm2,	xmm7
-    lea		esi,	[esi+8]
-    sub		ecx,	4
+    paddw   xmm2,   xmm7
+    lea     esi,    [esi+8]
+    sub     ecx,    4
     jnz near HASH_WIDTH_LOOP_SSE2
-    paddw	xmm3,	xmm6
-    dec	dword [i_height]
-    jnz	near HASH_HEIGHT_LOOP_SSE2
+    paddw   xmm3,   xmm6
+    dec dword [i_height]
+    jnz near HASH_HEIGHT_LOOP_SSE2
 
-    add		esp,	_ls
-    %undef	_ps
-    %undef	_ls
-    %undef	sum_ref
-    %undef	pos_list
-    %undef	width
-    %undef	height
-    %undef	i_height
-    pop		ebp
-    pop		ebx
-    pop		edi
-    pop		esi
+    add     esp,    _ls
+    %undef  _ps
+    %undef  _ls
+    %undef  sum_ref
+    %undef  pos_list
+    %undef  width
+    %undef  height
+    %undef  i_height
+    pop     ebp
+    pop     ebx
+    pop     edi
+    pop     esi
     ret
 
 ;---------------------------------------------------------------------------------------------------------------------------------------------------
@@ -755,74 +755,74 @@
 ;                        uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList )
 ;---------------------------------------------------------------------------------------------------------------------------------------------------
 WELS_EXTERN InitializeHashforFeature_sse2
-    push	ebx
-    push	esi
-    push	edi
-    push	ebp
-    %define	_ps	16	; push size
-    mov		edi,	[esp+_ps+16]	; pPositionOfSum
-    mov		ebp,	[esp+_ps+20]	; sum_idx_list
-    mov		esi,	[esp+_ps+4]     ; pTimesOfSum
-    mov		ebx,	[esp+_ps+8]     ; pBuf
-    mov		edx,	[esp+_ps+12]	; list_sz
-    sar		edx,	2
-    mov		ecx,	0
-    pxor	xmm7,	xmm7
+    push    ebx
+    push    esi
+    push    edi
+    push    ebp
+    %define _ps 16  ; push size
+    mov     edi,    [esp+_ps+16]    ; pPositionOfSum
+    mov     ebp,    [esp+_ps+20]    ; sum_idx_list
+    mov     esi,    [esp+_ps+4]     ; pTimesOfSum
+    mov     ebx,    [esp+_ps+8]     ; pBuf
+    mov     edx,    [esp+_ps+12]    ; list_sz
+    sar     edx,    2
+    mov     ecx,    0
+    pxor    xmm7,   xmm7
 hash_assign_loop_x4_sse2:
-    movdqa	xmm0,	[esi+ecx]
-    pslld	xmm0,	2
+    movdqa  xmm0,   [esi+ecx]
+    pslld   xmm0,   2
 
-    movdqa	xmm1,	xmm0
-    pcmpeqd	xmm1,	xmm7
-    movmskps	eax,	xmm1
+    movdqa  xmm1,   xmm0
+    pcmpeqd xmm1,   xmm7
+    movmskps    eax,    xmm1
     cmp eax, 0x0f
-    je	near hash_assign_with_copy_sse2
+    je  near hash_assign_with_copy_sse2
 
-%assign x	0
+%assign x   0
 %rep 4
-    lea		eax,	[edi+ecx+x]
-    mov		[eax],	ebx
-    lea		eax,	[ebp+ecx+x]
-    mov		[eax],	ebx
-    movd	eax,	xmm0
-    add		ebx,	eax
-    psrldq	xmm0,	4
-%assign	x	x+4
+    lea     eax,    [edi+ecx+x]
+    mov     [eax],  ebx
+    lea     eax,    [ebp+ecx+x]
+    mov     [eax],  ebx
+    movd    eax,    xmm0
+    add     ebx,    eax
+    psrldq  xmm0,   4
+%assign x   x+4
 %endrep
     jmp near assign_next_sse2
 
 hash_assign_with_copy_sse2:
-    movd	xmm1,	ebx
-    pshufd	xmm2,	xmm1,	0
-    movdqa	[edi+ecx], xmm2
-    movdqa	[ebp+ecx], xmm2
+    movd    xmm1,   ebx
+    pshufd  xmm2,   xmm1,   0
+    movdqa  [edi+ecx], xmm2
+    movdqa  [ebp+ecx], xmm2
 
 assign_next_sse2:
-    add		ecx,	16
-    dec		edx
-    jnz		near hash_assign_loop_x4_sse2
+    add     ecx,    16
+    dec     edx
+    jnz     near hash_assign_loop_x4_sse2
 
-    mov		edx,	[esp+_ps+12]	; list_sz
-    and		edx,	3
-    jz		near hash_assign_no_rem_sse2
+    mov     edx,    [esp+_ps+12]    ; list_sz
+    and     edx,    3
+    jz      near hash_assign_no_rem_sse2
 hash_assign_loop_x4_rem_sse2:
-    lea		eax,	[edi+ecx]
-    mov		[eax],	ebx
-    lea		eax,	[ebp+ecx]
-    mov		[eax],	ebx
-    mov		eax,	[esi+ecx]
-    sal		eax,	2
-    add		ebx,	eax
-    add		ecx,	4
-    dec		edx
-    jnz		near hash_assign_loop_x4_rem_sse2
+    lea     eax,    [edi+ecx]
+    mov     [eax],  ebx
+    lea     eax,    [ebp+ecx]
+    mov     [eax],  ebx
+    mov     eax,    [esi+ecx]
+    sal     eax,    2
+    add     ebx,    eax
+    add     ecx,    4
+    dec     edx
+    jnz     near hash_assign_loop_x4_rem_sse2
 
 hash_assign_no_rem_sse2:
-    %undef	_ps
-    pop		ebp
-    pop		edi
-    pop		esi
-    pop		ebx
+    %undef  _ps
+    pop     ebp
+    pop     edi
+    pop     esi
+    pop     ebx
     ret
 %else
 
@@ -843,47 +843,47 @@
     push r2
     push r4
 
-    pxor	xmm0,	xmm0
+    pxor    xmm0,   xmm0
     lea     r6, [r3+r3*2]
 
-    mov		r12,	r1              ;r12:tmp_width
-    lea		r13,	[r0+r3*4]       ;rbp:r13
+    mov     r12,    r1              ;r12:tmp_width
+    lea     r13,    [r0+r3*4]       ;rbp:r13
 FIRST_ROW:
-    movq	xmm1,	[r0]
-    movq	xmm2,	[r0+r3]
-    movq	xmm3,	[r0+r3*2]
-    movq	xmm4,	[r0+r6]
+    movq    xmm1,   [r0]
+    movq    xmm2,   [r0+r3]
+    movq    xmm3,   [r0+r3*2]
+    movq    xmm4,   [r0+r6]
 
-    shufps	xmm1,	xmm2,	01000100b
-    shufps	xmm3,	xmm4,	01000100b
-    psadbw	xmm1,	xmm0
-    psadbw	xmm3,	xmm0
-    paddd	xmm1,	xmm3
+    shufps  xmm1,   xmm2,   01000100b
+    shufps  xmm3,   xmm4,   01000100b
+    psadbw  xmm1,   xmm0
+    psadbw  xmm3,   xmm0
+    paddd   xmm1,   xmm3
 
-    movq	xmm2,	[r13]
-    movq	xmm3,	[r13+r3]
-    movq	xmm4,	[r13+r3*2]
-    movq	xmm5,	[r13+r6]
+    movq    xmm2,   [r13]
+    movq    xmm3,   [r13+r3]
+    movq    xmm4,   [r13+r3*2]
+    movq    xmm5,   [r13+r6]
 
-    shufps	xmm2,	xmm3,	01000100b
-    shufps	xmm4,	xmm5,	01000100b
-    psadbw	xmm2,	xmm0
-    psadbw	xmm4,	xmm0
-    paddd	xmm2,	xmm4
+    shufps  xmm2,   xmm3,   01000100b
+    shufps  xmm4,   xmm5,   01000100b
+    psadbw  xmm2,   xmm0
+    psadbw  xmm4,   xmm0
+    paddd   xmm2,   xmm4
 
-    paddd	xmm1,	xmm2
-    pshufd	xmm2,	xmm1,	00001110b
-    paddd	xmm1,	xmm2
-    movd	r2d,	xmm1
-    mov		[r4],	r2w
-    inc		dword [r5+r2*4]
+    paddd   xmm1,   xmm2
+    pshufd  xmm2,   xmm1,   00001110b
+    paddd   xmm1,   xmm2
+    movd    r2d,    xmm1
+    mov     [r4],   r2w
+    inc     dword [r5+r2*4]
 
-    inc		r0
-    inc		r13
-    add		r4,	2
+    inc     r0
+    inc     r13
+    add     r4, 2
 
-    dec		r12
-    jg		FIRST_ROW
+    dec     r12
+    jg      FIRST_ROW
 
     pop r4
     pop r2
@@ -891,34 +891,34 @@
     mov r13, r2
     dec r13
 HEIGHT_LOOP:
-    mov		r12,	r1
+    mov     r12,    r1
 WIDTH_LOOP:
-    movq	xmm1,	[r0+r3*8]
-    movq	xmm2,	[r0]
-    psadbw	xmm1,	xmm0
-    psadbw	xmm2,	xmm0
-    psubd	xmm1,	xmm2
-    movd	r2d,	xmm1
-    mov		r6w,	[r4]
-    add		r2d,	r6d
-    mov		[r4+r1*2],	r2w
-    inc		dword [r5+r2*4]
+    movq    xmm1,   [r0+r3*8]
+    movq    xmm2,   [r0]
+    psadbw  xmm1,   xmm0
+    psadbw  xmm2,   xmm0
+    psubd   xmm1,   xmm2
+    movd    r2d,    xmm1
+    mov     r6w,    [r4]
+    add     r2d,    r6d
+    mov     [r4+r1*2],  r2w
+    inc     dword [r5+r2*4]
 
-    inc		r0
-    add		r4,	2
+    inc     r0
+    add     r4, 2
 
-    dec		r12
-    jg		WIDTH_LOOP
+    dec     r12
+    jg      WIDTH_LOOP
 
-    add		r0,	r3
-    sub		r0,	r1
+    add     r0, r3
+    sub     r0, r1
 
 
-    dec		r13
-    jg		HEIGHT_LOOP
+    dec     r13
+    jg      HEIGHT_LOOP
 
-    pop		r13
-    pop		r12
+    pop     r13
+    pop     r12
     POP_XMM
     LOAD_6_PARA_POP
     ret
@@ -928,10 +928,10 @@
 %define xmm_reg %1
 %define tmp_dreg %2
 %define tmp_qreg %3
-    movd	tmp_dreg,	xmm_reg
-    inc		dword [r5+tmp_qreg*4]
+    movd    tmp_dreg,   xmm_reg
+    inc     dword [r5+tmp_qreg*4]
 %if %4 == 1
-    psrldq	xmm_reg,	4
+    psrldq  xmm_reg,    4
 %endif
 %endmacro
 
@@ -957,92 +957,92 @@
     push r2
     push r4
 
-    pxor	xmm0,	xmm0
+    pxor    xmm0,   xmm0
     lea     r6, [r3+r3*2]
 
-    mov		r12,	r1              ;r12:tmp_width
-    lea		r13,	[r0+r3*4]       ;rbp:r13
+    mov     r12,    r1              ;r12:tmp_width
+    lea     r13,    [r0+r3*4]       ;rbp:r13
 FIRST_ROW_SSE4:
-    movdqu	xmm1,	[r0]
-    movdqu	xmm3,	[r0+r3]
-    movdqu	xmm5,	[r0+r3*2]
-    movdqu	xmm7,	[r0+r6]
+    movdqu  xmm1,   [r0]
+    movdqu  xmm3,   [r0+r3]
+    movdqu  xmm5,   [r0+r3*2]
+    movdqu  xmm7,   [r0+r6]
 
-    movdqa	xmm2,	xmm1
-    mpsadbw	xmm1,	xmm0,	000b
-    mpsadbw	xmm2,	xmm0,	100b
-    paddw	xmm1,	xmm2			; 8 sums of line1
+    movdqa  xmm2,   xmm1
+    mpsadbw xmm1,   xmm0,   000b
+    mpsadbw xmm2,   xmm0,   100b
+    paddw   xmm1,   xmm2            ; 8 sums of line1
 
-    movdqa	xmm4,	xmm3
-    mpsadbw	xmm3,	xmm0,	000b
-    mpsadbw	xmm4,	xmm0,	100b
-    paddw	xmm3,	xmm4			; 8 sums of line2
+    movdqa  xmm4,   xmm3
+    mpsadbw xmm3,   xmm0,   000b
+    mpsadbw xmm4,   xmm0,   100b
+    paddw   xmm3,   xmm4            ; 8 sums of line2
 
-    movdqa	xmm2,	xmm5
-    mpsadbw	xmm5,	xmm0,	000b
-    mpsadbw	xmm2,	xmm0,	100b
-    paddw	xmm5,	xmm2			; 8 sums of line3
+    movdqa  xmm2,   xmm5
+    mpsadbw xmm5,   xmm0,   000b
+    mpsadbw xmm2,   xmm0,   100b
+    paddw   xmm5,   xmm2            ; 8 sums of line3
 
-    movdqa	xmm4,	xmm7
-    mpsadbw	xmm7,	xmm0,	000b
-    mpsadbw	xmm4,	xmm0,	100b
-    paddw	xmm7,	xmm4			; 8 sums of line4
+    movdqa  xmm4,   xmm7
+    mpsadbw xmm7,   xmm0,   000b
+    mpsadbw xmm4,   xmm0,   100b
+    paddw   xmm7,   xmm4            ; 8 sums of line4
 
-    paddw	xmm1,	xmm3
-    paddw	xmm5,	xmm7
-    paddw	xmm1,	xmm5			; sum the upper 4 lines first
+    paddw   xmm1,   xmm3
+    paddw   xmm5,   xmm7
+    paddw   xmm1,   xmm5            ; sum the upper 4 lines first
 
-    movdqu	xmm2,	[r13]
-    movdqu	xmm3,	[r13+r3]
-    movdqu	xmm4,	[r13+r3*2]
-    movdqu	xmm5,	[r13+r6]
+    movdqu  xmm2,   [r13]
+    movdqu  xmm3,   [r13+r3]
+    movdqu  xmm4,   [r13+r3*2]
+    movdqu  xmm5,   [r13+r6]
 
-    movdqa	xmm6,	xmm2
-    mpsadbw	xmm2,	xmm0,	000b
-    mpsadbw	xmm6,	xmm0,	100b
-    paddw	xmm2,	xmm6
+    movdqa  xmm6,   xmm2
+    mpsadbw xmm2,   xmm0,   000b
+    mpsadbw xmm6,   xmm0,   100b
+    paddw   xmm2,   xmm6
 
-    movdqa	xmm7,	xmm3
-    mpsadbw	xmm3,	xmm0,	000b
-    mpsadbw	xmm7,	xmm0,	100b
-    paddw	xmm3,	xmm7
+    movdqa  xmm7,   xmm3
+    mpsadbw xmm3,   xmm0,   000b
+    mpsadbw xmm7,   xmm0,   100b
+    paddw   xmm3,   xmm7
 
-    movdqa	xmm6,	xmm4
-    mpsadbw	xmm4,	xmm0,	000b
-    mpsadbw	xmm6,	xmm0,	100b
-    paddw	xmm4,	xmm6
+    movdqa  xmm6,   xmm4
+    mpsadbw xmm4,   xmm0,   000b
+    mpsadbw xmm6,   xmm0,   100b
+    paddw   xmm4,   xmm6
 
-    movdqa	xmm7,	xmm5
-    mpsadbw	xmm5,	xmm0,	000b
-    mpsadbw	xmm7,	xmm0,	100b
-    paddw	xmm5,	xmm7
+    movdqa  xmm7,   xmm5
+    mpsadbw xmm5,   xmm0,   000b
+    mpsadbw xmm7,   xmm0,   100b
+    paddw   xmm5,   xmm7
 
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm1,	xmm2
-    paddw	xmm1,	xmm4			; sum of lines 1- 8
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm1,   xmm2
+    paddw   xmm1,   xmm4            ; sum of lines 1- 8
 
-    movdqu	[r4],	xmm1
+    movdqu  [r4],   xmm1
 
-    movdqa	xmm2,	xmm1
-    punpcklwd	xmm1,	xmm0
-    punpckhwd	xmm2,	xmm0
+    movdqa  xmm2,   xmm1
+    punpcklwd   xmm1,   xmm0
+    punpckhwd   xmm2,   xmm0
 
-    COUNT_SUM	xmm1,	r2d, r2, 1
-    COUNT_SUM	xmm1,	r2d, r2, 1
-    COUNT_SUM	xmm1,	r2d, r2, 1
-    COUNT_SUM	xmm1,	r2d, r2, 0
-    COUNT_SUM	xmm2,	r2d, r2 ,1
-    COUNT_SUM	xmm2,	r2d, r2 ,1
-    COUNT_SUM	xmm2,	r2d, r2 ,1
-    COUNT_SUM	xmm2,	r2d, r2 ,0
+    COUNT_SUM   xmm1,   r2d, r2, 1
+    COUNT_SUM   xmm1,   r2d, r2, 1
+    COUNT_SUM   xmm1,   r2d, r2, 1
+    COUNT_SUM   xmm1,   r2d, r2, 0
+    COUNT_SUM   xmm2,   r2d, r2 ,1
+    COUNT_SUM   xmm2,   r2d, r2 ,1
+    COUNT_SUM   xmm2,   r2d, r2 ,1
+    COUNT_SUM   xmm2,   r2d, r2 ,0
 
-    lea		r0,     [r0+8]
-    lea		r13,	[r13+8]
-    lea		r4,     [r4+16]		; element size is 2
+    lea     r0,     [r0+8]
+    lea     r13,    [r13+8]
+    lea     r4,     [r4+16]     ; element size is 2
 
-    sub		r12, 8
-    jg		near FIRST_ROW_SSE4
+    sub     r12, 8
+    jg      near FIRST_ROW_SSE4
 
     pop r4
     pop r2
@@ -1050,53 +1050,53 @@
     mov r13, r2
     dec r13
 HEIGHT_LOOP_SSE4:
-    mov		r12,	r1
+    mov     r12,    r1
 WIDTH_LOOP_SSE4:
-    movdqu	xmm1,	[r0+r3*8]
-    movdqu	xmm2,	[r0]
-    movdqu	xmm7,	[r4]
+    movdqu  xmm1,   [r0+r3*8]
+    movdqu  xmm2,   [r0]
+    movdqu  xmm7,   [r4]
 
-    movdqa	xmm3,	xmm1
-    mpsadbw	xmm1,	xmm0,	000b
-    mpsadbw	xmm3,	xmm0,	100b
-    paddw	xmm1,	xmm3
+    movdqa  xmm3,   xmm1
+    mpsadbw xmm1,   xmm0,   000b
+    mpsadbw xmm3,   xmm0,   100b
+    paddw   xmm1,   xmm3
 
-    movdqa	xmm4,	xmm2
-    mpsadbw	xmm2,	xmm0,	000b
-    mpsadbw	xmm4,	xmm0,	100b
-    paddw	xmm2,	xmm4
+    movdqa  xmm4,   xmm2
+    mpsadbw xmm2,   xmm0,   000b
+    mpsadbw xmm4,   xmm0,   100b
+    paddw   xmm2,   xmm4
 
-    paddw	xmm7,	xmm1
-    psubw	xmm7,	xmm2
-    movdqu	[r4+r1*2], xmm7
+    paddw   xmm7,   xmm1
+    psubw   xmm7,   xmm2
+    movdqu  [r4+r1*2], xmm7
 
-    movdqa	xmm6,	xmm7
-    punpcklwd	xmm7,	xmm0
-    punpckhwd	xmm6,	xmm0
+    movdqa  xmm6,   xmm7
+    punpcklwd   xmm7,   xmm0
+    punpckhwd   xmm6,   xmm0
 
-    COUNT_SUM	xmm7,	r2d, r2, 1
-    COUNT_SUM	xmm7,	r2d, r2, 1
-    COUNT_SUM	xmm7,	r2d, r2, 1
-    COUNT_SUM	xmm7,	r2d, r2, 0
-    COUNT_SUM	xmm6,	r2d, r2, 1
-    COUNT_SUM	xmm6,	r2d, r2, 1
-    COUNT_SUM	xmm6,	r2d, r2, 1
-    COUNT_SUM	xmm6,	r2d, r2, 0
+    COUNT_SUM   xmm7,   r2d, r2, 1
+    COUNT_SUM   xmm7,   r2d, r2, 1
+    COUNT_SUM   xmm7,   r2d, r2, 1
+    COUNT_SUM   xmm7,   r2d, r2, 0
+    COUNT_SUM   xmm6,   r2d, r2, 1
+    COUNT_SUM   xmm6,   r2d, r2, 1
+    COUNT_SUM   xmm6,   r2d, r2, 1
+    COUNT_SUM   xmm6,   r2d, r2, 0
 
-    lea		r0,	[r0+8]
-    lea		r4,	[r4+16]
+    lea     r0, [r0+8]
+    lea     r4, [r4+16]
 
-    sub		r12,	8
-    jg		near WIDTH_LOOP_SSE4
+    sub     r12,    8
+    jg      near WIDTH_LOOP_SSE4
 
-    lea		r0,	[r0+r3]
-    sub		r0,	r1
+    lea     r0, [r0+r3]
+    sub     r0, r1
 
-    dec		r13
-    jg		near HEIGHT_LOOP_SSE4
+    dec     r13
+    jg      near HEIGHT_LOOP_SSE4
 
-    pop		r13
-    pop		r12
+    pop     r13
+    pop     r12
     POP_XMM
     LOAD_6_PARA_POP
     ret
@@ -1119,83 +1119,83 @@
     push r2
     push r4
 
-    pxor	xmm0,	xmm0
+    pxor    xmm0,   xmm0
     lea     r6, [r3+r3*2]
 
-    mov		r12,	r1              ;r12:tmp_width
+    mov     r12,    r1              ;r12:tmp_width
 FIRST_ROW_X16H:
-    movdqu	xmm1,	[r0]
-    movdqu	xmm2,	[r0+r3]
-    movdqu	xmm3,	[r0+r3*2]
-    movdqu	xmm4,	[r0+r6]
+    movdqu  xmm1,   [r0]
+    movdqu  xmm2,   [r0+r3]
+    movdqu  xmm3,   [r0+r3*2]
+    movdqu  xmm4,   [r0+r6]
 
-    psadbw  xmm1,	xmm0
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    paddw	xmm1,	xmm2
-    paddw	xmm3,	xmm4
-    paddw	xmm1,	xmm3
+    psadbw  xmm1,   xmm0
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    paddw   xmm1,   xmm2
+    paddw   xmm3,   xmm4
+    paddw   xmm1,   xmm3
 
-    lea		r13,	[r0+r3*4]       ;ebp:r13
-    movdqu	xmm2,	[r13]
-    movdqu	xmm3,	[r13+r3]
-    movdqu	xmm4,	[r13+r3*2]
-    movdqu	xmm5,	[r13+r6]
+    lea     r13,    [r0+r3*4]       ;ebp:r13
+    movdqu  xmm2,   [r13]
+    movdqu  xmm3,   [r13+r3]
+    movdqu  xmm4,   [r13+r3*2]
+    movdqu  xmm5,   [r13+r6]
 
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    psadbw  xmm5,	xmm0
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm2,	xmm4
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    psadbw  xmm5,   xmm0
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm2,   xmm4
 
-    paddw	xmm1,	xmm2
+    paddw   xmm1,   xmm2
 
-    lea		r13,	[r13+r3*4]
-    movdqu	xmm2,	[r13]
-    movdqu	xmm3,	[r13+r3]
-    movdqu	xmm4,	[r13+r3*2]
-    movdqu	xmm5,	[r13+r6]
+    lea     r13,    [r13+r3*4]
+    movdqu  xmm2,   [r13]
+    movdqu  xmm3,   [r13+r3]
+    movdqu  xmm4,   [r13+r3*2]
+    movdqu  xmm5,   [r13+r6]
 
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    psadbw  xmm5,	xmm0
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm2,	xmm4
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    psadbw  xmm5,   xmm0
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm2,   xmm4
 
-    paddw	xmm1,	xmm2
+    paddw   xmm1,   xmm2
 
-    lea		r13,	[r13+r3*4]
-    movdqu	xmm2,	[r13]
-    movdqu	xmm3,	[r13+r3]
-    movdqu	xmm4,	[r13+r3*2]
-    movdqu	xmm5,	[r13+r6]
+    lea     r13,    [r13+r3*4]
+    movdqu  xmm2,   [r13]
+    movdqu  xmm3,   [r13+r3]
+    movdqu  xmm4,   [r13+r3*2]
+    movdqu  xmm5,   [r13+r6]
 
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    psadbw  xmm5,	xmm0
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm2,	xmm4
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    psadbw  xmm5,   xmm0
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm2,   xmm4
 
-    paddw	xmm1,	xmm2
-    movdqa	xmm2,	xmm1
+    paddw   xmm1,   xmm2
+    movdqa  xmm2,   xmm1
     punpckhwd xmm2, xmm0
     paddw xmm1, xmm2
-    movd	r2d,	xmm1
-    mov		[r4],	r2w
-    inc		dword [r5+r2*4]
+    movd    r2d,    xmm1
+    mov     [r4],   r2w
+    inc     dword [r5+r2*4]
 
-    inc		r0
-    lea		r4,	[r4+2]
+    inc     r0
+    lea     r4, [r4+2]
 
-    dec		r12
-    jg		near FIRST_ROW_X16H
+    dec     r12
+    jg      near FIRST_ROW_X16H
 
     pop r4
     pop r2
@@ -1202,38 +1202,38 @@
     pop r0
     mov r13, r2
     dec r13
-    mov		r6,	r3
-    sal		r6,	4		; succeeded 16th line
+    mov     r6, r3
+    sal     r6, 4       ; succeeded 16th line
 HEIGHT_LOOP_X16:
-    mov		r12,	r1
+    mov     r12,    r1
 WIDTH_LOOP_X16:
-    movdqu	xmm1,	[r0+r6]
-    movdqu	xmm2,	[r0]
-    psadbw	xmm1,	xmm0
-    psadbw	xmm2,	xmm0
-    psubw	xmm1,	xmm2
-    movdqa	xmm2,	xmm1
+    movdqu  xmm1,   [r0+r6]
+    movdqu  xmm2,   [r0]
+    psadbw  xmm1,   xmm0
+    psadbw  xmm2,   xmm0
+    psubw   xmm1,   xmm2
+    movdqa  xmm2,   xmm1
     punpckhwd xmm2, xmm0
-    paddw	xmm1,	xmm2
-    movd	r2d,	xmm1
-    add		r2w,	word [r4]
-    mov		[r4+r1*2],	r2w
-    inc		dword [r5+r2*4]
+    paddw   xmm1,   xmm2
+    movd    r2d,    xmm1
+    add     r2w,    word [r4]
+    mov     [r4+r1*2],  r2w
+    inc     dword [r5+r2*4]
 
-    inc		r0
-    add		r4,	2
+    inc     r0
+    add     r4, 2
 
-    dec		r12
-    jg		near WIDTH_LOOP_X16
+    dec     r12
+    jg      near WIDTH_LOOP_X16
 
-    add		r0,	r3
-    sub		r0,	r1
+    add     r0, r3
+    sub     r0, r1
 
-    dec		r13
-    jg		near HEIGHT_LOOP_X16
+    dec     r13
+    jg      near HEIGHT_LOOP_X16
 
-    pop		r13
-    pop		r12
+    pop     r13
+    pop     r12
     POP_XMM
     LOAD_6_PARA_POP
     ret
@@ -1244,20 +1244,20 @@
 ;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
 ;-----------------------------------------------------------------------------------------------------------------------------
 ; try 8 mv via offset
-%macro   SUM_LINE_X16_SSE41  5	; ref, dst0, dst1, tmp0, tmp1
-    movdqu	%2,	[%1]
-    movdqu	%3,	[%1+8h]
-    movdqa	%4,	%2
-    movdqa	%5,	%3
+%macro SUM_LINE_X16_SSE41  5    ; ref, dst0, dst1, tmp0, tmp1
+    movdqu  %2, [%1]
+    movdqu  %3, [%1+8h]
+    movdqa  %4, %2
+    movdqa  %5, %3
 
-    mpsadbw	%2,	xmm0,	0	; 000 B
-    mpsadbw	%4,	xmm0,	5	; 101 B
-    mpsadbw	%3,	xmm0,	2	; 010 B
-    mpsadbw	%5,	xmm0,	7	; 111 B
-    paddw	%2,	%4
-    paddw	%3, %5
-    paddw	%2,	%3	; accumulate cost
-%endmacro	; end of SAD_16x16_LINE_SSE41
+    mpsadbw %2, xmm0,   0   ; 000 B
+    mpsadbw %4, xmm0,   5   ; 101 B
+    mpsadbw %3, xmm0,   2   ; 010 B
+    mpsadbw %5, xmm0,   7   ; 111 B
+    paddw   %2, %4
+    paddw   %3, %5
+    paddw   %2, %3  ; accumulate cost
+%endmacro   ; end of SAD_16x16_LINE_SSE41
 
 WELS_EXTERN SumOf16x16BlockOfFrame_sse4
     %assign  push_num 0
@@ -1272,68 +1272,68 @@
     push r2
     push r4
 
-    pxor	xmm0,	xmm0
+    pxor    xmm0,   xmm0
     lea     r6, [r3+r3*2]
 
-    mov		r12,	r1              ;r12:tmp_width
+    mov     r12,    r1              ;r12:tmp_width
 FIRST_ROW_X16_SSE4:
-    SUM_LINE_X16_SSE41	r0,		xmm1, xmm2, xmm3, xmm4
-    SUM_LINE_X16_SSE41	r0+r3,	xmm2, xmm3, xmm4, xmm5
-    SUM_LINE_X16_SSE41	r0+r3*2,xmm3, xmm4, xmm5, xmm6
-    SUM_LINE_X16_SSE41	r0+r6,	xmm4, xmm5, xmm6, xmm7
-    paddw	xmm1, xmm2
-    paddw	xmm3, xmm4
-    paddw	xmm1, xmm3
+    SUM_LINE_X16_SSE41  r0,     xmm1, xmm2, xmm3, xmm4
+    SUM_LINE_X16_SSE41  r0+r3,  xmm2, xmm3, xmm4, xmm5
+    SUM_LINE_X16_SSE41  r0+r3*2,xmm3, xmm4, xmm5, xmm6
+    SUM_LINE_X16_SSE41  r0+r6,  xmm4, xmm5, xmm6, xmm7
+    paddw   xmm1, xmm2
+    paddw   xmm3, xmm4
+    paddw   xmm1, xmm3
 
-    lea		r13,	[r0+r3*4]
-    SUM_LINE_X16_SSE41	r13,		xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r3,     xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r3*2,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r6,     xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
+    lea     r13,    [r0+r3*4]
+    SUM_LINE_X16_SSE41  r13,        xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r3,     xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r3*2,   xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r6,     xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
 
-    lea		r13,	[r13+r3*4]
-    SUM_LINE_X16_SSE41	r13,		xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r3,     xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r3*2,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r6,     xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
+    lea     r13,    [r13+r3*4]
+    SUM_LINE_X16_SSE41  r13,        xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r3,     xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r3*2,   xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r6,     xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
 
-    lea		r13,	[r13+r3*4]
-    SUM_LINE_X16_SSE41	r13,		xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r3,     xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r3*2,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r6,     xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
+    lea     r13,    [r13+r3*4]
+    SUM_LINE_X16_SSE41  r13,        xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r3,     xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r3*2,   xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r6,     xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
 
-    movdqa	[r4],	xmm1
-    movdqa	xmm2,	xmm1
-    punpcklwd	xmm1,	xmm0
-    punpckhwd	xmm2,	xmm0
+    movdqa  [r4],   xmm1
+    movdqa  xmm2,   xmm1
+    punpcklwd   xmm1,   xmm0
+    punpckhwd   xmm2,   xmm0
 
-    COUNT_SUM	xmm1,	r2d, r2, 1
-    COUNT_SUM	xmm1,	r2d, r2, 1
-    COUNT_SUM	xmm1,	r2d, r2, 1
-    COUNT_SUM	xmm1,	r2d, r2, 0
-    COUNT_SUM	xmm2,	r2d, r2, 1
-    COUNT_SUM	xmm2,	r2d, r2, 1
-    COUNT_SUM	xmm2,	r2d, r2, 1
-    COUNT_SUM	xmm2,	r2d, r2, 0
+    COUNT_SUM   xmm1,   r2d, r2, 1
+    COUNT_SUM   xmm1,   r2d, r2, 1
+    COUNT_SUM   xmm1,   r2d, r2, 1
+    COUNT_SUM   xmm1,   r2d, r2, 0
+    COUNT_SUM   xmm2,   r2d, r2, 1
+    COUNT_SUM   xmm2,   r2d, r2, 1
+    COUNT_SUM   xmm2,   r2d, r2, 1
+    COUNT_SUM   xmm2,   r2d, r2, 0
 
-    lea		r0,	[r0+8]
-    lea		r4,	[r4+16]	; element size is 2
+    lea     r0, [r0+8]
+    lea     r4, [r4+16] ; element size is 2
 
-    sub		r12, 8
-    jg		near FIRST_ROW_X16_SSE4
+    sub     r12, 8
+    jg      near FIRST_ROW_X16_SSE4
 
     pop r4
     pop r2
@@ -1340,47 +1340,47 @@
     pop r0
     mov r13, r2
     dec r13
-    mov		r6,	r3
-    sal		r6,	4		; succeeded 16th line
+    mov     r6, r3
+    sal     r6, 4       ; succeeded 16th line
 
 HEIGHT_LOOP_X16_SSE4:
-    mov		r12,	r1
+    mov     r12,    r1
 WIDTH_LOOP_X16_SSE4:
-    movdqa	xmm7,	[r4]
-    SUM_LINE_X16_SSE41	r0+r6, xmm1, xmm2, xmm3, xmm4
-    SUM_LINE_X16_SSE41	r0, xmm2, xmm3, xmm4, xmm5
+    movdqa  xmm7,   [r4]
+    SUM_LINE_X16_SSE41  r0+r6, xmm1, xmm2, xmm3, xmm4
+    SUM_LINE_X16_SSE41  r0, xmm2, xmm3, xmm4, xmm5
 
-    paddw	xmm7,	xmm1
-    psubw	xmm7,	xmm2
-    movdqa	[r4+r1*2], xmm7
+    paddw   xmm7,   xmm1
+    psubw   xmm7,   xmm2
+    movdqa  [r4+r1*2], xmm7
 
-    movdqa	xmm6,	xmm7
-    punpcklwd	xmm7,	xmm0
-    punpckhwd	xmm6,	xmm0
+    movdqa  xmm6,   xmm7
+    punpcklwd   xmm7,   xmm0
+    punpckhwd   xmm6,   xmm0
 
-    COUNT_SUM	xmm7,	r2d, r2, 1
-    COUNT_SUM	xmm7,	r2d, r2, 1
-    COUNT_SUM	xmm7,	r2d, r2, 1
-    COUNT_SUM	xmm7,	r2d, r2, 0
-    COUNT_SUM	xmm6,	r2d, r2, 1
-    COUNT_SUM	xmm6,	r2d, r2, 1
-    COUNT_SUM	xmm6,	r2d, r2, 1
-    COUNT_SUM	xmm6,	r2d, r2, 0
+    COUNT_SUM   xmm7,   r2d, r2, 1
+    COUNT_SUM   xmm7,   r2d, r2, 1
+    COUNT_SUM   xmm7,   r2d, r2, 1
+    COUNT_SUM   xmm7,   r2d, r2, 0
+    COUNT_SUM   xmm6,   r2d, r2, 1
+    COUNT_SUM   xmm6,   r2d, r2, 1
+    COUNT_SUM   xmm6,   r2d, r2, 1
+    COUNT_SUM   xmm6,   r2d, r2, 0
 
-    lea		r0,	[r0+8]
-    lea		r4,	[r4+16]
+    lea     r0, [r0+8]
+    lea     r4, [r4+16]
 
-    sub		r12, 8
-    jg		near WIDTH_LOOP_X16_SSE4
+    sub     r12, 8
+    jg      near WIDTH_LOOP_X16_SSE4
 
-    add		r0,	r3
-    sub		r0,	r1
+    add     r0, r3
+    sub     r0, r1
 
-    dec		r13
-    jg		near HEIGHT_LOOP_X16_SSE4
+    dec     r13
+    jg      near HEIGHT_LOOP_X16_SSE4
 
-    pop		r13
-    pop		r12
+    pop     r13
+    pop     r12
     POP_XMM
     LOAD_6_PARA_POP
     ret
@@ -1398,48 +1398,48 @@
     push r13
     mov     r12,    r2
 
-    movq	xmm7,	[mv_x_inc_x4]		; x_qpel inc
-    movq	xmm6,	[mv_y_inc_x4]		; y_qpel inc
-    movq	xmm5,	[mx_x_offset_x4]	; x_qpel vector
-    pxor	xmm4,	xmm4
-    pxor	xmm3,	xmm3				; y_qpel vector
+    movq    xmm7,   [mv_x_inc_x4]       ; x_qpel inc
+    movq    xmm6,   [mv_y_inc_x4]       ; y_qpel inc
+    movq    xmm5,   [mx_x_offset_x4]    ; x_qpel vector
+    pxor    xmm4,   xmm4
+    pxor    xmm3,   xmm3                ; y_qpel vector
 HASH_HEIGHT_LOOP_SSE2:
-    movdqa	xmm2,	xmm5	; x_qpel vector
-    mov		r4,	r1
+    movdqa  xmm2,   xmm5    ; x_qpel vector
+    mov     r4, r1
 HASH_WIDTH_LOOP_SSE2:
-    movq	xmm0,	[r0]			; load x8 sum
-    punpcklwd	xmm0,	xmm4
-    movdqa		xmm1,	xmm2
-    punpcklwd	xmm1,	xmm3
-%rep	3
-    movd	r2d,	xmm0        ;edx:r3
-    lea		r5,     [r3+r2*8]   ;ebx:r5
-    mov		r6,     [r5]        ;eax:r6
-    movd	[r6],	xmm1
-    mov		r13,    [r6+4]	; explictly load eax+4 due cache miss from vtune observation
-    lea		r6,     [r6+4]
-    mov		[r5],	r6
-    psrldq	xmm1,	4
-    psrldq	xmm0,	4
+    movq    xmm0,   [r0]            ; load x8 sum
+    punpcklwd   xmm0,   xmm4
+    movdqa      xmm1,   xmm2
+    punpcklwd   xmm1,   xmm3
+%rep    3
+    movd    r2d,    xmm0        ;edx:r3
+    lea     r5,     [r3+r2*8]   ;ebx:r5
+    mov     r6,     [r5]        ;eax:r6
+    movd    [r6],   xmm1
+    mov     r13,    [r6+4]  ; explictly load eax+4 due cache miss from vtune observation
+    lea     r6,     [r6+4]
+    mov     [r5],   r6
+    psrldq  xmm1,   4
+    psrldq  xmm0,   4
 %endrep
-    movd	r2d,	xmm0
-    lea		r5,     [r3+r2*8]   ;ebx:r5
-    mov		r6,     [r5]        ;eax:r6
-    movd	[r6],	xmm1
-    mov		r13,    [r6+4]	; explictly load eax+4 due cache miss from vtune observation
-    lea		r6,     [r6+4]
-    mov		[r5],	r6
+    movd    r2d,    xmm0
+    lea     r5,     [r3+r2*8]   ;ebx:r5
+    mov     r6,     [r5]        ;eax:r6
+    movd    [r6],   xmm1
+    mov     r13,    [r6+4]  ; explictly load eax+4 due cache miss from vtune observation
+    lea     r6,     [r6+4]
+    mov     [r5],   r6
 
-    paddw	xmm2,	xmm7
-    lea		r0,     [r0+8]
-    sub		r4,     4
+    paddw   xmm2,   xmm7
+    lea     r0,     [r0+8]
+    sub     r4,     4
     jnz near HASH_WIDTH_LOOP_SSE2
-    paddw	xmm3,	xmm6
-    dec	r12
-    jnz	near HASH_HEIGHT_LOOP_SSE2
+    paddw   xmm3,   xmm6
+    dec r12
+    jnz near HASH_HEIGHT_LOOP_SSE2
 
-    pop		r13
-    pop		r12
+    pop     r13
+    pop     r12
     POP_XMM
     ret
 
@@ -1455,69 +1455,69 @@
     push r12
     push r13
     mov     r12,    r2
-    sar		r2,     2
-    mov		r5,     0       ;r5:ecx
+    sar     r2,     2
+    mov     r5,     0       ;r5:ecx
     xor     r6,     r6
-    pxor	xmm3,	xmm3
+    pxor    xmm3,   xmm3
 hash_assign_loop_x4_sse2:
-    movdqa	xmm0,	[r0+r5]
-    pslld	xmm0,	2
+    movdqa  xmm0,   [r0+r5]
+    pslld   xmm0,   2
 
-    movdqa	xmm1,	xmm0
-    pcmpeqd	xmm1,	xmm3
-    movmskps	r6,	xmm1
+    movdqa  xmm1,   xmm0
+    pcmpeqd xmm1,   xmm3
+    movmskps    r6, xmm1
     cmp     r6,     0x0f
-    jz	near hash_assign_with_copy_sse2
+    jz  near hash_assign_with_copy_sse2
 
-%assign x	0
+%assign x   0
 %rep 4
-    lea		r13,	[r3+r5*2+x]
-    mov		[r13],	r1
-    lea		r13,	[r4+r5*2+x]
-    mov		[r13],	r1
-    movd	r6d,	xmm0
-    add		r1,     r6
-    psrldq	xmm0,	4
-%assign	x	x+8
+    lea     r13,    [r3+r5*2+x]
+    mov     [r13],  r1
+    lea     r13,    [r4+r5*2+x]
+    mov     [r13],  r1
+    movd    r6d,    xmm0
+    add     r1,     r6
+    psrldq  xmm0,   4
+%assign x   x+8
 %endrep
     jmp near assign_next_sse2
 
 hash_assign_with_copy_sse2:
-    movq	xmm1,	r1
-    pshufd	xmm2,	xmm1,	01000100b
-    movdqa	[r3+r5*2], xmm2
-    movdqa	[r4+r5*2], xmm2
-    movdqa	[r3+r5*2+16], xmm2
-    movdqa	[r4+r5*2+16], xmm2
+    movq    xmm1,   r1
+    pshufd  xmm2,   xmm1,   01000100b
+    movdqa  [r3+r5*2], xmm2
+    movdqa  [r4+r5*2], xmm2
+    movdqa  [r3+r5*2+16], xmm2
+    movdqa  [r4+r5*2+16], xmm2
 
 assign_next_sse2:
-    add		r5,	16
-    dec		r2
-    jnz		near hash_assign_loop_x4_sse2
+    add     r5, 16
+    dec     r2
+    jnz     near hash_assign_loop_x4_sse2
 
-    and		r12,	3
-    jz		near hash_assign_no_rem_sse2
+    and     r12,    3
+    jz      near hash_assign_no_rem_sse2
 hash_assign_loop_x4_rem_sse2:
-    lea		r13,	[r3+r5*2]
-    mov		[r13],	r1
-    lea		r13,	[r4+r5*2]
-    mov		[r13],	r1
-    mov		r6d,	[r0+r5]
-    sal		r6,     2
-    add		r1,     r6
-    add		r5,     4
-    dec		r12
-    jnz		near hash_assign_loop_x4_rem_sse2
+    lea     r13,    [r3+r5*2]
+    mov     [r13],  r1
+    lea     r13,    [r4+r5*2]
+    mov     [r13],  r1
+    mov     r6d,    [r0+r5]
+    sal     r6,     2
+    add     r1,     r6
+    add     r5,     4
+    dec     r12
+    jnz     near hash_assign_loop_x4_rem_sse2
 
 hash_assign_no_rem_sse2:
     pop     r13
-    pop	    r12
+    pop     r12
     ret
 
 %endif
 
 ;**********************************************************************************************************************************
-;	int32_t SumOf8x8SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
+;   int32_t SumOf8x8SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
 ;**********************************************************************************************************************************
 WELS_EXTERN SumOf8x8SingleBlock_sse2
     %assign  push_num 0
@@ -1553,7 +1553,7 @@
     ret
 
 ;**********************************************************************************************************************************
-;	int32_t SumOf16x16SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
+;   int32_t SumOf16x16SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
 ;**********************************************************************************************************************************
 WELS_EXTERN SumOf16x16SingleBlock_sse2
     %assign  push_num 0