shithub: openh264

Download patch

ref: a00e2e722926c9f5ca7ea964a8258a888f520e03
parent: 7d3bb19bede84d39b11e880de88f79a746d365f5
author: Martin Storsjö <martin@martin.st>
date: Mon Apr 27 09:47:07 EDT 2015

Convert tabs to spaces in sample_sc.asm

This makes them consistent with the rest of the assembly source
files. Prior to f2314151e8, all the assembly files had consistent
indentation, but after that, this file had been made different.

--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -37,9 +37,9 @@
 SECTION .rodata align=16
 
 ALIGN 16
-mv_x_inc_x4		dw	0x10, 0x10, 0x10, 0x10
-mv_y_inc_x4		dw	0x04, 0x04, 0x04, 0x04
-mx_x_offset_x4	dw	0x00, 0x04, 0x08, 0x0C
+mv_x_inc_x4     dw  0x10, 0x10, 0x10, 0x10
+mv_y_inc_x4     dw  0x04, 0x04, 0x04, 0x04
+mx_x_offset_x4  dw  0x00, 0x04, 0x08, 0x0C
 
 SECTION .text
 %ifdef X86_32
@@ -48,113 +48,113 @@
 ;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
 ;*********************************************************************************************************************
 WELS_EXTERN SumOf8x8BlockOfFrame_sse2
-%define		pushsize		16
-%define		localsize		4
-%define		ref				esp + pushsize + localsize + 4
-%define		sum_ref			esp + pushsize + localsize + 20
-%define		times_of_sum	esp + pushsize + localsize + 24
-%define		width			esp + pushsize + localsize + 8
-%define		height			esp + pushsize + localsize + 12
-%define		linesize		esp + pushsize + localsize + 16
-%define		tmp_width		esp + 0
-    push	ebx
-    push	ebp
-    push	esi
-    push	edi
-    sub		esp,	localsize
+%define     pushsize        16
+%define     localsize       4
+%define     ref             esp + pushsize + localsize + 4
+%define     sum_ref         esp + pushsize + localsize + 20
+%define     times_of_sum    esp + pushsize + localsize + 24
+%define     width           esp + pushsize + localsize + 8
+%define     height          esp + pushsize + localsize + 12
+%define     linesize        esp + pushsize + localsize + 16
+%define     tmp_width       esp + 0
+    push    ebx
+    push    ebp
+    push    esi
+    push    edi
+    sub     esp,    localsize
 
-    pxor	xmm0,	xmm0
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		edx,	[times_of_sum]
-    mov		ebx,	[linesize]
-    mov		eax,	[width]
-    lea		ecx,	[ebx+ebx*2]	; 3*linesize
+    pxor    xmm0,   xmm0
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     edx,    [times_of_sum]
+    mov     ebx,    [linesize]
+    mov     eax,    [width]
+    lea     ecx,    [ebx+ebx*2] ; 3*linesize
 
-    mov		[tmp_width],	eax
-    lea		ebp,	[esi+ebx*4]
+    mov     [tmp_width],    eax
+    lea     ebp,    [esi+ebx*4]
 FIRST_ROW:
-    movq	xmm1,	[esi]
-    movq	xmm2,	[esi+ebx]
-    movq	xmm3,	[esi+ebx*2]
-    movq	xmm4,	[esi+ecx]
+    movq    xmm1,   [esi]
+    movq    xmm2,   [esi+ebx]
+    movq    xmm3,   [esi+ebx*2]
+    movq    xmm4,   [esi+ecx]
 
-    shufps	xmm1,	xmm2,	01000100b
-    shufps	xmm3,	xmm4,	01000100b
-    psadbw	xmm1,	xmm0
-    psadbw	xmm3,	xmm0
-    paddd	xmm1,	xmm3
+    shufps  xmm1,   xmm2,   01000100b
+    shufps  xmm3,   xmm4,   01000100b
+    psadbw  xmm1,   xmm0
+    psadbw  xmm3,   xmm0
+    paddd   xmm1,   xmm3
 
-    movq	xmm2,	[ebp]
-    movq	xmm3,	[ebp+ebx]
-    movq	xmm4,	[ebp+ebx*2]
-    movq	xmm5,	[ebp+ecx]
+    movq    xmm2,   [ebp]
+    movq    xmm3,   [ebp+ebx]
+    movq    xmm4,   [ebp+ebx*2]
+    movq    xmm5,   [ebp+ecx]
 
-    shufps	xmm2,	xmm3,	01000100b
-    shufps	xmm4,	xmm5,	01000100b
-    psadbw	xmm2,	xmm0
-    psadbw	xmm4,	xmm0
-    paddd	xmm2,	xmm4
+    shufps  xmm2,   xmm3,   01000100b
+    shufps  xmm4,   xmm5,   01000100b
+    psadbw  xmm2,   xmm0
+    psadbw  xmm4,   xmm0
+    paddd   xmm2,   xmm4
 
-    paddd	xmm1,	xmm2
-    pshufd	xmm2,	xmm1,	00001110b
-    paddd	xmm1,	xmm2
-    movd	eax,	xmm1
-    mov		[edi],	ax
-    inc		dword [edx+eax*4]
+    paddd   xmm1,   xmm2
+    pshufd  xmm2,   xmm1,   00001110b
+    paddd   xmm1,   xmm2
+    movd    eax,    xmm1
+    mov     [edi],  ax
+    inc     dword [edx+eax*4]
 
-    inc		esi
-    inc		ebp
-    add		edi,	2
+    inc     esi
+    inc     ebp
+    add     edi,    2
 
-    dec		dword [tmp_width]
-    jg		FIRST_ROW
+    dec     dword [tmp_width]
+    jg      FIRST_ROW
 
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		ebp,	[width]
-    dec		dword [height]
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     ebp,    [width]
+    dec     dword [height]
 HEIGHT_LOOP:
-    mov		[tmp_width],	ebp
+    mov     [tmp_width],    ebp
 WIDTH_LOOP:
-    movq	xmm1,	[esi+ebx*8]
-    movq	xmm2,	[esi]
-    psadbw	xmm1,	xmm0
-    psadbw	xmm2,	xmm0
-    psubd	xmm1,	xmm2
-    movd	eax,	xmm1
-    mov		cx,		[edi]
-    add		eax,	ecx
+    movq    xmm1,   [esi+ebx*8]
+    movq    xmm2,   [esi]
+    psadbw  xmm1,   xmm0
+    psadbw  xmm2,   xmm0
+    psubd   xmm1,   xmm2
+    movd    eax,    xmm1
+    mov     cx,     [edi]
+    add     eax,    ecx
 
-    mov		[edi+ebp*2],	ax
-    inc		dword [edx+eax*4]
+    mov     [edi+ebp*2],    ax
+    inc     dword [edx+eax*4]
 
-    inc		esi
-    add		edi,	2
+    inc     esi
+    add     edi,    2
 
-    dec		dword [tmp_width]
-    jg		WIDTH_LOOP
+    dec     dword [tmp_width]
+    jg      WIDTH_LOOP
 
-    add		esi,	ebx
-    sub		esi,	ebp
+    add     esi,    ebx
+    sub     esi,    ebp
 
-    dec		dword [height]
-    jg		HEIGHT_LOOP
+    dec     dword [height]
+    jg      HEIGHT_LOOP
 
-    add		esp,	localsize
-    pop		edi
-    pop		esi
-    pop		ebp
-    pop		ebx
-%undef		pushsize
-%undef		localsize
-%undef		ref
-%undef		sum_ref
-%undef		times_of_sum
-%undef		width
-%undef		height
-%undef		linesize
-%undef		tmp_width
+    add     esp,    localsize
+    pop     edi
+    pop     esi
+    pop     ebp
+    pop     ebx
+%undef      pushsize
+%undef      localsize
+%undef      ref
+%undef      sum_ref
+%undef      times_of_sum
+%undef      width
+%undef      height
+%undef      linesize
+%undef      tmp_width
     ret
 
 
@@ -161,10 +161,10 @@
 %macro COUNT_SUM 3
 %define xmm_reg %1
 %define tmp_reg %2
-    movd	tmp_reg,	xmm_reg
-    inc		dword [edx+tmp_reg*4]
+    movd    tmp_reg,    xmm_reg
+    inc     dword [edx+tmp_reg*4]
 %if %3 == 1
-    psrldq	xmm_reg,	4
+    psrldq  xmm_reg,    4
 %endif
 %endmacro
 
@@ -178,177 +178,177 @@
 ; read extra (16 - (width % 8) ) mod 16 bytes of every line
 ; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref
 WELS_EXTERN SumOf8x8BlockOfFrame_sse4
-%define		pushsize		16
-%define		localsize		4
-%define		ref				esp + pushsize + localsize + 4
-%define		sum_ref			esp + pushsize + localsize + 20
-%define		times_of_sum	esp + pushsize + localsize + 24
-%define		width			esp + pushsize + localsize + 8
-%define		height			esp + pushsize + localsize + 12
-%define		linesize		esp + pushsize + localsize + 16
-%define		tmp_width		esp + 0
-    push	ebx
-    push	ebp
-    push	esi
-    push	edi
-    sub		esp,	localsize
+%define     pushsize        16
+%define     localsize       4
+%define     ref             esp + pushsize + localsize + 4
+%define     sum_ref         esp + pushsize + localsize + 20
+%define     times_of_sum    esp + pushsize + localsize + 24
+%define     width           esp + pushsize + localsize + 8
+%define     height          esp + pushsize + localsize + 12
+%define     linesize        esp + pushsize + localsize + 16
+%define     tmp_width       esp + 0
+    push    ebx
+    push    ebp
+    push    esi
+    push    edi
+    sub     esp,    localsize
 
-    pxor	xmm0,	xmm0
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		edx,	[times_of_sum]
-    mov		ebx,	[linesize]
-    mov		eax,	[width]
-    lea		ecx,	[ebx+ebx*2]	; 3*linesize
+    pxor    xmm0,   xmm0
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     edx,    [times_of_sum]
+    mov     ebx,    [linesize]
+    mov     eax,    [width]
+    lea     ecx,    [ebx+ebx*2] ; 3*linesize
 
-    mov		[tmp_width],	eax
-    lea		ebp,	[esi+ebx*4]
+    mov     [tmp_width],    eax
+    lea     ebp,    [esi+ebx*4]
 FIRST_ROW_SSE4:
-    movdqu	xmm1,	[esi]
-    movdqu	xmm3,	[esi+ebx]
-    movdqu	xmm5,	[esi+ebx*2]
-    movdqu	xmm7,	[esi+ecx]
+    movdqu  xmm1,   [esi]
+    movdqu  xmm3,   [esi+ebx]
+    movdqu  xmm5,   [esi+ebx*2]
+    movdqu  xmm7,   [esi+ecx]
 
-    movdqa	xmm2,	xmm1
-    mpsadbw	xmm1,	xmm0,	000b
-    mpsadbw	xmm2,	xmm0,	100b
-    paddw	xmm1,	xmm2			; 8 sums of line1
+    movdqa  xmm2,   xmm1
+    mpsadbw xmm1,   xmm0,   000b
+    mpsadbw xmm2,   xmm0,   100b
+    paddw   xmm1,   xmm2            ; 8 sums of line1
 
-    movdqa	xmm4,	xmm3
-    mpsadbw	xmm3,	xmm0,	000b
-    mpsadbw	xmm4,	xmm0,	100b
-    paddw	xmm3,	xmm4			; 8 sums of line2
+    movdqa  xmm4,   xmm3
+    mpsadbw xmm3,   xmm0,   000b
+    mpsadbw xmm4,   xmm0,   100b
+    paddw   xmm3,   xmm4            ; 8 sums of line2
 
-    movdqa	xmm2,	xmm5
-    mpsadbw	xmm5,	xmm0,	000b
-    mpsadbw	xmm2,	xmm0,	100b
-    paddw	xmm5,	xmm2			; 8 sums of line3
+    movdqa  xmm2,   xmm5
+    mpsadbw xmm5,   xmm0,   000b
+    mpsadbw xmm2,   xmm0,   100b
+    paddw   xmm5,   xmm2            ; 8 sums of line3
 
-    movdqa	xmm4,	xmm7
-    mpsadbw	xmm7,	xmm0,	000b
-    mpsadbw	xmm4,	xmm0,	100b
-    paddw	xmm7,	xmm4			; 8 sums of line4
+    movdqa  xmm4,   xmm7
+    mpsadbw xmm7,   xmm0,   000b
+    mpsadbw xmm4,   xmm0,   100b
+    paddw   xmm7,   xmm4            ; 8 sums of line4
 
-    paddw	xmm1,	xmm3
-    paddw	xmm5,	xmm7
-    paddw	xmm1,	xmm5			; sum the upper 4 lines first
+    paddw   xmm1,   xmm3
+    paddw   xmm5,   xmm7
+    paddw   xmm1,   xmm5            ; sum the upper 4 lines first
 
-    movdqu	xmm2,	[ebp]
-    movdqu	xmm3,	[ebp+ebx]
-    movdqu	xmm4,	[ebp+ebx*2]
-    movdqu	xmm5,	[ebp+ecx]
+    movdqu  xmm2,   [ebp]
+    movdqu  xmm3,   [ebp+ebx]
+    movdqu  xmm4,   [ebp+ebx*2]
+    movdqu  xmm5,   [ebp+ecx]
 
-    movdqa	xmm6,	xmm2
-    mpsadbw	xmm2,	xmm0,	000b
-    mpsadbw	xmm6,	xmm0,	100b
-    paddw	xmm2,	xmm6
+    movdqa  xmm6,   xmm2
+    mpsadbw xmm2,   xmm0,   000b
+    mpsadbw xmm6,   xmm0,   100b
+    paddw   xmm2,   xmm6
 
-    movdqa	xmm7,	xmm3
-    mpsadbw	xmm3,	xmm0,	000b
-    mpsadbw	xmm7,	xmm0,	100b
-    paddw	xmm3,	xmm7
+    movdqa  xmm7,   xmm3
+    mpsadbw xmm3,   xmm0,   000b
+    mpsadbw xmm7,   xmm0,   100b
+    paddw   xmm3,   xmm7
 
-    movdqa	xmm6,	xmm4
-    mpsadbw	xmm4,	xmm0,	000b
-    mpsadbw	xmm6,	xmm0,	100b
-    paddw	xmm4,	xmm6
+    movdqa  xmm6,   xmm4
+    mpsadbw xmm4,   xmm0,   000b
+    mpsadbw xmm6,   xmm0,   100b
+    paddw   xmm4,   xmm6
 
-    movdqa	xmm7,	xmm5
-    mpsadbw	xmm5,	xmm0,	000b
-    mpsadbw	xmm7,	xmm0,	100b
-    paddw	xmm5,	xmm7
+    movdqa  xmm7,   xmm5
+    mpsadbw xmm5,   xmm0,   000b
+    mpsadbw xmm7,   xmm0,   100b
+    paddw   xmm5,   xmm7
 
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm1,	xmm2
-    paddw	xmm1,	xmm4			; sum of lines 1- 8
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm1,   xmm2
+    paddw   xmm1,   xmm4            ; sum of lines 1- 8
 
-    movdqu	[edi],	xmm1
+    movdqu  [edi],  xmm1
 
-    movdqa	xmm2,	xmm1
-    punpcklwd	xmm1,	xmm0
-    punpckhwd	xmm2,	xmm0
+    movdqa  xmm2,   xmm1
+    punpcklwd   xmm1,   xmm0
+    punpckhwd   xmm2,   xmm0
 
-    COUNT_SUM	xmm1,	eax,	1
-    COUNT_SUM	xmm1,	eax,	1
-    COUNT_SUM	xmm1,	eax,	1
-    COUNT_SUM	xmm1,	eax,	0
-    COUNT_SUM	xmm2,	eax,	1
-    COUNT_SUM	xmm2,	eax,	1
-    COUNT_SUM	xmm2,	eax,	1
-    COUNT_SUM	xmm2,	eax,	0
+    COUNT_SUM   xmm1,   eax,    1
+    COUNT_SUM   xmm1,   eax,    1
+    COUNT_SUM   xmm1,   eax,    1
+    COUNT_SUM   xmm1,   eax,    0
+    COUNT_SUM   xmm2,   eax,    1
+    COUNT_SUM   xmm2,   eax,    1
+    COUNT_SUM   xmm2,   eax,    1
+    COUNT_SUM   xmm2,   eax,    0
 
-    lea		esi,	[esi+8]
-    lea		ebp,	[ebp+8]
-    lea		edi,	[edi+16]		; element size is 2
+    lea     esi,    [esi+8]
+    lea     ebp,    [ebp+8]
+    lea     edi,    [edi+16]        ; element size is 2
 
-    sub		dword [tmp_width], 8
-    jg		near FIRST_ROW_SSE4
+    sub     dword [tmp_width], 8
+    jg      near FIRST_ROW_SSE4
 
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		ebp,	[width]
-    dec		dword [height]
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     ebp,    [width]
+    dec     dword [height]
 HEIGHT_LOOP_SSE4:
-    mov		ecx,	ebp
+    mov     ecx,    ebp
 WIDTH_LOOP_SSE4:
-    movdqu	xmm1,	[esi+ebx*8]
-    movdqu	xmm2,	[esi]
-    movdqu	xmm7,	[edi]
+    movdqu  xmm1,   [esi+ebx*8]
+    movdqu  xmm2,   [esi]
+    movdqu  xmm7,   [edi]
 
-    movdqa	xmm3,	xmm1
-    mpsadbw	xmm1,	xmm0,	000b
-    mpsadbw	xmm3,	xmm0,	100b
-    paddw	xmm1,	xmm3
+    movdqa  xmm3,   xmm1
+    mpsadbw xmm1,   xmm0,   000b
+    mpsadbw xmm3,   xmm0,   100b
+    paddw   xmm1,   xmm3
 
-    movdqa	xmm4,	xmm2
-    mpsadbw	xmm2,	xmm0,	000b
-    mpsadbw	xmm4,	xmm0,	100b
-    paddw	xmm2,	xmm4
+    movdqa  xmm4,   xmm2
+    mpsadbw xmm2,   xmm0,   000b
+    mpsadbw xmm4,   xmm0,   100b
+    paddw   xmm2,   xmm4
 
-    paddw	xmm7,	xmm1
-    psubw	xmm7,	xmm2
-    movdqu	[edi+ebp*2], xmm7
+    paddw   xmm7,   xmm1
+    psubw   xmm7,   xmm2
+    movdqu  [edi+ebp*2], xmm7
 
-    movdqa	xmm6,	xmm7
-    punpcklwd	xmm7,	xmm0
-    punpckhwd	xmm6,	xmm0
+    movdqa  xmm6,   xmm7
+    punpcklwd   xmm7,   xmm0
+    punpckhwd   xmm6,   xmm0
 
-    COUNT_SUM	xmm7,	eax,	1
-    COUNT_SUM	xmm7,	eax,	1
-    COUNT_SUM	xmm7,	eax,	1
-    COUNT_SUM	xmm7,	eax,	0
-    COUNT_SUM	xmm6,	eax,	1
-    COUNT_SUM	xmm6,	eax,	1
-    COUNT_SUM	xmm6,	eax,	1
-    COUNT_SUM	xmm6,	eax,	0
+    COUNT_SUM   xmm7,   eax,    1
+    COUNT_SUM   xmm7,   eax,    1
+    COUNT_SUM   xmm7,   eax,    1
+    COUNT_SUM   xmm7,   eax,    0
+    COUNT_SUM   xmm6,   eax,    1
+    COUNT_SUM   xmm6,   eax,    1
+    COUNT_SUM   xmm6,   eax,    1
+    COUNT_SUM   xmm6,   eax,    0
 
-    lea		esi,	[esi+8]
-    lea		edi,	[edi+16]
+    lea     esi,    [esi+8]
+    lea     edi,    [edi+16]
 
-    sub		ecx,	8
-    jg		near WIDTH_LOOP_SSE4
+    sub     ecx,    8
+    jg      near WIDTH_LOOP_SSE4
 
-    lea		esi,	[esi+ebx]
-    sub		esi,	ebp
+    lea     esi,    [esi+ebx]
+    sub     esi,    ebp
 
-    dec		dword [height]
-    jg		near HEIGHT_LOOP_SSE4
+    dec     dword [height]
+    jg      near HEIGHT_LOOP_SSE4
 
-    add		esp,	localsize
-    pop		edi
-    pop		esi
-    pop		ebp
-    pop		ebx
-%undef		pushsize
-%undef		localsize
-%undef		ref
-%undef		sum_ref
-%undef		times_of_sum
-%undef		width
-%undef		height
-%undef		linesize
-%undef		tmp_width
+    add     esp,    localsize
+    pop     edi
+    pop     esi
+    pop     ebp
+    pop     ebx
+%undef      pushsize
+%undef      localsize
+%undef      ref
+%undef      sum_ref
+%undef      times_of_sum
+%undef      width
+%undef      height
+%undef      linesize
+%undef      tmp_width
     ret
 
 
@@ -357,153 +357,153 @@
 ;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
 ;****************************************************************************************************************************************************
 WELS_EXTERN SumOf16x16BlockOfFrame_sse2
-%define		pushsize		16
-%define		localsize		4
-%define		ref				esp + pushsize + localsize + 4
-%define		sum_ref			esp + pushsize + localsize + 20
-%define		times_of_sum	esp + pushsize + localsize + 24
-%define		width			esp + pushsize + localsize + 8
-%define		height			esp + pushsize + localsize + 12
-%define		linesize		esp + pushsize + localsize + 16
-%define		tmp_width		esp
-    push	ebx
-    push	ebp
-    push	esi
-    push	edi
-    sub		esp,	localsize
+%define     pushsize        16
+%define     localsize       4
+%define     ref             esp + pushsize + localsize + 4
+%define     sum_ref         esp + pushsize + localsize + 20
+%define     times_of_sum    esp + pushsize + localsize + 24
+%define     width           esp + pushsize + localsize + 8
+%define     height          esp + pushsize + localsize + 12
+%define     linesize        esp + pushsize + localsize + 16
+%define     tmp_width       esp
+    push    ebx
+    push    ebp
+    push    esi
+    push    edi
+    sub     esp,    localsize
 
-    pxor	xmm0,	xmm0
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		edx,	[times_of_sum]
-    mov		ebx,	[linesize]
-    mov		eax,	[width]
+    pxor    xmm0,   xmm0
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     edx,    [times_of_sum]
+    mov     ebx,    [linesize]
+    mov     eax,    [width]
 
-    lea		ecx,	[ebx+ebx*2]
-    mov		[tmp_width],	eax
+    lea     ecx,    [ebx+ebx*2]
+    mov     [tmp_width],    eax
 FIRST_ROW_X16H:
-    movdqu	xmm1,	[esi]
-    movdqu	xmm2,	[esi+ebx]
-    movdqu	xmm3,	[esi+ebx*2]
-    movdqu	xmm4,	[esi+ecx]
+    movdqu  xmm1,   [esi]
+    movdqu  xmm2,   [esi+ebx]
+    movdqu  xmm3,   [esi+ebx*2]
+    movdqu  xmm4,   [esi+ecx]
 
-    psadbw  xmm1,	xmm0
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    paddw	xmm1,	xmm2
-    paddw	xmm3,	xmm4
-    paddw	xmm1,	xmm3
+    psadbw  xmm1,   xmm0
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    paddw   xmm1,   xmm2
+    paddw   xmm3,   xmm4
+    paddw   xmm1,   xmm3
 
-    lea		ebp,	[esi+ebx*4]
-    movdqu	xmm2,	[ebp]
-    movdqu	xmm3,	[ebp+ebx]
-    movdqu	xmm4,	[ebp+ebx*2]
-    movdqu	xmm5,	[ebp+ecx]
+    lea     ebp,    [esi+ebx*4]
+    movdqu  xmm2,   [ebp]
+    movdqu  xmm3,   [ebp+ebx]
+    movdqu  xmm4,   [ebp+ebx*2]
+    movdqu  xmm5,   [ebp+ecx]
 
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    psadbw  xmm5,	xmm0
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm2,	xmm4
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    psadbw  xmm5,   xmm0
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm2,   xmm4
 
-    paddw	xmm1,	xmm2
+    paddw   xmm1,   xmm2
 
-    lea		ebp,	[ebp+ebx*4]
-    movdqu	xmm2,	[ebp]
-    movdqu	xmm3,	[ebp+ebx]
-    movdqu	xmm4,	[ebp+ebx*2]
-    movdqu	xmm5,	[ebp+ecx]
+    lea     ebp,    [ebp+ebx*4]
+    movdqu  xmm2,   [ebp]
+    movdqu  xmm3,   [ebp+ebx]
+    movdqu  xmm4,   [ebp+ebx*2]
+    movdqu  xmm5,   [ebp+ecx]
 
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    psadbw  xmm5,	xmm0
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm2,	xmm4
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    psadbw  xmm5,   xmm0
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm2,   xmm4
 
-    paddw	xmm1,	xmm2
+    paddw   xmm1,   xmm2
 
-    lea		ebp,	[ebp+ebx*4]
-    movdqu	xmm2,	[ebp]
-    movdqu	xmm3,	[ebp+ebx]
-    movdqu	xmm4,	[ebp+ebx*2]
-    movdqu	xmm5,	[ebp+ecx]
+    lea     ebp,    [ebp+ebx*4]
+    movdqu  xmm2,   [ebp]
+    movdqu  xmm3,   [ebp+ebx]
+    movdqu  xmm4,   [ebp+ebx*2]
+    movdqu  xmm5,   [ebp+ecx]
 
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    psadbw  xmm5,	xmm0
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm2,	xmm4
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    psadbw  xmm5,   xmm0
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm2,   xmm4
 
-    paddw	xmm1,	xmm2
-    movdqa	xmm2,	xmm1
+    paddw   xmm1,   xmm2
+    movdqa  xmm2,   xmm1
     punpckhwd xmm2, xmm0
     paddw xmm1, xmm2
-    movd	eax,	xmm1
-    mov		[edi],	ax
-    inc		dword [edx+eax*4]
+    movd    eax,    xmm1
+    mov     [edi],  ax
+    inc     dword [edx+eax*4]
 
-    inc		esi
-    lea		edi,	[edi+2]
+    inc     esi
+    lea     edi,    [edi+2]
 
-    dec		dword [tmp_width]
-    jg		near FIRST_ROW_X16H
+    dec     dword [tmp_width]
+    jg      near FIRST_ROW_X16H
 
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		ebp,	[width]
-    dec		dword [height]
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     ebp,    [width]
+    dec     dword [height]
 
-    mov		ecx,	ebx
-    sal		ecx,	4		; succeeded 16th line
+    mov     ecx,    ebx
+    sal     ecx,    4       ; succeeded 16th line
 HEIGHT_LOOP_X16:
-    mov		[tmp_width],	ebp
+    mov     [tmp_width],    ebp
 WIDTH_LOOP_X16:
-    movdqu	xmm1,	[esi+ecx]
-    movdqu	xmm2,	[esi]
-    psadbw	xmm1,	xmm0
-    psadbw	xmm2,	xmm0
-    psubw	xmm1,	xmm2
-    movdqa	xmm2,	xmm1
+    movdqu  xmm1,   [esi+ecx]
+    movdqu  xmm2,   [esi]
+    psadbw  xmm1,   xmm0
+    psadbw  xmm2,   xmm0
+    psubw   xmm1,   xmm2
+    movdqa  xmm2,   xmm1
     punpckhwd xmm2, xmm0
-    paddw	xmm1,	xmm2
-    movd	eax,	xmm1
-    add		ax,	word [edi]
-    mov		[edi+ebp*2],	ax
-    inc		dword [edx+eax*4]
+    paddw   xmm1,   xmm2
+    movd    eax,    xmm1
+    add     ax, word [edi]
+    mov     [edi+ebp*2],    ax
+    inc     dword [edx+eax*4]
 
-    inc		esi
-    add		edi,	2
+    inc     esi
+    add     edi,    2
 
-    dec		dword [tmp_width]
-    jg		near WIDTH_LOOP_X16
+    dec     dword [tmp_width]
+    jg      near WIDTH_LOOP_X16
 
-    add		esi,	ebx
-    sub		esi,	ebp
+    add     esi,    ebx
+    sub     esi,    ebp
 
-    dec		dword [height]
-    jg		near HEIGHT_LOOP_X16
+    dec     dword [height]
+    jg      near HEIGHT_LOOP_X16
 
-    add		esp,	localsize
-    pop		edi
-    pop		esi
-    pop		ebp
-    pop		ebx
-%undef		pushsize
-%undef		localsize
-%undef		ref
-%undef		sum_ref
-%undef		times_of_sum
-%undef		width
-%undef		height
-%undef		linesize
-%undef		tmp_width
+    add     esp,    localsize
+    pop     edi
+    pop     esi
+    pop     ebp
+    pop     ebx
+%undef      pushsize
+%undef      localsize
+%undef      ref
+%undef      sum_ref
+%undef      times_of_sum
+%undef      width
+%undef      height
+%undef      linesize
+%undef      tmp_width
     ret
 
 ; requires:  width % 16 == 0 && height > 1
@@ -512,163 +512,163 @@
 ;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
 ;-----------------------------------------------------------------------------------------------------------------------------
 ; try 8 mv via offset
-%macro   SUM_LINE_X16_SSE41  5	; ref, dst0, dst1, tmp0, tmp1
-    movdqu	%2,	[%1]
-    movdqu	%3,	[%1+8h]
-    movdqa	%4,	%2
-    movdqa	%5,	%3
+%macro SUM_LINE_X16_SSE41  5    ; ref, dst0, dst1, tmp0, tmp1
+    movdqu  %2, [%1]
+    movdqu  %3, [%1+8h]
+    movdqa  %4, %2
+    movdqa  %5, %3
 
-    mpsadbw	%2,	xmm0,	0	; 000 B
-    mpsadbw	%4,	xmm0,	5	; 101 B
-    mpsadbw	%3,	xmm0,	2	; 010 B
-    mpsadbw	%5,	xmm0,	7	; 111 B
-    paddw	%2,	%4
-    paddw	%3, %5
-    paddw	%2,	%3	; accumulate cost
-%endmacro	; end of SAD_16x16_LINE_SSE41
+    mpsadbw %2, xmm0,   0   ; 000 B
+    mpsadbw %4, xmm0,   5   ; 101 B
+    mpsadbw %3, xmm0,   2   ; 010 B
+    mpsadbw %5, xmm0,   7   ; 111 B
+    paddw   %2, %4
+    paddw   %3, %5
+    paddw   %2, %3  ; accumulate cost
+%endmacro   ; end of SAD_16x16_LINE_SSE41
 
 WELS_EXTERN SumOf16x16BlockOfFrame_sse4
-%define		pushsize		16
-%define		localsize		4
-%define		ref				esp + pushsize + localsize + 4
-%define		sum_ref			esp + pushsize + localsize + 20
-%define		times_of_sum	esp + pushsize + localsize + 24
-%define		width			esp + pushsize + localsize + 8
-%define		height			esp + pushsize + localsize + 12
-%define		linesize		esp + pushsize + localsize + 16
-%define		tmp_width		esp
-    push	ebx
-    push	ebp
-    push	esi
-    push	edi
-    sub		esp,	localsize
+%define     pushsize        16
+%define     localsize       4
+%define     ref             esp + pushsize + localsize + 4
+%define     sum_ref         esp + pushsize + localsize + 20
+%define     times_of_sum    esp + pushsize + localsize + 24
+%define     width           esp + pushsize + localsize + 8
+%define     height          esp + pushsize + localsize + 12
+%define     linesize        esp + pushsize + localsize + 16
+%define     tmp_width       esp
+    push    ebx
+    push    ebp
+    push    esi
+    push    edi
+    sub     esp,    localsize
 
-    pxor	xmm0,	xmm0
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		edx,	[times_of_sum]
-    mov		ebx,	[linesize]
-    mov		eax,	[width]
+    pxor    xmm0,   xmm0
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     edx,    [times_of_sum]
+    mov     ebx,    [linesize]
+    mov     eax,    [width]
 
-    lea		ecx,	[ebx+ebx*2]
-    mov		[tmp_width],	eax
+    lea     ecx,    [ebx+ebx*2]
+    mov     [tmp_width],    eax
 FIRST_ROW_X16_SSE4:
-    SUM_LINE_X16_SSE41	esi,		xmm1, xmm2, xmm3, xmm4
-    SUM_LINE_X16_SSE41	esi+ebx,	xmm2, xmm3, xmm4, xmm5
-    SUM_LINE_X16_SSE41	esi+ebx*2,	xmm3, xmm4, xmm5, xmm6
-    SUM_LINE_X16_SSE41	esi+ecx,	xmm4, xmm5, xmm6, xmm7
-    paddw	xmm1, xmm2
-    paddw	xmm3, xmm4
-    paddw	xmm1, xmm3
+    SUM_LINE_X16_SSE41  esi,        xmm1, xmm2, xmm3, xmm4
+    SUM_LINE_X16_SSE41  esi+ebx,    xmm2, xmm3, xmm4, xmm5
+    SUM_LINE_X16_SSE41  esi+ebx*2,  xmm3, xmm4, xmm5, xmm6
+    SUM_LINE_X16_SSE41  esi+ecx,    xmm4, xmm5, xmm6, xmm7
+    paddw   xmm1, xmm2
+    paddw   xmm3, xmm4
+    paddw   xmm1, xmm3
 
-    lea		ebp,	[esi+ebx*4]
-    SUM_LINE_X16_SSE41	ebp,		xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ebx,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ebx*2,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ecx,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
+    lea     ebp,    [esi+ebx*4]
+    SUM_LINE_X16_SSE41  ebp,        xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ebx,    xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ebx*2,  xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ecx,    xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
 
-    lea		ebp,	[ebp+ebx*4]
-    SUM_LINE_X16_SSE41	ebp,		xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ebx,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ebx*2,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ecx,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
+    lea     ebp,    [ebp+ebx*4]
+    SUM_LINE_X16_SSE41  ebp,        xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ebx,    xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ebx*2,  xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ecx,    xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
 
-    lea		ebp,	[ebp+ebx*4]
-    SUM_LINE_X16_SSE41	ebp,		xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ebx,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ebx*2,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	ebp+ecx,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
+    lea     ebp,    [ebp+ebx*4]
+    SUM_LINE_X16_SSE41  ebp,        xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ebx,    xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ebx*2,  xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  ebp+ecx,    xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
 
-    movdqa	[edi],	xmm1
-    movdqa	xmm2,	xmm1
-    punpcklwd	xmm1,	xmm0
-    punpckhwd	xmm2,	xmm0
+    movdqa  [edi],  xmm1
+    movdqa  xmm2,   xmm1
+    punpcklwd   xmm1,   xmm0
+    punpckhwd   xmm2,   xmm0
 
-    COUNT_SUM	xmm1,	eax,	1
-    COUNT_SUM	xmm1,	eax,	1
-    COUNT_SUM	xmm1,	eax,	1
-    COUNT_SUM	xmm1,	eax,	0
-    COUNT_SUM	xmm2,	eax,	1
-    COUNT_SUM	xmm2,	eax,	1
-    COUNT_SUM	xmm2,	eax,	1
-    COUNT_SUM	xmm2,	eax,	0
+    COUNT_SUM   xmm1,   eax,    1
+    COUNT_SUM   xmm1,   eax,    1
+    COUNT_SUM   xmm1,   eax,    1
+    COUNT_SUM   xmm1,   eax,    0
+    COUNT_SUM   xmm2,   eax,    1
+    COUNT_SUM   xmm2,   eax,    1
+    COUNT_SUM   xmm2,   eax,    1
+    COUNT_SUM   xmm2,   eax,    0
 
-    lea		esi,	[esi+8]
-    lea		edi,	[edi+16]	; element size is 2
+    lea     esi,    [esi+8]
+    lea     edi,    [edi+16]    ; element size is 2
 
-    sub		dword [tmp_width], 8
-    jg		near FIRST_ROW_X16_SSE4
+    sub     dword [tmp_width], 8
+    jg      near FIRST_ROW_X16_SSE4
 
-    mov		esi,	[ref]
-    mov		edi,	[sum_ref]
-    mov		ebp,	[width]
-    dec		dword [height]
+    mov     esi,    [ref]
+    mov     edi,    [sum_ref]
+    mov     ebp,    [width]
+    dec     dword [height]
 
-    mov		ecx,	ebx
-    sal		ecx,	4		; succeeded 16th line
+    mov     ecx,    ebx
+    sal     ecx,    4       ; succeeded 16th line
 
 HEIGHT_LOOP_X16_SSE4:
-    mov		[tmp_width],	ebp
+    mov     [tmp_width],    ebp
 WIDTH_LOOP_X16_SSE4:
-    movdqa	xmm7,	[edi]
-    SUM_LINE_X16_SSE41	esi+ecx, xmm1, xmm2, xmm3, xmm4
-    SUM_LINE_X16_SSE41	esi, xmm2, xmm3, xmm4, xmm5
+    movdqa  xmm7,   [edi]
+    SUM_LINE_X16_SSE41  esi+ecx, xmm1, xmm2, xmm3, xmm4
+    SUM_LINE_X16_SSE41  esi, xmm2, xmm3, xmm4, xmm5
 
-    paddw	xmm7,	xmm1
-    psubw	xmm7,	xmm2
-    movdqa	[edi+ebp*2], xmm7
+    paddw   xmm7,   xmm1
+    psubw   xmm7,   xmm2
+    movdqa  [edi+ebp*2], xmm7
 
-    movdqa	xmm6,	xmm7
-    punpcklwd	xmm7,	xmm0
-    punpckhwd	xmm6,	xmm0
+    movdqa  xmm6,   xmm7
+    punpcklwd   xmm7,   xmm0
+    punpckhwd   xmm6,   xmm0
 
-    COUNT_SUM	xmm7,	eax,	1
-    COUNT_SUM	xmm7,	eax,	1
-    COUNT_SUM	xmm7,	eax,	1
-    COUNT_SUM	xmm7,	eax,	0
-    COUNT_SUM	xmm6,	eax,	1
-    COUNT_SUM	xmm6,	eax,	1
-    COUNT_SUM	xmm6,	eax,	1
-    COUNT_SUM	xmm6,	eax,	0
+    COUNT_SUM   xmm7,   eax,    1
+    COUNT_SUM   xmm7,   eax,    1
+    COUNT_SUM   xmm7,   eax,    1
+    COUNT_SUM   xmm7,   eax,    0
+    COUNT_SUM   xmm6,   eax,    1
+    COUNT_SUM   xmm6,   eax,    1
+    COUNT_SUM   xmm6,   eax,    1
+    COUNT_SUM   xmm6,   eax,    0
 
-    lea		esi,	[esi+8]
-    lea		edi,	[edi+16]
+    lea     esi,    [esi+8]
+    lea     edi,    [edi+16]
 
-    sub		dword [tmp_width], 8
-    jg		near WIDTH_LOOP_X16_SSE4
+    sub     dword [tmp_width], 8
+    jg      near WIDTH_LOOP_X16_SSE4
 
-    add		esi,	ebx
-    sub		esi,	ebp
+    add     esi,    ebx
+    sub     esi,    ebp
 
-    dec		dword [height]
-    jg		near HEIGHT_LOOP_X16_SSE4
+    dec     dword [height]
+    jg      near HEIGHT_LOOP_X16_SSE4
 
-    add		esp,	localsize
-    pop		edi
-    pop		esi
-    pop		ebp
-    pop		ebx
-%undef		pushsize
-%undef		localsize
-%undef		ref
-%undef		sum_ref
-%undef		times_of_sum
-%undef		width
-%undef		height
-%undef		linesize
-%undef		tmp_width
+    add     esp,    localsize
+    pop     edi
+    pop     esi
+    pop     ebp
+    pop     ebx
+%undef      pushsize
+%undef      localsize
+%undef      ref
+%undef      sum_ref
+%undef      times_of_sum
+%undef      width
+%undef      height
+%undef      linesize
+%undef      tmp_width
     ret
 
 
@@ -676,78 +676,78 @@
 ; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
 ;-----------------------------------------------------------------------------------------------------------------------------
 WELS_EXTERN FillQpelLocationByFeatureValue_sse2
-    push	esi
-    push	edi
-    push	ebx
-    push	ebp
+    push    esi
+    push    edi
+    push    ebx
+    push    ebp
 
-    %define _ps			16				; push size
-    %define	_ls			4				; local size
-    %define	sum_ref		esp+_ps+_ls+4
-    %define	pos_list	esp+_ps+_ls+16
-    %define width		esp+_ps+_ls+8
-    %define height		esp+_ps+_ls+12
-    %define	i_height	esp
-    sub		esp,	_ls
+    %define _ps         16              ; push size
+    %define _ls         4               ; local size
+    %define sum_ref     esp+_ps+_ls+4
+    %define pos_list    esp+_ps+_ls+16
+    %define width       esp+_ps+_ls+8
+    %define height      esp+_ps+_ls+12
+    %define i_height    esp
+    sub     esp,    _ls
 
-    mov		esi,	[sum_ref]
-    mov		edi,	[pos_list]
-    mov		ebp,	[width]
-    mov		ebx,	[height]
-    mov		[i_height],	ebx
+    mov     esi,    [sum_ref]
+    mov     edi,    [pos_list]
+    mov     ebp,    [width]
+    mov     ebx,    [height]
+    mov     [i_height], ebx
 
-    movq	xmm7,	[mv_x_inc_x4]		; x_qpel inc
-    movq	xmm6,	[mv_y_inc_x4]		; y_qpel inc
-    movq	xmm5,	[mx_x_offset_x4]	; x_qpel vector
-    pxor	xmm4,	xmm4
-    pxor	xmm3,	xmm3				; y_qpel vector
+    movq    xmm7,   [mv_x_inc_x4]       ; x_qpel inc
+    movq    xmm6,   [mv_y_inc_x4]       ; y_qpel inc
+    movq    xmm5,   [mx_x_offset_x4]    ; x_qpel vector
+    pxor    xmm4,   xmm4
+    pxor    xmm3,   xmm3                ; y_qpel vector
 HASH_HEIGHT_LOOP_SSE2:
-    movdqa	xmm2,	xmm5	; x_qpel vector
-    mov		ecx,	ebp
+    movdqa  xmm2,   xmm5    ; x_qpel vector
+    mov     ecx,    ebp
 HASH_WIDTH_LOOP_SSE2:
-    movq	xmm0,	[esi]			; load x8 sum
-    punpcklwd	xmm0,	xmm4
-    movdqa		xmm1,	xmm2
-    punpcklwd	xmm1,	xmm3
-%rep	3
-    movd	edx,	xmm0
-    lea		ebx,	[edi+edx*4]
-    mov		eax,	[ebx]
-    movd	[eax],	xmm1
-    mov		edx,	[eax+4]	; explictly load eax+4 due cache miss from vtune observation
-    lea		eax,	[eax+4]
-    mov		[ebx],	eax
-    psrldq	xmm1,	4
-    psrldq	xmm0,	4
+    movq    xmm0,   [esi]           ; load x8 sum
+    punpcklwd   xmm0,   xmm4
+    movdqa      xmm1,   xmm2
+    punpcklwd   xmm1,   xmm3
+%rep    3
+    movd    edx,    xmm0
+    lea     ebx,    [edi+edx*4]
+    mov     eax,    [ebx]
+    movd    [eax],  xmm1
+    mov     edx,    [eax+4] ; explictly load eax+4 due cache miss from vtune observation
+    lea     eax,    [eax+4]
+    mov     [ebx],  eax
+    psrldq  xmm1,   4
+    psrldq  xmm0,   4
 %endrep
-    movd	edx,	xmm0
-    lea		ebx,	[edi+edx*4]
-    mov		eax,	[ebx]
-    movd	[eax],	xmm1
-    mov		edx,	[eax+4]	; explictly load eax+4 due cache miss from vtune observation
-    lea		eax,	[eax+4]
-    mov		[ebx],	eax
+    movd    edx,    xmm0
+    lea     ebx,    [edi+edx*4]
+    mov     eax,    [ebx]
+    movd    [eax],  xmm1
+    mov     edx,    [eax+4] ; explictly load eax+4 due cache miss from vtune observation
+    lea     eax,    [eax+4]
+    mov     [ebx],  eax
 
-    paddw	xmm2,	xmm7
-    lea		esi,	[esi+8]
-    sub		ecx,	4
+    paddw   xmm2,   xmm7
+    lea     esi,    [esi+8]
+    sub     ecx,    4
     jnz near HASH_WIDTH_LOOP_SSE2
-    paddw	xmm3,	xmm6
-    dec	dword [i_height]
-    jnz	near HASH_HEIGHT_LOOP_SSE2
+    paddw   xmm3,   xmm6
+    dec dword [i_height]
+    jnz near HASH_HEIGHT_LOOP_SSE2
 
-    add		esp,	_ls
-    %undef	_ps
-    %undef	_ls
-    %undef	sum_ref
-    %undef	pos_list
-    %undef	width
-    %undef	height
-    %undef	i_height
-    pop		ebp
-    pop		ebx
-    pop		edi
-    pop		esi
+    add     esp,    _ls
+    %undef  _ps
+    %undef  _ls
+    %undef  sum_ref
+    %undef  pos_list
+    %undef  width
+    %undef  height
+    %undef  i_height
+    pop     ebp
+    pop     ebx
+    pop     edi
+    pop     esi
     ret
 
 ;---------------------------------------------------------------------------------------------------------------------------------------------------
@@ -755,74 +755,74 @@
 ;                        uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList )
 ;---------------------------------------------------------------------------------------------------------------------------------------------------
 WELS_EXTERN InitializeHashforFeature_sse2
-    push	ebx
-    push	esi
-    push	edi
-    push	ebp
-    %define	_ps	16	; push size
-    mov		edi,	[esp+_ps+16]	; pPositionOfSum
-    mov		ebp,	[esp+_ps+20]	; sum_idx_list
-    mov		esi,	[esp+_ps+4]     ; pTimesOfSum
-    mov		ebx,	[esp+_ps+8]     ; pBuf
-    mov		edx,	[esp+_ps+12]	; list_sz
-    sar		edx,	2
-    mov		ecx,	0
-    pxor	xmm7,	xmm7
+    push    ebx
+    push    esi
+    push    edi
+    push    ebp
+    %define _ps 16  ; push size
+    mov     edi,    [esp+_ps+16]    ; pPositionOfSum
+    mov     ebp,    [esp+_ps+20]    ; sum_idx_list
+    mov     esi,    [esp+_ps+4]     ; pTimesOfSum
+    mov     ebx,    [esp+_ps+8]     ; pBuf
+    mov     edx,    [esp+_ps+12]    ; list_sz
+    sar     edx,    2
+    mov     ecx,    0
+    pxor    xmm7,   xmm7
 hash_assign_loop_x4_sse2:
-    movdqa	xmm0,	[esi+ecx]
-    pslld	xmm0,	2
+    movdqa  xmm0,   [esi+ecx]
+    pslld   xmm0,   2
 
-    movdqa	xmm1,	xmm0
-    pcmpeqd	xmm1,	xmm7
-    movmskps	eax,	xmm1
+    movdqa  xmm1,   xmm0
+    pcmpeqd xmm1,   xmm7
+    movmskps    eax,    xmm1
     cmp eax, 0x0f
-    je	near hash_assign_with_copy_sse2
+    je  near hash_assign_with_copy_sse2
 
-%assign x	0
+%assign x   0
 %rep 4
-    lea		eax,	[edi+ecx+x]
-    mov		[eax],	ebx
-    lea		eax,	[ebp+ecx+x]
-    mov		[eax],	ebx
-    movd	eax,	xmm0
-    add		ebx,	eax
-    psrldq	xmm0,	4
-%assign	x	x+4
+    lea     eax,    [edi+ecx+x]
+    mov     [eax],  ebx
+    lea     eax,    [ebp+ecx+x]
+    mov     [eax],  ebx
+    movd    eax,    xmm0
+    add     ebx,    eax
+    psrldq  xmm0,   4
+%assign x   x+4
 %endrep
     jmp near assign_next_sse2
 
 hash_assign_with_copy_sse2:
-    movd	xmm1,	ebx
-    pshufd	xmm2,	xmm1,	0
-    movdqa	[edi+ecx], xmm2
-    movdqa	[ebp+ecx], xmm2
+    movd    xmm1,   ebx
+    pshufd  xmm2,   xmm1,   0
+    movdqa  [edi+ecx], xmm2
+    movdqa  [ebp+ecx], xmm2
 
 assign_next_sse2:
-    add		ecx,	16
-    dec		edx
-    jnz		near hash_assign_loop_x4_sse2
+    add     ecx,    16
+    dec     edx
+    jnz     near hash_assign_loop_x4_sse2
 
-    mov		edx,	[esp+_ps+12]	; list_sz
-    and		edx,	3
-    jz		near hash_assign_no_rem_sse2
+    mov     edx,    [esp+_ps+12]    ; list_sz
+    and     edx,    3
+    jz      near hash_assign_no_rem_sse2
 hash_assign_loop_x4_rem_sse2:
-    lea		eax,	[edi+ecx]
-    mov		[eax],	ebx
-    lea		eax,	[ebp+ecx]
-    mov		[eax],	ebx
-    mov		eax,	[esi+ecx]
-    sal		eax,	2
-    add		ebx,	eax
-    add		ecx,	4
-    dec		edx
-    jnz		near hash_assign_loop_x4_rem_sse2
+    lea     eax,    [edi+ecx]
+    mov     [eax],  ebx
+    lea     eax,    [ebp+ecx]
+    mov     [eax],  ebx
+    mov     eax,    [esi+ecx]
+    sal     eax,    2
+    add     ebx,    eax
+    add     ecx,    4
+    dec     edx
+    jnz     near hash_assign_loop_x4_rem_sse2
 
 hash_assign_no_rem_sse2:
-    %undef	_ps
-    pop		ebp
-    pop		edi
-    pop		esi
-    pop		ebx
+    %undef  _ps
+    pop     ebp
+    pop     edi
+    pop     esi
+    pop     ebx
     ret
 %else
 
@@ -843,47 +843,47 @@
     push r2
     push r4
 
-    pxor	xmm0,	xmm0
+    pxor    xmm0,   xmm0
     lea     r6, [r3+r3*2]
 
-    mov		r12,	r1              ;r12:tmp_width
-    lea		r13,	[r0+r3*4]       ;rbp:r13
+    mov     r12,    r1              ;r12:tmp_width
+    lea     r13,    [r0+r3*4]       ;rbp:r13
 FIRST_ROW:
-    movq	xmm1,	[r0]
-    movq	xmm2,	[r0+r3]
-    movq	xmm3,	[r0+r3*2]
-    movq	xmm4,	[r0+r6]
+    movq    xmm1,   [r0]
+    movq    xmm2,   [r0+r3]
+    movq    xmm3,   [r0+r3*2]
+    movq    xmm4,   [r0+r6]
 
-    shufps	xmm1,	xmm2,	01000100b
-    shufps	xmm3,	xmm4,	01000100b
-    psadbw	xmm1,	xmm0
-    psadbw	xmm3,	xmm0
-    paddd	xmm1,	xmm3
+    shufps  xmm1,   xmm2,   01000100b
+    shufps  xmm3,   xmm4,   01000100b
+    psadbw  xmm1,   xmm0
+    psadbw  xmm3,   xmm0
+    paddd   xmm1,   xmm3
 
-    movq	xmm2,	[r13]
-    movq	xmm3,	[r13+r3]
-    movq	xmm4,	[r13+r3*2]
-    movq	xmm5,	[r13+r6]
+    movq    xmm2,   [r13]
+    movq    xmm3,   [r13+r3]
+    movq    xmm4,   [r13+r3*2]
+    movq    xmm5,   [r13+r6]
 
-    shufps	xmm2,	xmm3,	01000100b
-    shufps	xmm4,	xmm5,	01000100b
-    psadbw	xmm2,	xmm0
-    psadbw	xmm4,	xmm0
-    paddd	xmm2,	xmm4
+    shufps  xmm2,   xmm3,   01000100b
+    shufps  xmm4,   xmm5,   01000100b
+    psadbw  xmm2,   xmm0
+    psadbw  xmm4,   xmm0
+    paddd   xmm2,   xmm4
 
-    paddd	xmm1,	xmm2
-    pshufd	xmm2,	xmm1,	00001110b
-    paddd	xmm1,	xmm2
-    movd	r2d,	xmm1
-    mov		[r4],	r2w
-    inc		dword [r5+r2*4]
+    paddd   xmm1,   xmm2
+    pshufd  xmm2,   xmm1,   00001110b
+    paddd   xmm1,   xmm2
+    movd    r2d,    xmm1
+    mov     [r4],   r2w
+    inc     dword [r5+r2*4]
 
-    inc		r0
-    inc		r13
-    add		r4,	2
+    inc     r0
+    inc     r13
+    add     r4, 2
 
-    dec		r12
-    jg		FIRST_ROW
+    dec     r12
+    jg      FIRST_ROW
 
     pop r4
     pop r2
@@ -891,34 +891,34 @@
     mov r13, r2
     dec r13
 HEIGHT_LOOP:
-    mov		r12,	r1
+    mov     r12,    r1
 WIDTH_LOOP:
-    movq	xmm1,	[r0+r3*8]
-    movq	xmm2,	[r0]
-    psadbw	xmm1,	xmm0
-    psadbw	xmm2,	xmm0
-    psubd	xmm1,	xmm2
-    movd	r2d,	xmm1
-    mov		r6w,	[r4]
-    add		r2d,	r6d
-    mov		[r4+r1*2],	r2w
-    inc		dword [r5+r2*4]
+    movq    xmm1,   [r0+r3*8]
+    movq    xmm2,   [r0]
+    psadbw  xmm1,   xmm0
+    psadbw  xmm2,   xmm0
+    psubd   xmm1,   xmm2
+    movd    r2d,    xmm1
+    mov     r6w,    [r4]
+    add     r2d,    r6d
+    mov     [r4+r1*2],  r2w
+    inc     dword [r5+r2*4]
 
-    inc		r0
-    add		r4,	2
+    inc     r0
+    add     r4, 2
 
-    dec		r12
-    jg		WIDTH_LOOP
+    dec     r12
+    jg      WIDTH_LOOP
 
-    add		r0,	r3
-    sub		r0,	r1
+    add     r0, r3
+    sub     r0, r1
 
 
-    dec		r13
-    jg		HEIGHT_LOOP
+    dec     r13
+    jg      HEIGHT_LOOP
 
-    pop		r13
-    pop		r12
+    pop     r13
+    pop     r12
     POP_XMM
     LOAD_6_PARA_POP
     ret
@@ -928,10 +928,10 @@
 %define xmm_reg %1
 %define tmp_dreg %2
 %define tmp_qreg %3
-    movd	tmp_dreg,	xmm_reg
-    inc		dword [r5+tmp_qreg*4]
+    movd    tmp_dreg,   xmm_reg
+    inc     dword [r5+tmp_qreg*4]
 %if %4 == 1
-    psrldq	xmm_reg,	4
+    psrldq  xmm_reg,    4
 %endif
 %endmacro
 
@@ -957,92 +957,92 @@
     push r2
     push r4
 
-    pxor	xmm0,	xmm0
+    pxor    xmm0,   xmm0
     lea     r6, [r3+r3*2]
 
-    mov		r12,	r1              ;r12:tmp_width
-    lea		r13,	[r0+r3*4]       ;rbp:r13
+    mov     r12,    r1              ;r12:tmp_width
+    lea     r13,    [r0+r3*4]       ;rbp:r13
 FIRST_ROW_SSE4:
-    movdqu	xmm1,	[r0]
-    movdqu	xmm3,	[r0+r3]
-    movdqu	xmm5,	[r0+r3*2]
-    movdqu	xmm7,	[r0+r6]
+    movdqu  xmm1,   [r0]
+    movdqu  xmm3,   [r0+r3]
+    movdqu  xmm5,   [r0+r3*2]
+    movdqu  xmm7,   [r0+r6]
 
-    movdqa	xmm2,	xmm1
-    mpsadbw	xmm1,	xmm0,	000b
-    mpsadbw	xmm2,	xmm0,	100b
-    paddw	xmm1,	xmm2			; 8 sums of line1
+    movdqa  xmm2,   xmm1
+    mpsadbw xmm1,   xmm0,   000b
+    mpsadbw xmm2,   xmm0,   100b
+    paddw   xmm1,   xmm2            ; 8 sums of line1
 
-    movdqa	xmm4,	xmm3
-    mpsadbw	xmm3,	xmm0,	000b
-    mpsadbw	xmm4,	xmm0,	100b
-    paddw	xmm3,	xmm4			; 8 sums of line2
+    movdqa  xmm4,   xmm3
+    mpsadbw xmm3,   xmm0,   000b
+    mpsadbw xmm4,   xmm0,   100b
+    paddw   xmm3,   xmm4            ; 8 sums of line2
 
-    movdqa	xmm2,	xmm5
-    mpsadbw	xmm5,	xmm0,	000b
-    mpsadbw	xmm2,	xmm0,	100b
-    paddw	xmm5,	xmm2			; 8 sums of line3
+    movdqa  xmm2,   xmm5
+    mpsadbw xmm5,   xmm0,   000b
+    mpsadbw xmm2,   xmm0,   100b
+    paddw   xmm5,   xmm2            ; 8 sums of line3
 
-    movdqa	xmm4,	xmm7
-    mpsadbw	xmm7,	xmm0,	000b
-    mpsadbw	xmm4,	xmm0,	100b
-    paddw	xmm7,	xmm4			; 8 sums of line4
+    movdqa  xmm4,   xmm7
+    mpsadbw xmm7,   xmm0,   000b
+    mpsadbw xmm4,   xmm0,   100b
+    paddw   xmm7,   xmm4            ; 8 sums of line4
 
-    paddw	xmm1,	xmm3
-    paddw	xmm5,	xmm7
-    paddw	xmm1,	xmm5			; sum the upper 4 lines first
+    paddw   xmm1,   xmm3
+    paddw   xmm5,   xmm7
+    paddw   xmm1,   xmm5            ; sum the upper 4 lines first
 
-    movdqu	xmm2,	[r13]
-    movdqu	xmm3,	[r13+r3]
-    movdqu	xmm4,	[r13+r3*2]
-    movdqu	xmm5,	[r13+r6]
+    movdqu  xmm2,   [r13]
+    movdqu  xmm3,   [r13+r3]
+    movdqu  xmm4,   [r13+r3*2]
+    movdqu  xmm5,   [r13+r6]
 
-    movdqa	xmm6,	xmm2
-    mpsadbw	xmm2,	xmm0,	000b
-    mpsadbw	xmm6,	xmm0,	100b
-    paddw	xmm2,	xmm6
+    movdqa  xmm6,   xmm2
+    mpsadbw xmm2,   xmm0,   000b
+    mpsadbw xmm6,   xmm0,   100b
+    paddw   xmm2,   xmm6
 
-    movdqa	xmm7,	xmm3
-    mpsadbw	xmm3,	xmm0,	000b
-    mpsadbw	xmm7,	xmm0,	100b
-    paddw	xmm3,	xmm7
+    movdqa  xmm7,   xmm3
+    mpsadbw xmm3,   xmm0,   000b
+    mpsadbw xmm7,   xmm0,   100b
+    paddw   xmm3,   xmm7
 
-    movdqa	xmm6,	xmm4
-    mpsadbw	xmm4,	xmm0,	000b
-    mpsadbw	xmm6,	xmm0,	100b
-    paddw	xmm4,	xmm6
+    movdqa  xmm6,   xmm4
+    mpsadbw xmm4,   xmm0,   000b
+    mpsadbw xmm6,   xmm0,   100b
+    paddw   xmm4,   xmm6
 
-    movdqa	xmm7,	xmm5
-    mpsadbw	xmm5,	xmm0,	000b
-    mpsadbw	xmm7,	xmm0,	100b
-    paddw	xmm5,	xmm7
+    movdqa  xmm7,   xmm5
+    mpsadbw xmm5,   xmm0,   000b
+    mpsadbw xmm7,   xmm0,   100b
+    paddw   xmm5,   xmm7
 
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm1,	xmm2
-    paddw	xmm1,	xmm4			; sum of lines 1- 8
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm1,   xmm2
+    paddw   xmm1,   xmm4            ; sum of lines 1- 8
 
-    movdqu	[r4],	xmm1
+    movdqu  [r4],   xmm1
 
-    movdqa	xmm2,	xmm1
-    punpcklwd	xmm1,	xmm0
-    punpckhwd	xmm2,	xmm0
+    movdqa  xmm2,   xmm1
+    punpcklwd   xmm1,   xmm0
+    punpckhwd   xmm2,   xmm0
 
-    COUNT_SUM	xmm1,	r2d, r2, 1
-    COUNT_SUM	xmm1,	r2d, r2, 1
-    COUNT_SUM	xmm1,	r2d, r2, 1
-    COUNT_SUM	xmm1,	r2d, r2, 0
-    COUNT_SUM	xmm2,	r2d, r2 ,1
-    COUNT_SUM	xmm2,	r2d, r2 ,1
-    COUNT_SUM	xmm2,	r2d, r2 ,1
-    COUNT_SUM	xmm2,	r2d, r2 ,0
+    COUNT_SUM   xmm1,   r2d, r2, 1
+    COUNT_SUM   xmm1,   r2d, r2, 1
+    COUNT_SUM   xmm1,   r2d, r2, 1
+    COUNT_SUM   xmm1,   r2d, r2, 0
+    COUNT_SUM   xmm2,   r2d, r2 ,1
+    COUNT_SUM   xmm2,   r2d, r2 ,1
+    COUNT_SUM   xmm2,   r2d, r2 ,1
+    COUNT_SUM   xmm2,   r2d, r2 ,0
 
-    lea		r0,     [r0+8]
-    lea		r13,	[r13+8]
-    lea		r4,     [r4+16]		; element size is 2
+    lea     r0,     [r0+8]
+    lea     r13,    [r13+8]
+    lea     r4,     [r4+16]     ; element size is 2
 
-    sub		r12, 8
-    jg		near FIRST_ROW_SSE4
+    sub     r12, 8
+    jg      near FIRST_ROW_SSE4
 
     pop r4
     pop r2
@@ -1050,53 +1050,53 @@
     mov r13, r2
     dec r13
 HEIGHT_LOOP_SSE4:
-    mov		r12,	r1
+    mov     r12,    r1
 WIDTH_LOOP_SSE4:
-    movdqu	xmm1,	[r0+r3*8]
-    movdqu	xmm2,	[r0]
-    movdqu	xmm7,	[r4]
+    movdqu  xmm1,   [r0+r3*8]
+    movdqu  xmm2,   [r0]
+    movdqu  xmm7,   [r4]
 
-    movdqa	xmm3,	xmm1
-    mpsadbw	xmm1,	xmm0,	000b
-    mpsadbw	xmm3,	xmm0,	100b
-    paddw	xmm1,	xmm3
+    movdqa  xmm3,   xmm1
+    mpsadbw xmm1,   xmm0,   000b
+    mpsadbw xmm3,   xmm0,   100b
+    paddw   xmm1,   xmm3
 
-    movdqa	xmm4,	xmm2
-    mpsadbw	xmm2,	xmm0,	000b
-    mpsadbw	xmm4,	xmm0,	100b
-    paddw	xmm2,	xmm4
+    movdqa  xmm4,   xmm2
+    mpsadbw xmm2,   xmm0,   000b
+    mpsadbw xmm4,   xmm0,   100b
+    paddw   xmm2,   xmm4
 
-    paddw	xmm7,	xmm1
-    psubw	xmm7,	xmm2
-    movdqu	[r4+r1*2], xmm7
+    paddw   xmm7,   xmm1
+    psubw   xmm7,   xmm2
+    movdqu  [r4+r1*2], xmm7
 
-    movdqa	xmm6,	xmm7
-    punpcklwd	xmm7,	xmm0
-    punpckhwd	xmm6,	xmm0
+    movdqa  xmm6,   xmm7
+    punpcklwd   xmm7,   xmm0
+    punpckhwd   xmm6,   xmm0
 
-    COUNT_SUM	xmm7,	r2d, r2, 1
-    COUNT_SUM	xmm7,	r2d, r2, 1
-    COUNT_SUM	xmm7,	r2d, r2, 1
-    COUNT_SUM	xmm7,	r2d, r2, 0
-    COUNT_SUM	xmm6,	r2d, r2, 1
-    COUNT_SUM	xmm6,	r2d, r2, 1
-    COUNT_SUM	xmm6,	r2d, r2, 1
-    COUNT_SUM	xmm6,	r2d, r2, 0
+    COUNT_SUM   xmm7,   r2d, r2, 1
+    COUNT_SUM   xmm7,   r2d, r2, 1
+    COUNT_SUM   xmm7,   r2d, r2, 1
+    COUNT_SUM   xmm7,   r2d, r2, 0
+    COUNT_SUM   xmm6,   r2d, r2, 1
+    COUNT_SUM   xmm6,   r2d, r2, 1
+    COUNT_SUM   xmm6,   r2d, r2, 1
+    COUNT_SUM   xmm6,   r2d, r2, 0
 
-    lea		r0,	[r0+8]
-    lea		r4,	[r4+16]
+    lea     r0, [r0+8]
+    lea     r4, [r4+16]
 
-    sub		r12,	8
-    jg		near WIDTH_LOOP_SSE4
+    sub     r12,    8
+    jg      near WIDTH_LOOP_SSE4
 
-    lea		r0,	[r0+r3]
-    sub		r0,	r1
+    lea     r0, [r0+r3]
+    sub     r0, r1
 
-    dec		r13
-    jg		near HEIGHT_LOOP_SSE4
+    dec     r13
+    jg      near HEIGHT_LOOP_SSE4
 
-    pop		r13
-    pop		r12
+    pop     r13
+    pop     r12
     POP_XMM
     LOAD_6_PARA_POP
     ret
@@ -1119,83 +1119,83 @@
     push r2
     push r4
 
-    pxor	xmm0,	xmm0
+    pxor    xmm0,   xmm0
     lea     r6, [r3+r3*2]
 
-    mov		r12,	r1              ;r12:tmp_width
+    mov     r12,    r1              ;r12:tmp_width
 FIRST_ROW_X16H:
-    movdqu	xmm1,	[r0]
-    movdqu	xmm2,	[r0+r3]
-    movdqu	xmm3,	[r0+r3*2]
-    movdqu	xmm4,	[r0+r6]
+    movdqu  xmm1,   [r0]
+    movdqu  xmm2,   [r0+r3]
+    movdqu  xmm3,   [r0+r3*2]
+    movdqu  xmm4,   [r0+r6]
 
-    psadbw  xmm1,	xmm0
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    paddw	xmm1,	xmm2
-    paddw	xmm3,	xmm4
-    paddw	xmm1,	xmm3
+    psadbw  xmm1,   xmm0
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    paddw   xmm1,   xmm2
+    paddw   xmm3,   xmm4
+    paddw   xmm1,   xmm3
 
-    lea		r13,	[r0+r3*4]       ;ebp:r13
-    movdqu	xmm2,	[r13]
-    movdqu	xmm3,	[r13+r3]
-    movdqu	xmm4,	[r13+r3*2]
-    movdqu	xmm5,	[r13+r6]
+    lea     r13,    [r0+r3*4]       ;ebp:r13
+    movdqu  xmm2,   [r13]
+    movdqu  xmm3,   [r13+r3]
+    movdqu  xmm4,   [r13+r3*2]
+    movdqu  xmm5,   [r13+r6]
 
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    psadbw  xmm5,	xmm0
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm2,	xmm4
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    psadbw  xmm5,   xmm0
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm2,   xmm4
 
-    paddw	xmm1,	xmm2
+    paddw   xmm1,   xmm2
 
-    lea		r13,	[r13+r3*4]
-    movdqu	xmm2,	[r13]
-    movdqu	xmm3,	[r13+r3]
-    movdqu	xmm4,	[r13+r3*2]
-    movdqu	xmm5,	[r13+r6]
+    lea     r13,    [r13+r3*4]
+    movdqu  xmm2,   [r13]
+    movdqu  xmm3,   [r13+r3]
+    movdqu  xmm4,   [r13+r3*2]
+    movdqu  xmm5,   [r13+r6]
 
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    psadbw  xmm5,	xmm0
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm2,	xmm4
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    psadbw  xmm5,   xmm0
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm2,   xmm4
 
-    paddw	xmm1,	xmm2
+    paddw   xmm1,   xmm2
 
-    lea		r13,	[r13+r3*4]
-    movdqu	xmm2,	[r13]
-    movdqu	xmm3,	[r13+r3]
-    movdqu	xmm4,	[r13+r3*2]
-    movdqu	xmm5,	[r13+r6]
+    lea     r13,    [r13+r3*4]
+    movdqu  xmm2,   [r13]
+    movdqu  xmm3,   [r13+r3]
+    movdqu  xmm4,   [r13+r3*2]
+    movdqu  xmm5,   [r13+r6]
 
-    psadbw  xmm2,	xmm0
-    psadbw  xmm3,	xmm0
-    psadbw  xmm4,	xmm0
-    psadbw  xmm5,	xmm0
-    paddw	xmm2,	xmm3
-    paddw	xmm4,	xmm5
-    paddw	xmm2,	xmm4
+    psadbw  xmm2,   xmm0
+    psadbw  xmm3,   xmm0
+    psadbw  xmm4,   xmm0
+    psadbw  xmm5,   xmm0
+    paddw   xmm2,   xmm3
+    paddw   xmm4,   xmm5
+    paddw   xmm2,   xmm4
 
-    paddw	xmm1,	xmm2
-    movdqa	xmm2,	xmm1
+    paddw   xmm1,   xmm2
+    movdqa  xmm2,   xmm1
     punpckhwd xmm2, xmm0
     paddw xmm1, xmm2
-    movd	r2d,	xmm1
-    mov		[r4],	r2w
-    inc		dword [r5+r2*4]
+    movd    r2d,    xmm1
+    mov     [r4],   r2w
+    inc     dword [r5+r2*4]
 
-    inc		r0
-    lea		r4,	[r4+2]
+    inc     r0
+    lea     r4, [r4+2]
 
-    dec		r12
-    jg		near FIRST_ROW_X16H
+    dec     r12
+    jg      near FIRST_ROW_X16H
 
     pop r4
     pop r2
@@ -1202,38 +1202,38 @@
     pop r0
     mov r13, r2
     dec r13
-    mov		r6,	r3
-    sal		r6,	4		; succeeded 16th line
+    mov     r6, r3
+    sal     r6, 4       ; succeeded 16th line
 HEIGHT_LOOP_X16:
-    mov		r12,	r1
+    mov     r12,    r1
 WIDTH_LOOP_X16:
-    movdqu	xmm1,	[r0+r6]
-    movdqu	xmm2,	[r0]
-    psadbw	xmm1,	xmm0
-    psadbw	xmm2,	xmm0
-    psubw	xmm1,	xmm2
-    movdqa	xmm2,	xmm1
+    movdqu  xmm1,   [r0+r6]
+    movdqu  xmm2,   [r0]
+    psadbw  xmm1,   xmm0
+    psadbw  xmm2,   xmm0
+    psubw   xmm1,   xmm2
+    movdqa  xmm2,   xmm1
     punpckhwd xmm2, xmm0
-    paddw	xmm1,	xmm2
-    movd	r2d,	xmm1
-    add		r2w,	word [r4]
-    mov		[r4+r1*2],	r2w
-    inc		dword [r5+r2*4]
+    paddw   xmm1,   xmm2
+    movd    r2d,    xmm1
+    add     r2w,    word [r4]
+    mov     [r4+r1*2],  r2w
+    inc     dword [r5+r2*4]
 
-    inc		r0
-    add		r4,	2
+    inc     r0
+    add     r4, 2
 
-    dec		r12
-    jg		near WIDTH_LOOP_X16
+    dec     r12
+    jg      near WIDTH_LOOP_X16
 
-    add		r0,	r3
-    sub		r0,	r1
+    add     r0, r3
+    sub     r0, r1
 
-    dec		r13
-    jg		near HEIGHT_LOOP_X16
+    dec     r13
+    jg      near HEIGHT_LOOP_X16
 
-    pop		r13
-    pop		r12
+    pop     r13
+    pop     r12
     POP_XMM
     LOAD_6_PARA_POP
     ret
@@ -1244,20 +1244,20 @@
 ;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
 ;-----------------------------------------------------------------------------------------------------------------------------
 ; try 8 mv via offset
-%macro   SUM_LINE_X16_SSE41  5	; ref, dst0, dst1, tmp0, tmp1
-    movdqu	%2,	[%1]
-    movdqu	%3,	[%1+8h]
-    movdqa	%4,	%2
-    movdqa	%5,	%3
+%macro SUM_LINE_X16_SSE41  5    ; ref, dst0, dst1, tmp0, tmp1
+    movdqu  %2, [%1]
+    movdqu  %3, [%1+8h]
+    movdqa  %4, %2
+    movdqa  %5, %3
 
-    mpsadbw	%2,	xmm0,	0	; 000 B
-    mpsadbw	%4,	xmm0,	5	; 101 B
-    mpsadbw	%3,	xmm0,	2	; 010 B
-    mpsadbw	%5,	xmm0,	7	; 111 B
-    paddw	%2,	%4
-    paddw	%3, %5
-    paddw	%2,	%3	; accumulate cost
-%endmacro	; end of SAD_16x16_LINE_SSE41
+    mpsadbw %2, xmm0,   0   ; 000 B
+    mpsadbw %4, xmm0,   5   ; 101 B
+    mpsadbw %3, xmm0,   2   ; 010 B
+    mpsadbw %5, xmm0,   7   ; 111 B
+    paddw   %2, %4
+    paddw   %3, %5
+    paddw   %2, %3  ; accumulate cost
+%endmacro   ; end of SAD_16x16_LINE_SSE41
 
 WELS_EXTERN SumOf16x16BlockOfFrame_sse4
     %assign  push_num 0
@@ -1272,68 +1272,68 @@
     push r2
     push r4
 
-    pxor	xmm0,	xmm0
+    pxor    xmm0,   xmm0
     lea     r6, [r3+r3*2]
 
-    mov		r12,	r1              ;r12:tmp_width
+    mov     r12,    r1              ;r12:tmp_width
 FIRST_ROW_X16_SSE4:
-    SUM_LINE_X16_SSE41	r0,		xmm1, xmm2, xmm3, xmm4
-    SUM_LINE_X16_SSE41	r0+r3,	xmm2, xmm3, xmm4, xmm5
-    SUM_LINE_X16_SSE41	r0+r3*2,xmm3, xmm4, xmm5, xmm6
-    SUM_LINE_X16_SSE41	r0+r6,	xmm4, xmm5, xmm6, xmm7
-    paddw	xmm1, xmm2
-    paddw	xmm3, xmm4
-    paddw	xmm1, xmm3
+    SUM_LINE_X16_SSE41  r0,     xmm1, xmm2, xmm3, xmm4
+    SUM_LINE_X16_SSE41  r0+r3,  xmm2, xmm3, xmm4, xmm5
+    SUM_LINE_X16_SSE41  r0+r3*2,xmm3, xmm4, xmm5, xmm6
+    SUM_LINE_X16_SSE41  r0+r6,  xmm4, xmm5, xmm6, xmm7
+    paddw   xmm1, xmm2
+    paddw   xmm3, xmm4
+    paddw   xmm1, xmm3
 
-    lea		r13,	[r0+r3*4]
-    SUM_LINE_X16_SSE41	r13,		xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r3,     xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r3*2,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r6,     xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
+    lea     r13,    [r0+r3*4]
+    SUM_LINE_X16_SSE41  r13,        xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r3,     xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r3*2,   xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r6,     xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
 
-    lea		r13,	[r13+r3*4]
-    SUM_LINE_X16_SSE41	r13,		xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r3,     xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r3*2,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r6,     xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
+    lea     r13,    [r13+r3*4]
+    SUM_LINE_X16_SSE41  r13,        xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r3,     xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r3*2,   xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r6,     xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
 
-    lea		r13,	[r13+r3*4]
-    SUM_LINE_X16_SSE41	r13,		xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r3,     xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r3*2,	xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
-    SUM_LINE_X16_SSE41	r13+r6,     xmm2, xmm3, xmm4, xmm5
-    paddw	xmm1, xmm2
+    lea     r13,    [r13+r3*4]
+    SUM_LINE_X16_SSE41  r13,        xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r3,     xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r3*2,   xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
+    SUM_LINE_X16_SSE41  r13+r6,     xmm2, xmm3, xmm4, xmm5
+    paddw   xmm1, xmm2
 
-    movdqa	[r4],	xmm1
-    movdqa	xmm2,	xmm1
-    punpcklwd	xmm1,	xmm0
-    punpckhwd	xmm2,	xmm0
+    movdqa  [r4],   xmm1
+    movdqa  xmm2,   xmm1
+    punpcklwd   xmm1,   xmm0
+    punpckhwd   xmm2,   xmm0
 
-    COUNT_SUM	xmm1,	r2d, r2, 1
-    COUNT_SUM	xmm1,	r2d, r2, 1
-    COUNT_SUM	xmm1,	r2d, r2, 1
-    COUNT_SUM	xmm1,	r2d, r2, 0
-    COUNT_SUM	xmm2,	r2d, r2, 1
-    COUNT_SUM	xmm2,	r2d, r2, 1
-    COUNT_SUM	xmm2,	r2d, r2, 1
-    COUNT_SUM	xmm2,	r2d, r2, 0
+    COUNT_SUM   xmm1,   r2d, r2, 1
+    COUNT_SUM   xmm1,   r2d, r2, 1
+    COUNT_SUM   xmm1,   r2d, r2, 1
+    COUNT_SUM   xmm1,   r2d, r2, 0
+    COUNT_SUM   xmm2,   r2d, r2, 1
+    COUNT_SUM   xmm2,   r2d, r2, 1
+    COUNT_SUM   xmm2,   r2d, r2, 1
+    COUNT_SUM   xmm2,   r2d, r2, 0
 
-    lea		r0,	[r0+8]
-    lea		r4,	[r4+16]	; element size is 2
+    lea     r0, [r0+8]
+    lea     r4, [r4+16] ; element size is 2
 
-    sub		r12, 8
-    jg		near FIRST_ROW_X16_SSE4
+    sub     r12, 8
+    jg      near FIRST_ROW_X16_SSE4
 
     pop r4
     pop r2
@@ -1340,47 +1340,47 @@
     pop r0
     mov r13, r2
     dec r13
-    mov		r6,	r3
-    sal		r6,	4		; succeeded 16th line
+    mov     r6, r3
+    sal     r6, 4       ; succeeded 16th line
 
 HEIGHT_LOOP_X16_SSE4:
-    mov		r12,	r1
+    mov     r12,    r1
 WIDTH_LOOP_X16_SSE4:
-    movdqa	xmm7,	[r4]
-    SUM_LINE_X16_SSE41	r0+r6, xmm1, xmm2, xmm3, xmm4
-    SUM_LINE_X16_SSE41	r0, xmm2, xmm3, xmm4, xmm5
+    movdqa  xmm7,   [r4]
+    SUM_LINE_X16_SSE41  r0+r6, xmm1, xmm2, xmm3, xmm4
+    SUM_LINE_X16_SSE41  r0, xmm2, xmm3, xmm4, xmm5
 
-    paddw	xmm7,	xmm1
-    psubw	xmm7,	xmm2
-    movdqa	[r4+r1*2], xmm7
+    paddw   xmm7,   xmm1
+    psubw   xmm7,   xmm2
+    movdqa  [r4+r1*2], xmm7
 
-    movdqa	xmm6,	xmm7
-    punpcklwd	xmm7,	xmm0
-    punpckhwd	xmm6,	xmm0
+    movdqa  xmm6,   xmm7
+    punpcklwd   xmm7,   xmm0
+    punpckhwd   xmm6,   xmm0
 
-    COUNT_SUM	xmm7,	r2d, r2, 1
-    COUNT_SUM	xmm7,	r2d, r2, 1
-    COUNT_SUM	xmm7,	r2d, r2, 1
-    COUNT_SUM	xmm7,	r2d, r2, 0
-    COUNT_SUM	xmm6,	r2d, r2, 1
-    COUNT_SUM	xmm6,	r2d, r2, 1
-    COUNT_SUM	xmm6,	r2d, r2, 1
-    COUNT_SUM	xmm6,	r2d, r2, 0
+    COUNT_SUM   xmm7,   r2d, r2, 1
+    COUNT_SUM   xmm7,   r2d, r2, 1
+    COUNT_SUM   xmm7,   r2d, r2, 1
+    COUNT_SUM   xmm7,   r2d, r2, 0
+    COUNT_SUM   xmm6,   r2d, r2, 1
+    COUNT_SUM   xmm6,   r2d, r2, 1
+    COUNT_SUM   xmm6,   r2d, r2, 1
+    COUNT_SUM   xmm6,   r2d, r2, 0
 
-    lea		r0,	[r0+8]
-    lea		r4,	[r4+16]
+    lea     r0, [r0+8]
+    lea     r4, [r4+16]
 
-    sub		r12, 8
-    jg		near WIDTH_LOOP_X16_SSE4
+    sub     r12, 8
+    jg      near WIDTH_LOOP_X16_SSE4
 
-    add		r0,	r3
-    sub		r0,	r1
+    add     r0, r3
+    sub     r0, r1
 
-    dec		r13
-    jg		near HEIGHT_LOOP_X16_SSE4
+    dec     r13
+    jg      near HEIGHT_LOOP_X16_SSE4
 
-    pop		r13
-    pop		r12
+    pop     r13
+    pop     r12
     POP_XMM
     LOAD_6_PARA_POP
     ret
@@ -1398,48 +1398,48 @@
     push r13
     mov     r12,    r2
 
-    movq	xmm7,	[mv_x_inc_x4]		; x_qpel inc
-    movq	xmm6,	[mv_y_inc_x4]		; y_qpel inc
-    movq	xmm5,	[mx_x_offset_x4]	; x_qpel vector
-    pxor	xmm4,	xmm4
-    pxor	xmm3,	xmm3				; y_qpel vector
+    movq    xmm7,   [mv_x_inc_x4]       ; x_qpel inc
+    movq    xmm6,   [mv_y_inc_x4]       ; y_qpel inc
+    movq    xmm5,   [mx_x_offset_x4]    ; x_qpel vector
+    pxor    xmm4,   xmm4
+    pxor    xmm3,   xmm3                ; y_qpel vector
 HASH_HEIGHT_LOOP_SSE2:
-    movdqa	xmm2,	xmm5	; x_qpel vector
-    mov		r4,	r1
+    movdqa  xmm2,   xmm5    ; x_qpel vector
+    mov     r4, r1
 HASH_WIDTH_LOOP_SSE2:
-    movq	xmm0,	[r0]			; load x8 sum
-    punpcklwd	xmm0,	xmm4
-    movdqa		xmm1,	xmm2
-    punpcklwd	xmm1,	xmm3
-%rep	3
-    movd	r2d,	xmm0        ;edx:r3
-    lea		r5,     [r3+r2*8]   ;ebx:r5
-    mov		r6,     [r5]        ;eax:r6
-    movd	[r6],	xmm1
-    mov		r13,    [r6+4]	; explictly load eax+4 due cache miss from vtune observation
-    lea		r6,     [r6+4]
-    mov		[r5],	r6
-    psrldq	xmm1,	4
-    psrldq	xmm0,	4
+    movq    xmm0,   [r0]            ; load x8 sum
+    punpcklwd   xmm0,   xmm4
+    movdqa      xmm1,   xmm2
+    punpcklwd   xmm1,   xmm3
+%rep    3
+    movd    r2d,    xmm0        ;edx:r3
+    lea     r5,     [r3+r2*8]   ;ebx:r5
+    mov     r6,     [r5]        ;eax:r6
+    movd    [r6],   xmm1
+    mov     r13,    [r6+4]  ; explictly load eax+4 due cache miss from vtune observation
+    lea     r6,     [r6+4]
+    mov     [r5],   r6
+    psrldq  xmm1,   4
+    psrldq  xmm0,   4
 %endrep
-    movd	r2d,	xmm0
-    lea		r5,     [r3+r2*8]   ;ebx:r5
-    mov		r6,     [r5]        ;eax:r6
-    movd	[r6],	xmm1
-    mov		r13,    [r6+4]	; explictly load eax+4 due cache miss from vtune observation
-    lea		r6,     [r6+4]
-    mov		[r5],	r6
+    movd    r2d,    xmm0
+    lea     r5,     [r3+r2*8]   ;ebx:r5
+    mov     r6,     [r5]        ;eax:r6
+    movd    [r6],   xmm1
+    mov     r13,    [r6+4]  ; explictly load eax+4 due cache miss from vtune observation
+    lea     r6,     [r6+4]
+    mov     [r5],   r6
 
-    paddw	xmm2,	xmm7
-    lea		r0,     [r0+8]
-    sub		r4,     4
+    paddw   xmm2,   xmm7
+    lea     r0,     [r0+8]
+    sub     r4,     4
     jnz near HASH_WIDTH_LOOP_SSE2
-    paddw	xmm3,	xmm6
-    dec	r12
-    jnz	near HASH_HEIGHT_LOOP_SSE2
+    paddw   xmm3,   xmm6
+    dec r12
+    jnz near HASH_HEIGHT_LOOP_SSE2
 
-    pop		r13
-    pop		r12
+    pop     r13
+    pop     r12
     POP_XMM
     ret
 
@@ -1455,69 +1455,69 @@
     push r12
     push r13
     mov     r12,    r2
-    sar		r2,     2
-    mov		r5,     0       ;r5:ecx
+    sar     r2,     2
+    mov     r5,     0       ;r5:ecx
     xor     r6,     r6
-    pxor	xmm3,	xmm3
+    pxor    xmm3,   xmm3
 hash_assign_loop_x4_sse2:
-    movdqa	xmm0,	[r0+r5]
-    pslld	xmm0,	2
+    movdqa  xmm0,   [r0+r5]
+    pslld   xmm0,   2
 
-    movdqa	xmm1,	xmm0
-    pcmpeqd	xmm1,	xmm3
-    movmskps	r6,	xmm1
+    movdqa  xmm1,   xmm0
+    pcmpeqd xmm1,   xmm3
+    movmskps    r6, xmm1
     cmp     r6,     0x0f
-    jz	near hash_assign_with_copy_sse2
+    jz  near hash_assign_with_copy_sse2
 
-%assign x	0
+%assign x   0
 %rep 4
-    lea		r13,	[r3+r5*2+x]
-    mov		[r13],	r1
-    lea		r13,	[r4+r5*2+x]
-    mov		[r13],	r1
-    movd	r6d,	xmm0
-    add		r1,     r6
-    psrldq	xmm0,	4
-%assign	x	x+8
+    lea     r13,    [r3+r5*2+x]
+    mov     [r13],  r1
+    lea     r13,    [r4+r5*2+x]
+    mov     [r13],  r1
+    movd    r6d,    xmm0
+    add     r1,     r6
+    psrldq  xmm0,   4
+%assign x   x+8
 %endrep
     jmp near assign_next_sse2
 
 hash_assign_with_copy_sse2:
-    movq	xmm1,	r1
-    pshufd	xmm2,	xmm1,	01000100b
-    movdqa	[r3+r5*2], xmm2
-    movdqa	[r4+r5*2], xmm2
-    movdqa	[r3+r5*2+16], xmm2
-    movdqa	[r4+r5*2+16], xmm2
+    movq    xmm1,   r1
+    pshufd  xmm2,   xmm1,   01000100b
+    movdqa  [r3+r5*2], xmm2
+    movdqa  [r4+r5*2], xmm2
+    movdqa  [r3+r5*2+16], xmm2
+    movdqa  [r4+r5*2+16], xmm2
 
 assign_next_sse2:
-    add		r5,	16
-    dec		r2
-    jnz		near hash_assign_loop_x4_sse2
+    add     r5, 16
+    dec     r2
+    jnz     near hash_assign_loop_x4_sse2
 
-    and		r12,	3
-    jz		near hash_assign_no_rem_sse2
+    and     r12,    3
+    jz      near hash_assign_no_rem_sse2
 hash_assign_loop_x4_rem_sse2:
-    lea		r13,	[r3+r5*2]
-    mov		[r13],	r1
-    lea		r13,	[r4+r5*2]
-    mov		[r13],	r1
-    mov		r6d,	[r0+r5]
-    sal		r6,     2
-    add		r1,     r6
-    add		r5,     4
-    dec		r12
-    jnz		near hash_assign_loop_x4_rem_sse2
+    lea     r13,    [r3+r5*2]
+    mov     [r13],  r1
+    lea     r13,    [r4+r5*2]
+    mov     [r13],  r1
+    mov     r6d,    [r0+r5]
+    sal     r6,     2
+    add     r1,     r6
+    add     r5,     4
+    dec     r12
+    jnz     near hash_assign_loop_x4_rem_sse2
 
 hash_assign_no_rem_sse2:
     pop     r13
-    pop	    r12
+    pop     r12
     ret
 
 %endif
 
 ;**********************************************************************************************************************************
-;	int32_t SumOf8x8SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
+;   int32_t SumOf8x8SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
 ;**********************************************************************************************************************************
 WELS_EXTERN SumOf8x8SingleBlock_sse2
     %assign  push_num 0
@@ -1553,7 +1553,7 @@
     ret
 
 ;**********************************************************************************************************************************
-;	int32_t SumOf16x16SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
+;   int32_t SumOf16x16SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
 ;**********************************************************************************************************************************
 WELS_EXTERN SumOf16x16SingleBlock_sse2
     %assign  push_num 0