shithub: openh264

Download patch

ref: 186f4c0d29708590af145d8dfdff87b32b88a5b2
parent: 681b1da69899535a41b8aee4628811d4e529be8c
parent: f2314151e8623350363c0b1a5c4cfa1d7d460f6d
author: dongzha <dongzha@cisco.com>
date: Thu Aug 14 06:12:55 EDT 2014

Merge pull request #1267 from zhilwang/sumofblocks_scc

Add x86 32/64bit asm code for SumOfBlocks.

--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -244,6 +244,22 @@
 void SumOf16x16BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
                                const int32_t kiRefStride,
                                uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+
+#ifdef X86_ASM
+extern "C"
+{
+int32_t SumOf8x8SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
+int32_t SumOf16x16SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
+void SumOf8x8BlockOfFrame_sse2 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+                const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+void SumOf16x16BlockOfFrame_sse2 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+                const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+void SumOf8x8BlockOfFrame_sse4 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+                const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+void SumOf16x16BlockOfFrame_sse4 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+                const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+}
+#endif
 #ifdef HAVE_NEON
 extern "C"
 {
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -102,22 +102,42 @@
     //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
     pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
     pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
+#if defined (X86_ASM)
+    if (uiCpuFlag & WELS_CPU_SSE2) {
+        //for feature search
+      pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse2;
+      pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse2;
+        //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
+      pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_sse2;
+      pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_sse2;
+    }
+    if (uiCpuFlag & WELS_CPU_SSE41) {
+          //for feature search
+      pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse4;
+      pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse4;
+    }
+#endif
+
 #if defined (HAVE_NEON)
-    //for feature search
-    pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
-    pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
-    //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
-    pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon;
-    pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon;
+    if (uiCpuFlag & WELS_CPU_NEON) {
+      //for feature search
+      pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
+      pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
+      //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
+      pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon;
+      pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon;
+    }
 #endif
 
 #if defined (HAVE_NEON_AARCH64)
-    //for feature search
-    pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
-    pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
-    //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
-    pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon;
-    pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon;
+    if (uiCpuFlag & WELS_CPU_NEON) {
+      //for feature search
+      pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
+      pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
+      //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
+      pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon;
+      pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon;
+    }
 #endif
   }
 }
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -32,8 +32,1314 @@
 %include "asm_inc.asm"
 
 SECTION .text
+%ifdef X86_32
+;**********************************************************************************************************************
+;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;*********************************************************************************************************************
+WELS_EXTERN SumOf8x8BlockOfFrame_sse2
+%define		pushsize		16
+%define		localsize		4
+%define		ref				esp + pushsize + localsize + 4
+%define		sum_ref			esp + pushsize + localsize + 20
+%define		times_of_sum	esp + pushsize + localsize + 24
+%define		width			esp + pushsize + localsize + 8
+%define		height			esp + pushsize + localsize + 12
+%define		linesize		esp + pushsize + localsize + 16
+%define		tmp_width		esp + 0
+    push	ebx
+    push	ebp
+    push	esi
+    push	edi
+    sub		esp,	localsize
 
+    pxor	xmm0,	xmm0
+    mov		esi,	[ref]
+    mov		edi,	[sum_ref]
+    mov		edx,	[times_of_sum]
+    mov		ebx,	[linesize]
+    mov		eax,	[width]
+    lea		ecx,	[ebx+ebx*2]	; 3*linesize
+
+    mov		[tmp_width],	eax
+    lea		ebp,	[esi+ebx*4]
+FIRST_ROW:
+    movq	xmm1,	[esi]
+    movq	xmm2,	[esi+ebx]
+    movq	xmm3,	[esi+ebx*2]
+    movq	xmm4,	[esi+ecx]
+
+    shufps	xmm1,	xmm2,	01000100b
+    shufps	xmm3,	xmm4,	01000100b
+    psadbw	xmm1,	xmm0
+    psadbw	xmm3,	xmm0
+    paddd	xmm1,	xmm3
+
+    movq	xmm2,	[ebp]
+    movq	xmm3,	[ebp+ebx]
+    movq	xmm4,	[ebp+ebx*2]
+    movq	xmm5,	[ebp+ecx]
+
+    shufps	xmm2,	xmm3,	01000100b
+    shufps	xmm4,	xmm5,	01000100b
+    psadbw	xmm2,	xmm0
+    psadbw	xmm4,	xmm0
+    paddd	xmm2,	xmm4
+
+    paddd	xmm1,	xmm2
+    pshufd	xmm2,	xmm1,	00001110b
+    paddd	xmm1,	xmm2
+    movd	eax,	xmm1
+    mov		[edi],	ax
+    inc		dword [edx+eax*4]
+
+    inc		esi
+    inc		ebp
+    add		edi,	2
+
+    dec		dword [tmp_width]
+    jg		FIRST_ROW
+
+    mov		esi,	[ref]
+    mov		edi,	[sum_ref]
+    mov		ebp,	[width]
+    dec		dword [height]
+HEIGHT_LOOP:
+    mov		[tmp_width],	ebp
+WIDTH_LOOP:
+    movq	xmm1,	[esi+ebx*8]
+    movq	xmm2,	[esi]
+    psadbw	xmm1,	xmm0
+    psadbw	xmm2,	xmm0
+    psubd	xmm1,	xmm2
+    movd	eax,	xmm1
+    mov		cx,		[edi]
+    add		eax,	ecx
+
+    mov		[edi+ebp*2],	ax
+    inc		dword [edx+eax*4]
+
+    inc		esi
+    add		edi,	2
+
+    dec		dword [tmp_width]
+    jg		WIDTH_LOOP
+
+    add		esi,	ebx
+    sub		esi,	ebp
+
+    dec		dword [height]
+    jg		HEIGHT_LOOP
+
+    add		esp,	localsize
+    pop		edi
+    pop		esi
+    pop		ebp
+    pop		ebx
+%undef		pushsize
+%undef		localsize
+%undef		ref
+%undef		sum_ref
+%undef		times_of_sum
+%undef		width
+%undef		height
+%undef		linesize
+%undef		tmp_width
+    ret
+
+
+%macro COUNT_SUM 3
+%define xmm_reg %1
+%define tmp_reg %2
+    movd	tmp_reg,	xmm_reg
+    inc		dword [edx+tmp_reg*4]
+%if %3 == 1
+    psrldq	xmm_reg,	4
+%endif
+%endmacro
+
+
+;-----------------------------------------------------------------------------
+; requires:  width % 8 == 0 && height > 1
+;-----------------------------------------------------------------------------
+;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;-----------------------------------------------------------------------------
+; read extra (16 - (width % 8) ) mod 16 bytes of every line
+; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref
+WELS_EXTERN SumOf8x8BlockOfFrame_sse4
+%define		pushsize		16
+%define		localsize		4
+%define		ref				esp + pushsize + localsize + 4
+%define		sum_ref			esp + pushsize + localsize + 20
+%define		times_of_sum	esp + pushsize + localsize + 24
+%define		width			esp + pushsize + localsize + 8
+%define		height			esp + pushsize + localsize + 12
+%define		linesize		esp + pushsize + localsize + 16
+%define		tmp_width		esp + 0
+    push	ebx
+    push	ebp
+    push	esi
+    push	edi
+    sub		esp,	localsize
+
+    pxor	xmm0,	xmm0
+    mov		esi,	[ref]
+    mov		edi,	[sum_ref]
+    mov		edx,	[times_of_sum]
+    mov		ebx,	[linesize]
+    mov		eax,	[width]
+    lea		ecx,	[ebx+ebx*2]	; 3*linesize
+
+    mov		[tmp_width],	eax
+    lea		ebp,	[esi+ebx*4]
+FIRST_ROW_SSE4:
+    movdqu	xmm1,	[esi]
+    movdqu	xmm3,	[esi+ebx]
+    movdqu	xmm5,	[esi+ebx*2]
+    movdqu	xmm7,	[esi+ecx]
+
+    movdqa	xmm2,	xmm1
+    mpsadbw	xmm1,	xmm0,	000b
+    mpsadbw	xmm2,	xmm0,	100b
+    paddw	xmm1,	xmm2			; 8 sums of line1
+
+    movdqa	xmm4,	xmm3
+    mpsadbw	xmm3,	xmm0,	000b
+    mpsadbw	xmm4,	xmm0,	100b
+    paddw	xmm3,	xmm4			; 8 sums of line2
+
+    movdqa	xmm2,	xmm5
+    mpsadbw	xmm5,	xmm0,	000b
+    mpsadbw	xmm2,	xmm0,	100b
+    paddw	xmm5,	xmm2			; 8 sums of line3
+
+    movdqa	xmm4,	xmm7
+    mpsadbw	xmm7,	xmm0,	000b
+    mpsadbw	xmm4,	xmm0,	100b
+    paddw	xmm7,	xmm4			; 8 sums of line4
+
+    paddw	xmm1,	xmm3
+    paddw	xmm5,	xmm7
+    paddw	xmm1,	xmm5			; sum the upper 4 lines first
+
+    movdqu	xmm2,	[ebp]
+    movdqu	xmm3,	[ebp+ebx]
+    movdqu	xmm4,	[ebp+ebx*2]
+    movdqu	xmm5,	[ebp+ecx]
+
+    movdqa	xmm6,	xmm2
+    mpsadbw	xmm2,	xmm0,	000b
+    mpsadbw	xmm6,	xmm0,	100b
+    paddw	xmm2,	xmm6
+
+    movdqa	xmm7,	xmm3
+    mpsadbw	xmm3,	xmm0,	000b
+    mpsadbw	xmm7,	xmm0,	100b
+    paddw	xmm3,	xmm7
+
+    movdqa	xmm6,	xmm4
+    mpsadbw	xmm4,	xmm0,	000b
+    mpsadbw	xmm6,	xmm0,	100b
+    paddw	xmm4,	xmm6
+
+    movdqa	xmm7,	xmm5
+    mpsadbw	xmm5,	xmm0,	000b
+    mpsadbw	xmm7,	xmm0,	100b
+    paddw	xmm5,	xmm7
+
+    paddw	xmm2,	xmm3
+    paddw	xmm4,	xmm5
+    paddw	xmm1,	xmm2
+    paddw	xmm1,	xmm4			; sum of lines 1- 8
+
+    movdqu	[edi],	xmm1
+
+    movdqa	xmm2,	xmm1
+    punpcklwd	xmm1,	xmm0
+    punpckhwd	xmm2,	xmm0
+
+    COUNT_SUM	xmm1,	eax,	1
+    COUNT_SUM	xmm1,	eax,	1
+    COUNT_SUM	xmm1,	eax,	1
+    COUNT_SUM	xmm1,	eax,	0
+    COUNT_SUM	xmm2,	eax,	1
+    COUNT_SUM	xmm2,	eax,	1
+    COUNT_SUM	xmm2,	eax,	1
+    COUNT_SUM	xmm2,	eax,	0
+
+    lea		esi,	[esi+8]
+    lea		ebp,	[ebp+8]
+    lea		edi,	[edi+16]		; element size is 2
+
+    sub		dword [tmp_width], 8
+    jg		near FIRST_ROW_SSE4
+
+    mov		esi,	[ref]
+    mov		edi,	[sum_ref]
+    mov		ebp,	[width]
+    dec		dword [height]
+HEIGHT_LOOP_SSE4:
+    mov		ecx,	ebp
+WIDTH_LOOP_SSE4:
+    movdqu	xmm1,	[esi+ebx*8]
+    movdqu	xmm2,	[esi]
+    movdqu	xmm7,	[edi]
+
+    movdqa	xmm3,	xmm1
+    mpsadbw	xmm1,	xmm0,	000b
+    mpsadbw	xmm3,	xmm0,	100b
+    paddw	xmm1,	xmm3
+
+    movdqa	xmm4,	xmm2
+    mpsadbw	xmm2,	xmm0,	000b
+    mpsadbw	xmm4,	xmm0,	100b
+    paddw	xmm2,	xmm4
+
+    paddw	xmm7,	xmm1
+    psubw	xmm7,	xmm2
+    movdqu	[edi+ebp*2], xmm7
+
+    movdqa	xmm6,	xmm7
+    punpcklwd	xmm7,	xmm0
+    punpckhwd	xmm6,	xmm0
+
+    COUNT_SUM	xmm7,	eax,	1
+    COUNT_SUM	xmm7,	eax,	1
+    COUNT_SUM	xmm7,	eax,	1
+    COUNT_SUM	xmm7,	eax,	0
+    COUNT_SUM	xmm6,	eax,	1
+    COUNT_SUM	xmm6,	eax,	1
+    COUNT_SUM	xmm6,	eax,	1
+    COUNT_SUM	xmm6,	eax,	0
+
+    lea		esi,	[esi+8]
+    lea		edi,	[edi+16]
+
+    sub		ecx,	8
+    jg		near WIDTH_LOOP_SSE4
+
+    lea		esi,	[esi+ebx]
+    sub		esi,	ebp
+
+    dec		dword [height]
+    jg		near HEIGHT_LOOP_SSE4
+
+    add		esp,	localsize
+    pop		edi
+    pop		esi
+    pop		ebp
+    pop		ebx
+%undef		pushsize
+%undef		localsize
+%undef		ref
+%undef		sum_ref
+%undef		times_of_sum
+%undef		width
+%undef		height
+%undef		linesize
+%undef		tmp_width
+    ret
+
+
+;****************************************************************************************************************************************************
+;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;****************************************************************************************************************************************************
+WELS_EXTERN SumOf16x16BlockOfFrame_sse2
+%define		pushsize		16
+%define		localsize		4
+%define		ref				esp + pushsize + localsize + 4
+%define		sum_ref			esp + pushsize + localsize + 20
+%define		times_of_sum	esp + pushsize + localsize + 24
+%define		width			esp + pushsize + localsize + 8
+%define		height			esp + pushsize + localsize + 12
+%define		linesize		esp + pushsize + localsize + 16
+%define		tmp_width		esp
+    push	ebx
+    push	ebp
+    push	esi
+    push	edi
+    sub		esp,	localsize
+
+    pxor	xmm0,	xmm0
+    mov		esi,	[ref]
+    mov		edi,	[sum_ref]
+    mov		edx,	[times_of_sum]
+    mov		ebx,	[linesize]
+    mov		eax,	[width]
+
+    lea		ecx,	[ebx+ebx*2]
+    mov		[tmp_width],	eax
+FIRST_ROW_X16H:
+    movdqu	xmm1,	[esi]
+    movdqu	xmm2,	[esi+ebx]
+    movdqu	xmm3,	[esi+ebx*2]
+    movdqu	xmm4,	[esi+ecx]
+
+    psadbw  xmm1,	xmm0
+    psadbw  xmm2,	xmm0
+    psadbw  xmm3,	xmm0
+    psadbw  xmm4,	xmm0
+    paddw	xmm1,	xmm2
+    paddw	xmm3,	xmm4
+    paddw	xmm1,	xmm3
+
+    lea		ebp,	[esi+ebx*4]
+    movdqu	xmm2,	[ebp]
+    movdqu	xmm3,	[ebp+ebx]
+    movdqu	xmm4,	[ebp+ebx*2]
+    movdqu	xmm5,	[ebp+ecx]
+
+    psadbw  xmm2,	xmm0
+    psadbw  xmm3,	xmm0
+    psadbw  xmm4,	xmm0
+    psadbw  xmm5,	xmm0
+    paddw	xmm2,	xmm3
+    paddw	xmm4,	xmm5
+    paddw	xmm2,	xmm4
+
+    paddw	xmm1,	xmm2
+
+    lea		ebp,	[ebp+ebx*4]
+    movdqu	xmm2,	[ebp]
+    movdqu	xmm3,	[ebp+ebx]
+    movdqu	xmm4,	[ebp+ebx*2]
+    movdqu	xmm5,	[ebp+ecx]
+
+    psadbw  xmm2,	xmm0
+    psadbw  xmm3,	xmm0
+    psadbw  xmm4,	xmm0
+    psadbw  xmm5,	xmm0
+    paddw	xmm2,	xmm3
+    paddw	xmm4,	xmm5
+    paddw	xmm2,	xmm4
+
+    paddw	xmm1,	xmm2
+
+    lea		ebp,	[ebp+ebx*4]
+    movdqu	xmm2,	[ebp]
+    movdqu	xmm3,	[ebp+ebx]
+    movdqu	xmm4,	[ebp+ebx*2]
+    movdqu	xmm5,	[ebp+ecx]
+
+    psadbw  xmm2,	xmm0
+    psadbw  xmm3,	xmm0
+    psadbw  xmm4,	xmm0
+    psadbw  xmm5,	xmm0
+    paddw	xmm2,	xmm3
+    paddw	xmm4,	xmm5
+    paddw	xmm2,	xmm4
+
+    paddw	xmm1,	xmm2
+    movdqa	xmm2,	xmm1
+    punpckhwd xmm2, xmm0
+    paddw xmm1, xmm2
+    movd	eax,	xmm1
+    mov		[edi],	ax
+    inc		dword [edx+eax*4]
+
+    inc		esi
+    lea		edi,	[edi+2]
+
+    dec		dword [tmp_width]
+    jg		near FIRST_ROW_X16H
+
+    mov		esi,	[ref]
+    mov		edi,	[sum_ref]
+    mov		ebp,	[width]
+    dec		dword [height]
+
+    mov		ecx,	ebx
+    sal		ecx,	4		; succeeded 16th line
+HEIGHT_LOOP_X16:
+    mov		[tmp_width],	ebp
+WIDTH_LOOP_X16:
+    movdqu	xmm1,	[esi+ecx]
+    movdqu	xmm2,	[esi]
+    psadbw	xmm1,	xmm0
+    psadbw	xmm2,	xmm0
+    psubw	xmm1,	xmm2
+    movdqa	xmm2,	xmm1
+    punpckhwd xmm2, xmm0
+    paddw	xmm1,	xmm2
+    movd	eax,	xmm1
+    add		ax,	word [edi]
+    mov		[edi+ebp*2],	ax
+    inc		dword [edx+eax*4]
+
+    inc		esi
+    add		edi,	2
+
+    dec		dword [tmp_width]
+    jg		near WIDTH_LOOP_X16
+
+    add		esi,	ebx
+    sub		esi,	ebp
+
+    dec		dword [height]
+    jg		near HEIGHT_LOOP_X16
+
+    add		esp,	localsize
+    pop		edi
+    pop		esi
+    pop		ebp
+    pop		ebx
+%undef		pushsize
+%undef		localsize
+%undef		ref
+%undef		sum_ref
+%undef		times_of_sum
+%undef		width
+%undef		height
+%undef		linesize
+%undef		tmp_width
+    ret
+
+; requires:  width % 16 == 0 && height > 1
+;-----------------------------------------------------------------------------------------------------------------------------
+;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;-----------------------------------------------------------------------------------------------------------------------------
+; try 8 mv via offset
+%macro   SUM_LINE_X16_SSE41  5	; ref, dst0, dst1, tmp0, tmp1
+    movdqu	%2,	[%1]
+    movdqu	%3,	[%1+8h]
+    movdqa	%4,	%2
+    movdqa	%5,	%3
+
+    mpsadbw	%2,	xmm0,	0	; 000 B
+    mpsadbw	%4,	xmm0,	5	; 101 B
+    mpsadbw	%3,	xmm0,	2	; 010 B
+    mpsadbw	%5,	xmm0,	7	; 111 B
+    paddw	%2,	%4
+    paddw	%3, %5
+    paddw	%2,	%3	; accumulate cost
+%endmacro	; end of SAD_16x16_LINE_SSE41
+
+WELS_EXTERN SumOf16x16BlockOfFrame_sse4
+%define		pushsize		16
+%define		localsize		4
+%define		ref				esp + pushsize + localsize + 4
+%define		sum_ref			esp + pushsize + localsize + 20
+%define		times_of_sum	esp + pushsize + localsize + 24
+%define		width			esp + pushsize + localsize + 8
+%define		height			esp + pushsize + localsize + 12
+%define		linesize		esp + pushsize + localsize + 16
+%define		tmp_width		esp
+    push	ebx
+    push	ebp
+    push	esi
+    push	edi
+    sub		esp,	localsize
+
+    pxor	xmm0,	xmm0
+    mov		esi,	[ref]
+    mov		edi,	[sum_ref]
+    mov		edx,	[times_of_sum]
+    mov		ebx,	[linesize]
+    mov		eax,	[width]
+
+    lea		ecx,	[ebx+ebx*2]
+    mov		[tmp_width],	eax
+FIRST_ROW_X16_SSE4:
+    SUM_LINE_X16_SSE41	esi,		xmm1, xmm2, xmm3, xmm4
+    SUM_LINE_X16_SSE41	esi+ebx,	xmm2, xmm3, xmm4, xmm5
+    SUM_LINE_X16_SSE41	esi+ebx*2,	xmm3, xmm4, xmm5, xmm6
+    SUM_LINE_X16_SSE41	esi+ecx,	xmm4, xmm5, xmm6, xmm7
+    paddw	xmm1, xmm2
+    paddw	xmm3, xmm4
+    paddw	xmm1, xmm3
+
+    lea		ebp,	[esi+ebx*4]
+    SUM_LINE_X16_SSE41	ebp,		xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	ebp+ebx,	xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	ebp+ebx*2,	xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	ebp+ecx,	xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+
+    lea		ebp,	[ebp+ebx*4]
+    SUM_LINE_X16_SSE41	ebp,		xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	ebp+ebx,	xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	ebp+ebx*2,	xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	ebp+ecx,	xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+
+    lea		ebp,	[ebp+ebx*4]
+    SUM_LINE_X16_SSE41	ebp,		xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	ebp+ebx,	xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	ebp+ebx*2,	xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	ebp+ecx,	xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+
+    movdqa	[edi],	xmm1
+    movdqa	xmm2,	xmm1
+    punpcklwd	xmm1,	xmm0
+    punpckhwd	xmm2,	xmm0
+
+    COUNT_SUM	xmm1,	eax,	1
+    COUNT_SUM	xmm1,	eax,	1
+    COUNT_SUM	xmm1,	eax,	1
+    COUNT_SUM	xmm1,	eax,	0
+    COUNT_SUM	xmm2,	eax,	1
+    COUNT_SUM	xmm2,	eax,	1
+    COUNT_SUM	xmm2,	eax,	1
+    COUNT_SUM	xmm2,	eax,	0
+
+    lea		esi,	[esi+8]
+    lea		edi,	[edi+16]	; element size is 2
+
+    sub		dword [tmp_width], 8
+    jg		near FIRST_ROW_X16_SSE4
+
+    mov		esi,	[ref]
+    mov		edi,	[sum_ref]
+    mov		ebp,	[width]
+    dec		dword [height]
+
+    mov		ecx,	ebx
+    sal		ecx,	4		; succeeded 16th line
+
+HEIGHT_LOOP_X16_SSE4:
+    mov		[tmp_width],	ebp
+WIDTH_LOOP_X16_SSE4:
+    movdqa	xmm7,	[edi]
+    SUM_LINE_X16_SSE41	esi+ecx, xmm1, xmm2, xmm3, xmm4
+    SUM_LINE_X16_SSE41	esi, xmm2, xmm3, xmm4, xmm5
+
+    paddw	xmm7,	xmm1
+    psubw	xmm7,	xmm2
+    movdqa	[edi+ebp*2], xmm7
+
+    movdqa	xmm6,	xmm7
+    punpcklwd	xmm7,	xmm0
+    punpckhwd	xmm6,	xmm0
+
+    COUNT_SUM	xmm7,	eax,	1
+    COUNT_SUM	xmm7,	eax,	1
+    COUNT_SUM	xmm7,	eax,	1
+    COUNT_SUM	xmm7,	eax,	0
+    COUNT_SUM	xmm6,	eax,	1
+    COUNT_SUM	xmm6,	eax,	1
+    COUNT_SUM	xmm6,	eax,	1
+    COUNT_SUM	xmm6,	eax,	0
+
+    lea		esi,	[esi+8]
+    lea		edi,	[edi+16]
+
+    sub		dword [tmp_width], 8
+    jg		near WIDTH_LOOP_X16_SSE4
+
+    add		esi,	ebx
+    sub		esi,	ebp
+
+    dec		dword [height]
+    jg		near HEIGHT_LOOP_X16_SSE4
+
+    add		esp,	localsize
+    pop		edi
+    pop		esi
+    pop		ebp
+    pop		ebx
+%undef		pushsize
+%undef		localsize
+%undef		ref
+%undef		sum_ref
+%undef		times_of_sum
+%undef		width
+%undef		height
+%undef		linesize
+%undef		tmp_width
+    ret
+
+%else
+
+;**********************************************************************************************************************
+;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;*********************************************************************************************************************
+WELS_EXTERN SumOf8x8BlockOfFrame_sse2
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 6
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    push r12
+    push r13
+    push r0
+    push r2
+    push r4
+
+    pxor	xmm0,	xmm0
+    lea     r6, [r3+r3*2]
+
+    mov		r12,	r1              ;r12:tmp_width
+    lea		r13,	[r0+r3*4]       ;rbp:r13
+FIRST_ROW:
+    movq	xmm1,	[r0]
+    movq	xmm2,	[r0+r3]
+    movq	xmm3,	[r0+r3*2]
+    movq	xmm4,	[r0+r6]
+
+    shufps	xmm1,	xmm2,	01000100b
+    shufps	xmm3,	xmm4,	01000100b
+    psadbw	xmm1,	xmm0
+    psadbw	xmm3,	xmm0
+    paddd	xmm1,	xmm3
+
+    movq	xmm2,	[r13]
+    movq	xmm3,	[r13+r3]
+    movq	xmm4,	[r13+r3*2]
+    movq	xmm5,	[r13+r6]
+
+    shufps	xmm2,	xmm3,	01000100b
+    shufps	xmm4,	xmm5,	01000100b
+    psadbw	xmm2,	xmm0
+    psadbw	xmm4,	xmm0
+    paddd	xmm2,	xmm4
+
+    paddd	xmm1,	xmm2
+    pshufd	xmm2,	xmm1,	00001110b
+    paddd	xmm1,	xmm2
+    movd	r2d,	xmm1
+    mov		[r4],	r2w
+    inc		dword [r5+r2*4]
+
+    inc		r0
+    inc		r13
+    add		r4,	2
+
+    dec		r12
+    jg		FIRST_ROW
+
+    pop r4
+    pop r2
+    pop r0
+    mov r13, r2
+    dec r13
+HEIGHT_LOOP:
+    mov		r12,	r1
+WIDTH_LOOP:
+    movq	xmm1,	[r0+r3*8]
+    movq	xmm2,	[r0]
+    psadbw	xmm1,	xmm0
+    psadbw	xmm2,	xmm0
+    psubd	xmm1,	xmm2
+    movd	r2d,	xmm1
+    mov		r6w,	[r4]
+    add		r2d,	r6d
+    mov		[r4+r1*2],	r2w
+    inc		dword [r5+r2*4]
+
+    inc		r0
+    add		r4,	2
+
+    dec		r12
+    jg		WIDTH_LOOP
+
+    add		r0,	r3
+    sub		r0,	r1
+
+
+    dec		r13
+    jg		HEIGHT_LOOP
+
+    pop		r13
+    pop		r12
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+
+
+%macro COUNT_SUM 4
+%define xmm_reg %1
+%define tmp_dreg %2
+%define tmp_qreg %3
+    movd	tmp_dreg,	xmm_reg
+    inc		dword [r5+tmp_qreg*4]
+%if %4 == 1
+    psrldq	xmm_reg,	4
+%endif
+%endmacro
+
+
+;-----------------------------------------------------------------------------
+; requires:  width % 8 == 0 && height > 1
+;-----------------------------------------------------------------------------
+;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;-----------------------------------------------------------------------------
+; read extra (16 - (width % 8) ) mod 16 bytes of every line
+; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref
+WELS_EXTERN SumOf8x8BlockOfFrame_sse4
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    push r12
+    push r13
+    push r0
+    push r2
+    push r4
+
+    pxor	xmm0,	xmm0
+    lea     r6, [r3+r3*2]
+
+    mov		r12,	r1              ;r12:tmp_width
+    lea		r13,	[r0+r3*4]       ;rbp:r13
+FIRST_ROW_SSE4:
+    movdqu	xmm1,	[r0]
+    movdqu	xmm3,	[r0+r3]
+    movdqu	xmm5,	[r0+r3*2]
+    movdqu	xmm7,	[r0+r6]
+
+    movdqa	xmm2,	xmm1
+    mpsadbw	xmm1,	xmm0,	000b
+    mpsadbw	xmm2,	xmm0,	100b
+    paddw	xmm1,	xmm2			; 8 sums of line1
+
+    movdqa	xmm4,	xmm3
+    mpsadbw	xmm3,	xmm0,	000b
+    mpsadbw	xmm4,	xmm0,	100b
+    paddw	xmm3,	xmm4			; 8 sums of line2
+
+    movdqa	xmm2,	xmm5
+    mpsadbw	xmm5,	xmm0,	000b
+    mpsadbw	xmm2,	xmm0,	100b
+    paddw	xmm5,	xmm2			; 8 sums of line3
+
+    movdqa	xmm4,	xmm7
+    mpsadbw	xmm7,	xmm0,	000b
+    mpsadbw	xmm4,	xmm0,	100b
+    paddw	xmm7,	xmm4			; 8 sums of line4
+
+    paddw	xmm1,	xmm3
+    paddw	xmm5,	xmm7
+    paddw	xmm1,	xmm5			; sum the upper 4 lines first
+
+    movdqu	xmm2,	[r13]
+    movdqu	xmm3,	[r13+r3]
+    movdqu	xmm4,	[r13+r3*2]
+    movdqu	xmm5,	[r13+r6]
+
+    movdqa	xmm6,	xmm2
+    mpsadbw	xmm2,	xmm0,	000b
+    mpsadbw	xmm6,	xmm0,	100b
+    paddw	xmm2,	xmm6
+
+    movdqa	xmm7,	xmm3
+    mpsadbw	xmm3,	xmm0,	000b
+    mpsadbw	xmm7,	xmm0,	100b
+    paddw	xmm3,	xmm7
+
+    movdqa	xmm6,	xmm4
+    mpsadbw	xmm4,	xmm0,	000b
+    mpsadbw	xmm6,	xmm0,	100b
+    paddw	xmm4,	xmm6
+
+    movdqa	xmm7,	xmm5
+    mpsadbw	xmm5,	xmm0,	000b
+    mpsadbw	xmm7,	xmm0,	100b
+    paddw	xmm5,	xmm7
+
+    paddw	xmm2,	xmm3
+    paddw	xmm4,	xmm5
+    paddw	xmm1,	xmm2
+    paddw	xmm1,	xmm4			; sum of lines 1- 8
+
+    movdqu	[r4],	xmm1
+
+    movdqa	xmm2,	xmm1
+    punpcklwd	xmm1,	xmm0
+    punpckhwd	xmm2,	xmm0
+
+    COUNT_SUM	xmm1,	r2d, r2, 1
+    COUNT_SUM	xmm1,	r2d, r2, 1
+    COUNT_SUM	xmm1,	r2d, r2, 1
+    COUNT_SUM	xmm1,	r2d, r2, 0
+    COUNT_SUM	xmm2,	r2d, r2 ,1
+    COUNT_SUM	xmm2,	r2d, r2 ,1
+    COUNT_SUM	xmm2,	r2d, r2 ,1
+    COUNT_SUM	xmm2,	r2d, r2 ,0
+
+    lea		r0,     [r0+8]
+    lea		r13,	[r13+8]
+    lea		r4,     [r4+16]		; element size is 2
+
+    sub		r12, 8
+    jg		near FIRST_ROW_SSE4
+
+    pop r4
+    pop r2
+    pop r0
+    mov r13, r2
+    dec r13
+HEIGHT_LOOP_SSE4:
+    mov		r12,	r1
+WIDTH_LOOP_SSE4:
+    movdqu	xmm1,	[r0+r3*8]
+    movdqu	xmm2,	[r0]
+    movdqu	xmm7,	[r4]
+
+    movdqa	xmm3,	xmm1
+    mpsadbw	xmm1,	xmm0,	000b
+    mpsadbw	xmm3,	xmm0,	100b
+    paddw	xmm1,	xmm3
+
+    movdqa	xmm4,	xmm2
+    mpsadbw	xmm2,	xmm0,	000b
+    mpsadbw	xmm4,	xmm0,	100b
+    paddw	xmm2,	xmm4
+
+    paddw	xmm7,	xmm1
+    psubw	xmm7,	xmm2
+    movdqu	[r4+r1*2], xmm7
+
+    movdqa	xmm6,	xmm7
+    punpcklwd	xmm7,	xmm0
+    punpckhwd	xmm6,	xmm0
+
+    COUNT_SUM	xmm7,	r2d, r2, 1
+    COUNT_SUM	xmm7,	r2d, r2, 1
+    COUNT_SUM	xmm7,	r2d, r2, 1
+    COUNT_SUM	xmm7,	r2d, r2, 0
+    COUNT_SUM	xmm6,	r2d, r2, 1
+    COUNT_SUM	xmm6,	r2d, r2, 1
+    COUNT_SUM	xmm6,	r2d, r2, 1
+    COUNT_SUM	xmm6,	r2d, r2, 0
+
+    lea		r0,	[r0+8]
+    lea		r4,	[r4+16]
+
+    sub		r12,	8
+    jg		near WIDTH_LOOP_SSE4
+
+    lea		r0,	[r0+r3]
+    sub		r0,	r1
+
+    dec		r13
+    jg		near HEIGHT_LOOP_SSE4
+
+    pop		r13
+    pop		r12
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+
+
+;****************************************************************************************************************************************************
+;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;****************************************************************************************************************************************************
+WELS_EXTERN SumOf16x16BlockOfFrame_sse2
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 6
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    push r12
+    push r13
+    push r0
+    push r2
+    push r4
+
+    pxor	xmm0,	xmm0
+    lea     r6, [r3+r3*2]
+
+    mov		r12,	r1              ;r12:tmp_width
+FIRST_ROW_X16H:
+    movdqu	xmm1,	[r0]
+    movdqu	xmm2,	[r0+r3]
+    movdqu	xmm3,	[r0+r3*2]
+    movdqu	xmm4,	[r0+r6]
+
+    psadbw  xmm1,	xmm0
+    psadbw  xmm2,	xmm0
+    psadbw  xmm3,	xmm0
+    psadbw  xmm4,	xmm0
+    paddw	xmm1,	xmm2
+    paddw	xmm3,	xmm4
+    paddw	xmm1,	xmm3
+
+    lea		r13,	[r0+r3*4]       ;ebp:r13
+    movdqu	xmm2,	[r13]
+    movdqu	xmm3,	[r13+r3]
+    movdqu	xmm4,	[r13+r3*2]
+    movdqu	xmm5,	[r13+r6]
+
+    psadbw  xmm2,	xmm0
+    psadbw  xmm3,	xmm0
+    psadbw  xmm4,	xmm0
+    psadbw  xmm5,	xmm0
+    paddw	xmm2,	xmm3
+    paddw	xmm4,	xmm5
+    paddw	xmm2,	xmm4
+
+    paddw	xmm1,	xmm2
+
+    lea		r13,	[r13+r3*4]
+    movdqu	xmm2,	[r13]
+    movdqu	xmm3,	[r13+r3]
+    movdqu	xmm4,	[r13+r3*2]
+    movdqu	xmm5,	[r13+r6]
+
+    psadbw  xmm2,	xmm0
+    psadbw  xmm3,	xmm0
+    psadbw  xmm4,	xmm0
+    psadbw  xmm5,	xmm0
+    paddw	xmm2,	xmm3
+    paddw	xmm4,	xmm5
+    paddw	xmm2,	xmm4
+
+    paddw	xmm1,	xmm2
+
+    lea		r13,	[r13+r3*4]
+    movdqu	xmm2,	[r13]
+    movdqu	xmm3,	[r13+r3]
+    movdqu	xmm4,	[r13+r3*2]
+    movdqu	xmm5,	[r13+r6]
+
+    psadbw  xmm2,	xmm0
+    psadbw  xmm3,	xmm0
+    psadbw  xmm4,	xmm0
+    psadbw  xmm5,	xmm0
+    paddw	xmm2,	xmm3
+    paddw	xmm4,	xmm5
+    paddw	xmm2,	xmm4
+
+    paddw	xmm1,	xmm2
+    movdqa	xmm2,	xmm1
+    punpckhwd xmm2, xmm0
+    paddw xmm1, xmm2
+    movd	r2d,	xmm1
+    mov		[r4],	r2w
+    inc		dword [r5+r2*4]
+
+    inc		r0
+    lea		r4,	[r4+2]
+
+    dec		r12
+    jg		near FIRST_ROW_X16H
+
+    pop r4
+    pop r2
+    pop r0
+    mov r13, r2
+    dec r13
+    mov		r6,	r3
+    sal		r6,	4		; succeeded 16th line
+HEIGHT_LOOP_X16:
+    mov		r12,	r1
+WIDTH_LOOP_X16:
+    movdqu	xmm1,	[r0+r6]
+    movdqu	xmm2,	[r0]
+    psadbw	xmm1,	xmm0
+    psadbw	xmm2,	xmm0
+    psubw	xmm1,	xmm2
+    movdqa	xmm2,	xmm1
+    punpckhwd xmm2, xmm0
+    paddw	xmm1,	xmm2
+    movd	r2d,	xmm1
+    add		r2w,	word [r4]
+    mov		[r4+r1*2],	r2w
+    inc		dword [r5+r2*4]
+
+    inc		r0
+    add		r4,	2
+
+    dec		r12
+    jg		near WIDTH_LOOP_X16
+
+    add		r0,	r3
+    sub		r0,	r1
+
+    dec		r13
+    jg		near HEIGHT_LOOP_X16
+
+    pop		r13
+    pop		r12
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+
+; requires:  width % 16 == 0 && height > 1
+;-----------------------------------------------------------------------------------------------------------------------------
+;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;-----------------------------------------------------------------------------------------------------------------------------
+; try 8 mv via offset
+%macro   SUM_LINE_X16_SSE41  5	; ref, dst0, dst1, tmp0, tmp1
+    movdqu	%2,	[%1]
+    movdqu	%3,	[%1+8h]
+    movdqa	%4,	%2
+    movdqa	%5,	%3
+
+    mpsadbw	%2,	xmm0,	0	; 000 B
+    mpsadbw	%4,	xmm0,	5	; 101 B
+    mpsadbw	%3,	xmm0,	2	; 010 B
+    mpsadbw	%5,	xmm0,	7	; 111 B
+    paddw	%2,	%4
+    paddw	%3, %5
+    paddw	%2,	%3	; accumulate cost
+%endmacro	; end of SAD_16x16_LINE_SSE41
+
+WELS_EXTERN SumOf16x16BlockOfFrame_sse4
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    push r12
+    push r13
+    push r0
+    push r2
+    push r4
+
+    pxor	xmm0,	xmm0
+    lea     r6, [r3+r3*2]
+
+    mov		r12,	r1              ;r12:tmp_width
+FIRST_ROW_X16_SSE4:
+    SUM_LINE_X16_SSE41	r0,		xmm1, xmm2, xmm3, xmm4
+    SUM_LINE_X16_SSE41	r0+r3,	xmm2, xmm3, xmm4, xmm5
+    SUM_LINE_X16_SSE41	r0+r3*2,xmm3, xmm4, xmm5, xmm6
+    SUM_LINE_X16_SSE41	r0+r6,	xmm4, xmm5, xmm6, xmm7
+    paddw	xmm1, xmm2
+    paddw	xmm3, xmm4
+    paddw	xmm1, xmm3
+
+    lea		r13,	[r0+r3*4]
+    SUM_LINE_X16_SSE41	r13,		xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	r13+r3,     xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	r13+r3*2,	xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	r13+r6,     xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+
+    lea		r13,	[r13+r3*4]
+    SUM_LINE_X16_SSE41	r13,		xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	r13+r3,     xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	r13+r3*2,	xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	r13+r6,     xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+
+    lea		r13,	[r13+r3*4]
+    SUM_LINE_X16_SSE41	r13,		xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	r13+r3,     xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	r13+r3*2,	xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+    SUM_LINE_X16_SSE41	r13+r6,     xmm2, xmm3, xmm4, xmm5
+    paddw	xmm1, xmm2
+
+    movdqa	[r4],	xmm1
+    movdqa	xmm2,	xmm1
+    punpcklwd	xmm1,	xmm0
+    punpckhwd	xmm2,	xmm0
+
+    COUNT_SUM	xmm1,	r2d, r2, 1
+    COUNT_SUM	xmm1,	r2d, r2, 1
+    COUNT_SUM	xmm1,	r2d, r2, 1
+    COUNT_SUM	xmm1,	r2d, r2, 0
+    COUNT_SUM	xmm2,	r2d, r2, 1
+    COUNT_SUM	xmm2,	r2d, r2, 1
+    COUNT_SUM	xmm2,	r2d, r2, 1
+    COUNT_SUM	xmm2,	r2d, r2, 0
+
+    lea		r0,	[r0+8]
+    lea		r4,	[r4+16]	; element size is 2
+
+    sub		r12, 8
+    jg		near FIRST_ROW_X16_SSE4
+
+    pop r4
+    pop r2
+    pop r0
+    mov r13, r2
+    dec r13
+    mov		r6,	r3
+    sal		r6,	4		; succeeded 16th line
+
+HEIGHT_LOOP_X16_SSE4:
+    mov		r12,	r1
+WIDTH_LOOP_X16_SSE4:
+    movdqa	xmm7,	[r4]
+    SUM_LINE_X16_SSE41	r0+r6, xmm1, xmm2, xmm3, xmm4
+    SUM_LINE_X16_SSE41	r0, xmm2, xmm3, xmm4, xmm5
+
+    paddw	xmm7,	xmm1
+    psubw	xmm7,	xmm2
+    movdqa	[r4+r1*2], xmm7
+
+    movdqa	xmm6,	xmm7
+    punpcklwd	xmm7,	xmm0
+    punpckhwd	xmm6,	xmm0
+
+    COUNT_SUM	xmm7,	r2d, r2, 1
+    COUNT_SUM	xmm7,	r2d, r2, 1
+    COUNT_SUM	xmm7,	r2d, r2, 1
+    COUNT_SUM	xmm7,	r2d, r2, 0
+    COUNT_SUM	xmm6,	r2d, r2, 1
+    COUNT_SUM	xmm6,	r2d, r2, 1
+    COUNT_SUM	xmm6,	r2d, r2, 1
+    COUNT_SUM	xmm6,	r2d, r2, 0
+
+    lea		r0,	[r0+8]
+    lea		r4,	[r4+16]
+
+    sub		r12, 8
+    jg		near WIDTH_LOOP_X16_SSE4
+
+    add		r0,	r3
+    sub		r0,	r1
+
+    dec		r13
+    jg		near HEIGHT_LOOP_X16_SSE4
+
+    pop		r13
+    pop		r12
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+
+%endif
+
 ;**********************************************************************************************************************************
+;	int32_t SumOf8x8SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
+;**********************************************************************************************************************************
+WELS_EXTERN SumOf8x8SingleBlock_sse2
+    %assign  push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION  r1, r1d
+
+    pxor xmm0, xmm0
+    movq xmm1, [r0]
+    movhps xmm1, [r0+r1]
+    lea r0, [r0+2*r1]
+    movq xmm2, [r0]
+    movhps xmm2, [r0+r1]
+    lea r0, [r0+2*r1]
+    movq xmm3, [r0]
+    movhps xmm3, [r0+r1]
+    lea r0, [r0+2*r1]
+    movq xmm4, [r0]
+    movhps xmm4, [r0+r1]
+
+    psadbw xmm1, xmm0
+    psadbw xmm2, xmm0
+    psadbw xmm3, xmm0
+    psadbw xmm4, xmm0
+    paddw xmm1, xmm2
+    paddw xmm3, xmm4
+    paddw xmm1, xmm3
+
+    movdqa xmm2, xmm1
+    punpckhwd xmm2, xmm0
+    paddw xmm1, xmm2
+
+    movd retrd, xmm1
+    ret
+
+;**********************************************************************************************************************************
+;	int32_t SumOf16x16SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
+;**********************************************************************************************************************************
+WELS_EXTERN SumOf16x16SingleBlock_sse2
+    %assign  push_num 0
+    LOAD_2_PARA
+    PUSH_XMM 6
+    SIGN_EXTENSION  r1, r1d
+
+    pxor xmm0, xmm0
+    movdqa xmm1, [r0]
+    movdqa xmm2, [r0+r1]
+    lea r0, [r0+2*r1]
+    movdqa xmm3, [r0]
+    movdqa xmm4, [r0+r1]
+    psadbw xmm1, xmm0
+    psadbw xmm2, xmm0
+    psadbw xmm3, xmm0
+    psadbw xmm4, xmm0
+    paddw xmm1, xmm2
+    paddw xmm3, xmm4
+    paddw xmm1, xmm3
+
+    lea r0, [r0+2*r1]
+    movdqa xmm2, [r0]
+    movdqa xmm3, [r0+r1]
+    lea r0, [r0+2*r1]
+    movdqa xmm4, [r0]
+    movdqa xmm5, [r0+r1]
+    psadbw xmm2, xmm0
+    psadbw xmm3, xmm0
+    psadbw xmm4, xmm0
+    psadbw xmm5, xmm0
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    paddw xmm2, xmm4
+
+    paddw xmm1, xmm2
+
+    lea r0, [r0+2*r1]
+    movdqa xmm2, [r0]
+    movdqa xmm3, [r0+r1]
+    lea r0, [r0+2*r1]
+    movdqa xmm4, [r0]
+    movdqa xmm5, [r0+r1]
+    psadbw xmm2, xmm0
+    psadbw xmm3, xmm0
+    psadbw xmm4, xmm0
+    psadbw xmm5, xmm0
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    paddw xmm2, xmm4
+
+    paddw xmm1, xmm2
+
+    lea r0, [r0+2*r1]
+    movdqa xmm2, [r0]
+    movdqa xmm3, [r0+r1]
+    lea r0, [r0+2*r1]
+    movdqa xmm4, [r0]
+    movdqa xmm5, [r0+r1]
+    psadbw xmm2, xmm0
+    psadbw xmm3, xmm0
+    psadbw xmm4, xmm0
+    psadbw xmm5, xmm0
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    paddw xmm2, xmm4
+
+    paddw xmm1, xmm2
+
+    movdqa xmm2, xmm1
+    punpckhwd xmm2, xmm0
+    paddw xmm1, xmm2
+
+    movd retrd, xmm1
+    POP_XMM
+    ret
+
+;**********************************************************************************************************************************
 ;
 ;   uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
 ;
@@ -222,4 +1528,3 @@
     POP_XMM
     LOAD_6_PARA_POP
     ret
-
--- a/test/encoder/EncUT_SVC_me.cpp
+++ b/test/encoder/EncUT_SVC_me.cpp
@@ -92,6 +92,11 @@
 GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_c)
 GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_c)
 
+#ifdef X86_ASM
+GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_sse2)
+GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_sse2)
+#endif
+
 #ifdef HAVE_NEON
 GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_neon)
 GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_neon)
@@ -137,6 +142,31 @@
 GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 320)
 GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 640, 320)
 GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 640, 320)
+
+#ifdef X86_ASM
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse2, 6, 6)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse2, 6, 6)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse2, 6, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse2, 6, 320)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse2, 640, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse2, 640, 320)
+
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse4, 8, 2)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse4, 16, 2)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse4, 8, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse4, 16, 320)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse4, 640, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse4, 640, 320)
+#endif
+
+#ifdef HAVE_NEON
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 1)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 1)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320)
+#endif
 
 #ifdef HAVE_NEON
 GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 1)