ref: f2314151e8623350363c0b1a5c4cfa1d7d460f6d
parent: 681b1da69899535a41b8aee4628811d4e529be8c
author: zhiliang wang <zhilwang@cisco.com>
date: Wed Aug 13 07:18:39 EDT 2014
Add x86 32/64bit asm code for SumOfBlocks.
--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -244,6 +244,22 @@
void SumOf16x16BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride,
uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+
+#ifdef X86_ASM
+extern "C"
+{
+int32_t SumOf8x8SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
+int32_t SumOf16x16SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
+void SumOf8x8BlockOfFrame_sse2 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+ const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+void SumOf16x16BlockOfFrame_sse2 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+ const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+void SumOf8x8BlockOfFrame_sse4 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+ const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+void SumOf16x16BlockOfFrame_sse4 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+ const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+}
+#endif
#ifdef HAVE_NEON
extern "C"
{
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -102,22 +102,42 @@
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
+#if defined (X86_ASM)
+ if (uiCpuFlag & WELS_CPU_SSE2) {
+ //for feature search
+ pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse2;
+ pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse2;
+ //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
+ pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_sse2;
+ pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_sse2;
+ }
+ if (uiCpuFlag & WELS_CPU_SSE41) {
+ //for feature search
+ pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse4;
+ pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse4;
+ }
+#endif
+
#if defined (HAVE_NEON)
- //for feature search
- pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
- pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
- //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
- pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon;
- pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon;
+ if (uiCpuFlag & WELS_CPU_NEON) {
+ //for feature search
+ pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
+ pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
+ //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
+ pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon;
+ pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon;
+ }
#endif
#if defined (HAVE_NEON_AARCH64)
- //for feature search
- pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
- pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
- //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
- pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon;
- pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon;
+ if (uiCpuFlag & WELS_CPU_NEON) {
+ //for feature search
+ pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
+ pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
+ //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
+ pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon;
+ pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon;
+ }
#endif
}
}
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -32,8 +32,1314 @@
%include "asm_inc.asm"
SECTION .text
+%ifdef X86_32
+;**********************************************************************************************************************
+;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;*********************************************************************************************************************
+WELS_EXTERN SumOf8x8BlockOfFrame_sse2
+%define pushsize 16
+%define localsize 4
+%define ref esp + pushsize + localsize + 4
+%define sum_ref esp + pushsize + localsize + 20
+%define times_of_sum esp + pushsize + localsize + 24
+%define width esp + pushsize + localsize + 8
+%define height esp + pushsize + localsize + 12
+%define linesize esp + pushsize + localsize + 16
+%define tmp_width esp + 0
+ push ebx
+ push ebp
+ push esi
+ push edi
+ sub esp, localsize
+ pxor xmm0, xmm0
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov edx, [times_of_sum]
+ mov ebx, [linesize]
+ mov eax, [width]
+ lea ecx, [ebx+ebx*2] ; 3*linesize
+
+ mov [tmp_width], eax
+ lea ebp, [esi+ebx*4]
+FIRST_ROW:
+ movq xmm1, [esi]
+ movq xmm2, [esi+ebx]
+ movq xmm3, [esi+ebx*2]
+ movq xmm4, [esi+ecx]
+
+ shufps xmm1, xmm2, 01000100b
+ shufps xmm3, xmm4, 01000100b
+ psadbw xmm1, xmm0
+ psadbw xmm3, xmm0
+ paddd xmm1, xmm3
+
+ movq xmm2, [ebp]
+ movq xmm3, [ebp+ebx]
+ movq xmm4, [ebp+ebx*2]
+ movq xmm5, [ebp+ecx]
+
+ shufps xmm2, xmm3, 01000100b
+ shufps xmm4, xmm5, 01000100b
+ psadbw xmm2, xmm0
+ psadbw xmm4, xmm0
+ paddd xmm2, xmm4
+
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd eax, xmm1
+ mov [edi], ax
+ inc dword [edx+eax*4]
+
+ inc esi
+ inc ebp
+ add edi, 2
+
+ dec dword [tmp_width]
+ jg FIRST_ROW
+
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov ebp, [width]
+ dec dword [height]
+HEIGHT_LOOP:
+ mov [tmp_width], ebp
+WIDTH_LOOP:
+ movq xmm1, [esi+ebx*8]
+ movq xmm2, [esi]
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psubd xmm1, xmm2
+ movd eax, xmm1
+ mov cx, [edi]
+ add eax, ecx
+
+ mov [edi+ebp*2], ax
+ inc dword [edx+eax*4]
+
+ inc esi
+ add edi, 2
+
+ dec dword [tmp_width]
+ jg WIDTH_LOOP
+
+ add esi, ebx
+ sub esi, ebp
+
+ dec dword [height]
+ jg HEIGHT_LOOP
+
+ add esp, localsize
+ pop edi
+ pop esi
+ pop ebp
+ pop ebx
+%undef pushsize
+%undef localsize
+%undef ref
+%undef sum_ref
+%undef times_of_sum
+%undef width
+%undef height
+%undef linesize
+%undef tmp_width
+ ret
+
+
+%macro COUNT_SUM 3
+%define xmm_reg %1
+%define tmp_reg %2
+ movd tmp_reg, xmm_reg
+ inc dword [edx+tmp_reg*4]
+%if %3 == 1
+ psrldq xmm_reg, 4
+%endif
+%endmacro
+
+
+;-----------------------------------------------------------------------------
+; requires: width % 8 == 0 && height > 1
+;-----------------------------------------------------------------------------
+;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;-----------------------------------------------------------------------------
+; read extra (16 - (width % 8) ) mod 16 bytes of every line
+; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref
+WELS_EXTERN SumOf8x8BlockOfFrame_sse4
+%define pushsize 16
+%define localsize 4
+%define ref esp + pushsize + localsize + 4
+%define sum_ref esp + pushsize + localsize + 20
+%define times_of_sum esp + pushsize + localsize + 24
+%define width esp + pushsize + localsize + 8
+%define height esp + pushsize + localsize + 12
+%define linesize esp + pushsize + localsize + 16
+%define tmp_width esp + 0
+ push ebx
+ push ebp
+ push esi
+ push edi
+ sub esp, localsize
+
+ pxor xmm0, xmm0
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov edx, [times_of_sum]
+ mov ebx, [linesize]
+ mov eax, [width]
+ lea ecx, [ebx+ebx*2] ; 3*linesize
+
+ mov [tmp_width], eax
+ lea ebp, [esi+ebx*4]
+FIRST_ROW_SSE4:
+ movdqu xmm1, [esi]
+ movdqu xmm3, [esi+ebx]
+ movdqu xmm5, [esi+ebx*2]
+ movdqu xmm7, [esi+ecx]
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 000b
+ mpsadbw xmm2, xmm0, 100b
+ paddw xmm1, xmm2 ; 8 sums of line1
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 000b
+ mpsadbw xmm4, xmm0, 100b
+ paddw xmm3, xmm4 ; 8 sums of line2
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 000b
+ mpsadbw xmm2, xmm0, 100b
+ paddw xmm5, xmm2 ; 8 sums of line3
+
+ movdqa xmm4, xmm7
+ mpsadbw xmm7, xmm0, 000b
+ mpsadbw xmm4, xmm0, 100b
+ paddw xmm7, xmm4 ; 8 sums of line4
+
+ paddw xmm1, xmm3
+ paddw xmm5, xmm7
+ paddw xmm1, xmm5 ; sum the upper 4 lines first
+
+ movdqu xmm2, [ebp]
+ movdqu xmm3, [ebp+ebx]
+ movdqu xmm4, [ebp+ebx*2]
+ movdqu xmm5, [ebp+ecx]
+
+ movdqa xmm6, xmm2
+ mpsadbw xmm2, xmm0, 000b
+ mpsadbw xmm6, xmm0, 100b
+ paddw xmm2, xmm6
+
+ movdqa xmm7, xmm3
+ mpsadbw xmm3, xmm0, 000b
+ mpsadbw xmm7, xmm0, 100b
+ paddw xmm3, xmm7
+
+ movdqa xmm6, xmm4
+ mpsadbw xmm4, xmm0, 000b
+ mpsadbw xmm6, xmm0, 100b
+ paddw xmm4, xmm6
+
+ movdqa xmm7, xmm5
+ mpsadbw xmm5, xmm0, 000b
+ mpsadbw xmm7, xmm0, 100b
+ paddw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm1, xmm2
+ paddw xmm1, xmm4 ; sum of lines 1- 8
+
+ movdqu [edi], xmm1
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm0
+ punpckhwd xmm2, xmm0
+
+ COUNT_SUM xmm1, eax, 1
+ COUNT_SUM xmm1, eax, 1
+ COUNT_SUM xmm1, eax, 1
+ COUNT_SUM xmm1, eax, 0
+ COUNT_SUM xmm2, eax, 1
+ COUNT_SUM xmm2, eax, 1
+ COUNT_SUM xmm2, eax, 1
+ COUNT_SUM xmm2, eax, 0
+
+ lea esi, [esi+8]
+ lea ebp, [ebp+8]
+ lea edi, [edi+16] ; element size is 2
+
+ sub dword [tmp_width], 8
+ jg near FIRST_ROW_SSE4
+
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov ebp, [width]
+ dec dword [height]
+HEIGHT_LOOP_SSE4:
+ mov ecx, ebp
+WIDTH_LOOP_SSE4:
+ movdqu xmm1, [esi+ebx*8]
+ movdqu xmm2, [esi]
+ movdqu xmm7, [edi]
+
+ movdqa xmm3, xmm1
+ mpsadbw xmm1, xmm0, 000b
+ mpsadbw xmm3, xmm0, 100b
+ paddw xmm1, xmm3
+
+ movdqa xmm4, xmm2
+ mpsadbw xmm2, xmm0, 000b
+ mpsadbw xmm4, xmm0, 100b
+ paddw xmm2, xmm4
+
+ paddw xmm7, xmm1
+ psubw xmm7, xmm2
+ movdqu [edi+ebp*2], xmm7
+
+ movdqa xmm6, xmm7
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm6, xmm0
+
+ COUNT_SUM xmm7, eax, 1
+ COUNT_SUM xmm7, eax, 1
+ COUNT_SUM xmm7, eax, 1
+ COUNT_SUM xmm7, eax, 0
+ COUNT_SUM xmm6, eax, 1
+ COUNT_SUM xmm6, eax, 1
+ COUNT_SUM xmm6, eax, 1
+ COUNT_SUM xmm6, eax, 0
+
+ lea esi, [esi+8]
+ lea edi, [edi+16]
+
+ sub ecx, 8
+ jg near WIDTH_LOOP_SSE4
+
+ lea esi, [esi+ebx]
+ sub esi, ebp
+
+ dec dword [height]
+ jg near HEIGHT_LOOP_SSE4
+
+ add esp, localsize
+ pop edi
+ pop esi
+ pop ebp
+ pop ebx
+%undef pushsize
+%undef localsize
+%undef ref
+%undef sum_ref
+%undef times_of_sum
+%undef width
+%undef height
+%undef linesize
+%undef tmp_width
+ ret
+
+
+;****************************************************************************************************************************************************
+;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;****************************************************************************************************************************************************
+WELS_EXTERN SumOf16x16BlockOfFrame_sse2
+%define pushsize 16
+%define localsize 4
+%define ref esp + pushsize + localsize + 4
+%define sum_ref esp + pushsize + localsize + 20
+%define times_of_sum esp + pushsize + localsize + 24
+%define width esp + pushsize + localsize + 8
+%define height esp + pushsize + localsize + 12
+%define linesize esp + pushsize + localsize + 16
+%define tmp_width esp
+ push ebx
+ push ebp
+ push esi
+ push edi
+ sub esp, localsize
+
+ pxor xmm0, xmm0
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov edx, [times_of_sum]
+ mov ebx, [linesize]
+ mov eax, [width]
+
+ lea ecx, [ebx+ebx*2]
+ mov [tmp_width], eax
+FIRST_ROW_X16H:
+ movdqu xmm1, [esi]
+ movdqu xmm2, [esi+ebx]
+ movdqu xmm3, [esi+ebx*2]
+ movdqu xmm4, [esi+ecx]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm1, xmm3
+
+ lea ebp, [esi+ebx*4]
+ movdqu xmm2, [ebp]
+ movdqu xmm3, [ebp+ebx]
+ movdqu xmm4, [ebp+ebx*2]
+ movdqu xmm5, [ebp+ecx]
+
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
+
+ paddw xmm1, xmm2
+
+ lea ebp, [ebp+ebx*4]
+ movdqu xmm2, [ebp]
+ movdqu xmm3, [ebp+ebx]
+ movdqu xmm4, [ebp+ebx*2]
+ movdqu xmm5, [ebp+ecx]
+
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
+
+ paddw xmm1, xmm2
+
+ lea ebp, [ebp+ebx*4]
+ movdqu xmm2, [ebp]
+ movdqu xmm3, [ebp+ebx]
+ movdqu xmm4, [ebp+ebx*2]
+ movdqu xmm5, [ebp+ecx]
+
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
+
+ paddw xmm1, xmm2
+ movdqa xmm2, xmm1
+ punpckhwd xmm2, xmm0
+ paddw xmm1, xmm2
+ movd eax, xmm1
+ mov [edi], ax
+ inc dword [edx+eax*4]
+
+ inc esi
+ lea edi, [edi+2]
+
+ dec dword [tmp_width]
+ jg near FIRST_ROW_X16H
+
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov ebp, [width]
+ dec dword [height]
+
+ mov ecx, ebx
+ sal ecx, 4 ; succeeded 16th line
+HEIGHT_LOOP_X16:
+ mov [tmp_width], ebp
+WIDTH_LOOP_X16:
+ movdqu xmm1, [esi+ecx]
+ movdqu xmm2, [esi]
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psubw xmm1, xmm2
+ movdqa xmm2, xmm1
+ punpckhwd xmm2, xmm0
+ paddw xmm1, xmm2
+ movd eax, xmm1
+ add ax, word [edi]
+ mov [edi+ebp*2], ax
+ inc dword [edx+eax*4]
+
+ inc esi
+ add edi, 2
+
+ dec dword [tmp_width]
+ jg near WIDTH_LOOP_X16
+
+ add esi, ebx
+ sub esi, ebp
+
+ dec dword [height]
+ jg near HEIGHT_LOOP_X16
+
+ add esp, localsize
+ pop edi
+ pop esi
+ pop ebp
+ pop ebx
+%undef pushsize
+%undef localsize
+%undef ref
+%undef sum_ref
+%undef times_of_sum
+%undef width
+%undef height
+%undef linesize
+%undef tmp_width
+ ret
+
+; requires: width % 16 == 0 && height > 1
+;-----------------------------------------------------------------------------------------------------------------------------
+;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;-----------------------------------------------------------------------------------------------------------------------------
+; try 8 mv via offset
+%macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1
+ movdqu %2, [%1]
+ movdqu %3, [%1+8h]
+ movdqa %4, %2
+ movdqa %5, %3
+
+ mpsadbw %2, xmm0, 0 ; 000 B
+ mpsadbw %4, xmm0, 5 ; 101 B
+ mpsadbw %3, xmm0, 2 ; 010 B
+ mpsadbw %5, xmm0, 7 ; 111 B
+ paddw %2, %4
+ paddw %3, %5
+ paddw %2, %3 ; accumulate cost
+%endmacro ; end of SAD_16x16_LINE_SSE41
+
+WELS_EXTERN SumOf16x16BlockOfFrame_sse4
+%define pushsize 16
+%define localsize 4
+%define ref esp + pushsize + localsize + 4
+%define sum_ref esp + pushsize + localsize + 20
+%define times_of_sum esp + pushsize + localsize + 24
+%define width esp + pushsize + localsize + 8
+%define height esp + pushsize + localsize + 12
+%define linesize esp + pushsize + localsize + 16
+%define tmp_width esp
+ push ebx
+ push ebp
+ push esi
+ push edi
+ sub esp, localsize
+
+ pxor xmm0, xmm0
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov edx, [times_of_sum]
+ mov ebx, [linesize]
+ mov eax, [width]
+
+ lea ecx, [ebx+ebx*2]
+ mov [tmp_width], eax
+FIRST_ROW_X16_SSE4:
+ SUM_LINE_X16_SSE41 esi, xmm1, xmm2, xmm3, xmm4
+ SUM_LINE_X16_SSE41 esi+ebx, xmm2, xmm3, xmm4, xmm5
+ SUM_LINE_X16_SSE41 esi+ebx*2, xmm3, xmm4, xmm5, xmm6
+ SUM_LINE_X16_SSE41 esi+ecx, xmm4, xmm5, xmm6, xmm7
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm1, xmm3
+
+ lea ebp, [esi+ebx*4]
+ SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+
+ lea ebp, [ebp+ebx*4]
+ SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+
+ lea ebp, [ebp+ebx*4]
+ SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+
+ movdqa [edi], xmm1
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm0
+ punpckhwd xmm2, xmm0
+
+ COUNT_SUM xmm1, eax, 1
+ COUNT_SUM xmm1, eax, 1
+ COUNT_SUM xmm1, eax, 1
+ COUNT_SUM xmm1, eax, 0
+ COUNT_SUM xmm2, eax, 1
+ COUNT_SUM xmm2, eax, 1
+ COUNT_SUM xmm2, eax, 1
+ COUNT_SUM xmm2, eax, 0
+
+ lea esi, [esi+8]
+ lea edi, [edi+16] ; element size is 2
+
+ sub dword [tmp_width], 8
+ jg near FIRST_ROW_X16_SSE4
+
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov ebp, [width]
+ dec dword [height]
+
+ mov ecx, ebx
+ sal ecx, 4 ; succeeded 16th line
+
+HEIGHT_LOOP_X16_SSE4:
+ mov [tmp_width], ebp
+WIDTH_LOOP_X16_SSE4:
+ movdqa xmm7, [edi]
+ SUM_LINE_X16_SSE41 esi+ecx, xmm1, xmm2, xmm3, xmm4
+ SUM_LINE_X16_SSE41 esi, xmm2, xmm3, xmm4, xmm5
+
+ paddw xmm7, xmm1
+ psubw xmm7, xmm2
+ movdqa [edi+ebp*2], xmm7
+
+ movdqa xmm6, xmm7
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm6, xmm0
+
+ COUNT_SUM xmm7, eax, 1
+ COUNT_SUM xmm7, eax, 1
+ COUNT_SUM xmm7, eax, 1
+ COUNT_SUM xmm7, eax, 0
+ COUNT_SUM xmm6, eax, 1
+ COUNT_SUM xmm6, eax, 1
+ COUNT_SUM xmm6, eax, 1
+ COUNT_SUM xmm6, eax, 0
+
+ lea esi, [esi+8]
+ lea edi, [edi+16]
+
+ sub dword [tmp_width], 8
+ jg near WIDTH_LOOP_X16_SSE4
+
+ add esi, ebx
+ sub esi, ebp
+
+ dec dword [height]
+ jg near HEIGHT_LOOP_X16_SSE4
+
+ add esp, localsize
+ pop edi
+ pop esi
+ pop ebp
+ pop ebx
+%undef pushsize
+%undef localsize
+%undef ref
+%undef sum_ref
+%undef times_of_sum
+%undef width
+%undef height
+%undef linesize
+%undef tmp_width
+ ret
+
+%else
+
+;**********************************************************************************************************************
+;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;*********************************************************************************************************************
+WELS_EXTERN SumOf8x8BlockOfFrame_sse2
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 6
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ push r12
+ push r13
+ push r0
+ push r2
+ push r4
+
+ pxor xmm0, xmm0
+ lea r6, [r3+r3*2]
+
+ mov r12, r1 ;r12:tmp_width
+ lea r13, [r0+r3*4] ;rbp:r13
+FIRST_ROW:
+ movq xmm1, [r0]
+ movq xmm2, [r0+r3]
+ movq xmm3, [r0+r3*2]
+ movq xmm4, [r0+r6]
+
+ shufps xmm1, xmm2, 01000100b
+ shufps xmm3, xmm4, 01000100b
+ psadbw xmm1, xmm0
+ psadbw xmm3, xmm0
+ paddd xmm1, xmm3
+
+ movq xmm2, [r13]
+ movq xmm3, [r13+r3]
+ movq xmm4, [r13+r3*2]
+ movq xmm5, [r13+r6]
+
+ shufps xmm2, xmm3, 01000100b
+ shufps xmm4, xmm5, 01000100b
+ psadbw xmm2, xmm0
+ psadbw xmm4, xmm0
+ paddd xmm2, xmm4
+
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd r2d, xmm1
+ mov [r4], r2w
+ inc dword [r5+r2*4]
+
+ inc r0
+ inc r13
+ add r4, 2
+
+ dec r12
+ jg FIRST_ROW
+
+ pop r4
+ pop r2
+ pop r0
+ mov r13, r2
+ dec r13
+HEIGHT_LOOP:
+ mov r12, r1
+WIDTH_LOOP:
+ movq xmm1, [r0+r3*8]
+ movq xmm2, [r0]
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psubd xmm1, xmm2
+ movd r2d, xmm1
+ mov r6w, [r4]
+ add r2d, r6d
+ mov [r4+r1*2], r2w
+ inc dword [r5+r2*4]
+
+ inc r0
+ add r4, 2
+
+ dec r12
+ jg WIDTH_LOOP
+
+ add r0, r3
+ sub r0, r1
+
+
+ dec r13
+ jg HEIGHT_LOOP
+
+ pop r13
+ pop r12
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+
+%macro COUNT_SUM 4
+%define xmm_reg %1
+%define tmp_dreg %2
+%define tmp_qreg %3
+ movd tmp_dreg, xmm_reg
+ inc dword [r5+tmp_qreg*4]
+%if %4 == 1
+ psrldq xmm_reg, 4
+%endif
+%endmacro
+
+
+;-----------------------------------------------------------------------------
+; requires: width % 8 == 0 && height > 1
+;-----------------------------------------------------------------------------
+;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;-----------------------------------------------------------------------------
+; read extra (16 - (width % 8) ) mod 16 bytes of every line
+; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref
+WELS_EXTERN SumOf8x8BlockOfFrame_sse4
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ push r12
+ push r13
+ push r0
+ push r2
+ push r4
+
+ pxor xmm0, xmm0
+ lea r6, [r3+r3*2]
+
+ mov r12, r1 ;r12:tmp_width
+ lea r13, [r0+r3*4] ;rbp:r13
+FIRST_ROW_SSE4:
+ movdqu xmm1, [r0]
+ movdqu xmm3, [r0+r3]
+ movdqu xmm5, [r0+r3*2]
+ movdqu xmm7, [r0+r6]
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 000b
+ mpsadbw xmm2, xmm0, 100b
+ paddw xmm1, xmm2 ; 8 sums of line1
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 000b
+ mpsadbw xmm4, xmm0, 100b
+ paddw xmm3, xmm4 ; 8 sums of line2
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 000b
+ mpsadbw xmm2, xmm0, 100b
+ paddw xmm5, xmm2 ; 8 sums of line3
+
+ movdqa xmm4, xmm7
+ mpsadbw xmm7, xmm0, 000b
+ mpsadbw xmm4, xmm0, 100b
+ paddw xmm7, xmm4 ; 8 sums of line4
+
+ paddw xmm1, xmm3
+ paddw xmm5, xmm7
+ paddw xmm1, xmm5 ; sum the upper 4 lines first
+
+ movdqu xmm2, [r13]
+ movdqu xmm3, [r13+r3]
+ movdqu xmm4, [r13+r3*2]
+ movdqu xmm5, [r13+r6]
+
+ movdqa xmm6, xmm2
+ mpsadbw xmm2, xmm0, 000b
+ mpsadbw xmm6, xmm0, 100b
+ paddw xmm2, xmm6
+
+ movdqa xmm7, xmm3
+ mpsadbw xmm3, xmm0, 000b
+ mpsadbw xmm7, xmm0, 100b
+ paddw xmm3, xmm7
+
+ movdqa xmm6, xmm4
+ mpsadbw xmm4, xmm0, 000b
+ mpsadbw xmm6, xmm0, 100b
+ paddw xmm4, xmm6
+
+ movdqa xmm7, xmm5
+ mpsadbw xmm5, xmm0, 000b
+ mpsadbw xmm7, xmm0, 100b
+ paddw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm1, xmm2
+ paddw xmm1, xmm4 ; sum of lines 1- 8
+
+ movdqu [r4], xmm1
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm0
+ punpckhwd xmm2, xmm0
+
+ COUNT_SUM xmm1, r2d, r2, 1
+ COUNT_SUM xmm1, r2d, r2, 1
+ COUNT_SUM xmm1, r2d, r2, 1
+ COUNT_SUM xmm1, r2d, r2, 0
+ COUNT_SUM xmm2, r2d, r2 ,1
+ COUNT_SUM xmm2, r2d, r2 ,1
+ COUNT_SUM xmm2, r2d, r2 ,1
+ COUNT_SUM xmm2, r2d, r2 ,0
+
+ lea r0, [r0+8]
+ lea r13, [r13+8]
+ lea r4, [r4+16] ; element size is 2
+
+ sub r12, 8
+ jg near FIRST_ROW_SSE4
+
+ pop r4
+ pop r2
+ pop r0
+ mov r13, r2
+ dec r13
+HEIGHT_LOOP_SSE4:
+ mov r12, r1
+WIDTH_LOOP_SSE4:
+ movdqu xmm1, [r0+r3*8]
+ movdqu xmm2, [r0]
+ movdqu xmm7, [r4]
+
+ movdqa xmm3, xmm1
+ mpsadbw xmm1, xmm0, 000b
+ mpsadbw xmm3, xmm0, 100b
+ paddw xmm1, xmm3
+
+ movdqa xmm4, xmm2
+ mpsadbw xmm2, xmm0, 000b
+ mpsadbw xmm4, xmm0, 100b
+ paddw xmm2, xmm4
+
+ paddw xmm7, xmm1
+ psubw xmm7, xmm2
+ movdqu [r4+r1*2], xmm7
+
+ movdqa xmm6, xmm7
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm6, xmm0
+
+ COUNT_SUM xmm7, r2d, r2, 1
+ COUNT_SUM xmm7, r2d, r2, 1
+ COUNT_SUM xmm7, r2d, r2, 1
+ COUNT_SUM xmm7, r2d, r2, 0
+ COUNT_SUM xmm6, r2d, r2, 1
+ COUNT_SUM xmm6, r2d, r2, 1
+ COUNT_SUM xmm6, r2d, r2, 1
+ COUNT_SUM xmm6, r2d, r2, 0
+
+ lea r0, [r0+8]
+ lea r4, [r4+16]
+
+ sub r12, 8
+ jg near WIDTH_LOOP_SSE4
+
+ lea r0, [r0+r3]
+ sub r0, r1
+
+ dec r13
+ jg near HEIGHT_LOOP_SSE4
+
+ pop r13
+ pop r12
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+
+;****************************************************************************************************************************************************
+;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;****************************************************************************************************************************************************
+WELS_EXTERN SumOf16x16BlockOfFrame_sse2
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 6
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ push r12
+ push r13
+ push r0
+ push r2
+ push r4
+
+ pxor xmm0, xmm0
+ lea r6, [r3+r3*2]
+
+ mov r12, r1 ;r12:tmp_width
+FIRST_ROW_X16H:
+ movdqu xmm1, [r0]
+ movdqu xmm2, [r0+r3]
+ movdqu xmm3, [r0+r3*2]
+ movdqu xmm4, [r0+r6]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm1, xmm3
+
+ lea r13, [r0+r3*4] ;ebp:r13
+ movdqu xmm2, [r13]
+ movdqu xmm3, [r13+r3]
+ movdqu xmm4, [r13+r3*2]
+ movdqu xmm5, [r13+r6]
+
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
+
+ paddw xmm1, xmm2
+
+ lea r13, [r13+r3*4]
+ movdqu xmm2, [r13]
+ movdqu xmm3, [r13+r3]
+ movdqu xmm4, [r13+r3*2]
+ movdqu xmm5, [r13+r6]
+
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
+
+ paddw xmm1, xmm2
+
+ lea r13, [r13+r3*4]
+ movdqu xmm2, [r13]
+ movdqu xmm3, [r13+r3]
+ movdqu xmm4, [r13+r3*2]
+ movdqu xmm5, [r13+r6]
+
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
+
+ paddw xmm1, xmm2
+ movdqa xmm2, xmm1
+ punpckhwd xmm2, xmm0
+ paddw xmm1, xmm2
+ movd r2d, xmm1
+ mov [r4], r2w
+ inc dword [r5+r2*4]
+
+ inc r0
+ lea r4, [r4+2]
+
+ dec r12
+ jg near FIRST_ROW_X16H
+
+ pop r4
+ pop r2
+ pop r0
+ mov r13, r2
+ dec r13
+ mov r6, r3
+ sal r6, 4 ; succeeded 16th line
+HEIGHT_LOOP_X16:
+ mov r12, r1
+WIDTH_LOOP_X16:
+ movdqu xmm1, [r0+r6]
+ movdqu xmm2, [r0]
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psubw xmm1, xmm2
+ movdqa xmm2, xmm1
+ punpckhwd xmm2, xmm0
+ paddw xmm1, xmm2
+ movd r2d, xmm1
+ add r2w, word [r4]
+ mov [r4+r1*2], r2w
+ inc dword [r5+r2*4]
+
+ inc r0
+ add r4, 2
+
+ dec r12
+ jg near WIDTH_LOOP_X16
+
+ add r0, r3
+ sub r0, r1
+
+ dec r13
+ jg near HEIGHT_LOOP_X16
+
+ pop r13
+ pop r12
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+; requires: width % 16 == 0 && height > 1
+;-----------------------------------------------------------------------------------------------------------------------------
+;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+;-----------------------------------------------------------------------------------------------------------------------------
+; try 8 mv via offset
+%macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1
+ movdqu %2, [%1]
+ movdqu %3, [%1+8h]
+ movdqa %4, %2
+ movdqa %5, %3
+
+ mpsadbw %2, xmm0, 0 ; 000 B
+ mpsadbw %4, xmm0, 5 ; 101 B
+ mpsadbw %3, xmm0, 2 ; 010 B
+ mpsadbw %5, xmm0, 7 ; 111 B
+ paddw %2, %4
+ paddw %3, %5
+ paddw %2, %3 ; accumulate cost
+%endmacro ; end of SAD_16x16_LINE_SSE41
+
+WELS_EXTERN SumOf16x16BlockOfFrame_sse4
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ push r12
+ push r13
+ push r0
+ push r2
+ push r4
+
+ pxor xmm0, xmm0
+ lea r6, [r3+r3*2]
+
+ mov r12, r1 ;r12:tmp_width
+FIRST_ROW_X16_SSE4:
+ SUM_LINE_X16_SSE41 r0, xmm1, xmm2, xmm3, xmm4
+ SUM_LINE_X16_SSE41 r0+r3, xmm2, xmm3, xmm4, xmm5
+ SUM_LINE_X16_SSE41 r0+r3*2,xmm3, xmm4, xmm5, xmm6
+ SUM_LINE_X16_SSE41 r0+r6, xmm4, xmm5, xmm6, xmm7
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm1, xmm3
+
+ lea r13, [r0+r3*4]
+ SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+
+ lea r13, [r13+r3*4]
+ SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+
+ lea r13, [r13+r3*4]
+ SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+
+ movdqa [r4], xmm1
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm0
+ punpckhwd xmm2, xmm0
+
+ COUNT_SUM xmm1, r2d, r2, 1
+ COUNT_SUM xmm1, r2d, r2, 1
+ COUNT_SUM xmm1, r2d, r2, 1
+ COUNT_SUM xmm1, r2d, r2, 0
+ COUNT_SUM xmm2, r2d, r2, 1
+ COUNT_SUM xmm2, r2d, r2, 1
+ COUNT_SUM xmm2, r2d, r2, 1
+ COUNT_SUM xmm2, r2d, r2, 0
+
+ lea r0, [r0+8]
+ lea r4, [r4+16] ; element size is 2
+
+ sub r12, 8
+ jg near FIRST_ROW_X16_SSE4
+
+ pop r4
+ pop r2
+ pop r0
+ mov r13, r2
+ dec r13
+ mov r6, r3
+ sal r6, 4 ; succeeded 16th line
+
+HEIGHT_LOOP_X16_SSE4:
+ mov r12, r1
+WIDTH_LOOP_X16_SSE4:
+ movdqa xmm7, [r4]
+ SUM_LINE_X16_SSE41 r0+r6, xmm1, xmm2, xmm3, xmm4
+ SUM_LINE_X16_SSE41 r0, xmm2, xmm3, xmm4, xmm5
+
+ paddw xmm7, xmm1
+ psubw xmm7, xmm2
+ movdqa [r4+r1*2], xmm7
+
+ movdqa xmm6, xmm7
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm6, xmm0
+
+ COUNT_SUM xmm7, r2d, r2, 1
+ COUNT_SUM xmm7, r2d, r2, 1
+ COUNT_SUM xmm7, r2d, r2, 1
+ COUNT_SUM xmm7, r2d, r2, 0
+ COUNT_SUM xmm6, r2d, r2, 1
+ COUNT_SUM xmm6, r2d, r2, 1
+ COUNT_SUM xmm6, r2d, r2, 1
+ COUNT_SUM xmm6, r2d, r2, 0
+
+ lea r0, [r0+8]
+ lea r4, [r4+16]
+
+ sub r12, 8
+ jg near WIDTH_LOOP_X16_SSE4
+
+ add r0, r3
+ sub r0, r1
+
+ dec r13
+ jg near HEIGHT_LOOP_X16_SSE4
+
+ pop r13
+ pop r12
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+%endif
+
;**********************************************************************************************************************************
+; int32_t SumOf8x8SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
+;**********************************************************************************************************************************
+WELS_EXTERN SumOf8x8SingleBlock_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+
+ pxor xmm0, xmm0
+ movq xmm1, [r0]
+ movhps xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movq xmm2, [r0]
+ movhps xmm2, [r0+r1]
+ lea r0, [r0+2*r1]
+ movq xmm3, [r0]
+ movhps xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movq xmm4, [r0]
+ movhps xmm4, [r0+r1]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm1, xmm3
+
+ movdqa xmm2, xmm1
+ punpckhwd xmm2, xmm0
+ paddw xmm1, xmm2
+
+ movd retrd, xmm1
+ ret
+
+;**********************************************************************************************************************************
+; int32_t SumOf16x16SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
+;**********************************************************************************************************************************
+WELS_EXTERN SumOf16x16SingleBlock_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 6
+ SIGN_EXTENSION r1, r1d
+
+ pxor xmm0, xmm0
+ movdqa xmm1, [r0]
+ movdqa xmm2, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm3, [r0]
+ movdqa xmm4, [r0+r1]
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm1, xmm3
+
+ lea r0, [r0+2*r1]
+ movdqa xmm2, [r0]
+ movdqa xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm4, [r0]
+ movdqa xmm5, [r0+r1]
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
+
+ paddw xmm1, xmm2
+
+ lea r0, [r0+2*r1]
+ movdqa xmm2, [r0]
+ movdqa xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm4, [r0]
+ movdqa xmm5, [r0+r1]
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
+
+ paddw xmm1, xmm2
+
+ lea r0, [r0+2*r1]
+ movdqa xmm2, [r0]
+ movdqa xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm4, [r0]
+ movdqa xmm5, [r0+r1]
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
+
+ paddw xmm1, xmm2
+
+ movdqa xmm2, xmm1
+ punpckhwd xmm2, xmm0
+ paddw xmm1, xmm2
+
+ movd retrd, xmm1
+ POP_XMM
+ ret
+
+;**********************************************************************************************************************************
;
; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
;
@@ -222,4 +1528,3 @@
POP_XMM
LOAD_6_PARA_POP
ret
-
--- a/test/encoder/EncUT_SVC_me.cpp
+++ b/test/encoder/EncUT_SVC_me.cpp
@@ -92,6 +92,11 @@
GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_c)
GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_c)
+#ifdef X86_ASM
+GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_sse2)
+GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_sse2)
+#endif
+
#ifdef HAVE_NEON
GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_neon)
GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_neon)
@@ -137,6 +142,31 @@
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 320)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 640, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 640, 320)
+
+#ifdef X86_ASM
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse2, 6, 6)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse2, 6, 6)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse2, 6, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse2, 6, 320)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse2, 640, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse2, 640, 320)
+
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse4, 8, 2)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse4, 16, 2)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse4, 8, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse4, 16, 320)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse4, 640, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse4, 640, 320)
+#endif
+
+#ifdef HAVE_NEON
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 1)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 1)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320)
+#endif
#ifdef HAVE_NEON
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 1)