ref: b35f5797de9d23878fe8d4178da05061e770685c
parent: 9d2e1a9384dc680758700b187485be1f30a8baed
author: zhiliang wang <zhilwang@cisco.com>
date: Thu Aug 14 14:41:52 EDT 2014
Add x86 32/64bit asm code for Scc_hash
--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -252,6 +252,10 @@
#ifdef X86_ASM
extern "C"
{
+void InitializeHashforFeature_sse2 (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+ uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+void FillQpelLocationByFeatureValue_sse2 (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
+ uint16_t** pFeatureValuePointerList);
int32_t SumOf8x8SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
int32_t SumOf16x16SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
void SumOf8x8BlockOfFrame_sse2 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -107,6 +107,8 @@
#if defined (X86_ASM)
if (uiCpuFlag & WELS_CPU_SSE2) {
//for feature search
+ pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_sse2;
+ pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_sse2;
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse2;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse2;
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -31,6 +31,16 @@
;*************************************************************************/
%include "asm_inc.asm"
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+SECTION .rodata align=16
+
+ALIGN 16
+mv_x_inc_x4 dw 0x10, 0x10, 0x10, 0x10
+mv_y_inc_x4 dw 0x04, 0x04, 0x04, 0x04
+mx_x_offset_x4 dw 0x00, 0x04, 0x08, 0x0C
+
SECTION .text
%ifdef X86_32
;**********************************************************************************************************************
@@ -661,6 +671,159 @@
%undef tmp_width
ret
+
+;-----------------------------------------------------------------------------------------------------------------------------
+; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
+;-----------------------------------------------------------------------------------------------------------------------------
+WELS_EXTERN FillQpelLocationByFeatureValue_sse2
+ push esi
+ push edi
+ push ebx
+ push ebp
+
+ %define _ps 16 ; push size
+ %define _ls 4 ; local size
+ %define sum_ref esp+_ps+_ls+4
+ %define pos_list esp+_ps+_ls+16
+ %define width esp+_ps+_ls+8
+ %define height esp+_ps+_ls+12
+ %define i_height esp
+ sub esp, _ls
+
+ mov esi, [sum_ref]
+ mov edi, [pos_list]
+ mov ebp, [width]
+ mov ebx, [height]
+ mov [i_height], ebx
+
+ movq xmm7, [mv_x_inc_x4] ; x_qpel inc
+ movq xmm6, [mv_y_inc_x4] ; y_qpel inc
+ movq xmm5, [mx_x_offset_x4] ; x_qpel vector
+ pxor xmm4, xmm4
+ pxor xmm3, xmm3 ; y_qpel vector
+HASH_HEIGHT_LOOP_SSE2:
+ movdqa xmm2, xmm5 ; x_qpel vector
+ mov ecx, ebp
+HASH_WIDTH_LOOP_SSE2:
+ movq xmm0, [esi] ; load x8 sum
+ punpcklwd xmm0, xmm4
+ movdqa xmm1, xmm2
+ punpcklwd xmm1, xmm3
+%rep 3
+ movd edx, xmm0
+ lea ebx, [edi+edx*4]
+ mov eax, [ebx]
+ movd [eax], xmm1
+ mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
+ lea eax, [eax+4]
+ mov [ebx], eax
+ psrldq xmm1, 4
+ psrldq xmm0, 4
+%endrep
+ movd edx, xmm0
+ lea ebx, [edi+edx*4]
+ mov eax, [ebx]
+ movd [eax], xmm1
+ mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
+ lea eax, [eax+4]
+ mov [ebx], eax
+
+ paddw xmm2, xmm7
+ lea esi, [esi+8]
+ sub ecx, 4
+ jnz near HASH_WIDTH_LOOP_SSE2
+ paddw xmm3, xmm6
+ dec dword [i_height]
+ jnz near HASH_HEIGHT_LOOP_SSE2
+
+ add esp, _ls
+ %undef _ps
+ %undef _ls
+ %undef sum_ref
+ %undef pos_list
+ %undef width
+ %undef height
+ %undef i_height
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------
+; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList )
+;---------------------------------------------------------------------------------------------------------------------------------------------------
+WELS_EXTERN InitializeHashforFeature_sse2
+ push ebx
+ push esi
+ push edi
+ push ebp
+ %define _ps 16 ; push size
+ mov edi, [esp+_ps+16] ; pPositionOfSum
+ mov ebp, [esp+_ps+20] ; sum_idx_list
+ mov esi, [esp+_ps+4] ; pTimesOfSum
+ mov ebx, [esp+_ps+8] ; pBuf
+ mov edx, [esp+_ps+12] ; list_sz
+ sar edx, 2
+ mov ecx, 0
+ pxor xmm7, xmm7
+hash_assign_loop_x4_sse2:
+ movdqa xmm0, [esi+ecx]
+ pslld xmm0, 2
+
+ movdqa xmm1, xmm0
+ pcmpeqd xmm1, xmm7
+ movmskps eax, xmm1
+ cmp eax, 0x0f
+ je near hash_assign_with_copy_sse2
+
+%assign x 0
+%rep 4
+ lea eax, [edi+ecx+x]
+ mov [eax], ebx
+ lea eax, [ebp+ecx+x]
+ mov [eax], ebx
+ movd eax, xmm0
+ add ebx, eax
+ psrldq xmm0, 4
+%assign x x+4
+%endrep
+ jmp near assign_next_sse2
+
+hash_assign_with_copy_sse2:
+ movd xmm1, ebx
+ pshufd xmm2, xmm1, 0
+ movdqa [edi+ecx], xmm2
+ movdqa [ebp+ecx], xmm2
+
+assign_next_sse2:
+ add ecx, 16
+ dec edx
+ jnz near hash_assign_loop_x4_sse2
+
+ mov edx, [esp+_ps+12] ; list_sz
+ and edx, 3
+ jz near hash_assign_no_rem_sse2
+hash_assign_loop_x4_rem_sse2:
+ lea eax, [edi+ecx]
+ mov [eax], ebx
+ lea eax, [ebp+ecx]
+ mov [eax], ebx
+ mov eax, [esi+ecx]
+ sal eax, 2
+ add ebx, eax
+ add ecx, 4
+ dec edx
+ jnz near hash_assign_loop_x4_rem_sse2
+
+hash_assign_no_rem_sse2:
+ %undef _ps
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
%else
;**********************************************************************************************************************
@@ -1221,6 +1384,146 @@
POP_XMM
LOAD_6_PARA_POP
ret
+
+;-----------------------------------------------------------------------------------------------------------------------------
+; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
+;-----------------------------------------------------------------------------------------------------------------------------
+WELS_EXTERN FillQpelLocationByFeatureValue_sse2
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r2, r2d
+ push r12
+ push r13
+
+ ;mov esi, [sum_ref] r0:esi
+ ;mov edi, [pos_list] r3:edi
+ ;mov ebp, [width] r1:ebp
+ ;mov ebx, [height] r2:ebx
+ ;mov [i_height], ebx
+ mov r12, r2
+
+ movq xmm7, [mv_x_inc_x4] ; x_qpel inc
+ movq xmm6, [mv_y_inc_x4] ; y_qpel inc
+ movq xmm5, [mx_x_offset_x4] ; x_qpel vector
+ pxor xmm4, xmm4
+ pxor xmm3, xmm3 ; y_qpel vector
+HASH_HEIGHT_LOOP_SSE2:
+ movdqa xmm2, xmm5 ; x_qpel vector
+ mov r4, r1
+HASH_WIDTH_LOOP_SSE2:
+ movq xmm0, [r0] ; load x8 sum
+ punpcklwd xmm0, xmm4
+ movdqa xmm1, xmm2
+ punpcklwd xmm1, xmm3
+%rep 3
+ movd r2d, xmm0 ;edx:r3
+ lea r5, [r3+r2*8] ;ebx:r5
+ mov r6, [r5] ;eax:r6
+ movd [r6], xmm1
+ mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
+ lea r6, [r6+4]
+ mov [r5], r6
+ psrldq xmm1, 4
+ psrldq xmm0, 4
+%endrep
+ movd r2d, xmm0
+ lea r5, [r3+r2*8] ;ebx:r5
+ mov r6, [r5] ;eax:r6
+ movd [r6], xmm1
+ mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
+ lea r6, [r6+4]
+ mov [r5], r6
+
+ paddw xmm2, xmm7
+ lea r0, [r0+8]
+ sub r4, 4
+ jnz near HASH_WIDTH_LOOP_SSE2
+ paddw xmm3, xmm6
+ dec r12
+ jnz near HASH_HEIGHT_LOOP_SSE2
+
+ pop r13
+ pop r12
+ POP_XMM
+ ret
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------
+; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+;uint16_t** pPositionOfSum, uint16_t** sum_idx_list, uint32_t* pTimesOfSum, uint16_t* pBuf, const int32_t list_sz )
+;---------------------------------------------------------------------------------------------------------------------------------------------------
+WELS_EXTERN InitializeHashforFeature_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENSION r2, r2d
+ push r12
+ push r13
+ ;mov edi, [esp+_ps+4] ; pPositionOfSum r3:edi
+ ;mov ebp, [esp+_ps+8] ; sum_idx_list r4:ebp
+ ;mov esi, [esp+_ps+12] ; pTimesOfSum r0:esi
+ ;mov ebx, [esp+_ps+16] ; pBuf r1:ebx
+ ;mov edx, [esp+_ps+20] ; list_sz r2:edx
+ mov r12, r2
+ sar r2, 2
+ mov r5, 0 ;r5:ecx
+ xor r6, r6
+ pxor xmm3, xmm3
+hash_assign_loop_x4_sse2:
+ movdqa xmm0, [r0+r5]
+ pslld xmm0, 2
+
+ movdqa xmm1, xmm0
+ pcmpeqd xmm1, xmm3
+ movmskps r6, xmm1
+ cmp r6, 0x0f
+ jz near hash_assign_with_copy_sse2
+
+%assign x 0
+%rep 4
+ lea r13, [r3+r5*2+x]
+ mov [r13], r1
+ lea r13, [r4+r5*2+x]
+ mov [r13], r1
+ movd r6d, xmm0
+ add r1, r6
+ psrldq xmm0, 4
+%assign x x+8
+%endrep
+ jmp near assign_next_sse2
+
+hash_assign_with_copy_sse2:
+ movq xmm1, r1
+ pshufd xmm2, xmm1, 01000100b
+ movdqa [r3+r5*2], xmm2
+ movdqa [r4+r5*2], xmm2
+ movdqa [r3+r5*2+16], xmm2
+ movdqa [r4+r5*2+16], xmm2
+
+assign_next_sse2:
+ add r5, 16
+ dec r2
+ jnz near hash_assign_loop_x4_sse2
+
+ and r12, 3
+ jz near hash_assign_no_rem_sse2
+hash_assign_loop_x4_rem_sse2:
+ lea r13, [r3+r5*2]
+ mov [r13], r1
+ lea r13, [r4+r5*2]
+ mov [r13], r1
+ mov r6d, [r0+r5]
+ sal r6, 2
+ add r1, r6
+ add r5, 4
+ dec r12
+ jnz near hash_assign_loop_x4_rem_sse2
+
+hash_assign_no_rem_sse2:
+ pop r13
+ pop r12
+ ret
%endif
--- a/test/encoder/EncUT_SVC_me.cpp
+++ b/test/encoder/EncUT_SVC_me.cpp
@@ -6,6 +6,7 @@
#include "cpu_core.h"
#include "cpu.h"
#include "macros.h"
+#include "ls_defines.h"
#include "svc_motion_estimate.h"
using namespace WelsEnc;
@@ -77,6 +78,33 @@
}
}
+
+void InitializeHashforFeature_ref (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+ uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList) {
+ //assign location pointer
+ uint16_t* pBufPos = pBuf;
+ for (int32_t i = 0 ; i < kiListSize; ++i) {
+ pLocationOfFeature[i] =
+ pFeatureValuePointerList[i] = pBufPos;
+ pBufPos += (pTimesOfFeatureValue[i] << 1);
+ }
+}
+void FillQpelLocationByFeatureValue_ref (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
+ uint16_t** pFeatureValuePointerList) {
+ //assign each pixel's position
+ uint16_t* pSrcPointer = pFeatureOfBlock;
+ int32_t iQpelY = 0;
+ for (int32_t y = 0; y < kiHeight; y++) {
+ for (int32_t x = 0; x < kiWidth; x++) {
+ uint16_t uiFeature = pSrcPointer[x];
+ ST32 (&pFeatureValuePointerList[uiFeature][0], ((iQpelY << 16) | (x << 2)));
+ pFeatureValuePointerList[uiFeature] += 2;
+ }
+ iQpelY += 4;
+ pSrcPointer += kiWidth;
+ }
+}
+
#define GENERATE_SumOfSingleBlock(anchor, method) \
TEST (SVC_ME_FunTest, method) {\
ENFORCE_STACK_ALIGN_1D (uint8_t, uiRefBuf, 16*320, 16);\
@@ -135,6 +163,89 @@
delete[] pFeatureOfBlockBuff1; \
delete[] pFeatureOfBlockBuff2; \
}
+
+#define GENERATE_InitializeHashforFeature(anchor, method, kiWidth, kiHeight) \
+TEST (SVC_ME_FunTest, method##_##kiWidth##x##kiHeight) {\
+ENFORCE_NEW_ALIGN_1D (uint8_t, pRefPicture, pRefPictureBuff, ((kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock, pFeatureOfBlockBuff, (kiWidth*kiHeight), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pLocation1, pLocationBuff1, (kiWidth*kiHeight)*2, 16) \
+ENFORCE_NEW_ALIGN_1D (uint32_t, pTimesOfFeatureValue, pTimesOfFeatureValueBuff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature0, pLocationFeature0Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature1, pLocationFeature1Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList0, pFeaturePointValueList0Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList1, pFeaturePointValueList1Buff, 65536, 16) \
+for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) { \
+ FillWithRandomData (pRefPicture,(kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)); \
+ memset(pTimesOfFeatureValue, 0, 65536*sizeof(uint32_t)); \
+ memset(pLocationFeature0, 0, 65536*sizeof(uint16_t*)); \
+ memset(pFeaturePointValueList0, 0, 65536*sizeof(uint16_t*)); \
+ memset(pLocationFeature1, 0, 65536*sizeof(uint16_t*)); \
+ memset(pFeaturePointValueList1, 0, 65536*sizeof(uint16_t*)); \
+ SumOf8x8BlockOfFrame_c (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock,pTimesOfFeatureValue); \
+ int32_t iActSize = 65536;\
+ anchor ( pTimesOfFeatureValue, pLocation1, iActSize, pLocationFeature0, pFeaturePointValueList0);\
+ method ( pTimesOfFeatureValue, pLocation1, iActSize, pLocationFeature1, pFeaturePointValueList1); \
+ for(int32_t j =0; j<65536; j++) { \
+ EXPECT_EQ (pLocationFeature0[j], pLocationFeature1[j]); \
+ EXPECT_EQ (pFeaturePointValueList0[j], pFeaturePointValueList1[j]); \
+ } \
+} \
+delete[] pRefPictureBuff; \
+delete[] pFeatureOfBlockBuff; \
+delete[] pLocationBuff1; \
+delete[] pTimesOfFeatureValueBuff; \
+delete[] pLocationFeature0Buff; \
+delete[] pFeaturePointValueList0Buff; \
+delete[] pLocationFeature1Buff; \
+delete[] pFeaturePointValueList1Buff; \
+}
+
+
+#define GENERATE_FillQpelLocationByFeatureValue(anchor, method, kiWidth, kiHeight) \
+TEST (SVC_ME_FunTest, method##_##kiWidth##x##kiHeight) {\
+ENFORCE_NEW_ALIGN_1D (uint8_t, pRefPicture, pRefPictureBuff, ((kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock, pFeatureOfBlockBuff, (kiWidth*kiHeight), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pLocation1, pLocationBuff1, (kiWidth*kiHeight)*2, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pLocation2, pLocationBuff2, (kiWidth*kiHeight)*2, 16) \
+ENFORCE_NEW_ALIGN_1D (uint32_t, pTimesOfFeatureValue, pTimesOfFeatureValueBuff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature0, pLocationFeature0Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature1, pLocationFeature1Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList0, pFeaturePointValueList0Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList1, pFeaturePointValueList1Buff, 65536, 16) \
+for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) { \
+ FillWithRandomData (pRefPicture,(kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)); \
+ memset(pTimesOfFeatureValue, 0, 65536*sizeof(uint32_t)); \
+ memset(pLocationFeature0, 0, 65536*sizeof(uint16_t*)); \
+ memset(pFeaturePointValueList0, 0, 65536*sizeof(uint16_t*)); \
+ memset(pLocationFeature1, 0, 65536*sizeof(uint16_t*)); \
+ memset(pFeaturePointValueList1, 0, 65536*sizeof(uint16_t*)); \
+ SumOf8x8BlockOfFrame_c (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock,pTimesOfFeatureValue); \
+ int32_t iActSize = 65536; \
+ InitializeHashforFeature_c ( pTimesOfFeatureValue, pLocation1, iActSize, pLocationFeature0, pFeaturePointValueList0); \
+ InitializeHashforFeature_c( pTimesOfFeatureValue, pLocation2, iActSize, pLocationFeature1, pFeaturePointValueList1); \
+ anchor(pFeatureOfBlock, kiWidth, kiHeight, pFeaturePointValueList0); \
+ method(pFeatureOfBlock, kiWidth, kiHeight, pFeaturePointValueList1); \
+ for(int32_t j =0; j<kiWidth*kiHeight*2; j++) { \
+ EXPECT_EQ (pLocation1[j], pLocation2[j]); \
+ } \
+} \
+delete[] pRefPictureBuff; \
+delete[] pFeatureOfBlockBuff; \
+delete[] pLocationBuff1; \
+delete[] pLocationBuff2; \
+delete[] pTimesOfFeatureValueBuff; \
+delete[] pLocationFeature0Buff; \
+delete[] pFeaturePointValueList0Buff; \
+delete[] pLocationFeature1Buff; \
+delete[] pFeaturePointValueList1Buff; \
+}
+
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_c, 10, 10)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_c, 16, 16)
+#ifdef X86_ASM
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_sse2, 10, 10)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_sse2, 16, 16)
+#endif
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 1)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 1)