shithub: openh264

Download patch

ref: 87107c50a37ee1ad6ba30272bf1823a880f090b5
parent: 136e169c0105e700b1c04e1383f6b4d9041094a5
parent: ef888894046228861825f4acdab09bf6298a6037
author: dongzha <dongzha@cisco.com>
date: Fri Aug 15 06:12:18 EDT 2014

Merge pull request #1275 from zhilwang/x86_hash_scc

Add x86 32/64bit asm code and UT for SCC hash functions

--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -252,6 +252,10 @@
 #ifdef X86_ASM
 extern "C"
 {
+void InitializeHashforFeature_sse2 (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+                                     uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+void FillQpelLocationByFeatureValue_sse2 (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
+                                           uint16_t** pFeatureValuePointerList);
 int32_t SumOf8x8SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
 int32_t SumOf16x16SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
 void SumOf8x8BlockOfFrame_sse2 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -107,6 +107,8 @@
 #if defined (X86_ASM)
     if (uiCpuFlag & WELS_CPU_SSE2) {
         //for feature search
+      pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_sse2;
+      pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_sse2;
       pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse2;
       pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse2;
         //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -31,6 +31,16 @@
 ;*************************************************************************/
 %include "asm_inc.asm"
 
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+SECTION .rodata align=16
+
+ALIGN 16
+mv_x_inc_x4		dw	0x10, 0x10, 0x10, 0x10
+mv_y_inc_x4		dw	0x04, 0x04, 0x04, 0x04
+mx_x_offset_x4	dw	0x00, 0x04, 0x08, 0x0C
+
 SECTION .text
 %ifdef X86_32
 ;**********************************************************************************************************************
@@ -661,6 +671,159 @@
 %undef		tmp_width
     ret
 
+
+;-----------------------------------------------------------------------------------------------------------------------------
+; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
+;-----------------------------------------------------------------------------------------------------------------------------
+WELS_EXTERN FillQpelLocationByFeatureValue_sse2
+    push	esi
+    push	edi
+    push	ebx
+    push	ebp
+
+    %define _ps			16				; push size
+    %define	_ls			4				; local size
+    %define	sum_ref		esp+_ps+_ls+4
+    %define	pos_list	esp+_ps+_ls+16
+    %define width		esp+_ps+_ls+8
+    %define height		esp+_ps+_ls+12
+    %define	i_height	esp
+    sub		esp,	_ls
+
+    mov		esi,	[sum_ref]
+    mov		edi,	[pos_list]
+    mov		ebp,	[width]
+    mov		ebx,	[height]
+    mov		[i_height],	ebx
+
+    movq	xmm7,	[mv_x_inc_x4]		; x_qpel inc
+    movq	xmm6,	[mv_y_inc_x4]		; y_qpel inc
+    movq	xmm5,	[mx_x_offset_x4]	; x_qpel vector
+    pxor	xmm4,	xmm4
+    pxor	xmm3,	xmm3				; y_qpel vector
+HASH_HEIGHT_LOOP_SSE2:
+    movdqa	xmm2,	xmm5	; x_qpel vector
+    mov		ecx,	ebp
+HASH_WIDTH_LOOP_SSE2:
+    movq	xmm0,	[esi]			; load x8 sum
+    punpcklwd	xmm0,	xmm4
+    movdqa		xmm1,	xmm2
+    punpcklwd	xmm1,	xmm3
+%rep	3
+    movd	edx,	xmm0
+    lea		ebx,	[edi+edx*4]
+    mov		eax,	[ebx]
+    movd	[eax],	xmm1
+    mov		edx,	[eax+4]	; explictly load eax+4 due cache miss from vtune observation
+    lea		eax,	[eax+4]
+    mov		[ebx],	eax
+    psrldq	xmm1,	4
+    psrldq	xmm0,	4
+%endrep
+    movd	edx,	xmm0
+    lea		ebx,	[edi+edx*4]
+    mov		eax,	[ebx]
+    movd	[eax],	xmm1
+    mov		edx,	[eax+4]	; explictly load eax+4 due cache miss from vtune observation
+    lea		eax,	[eax+4]
+    mov		[ebx],	eax
+
+    paddw	xmm2,	xmm7
+    lea		esi,	[esi+8]
+    sub		ecx,	4
+    jnz near HASH_WIDTH_LOOP_SSE2
+    paddw	xmm3,	xmm6
+    dec	dword [i_height]
+    jnz	near HASH_HEIGHT_LOOP_SSE2
+
+    add		esp,	_ls
+    %undef	_ps
+    %undef	_ls
+    %undef	sum_ref
+    %undef	pos_list
+    %undef	width
+    %undef	height
+    %undef	i_height
+    pop		ebp
+    pop		ebx
+    pop		edi
+    pop		esi
+    ret
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------
+; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+;                        uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList )
+;---------------------------------------------------------------------------------------------------------------------------------------------------
+WELS_EXTERN InitializeHashforFeature_sse2
+    push	ebx
+    push	esi
+    push	edi
+    push	ebp
+    %define	_ps	16	; push size
+    mov		edi,	[esp+_ps+16]	; pPositionOfSum
+    mov		ebp,	[esp+_ps+20]	; sum_idx_list
+    mov		esi,	[esp+_ps+4]     ; pTimesOfSum
+    mov		ebx,	[esp+_ps+8]     ; pBuf
+    mov		edx,	[esp+_ps+12]	; list_sz
+    sar		edx,	2
+    mov		ecx,	0
+    pxor	xmm7,	xmm7
+hash_assign_loop_x4_sse2:
+    movdqa	xmm0,	[esi+ecx]
+    pslld	xmm0,	2
+
+    movdqa	xmm1,	xmm0
+    pcmpeqd	xmm1,	xmm7
+    movmskps	eax,	xmm1
+    cmp eax, 0x0f
+    je	near hash_assign_with_copy_sse2
+
+%assign x	0
+%rep 4
+    lea		eax,	[edi+ecx+x]
+    mov		[eax],	ebx
+    lea		eax,	[ebp+ecx+x]
+    mov		[eax],	ebx
+    movd	eax,	xmm0
+    add		ebx,	eax
+    psrldq	xmm0,	4
+%assign	x	x+4
+%endrep
+    jmp near assign_next_sse2
+
+hash_assign_with_copy_sse2:
+    movd	xmm1,	ebx
+    pshufd	xmm2,	xmm1,	0
+    movdqa	[edi+ecx], xmm2
+    movdqa	[ebp+ecx], xmm2
+
+assign_next_sse2:
+    add		ecx,	16
+    dec		edx
+    jnz		near hash_assign_loop_x4_sse2
+
+    mov		edx,	[esp+_ps+12]	; list_sz
+    and		edx,	3
+    jz		near hash_assign_no_rem_sse2
+hash_assign_loop_x4_rem_sse2:
+    lea		eax,	[edi+ecx]
+    mov		[eax],	ebx
+    lea		eax,	[ebp+ecx]
+    mov		[eax],	ebx
+    mov		eax,	[esi+ecx]
+    sal		eax,	2
+    add		ebx,	eax
+    add		ecx,	4
+    dec		edx
+    jnz		near hash_assign_loop_x4_rem_sse2
+
+hash_assign_no_rem_sse2:
+    %undef	_ps
+    pop		ebp
+    pop		edi
+    pop		esi
+    pop		ebx
+    ret
 %else
 
 ;**********************************************************************************************************************
@@ -1220,6 +1383,135 @@
     pop		r12
     POP_XMM
     LOAD_6_PARA_POP
+    ret
+
+;-----------------------------------------------------------------------------------------------------------------------------
+; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
+;-----------------------------------------------------------------------------------------------------------------------------
+WELS_EXTERN FillQpelLocationByFeatureValue_sse2
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r2, r2d
+    push r12
+    push r13
+    mov     r12,    r2
+
+    movq	xmm7,	[mv_x_inc_x4]		; x_qpel inc
+    movq	xmm6,	[mv_y_inc_x4]		; y_qpel inc
+    movq	xmm5,	[mx_x_offset_x4]	; x_qpel vector
+    pxor	xmm4,	xmm4
+    pxor	xmm3,	xmm3				; y_qpel vector
+HASH_HEIGHT_LOOP_SSE2:
+    movdqa	xmm2,	xmm5	; x_qpel vector
+    mov		r4,	r1
+HASH_WIDTH_LOOP_SSE2:
+    movq	xmm0,	[r0]			; load x8 sum
+    punpcklwd	xmm0,	xmm4
+    movdqa		xmm1,	xmm2
+    punpcklwd	xmm1,	xmm3
+%rep	3
+    movd	r2d,	xmm0        ;edx:r3
+    lea		r5,     [r3+r2*8]   ;ebx:r5
+    mov		r6,     [r5]        ;eax:r6
+    movd	[r6],	xmm1
+    mov		r13,    [r6+4]	; explictly load eax+4 due cache miss from vtune observation
+    lea		r6,     [r6+4]
+    mov		[r5],	r6
+    psrldq	xmm1,	4
+    psrldq	xmm0,	4
+%endrep
+    movd	r2d,	xmm0
+    lea		r5,     [r3+r2*8]   ;ebx:r5
+    mov		r6,     [r5]        ;eax:r6
+    movd	[r6],	xmm1
+    mov		r13,    [r6+4]	; explictly load eax+4 due cache miss from vtune observation
+    lea		r6,     [r6+4]
+    mov		[r5],	r6
+
+    paddw	xmm2,	xmm7
+    lea		r0,     [r0+8]
+    sub		r4,     4
+    jnz near HASH_WIDTH_LOOP_SSE2
+    paddw	xmm3,	xmm6
+    dec	r12
+    jnz	near HASH_HEIGHT_LOOP_SSE2
+
+    pop		r13
+    pop		r12
+    POP_XMM
+    ret
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------
+; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+;                                 uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+;uint16_t** pPositionOfSum, uint16_t** sum_idx_list, uint32_t* pTimesOfSum, uint16_t* pBuf, const int32_t list_sz )
+;---------------------------------------------------------------------------------------------------------------------------------------------------
+WELS_EXTERN InitializeHashforFeature_sse2
+    %assign  push_num 0
+    LOAD_5_PARA
+    SIGN_EXTENSION  r2, r2d
+    push r12
+    push r13
+    mov     r12,    r2
+    sar		r2,     2
+    mov		r5,     0       ;r5:ecx
+    xor     r6,     r6
+    pxor	xmm3,	xmm3
+hash_assign_loop_x4_sse2:
+    movdqa	xmm0,	[r0+r5]
+    pslld	xmm0,	2
+
+    movdqa	xmm1,	xmm0
+    pcmpeqd	xmm1,	xmm3
+    movmskps	r6,	xmm1
+    cmp     r6,     0x0f
+    jz	near hash_assign_with_copy_sse2
+
+%assign x	0
+%rep 4
+    lea		r13,	[r3+r5*2+x]
+    mov		[r13],	r1
+    lea		r13,	[r4+r5*2+x]
+    mov		[r13],	r1
+    movd	r6d,	xmm0
+    add		r1,     r6
+    psrldq	xmm0,	4
+%assign	x	x+8
+%endrep
+    jmp near assign_next_sse2
+
+hash_assign_with_copy_sse2:
+    movq	xmm1,	r1
+    pshufd	xmm2,	xmm1,	01000100b
+    movdqa	[r3+r5*2], xmm2
+    movdqa	[r4+r5*2], xmm2
+    movdqa	[r3+r5*2+16], xmm2
+    movdqa	[r4+r5*2+16], xmm2
+
+assign_next_sse2:
+    add		r5,	16
+    dec		r2
+    jnz		near hash_assign_loop_x4_sse2
+
+    and		r12,	3
+    jz		near hash_assign_no_rem_sse2
+hash_assign_loop_x4_rem_sse2:
+    lea		r13,	[r3+r5*2]
+    mov		[r13],	r1
+    lea		r13,	[r4+r5*2]
+    mov		[r13],	r1
+    mov		r6d,	[r0+r5]
+    sal		r6,     2
+    add		r1,     r6
+    add		r5,     4
+    dec		r12
+    jnz		near hash_assign_loop_x4_rem_sse2
+
+hash_assign_no_rem_sse2:
+    pop     r13
+    pop	    r12
     ret
 
 %endif
--- a/test/encoder/EncUT_SVC_me.cpp
+++ b/test/encoder/EncUT_SVC_me.cpp
@@ -6,6 +6,7 @@
 #include "cpu_core.h"
 #include "cpu.h"
 #include "macros.h"
+#include "ls_defines.h"
 #include "svc_motion_estimate.h"
 
 using namespace WelsEnc;
@@ -77,6 +78,33 @@
   }
 }
 
+
+void InitializeHashforFeature_ref (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+                                 uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList) {
+    //assign location pointer
+  uint16_t* pBufPos  = pBuf;
+  for (int32_t i = 0 ; i < kiListSize; ++i) {
+    pLocationOfFeature[i] =
+    pFeatureValuePointerList[i] = pBufPos;
+    pBufPos      += (pTimesOfFeatureValue[i] << 1);
+  }
+}
+void FillQpelLocationByFeatureValue_ref (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
+                                       uint16_t** pFeatureValuePointerList) {
+    //assign each pixel's position
+  uint16_t* pSrcPointer  =  pFeatureOfBlock;
+  int32_t iQpelY = 0;
+  for (int32_t y = 0; y < kiHeight; y++) {
+    for (int32_t x = 0; x < kiWidth; x++) {
+      uint16_t uiFeature = pSrcPointer[x];
+      ST32 (&pFeatureValuePointerList[uiFeature][0], ((iQpelY << 16) | (x << 2)));
+      pFeatureValuePointerList[uiFeature] += 2;
+    }
+    iQpelY += 4;
+    pSrcPointer += kiWidth;
+  }
+}
+
 #define GENERATE_SumOfSingleBlock(anchor, method) \
 TEST (SVC_ME_FunTest, method) {\
   ENFORCE_STACK_ALIGN_1D (uint8_t,  uiRefBuf,   16*320, 16);\
@@ -135,6 +163,93 @@
 delete[] pFeatureOfBlockBuff1; \
 delete[] pFeatureOfBlockBuff2; \
 }
+
+#define GENERATE_InitializeHashforFeature(anchor, method, kiWidth, kiHeight) \
+TEST (SVC_ME_FunTest, method##_##kiWidth##x##kiHeight) {\
+ENFORCE_NEW_ALIGN_1D (uint8_t, pRefPicture, pRefPictureBuff, ((kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock, pFeatureOfBlockBuff, (kiWidth*kiHeight), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pLocation1, pLocationBuff1, (kiWidth*kiHeight)*2, 16) \
+ENFORCE_NEW_ALIGN_1D (uint32_t, pTimesOfFeatureValue, pTimesOfFeatureValueBuff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature0, pLocationFeature0Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature1, pLocationFeature1Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList0, pFeaturePointValueList0Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList1, pFeaturePointValueList1Buff, 65536, 16) \
+for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) { \
+  FillWithRandomData (pRefPicture,(kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)); \
+  memset(pTimesOfFeatureValue, 0, 65536*sizeof(uint32_t)); \
+  memset(pLocationFeature0, 0, 65536*sizeof(uint16_t*)); \
+  memset(pFeaturePointValueList0, 0, 65536*sizeof(uint16_t*)); \
+  memset(pLocationFeature1, 0, 65536*sizeof(uint16_t*)); \
+  memset(pFeaturePointValueList1, 0, 65536*sizeof(uint16_t*)); \
+  SumOf8x8BlockOfFrame_c (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock,pTimesOfFeatureValue); \
+  int32_t iActSize = 65536;\
+  anchor ( pTimesOfFeatureValue, pLocation1, iActSize, pLocationFeature0, pFeaturePointValueList0);\
+  method ( pTimesOfFeatureValue, pLocation1, iActSize, pLocationFeature1, pFeaturePointValueList1); \
+  for(int32_t j =0; j<65536; j++) { \
+    EXPECT_EQ (pLocationFeature0[j], pLocationFeature1[j]); \
+    EXPECT_EQ (pFeaturePointValueList0[j], pFeaturePointValueList1[j]); \
+  } \
+} \
+delete[] pRefPictureBuff; \
+delete[] pFeatureOfBlockBuff; \
+delete[] pLocationBuff1; \
+delete[] pTimesOfFeatureValueBuff; \
+delete[] pLocationFeature0Buff; \
+delete[] pFeaturePointValueList0Buff; \
+delete[] pLocationFeature1Buff; \
+delete[] pFeaturePointValueList1Buff; \
+}
+
+
+#define GENERATE_FillQpelLocationByFeatureValue(anchor, method, kiWidth, kiHeight) \
+TEST (SVC_ME_FunTest, method##_##kiWidth##x##kiHeight) {\
+ENFORCE_NEW_ALIGN_1D (uint8_t, pRefPicture, pRefPictureBuff, ((kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock, pFeatureOfBlockBuff, (kiWidth*kiHeight), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pLocation1, pLocationBuff1, (kiWidth*kiHeight)*2, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pLocation2, pLocationBuff2, (kiWidth*kiHeight)*2, 16) \
+ENFORCE_NEW_ALIGN_1D (uint32_t, pTimesOfFeatureValue, pTimesOfFeatureValueBuff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature0, pLocationFeature0Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature1, pLocationFeature1Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList0, pFeaturePointValueList0Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList1, pFeaturePointValueList1Buff, 65536, 16) \
+for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) { \
+  FillWithRandomData (pRefPicture,(kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)); \
+  memset(pTimesOfFeatureValue, 0, 65536*sizeof(uint32_t)); \
+  memset(pLocationFeature0, 0, 65536*sizeof(uint16_t*)); \
+  memset(pFeaturePointValueList0, 0, 65536*sizeof(uint16_t*)); \
+  memset(pLocationFeature1, 0, 65536*sizeof(uint16_t*)); \
+  memset(pFeaturePointValueList1, 0, 65536*sizeof(uint16_t*)); \
+  SumOf8x8BlockOfFrame_c (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock,pTimesOfFeatureValue); \
+  int32_t iActSize = 65536; \
+  InitializeHashforFeature_c ( pTimesOfFeatureValue, pLocation1, iActSize, pLocationFeature0, pFeaturePointValueList0); \
+  InitializeHashforFeature_c( pTimesOfFeatureValue, pLocation2, iActSize, pLocationFeature1, pFeaturePointValueList1); \
+  anchor(pFeatureOfBlock, kiWidth, kiHeight, pFeaturePointValueList0); \
+  method(pFeatureOfBlock, kiWidth, kiHeight, pFeaturePointValueList1); \
+  for(int32_t j =0; j<kiWidth*kiHeight*2; j++) { \
+    EXPECT_EQ (pLocation1[j], pLocation2[j]); \
+  } \
+} \
+delete[] pRefPictureBuff; \
+delete[] pFeatureOfBlockBuff; \
+delete[] pLocationBuff1; \
+delete[] pLocationBuff2; \
+delete[] pTimesOfFeatureValueBuff; \
+delete[] pLocationFeature0Buff; \
+delete[] pFeaturePointValueList0Buff; \
+delete[] pLocationFeature1Buff; \
+delete[] pFeaturePointValueList1Buff; \
+}
+
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_c, 10, 10)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_c, 16, 16)
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_c, 640, 320)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_c, 640, 320)
+#ifdef X86_ASM
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_sse2, 10, 10)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_sse2, 16, 16)
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_sse2, 640, 320)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_sse2, 640, 320)
+#endif
 
 GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 1)
 GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 1)