ref: c17a58efdfa8c03e3a1ae8e7f78483d48700499c
dir: /codec/encoder/core/x86/sample_sc.asm/
;*! ;* \copy ;* Copyright (c) 2009-2013, Cisco Systems ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* * Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* ;* * Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in ;* the documentation and/or other materials provided with the ;* distribution. ;* ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ;* POSSIBILITY OF SUCH DAMAGE. ;* ;*************************************************************************/ %include "asm_inc.asm" ;*********************************************************************** ; Local Data (Read Only) ;*********************************************************************** SECTION .rodata align=16 ALIGN 16 mv_x_inc_x4 dw 0x10, 0x10, 0x10, 0x10 mv_y_inc_x4 dw 0x04, 0x04, 0x04, 0x04 mx_x_offset_x4 dw 0x00, 0x04, 0x08, 0x0C SECTION .text %ifdef X86_32 ;********************************************************************************************************************** ;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, ; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); ;********************************************************************************************************************* WELS_EXTERN SumOf8x8BlockOfFrame_sse2 %define pushsize 16 %define localsize 4 %define ref esp + pushsize + localsize + 4 %define sum_ref esp + pushsize + localsize + 20 %define times_of_sum esp + pushsize + localsize + 24 %define width esp + pushsize + localsize + 8 %define height esp + pushsize + localsize + 12 %define linesize esp + pushsize + localsize + 16 %define tmp_width esp + 0 push ebx push ebp push esi push edi sub esp, localsize pxor xmm0, xmm0 mov esi, [ref] mov edi, [sum_ref] mov edx, [times_of_sum] mov ebx, [linesize] mov eax, [width] lea ecx, [ebx+ebx*2] ; 3*linesize mov [tmp_width], eax lea ebp, [esi+ebx*4] FIRST_ROW: movq xmm1, [esi] movq xmm2, [esi+ebx] movq xmm3, [esi+ebx*2] movq xmm4, [esi+ecx] shufps xmm1, xmm2, 01000100b shufps xmm3, xmm4, 01000100b psadbw xmm1, xmm0 psadbw xmm3, xmm0 paddd xmm1, xmm3 movq xmm2, [ebp] movq xmm3, [ebp+ebx] movq xmm4, [ebp+ebx*2] movq xmm5, [ebp+ecx] shufps xmm2, xmm3, 01000100b shufps xmm4, xmm5, 01000100b psadbw xmm2, xmm0 psadbw xmm4, xmm0 paddd xmm2, xmm4 paddd xmm1, xmm2 pshufd xmm2, xmm1, 00001110b paddd xmm1, xmm2 movd eax, xmm1 mov [edi], ax inc dword [edx+eax*4] inc esi inc ebp add edi, 2 dec dword [tmp_width] jg FIRST_ROW mov esi, [ref] mov edi, [sum_ref] mov ebp, [width] dec dword [height] HEIGHT_LOOP: mov [tmp_width], ebp WIDTH_LOOP: movq xmm1, [esi+ebx*8] movq xmm2, [esi] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psubd xmm1, xmm2 movd eax, xmm1 mov cx, [edi] add eax, ecx mov [edi+ebp*2], ax inc dword [edx+eax*4] inc esi add edi, 2 dec dword [tmp_width] jg WIDTH_LOOP add esi, ebx sub esi, ebp dec dword [height] jg HEIGHT_LOOP add esp, localsize pop edi pop esi pop ebp pop ebx %undef pushsize %undef localsize %undef ref %undef sum_ref %undef times_of_sum %undef width %undef height %undef linesize %undef tmp_width ret %macro COUNT_SUM 3 %define xmm_reg %1 %define tmp_reg %2 movd tmp_reg, xmm_reg inc dword [edx+tmp_reg*4] %if %3 == 1 psrldq xmm_reg, 4 %endif %endmacro ;----------------------------------------------------------------------------- ; requires: width % 8 == 0 && height > 1 ;----------------------------------------------------------------------------- ;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, ; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); ;----------------------------------------------------------------------------- ; read extra (16 - (width % 8) ) mod 16 bytes of every line ; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref WELS_EXTERN SumOf8x8BlockOfFrame_sse4 %define pushsize 16 %define localsize 4 %define ref esp + pushsize + localsize + 4 %define sum_ref esp + pushsize + localsize + 20 %define times_of_sum esp + pushsize + localsize + 24 %define width esp + pushsize + localsize + 8 %define height esp + pushsize + localsize + 12 %define linesize esp + pushsize + localsize + 16 %define tmp_width esp + 0 push ebx push ebp push esi push edi sub esp, localsize pxor xmm0, xmm0 mov esi, [ref] mov edi, [sum_ref] mov edx, [times_of_sum] mov ebx, [linesize] mov eax, [width] lea ecx, [ebx+ebx*2] ; 3*linesize mov [tmp_width], eax lea ebp, [esi+ebx*4] FIRST_ROW_SSE4: movdqu xmm1, [esi] movdqu xmm3, [esi+ebx] movdqu xmm5, [esi+ebx*2] movdqu xmm7, [esi+ecx] movdqa xmm2, xmm1 mpsadbw xmm1, xmm0, 000b mpsadbw xmm2, xmm0, 100b paddw xmm1, xmm2 ; 8 sums of line1 movdqa xmm4, xmm3 mpsadbw xmm3, xmm0, 000b mpsadbw xmm4, xmm0, 100b paddw xmm3, xmm4 ; 8 sums of line2 movdqa xmm2, xmm5 mpsadbw xmm5, xmm0, 000b mpsadbw xmm2, xmm0, 100b paddw xmm5, xmm2 ; 8 sums of line3 movdqa xmm4, xmm7 mpsadbw xmm7, xmm0, 000b mpsadbw xmm4, xmm0, 100b paddw xmm7, xmm4 ; 8 sums of line4 paddw xmm1, xmm3 paddw xmm5, xmm7 paddw xmm1, xmm5 ; sum the upper 4 lines first movdqu xmm2, [ebp] movdqu xmm3, [ebp+ebx] movdqu xmm4, [ebp+ebx*2] movdqu xmm5, [ebp+ecx] movdqa xmm6, xmm2 mpsadbw xmm2, xmm0, 000b mpsadbw xmm6, xmm0, 100b paddw xmm2, xmm6 movdqa xmm7, xmm3 mpsadbw xmm3, xmm0, 000b mpsadbw xmm7, xmm0, 100b paddw xmm3, xmm7 movdqa xmm6, xmm4 mpsadbw xmm4, xmm0, 000b mpsadbw xmm6, xmm0, 100b paddw xmm4, xmm6 movdqa xmm7, xmm5 mpsadbw xmm5, xmm0, 000b mpsadbw xmm7, xmm0, 100b paddw xmm5, xmm7 paddw xmm2, xmm3 paddw xmm4, xmm5 paddw xmm1, xmm2 paddw xmm1, xmm4 ; sum of lines 1- 8 movdqu [edi], xmm1 movdqa xmm2, xmm1 punpcklwd xmm1, xmm0 punpckhwd xmm2, xmm0 COUNT_SUM xmm1, eax, 1 COUNT_SUM xmm1, eax, 1 COUNT_SUM xmm1, eax, 1 COUNT_SUM xmm1, eax, 0 COUNT_SUM xmm2, eax, 1 COUNT_SUM xmm2, eax, 1 COUNT_SUM xmm2, eax, 1 COUNT_SUM xmm2, eax, 0 lea esi, [esi+8] lea ebp, [ebp+8] lea edi, [edi+16] ; element size is 2 sub dword [tmp_width], 8 jg near FIRST_ROW_SSE4 mov esi, [ref] mov edi, [sum_ref] mov ebp, [width] dec dword [height] HEIGHT_LOOP_SSE4: mov ecx, ebp WIDTH_LOOP_SSE4: movdqu xmm1, [esi+ebx*8] movdqu xmm2, [esi] movdqu xmm7, [edi] movdqa xmm3, xmm1 mpsadbw xmm1, xmm0, 000b mpsadbw xmm3, xmm0, 100b paddw xmm1, xmm3 movdqa xmm4, xmm2 mpsadbw xmm2, xmm0, 000b mpsadbw xmm4, xmm0, 100b paddw xmm2, xmm4 paddw xmm7, xmm1 psubw xmm7, xmm2 movdqu [edi+ebp*2], xmm7 movdqa xmm6, xmm7 punpcklwd xmm7, xmm0 punpckhwd xmm6, xmm0 COUNT_SUM xmm7, eax, 1 COUNT_SUM xmm7, eax, 1 COUNT_SUM xmm7, eax, 1 COUNT_SUM xmm7, eax, 0 COUNT_SUM xmm6, eax, 1 COUNT_SUM xmm6, eax, 1 COUNT_SUM xmm6, eax, 1 COUNT_SUM xmm6, eax, 0 lea esi, [esi+8] lea edi, [edi+16] sub ecx, 8 jg near WIDTH_LOOP_SSE4 lea esi, [esi+ebx] sub esi, ebp dec dword [height] jg near HEIGHT_LOOP_SSE4 add esp, localsize pop edi pop esi pop ebp pop ebx %undef pushsize %undef localsize %undef ref %undef sum_ref %undef times_of_sum %undef width %undef height %undef linesize %undef tmp_width ret ;**************************************************************************************************************************************************** ;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, ; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); ;**************************************************************************************************************************************************** WELS_EXTERN SumOf16x16BlockOfFrame_sse2 %define pushsize 16 %define localsize 4 %define ref esp + pushsize + localsize + 4 %define sum_ref esp + pushsize + localsize + 20 %define times_of_sum esp + pushsize + localsize + 24 %define width esp + pushsize + localsize + 8 %define height esp + pushsize + localsize + 12 %define linesize esp + pushsize + localsize + 16 %define tmp_width esp push ebx push ebp push esi push edi sub esp, localsize pxor xmm0, xmm0 mov esi, [ref] mov edi, [sum_ref] mov edx, [times_of_sum] mov ebx, [linesize] mov eax, [width] lea ecx, [ebx+ebx*2] mov [tmp_width], eax FIRST_ROW_X16H: movdqu xmm1, [esi] movdqu xmm2, [esi+ebx] movdqu xmm3, [esi+ebx*2] movdqu xmm4, [esi+ecx] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 paddw xmm1, xmm2 paddw xmm3, xmm4 paddw xmm1, xmm3 lea ebp, [esi+ebx*4] movdqu xmm2, [ebp] movdqu xmm3, [ebp+ebx] movdqu xmm4, [ebp+ebx*2] movdqu xmm5, [ebp+ecx] psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 psadbw xmm5, xmm0 paddw xmm2, xmm3 paddw xmm4, xmm5 paddw xmm2, xmm4 paddw xmm1, xmm2 lea ebp, [ebp+ebx*4] movdqu xmm2, [ebp] movdqu xmm3, [ebp+ebx] movdqu xmm4, [ebp+ebx*2] movdqu xmm5, [ebp+ecx] psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 psadbw xmm5, xmm0 paddw xmm2, xmm3 paddw xmm4, xmm5 paddw xmm2, xmm4 paddw xmm1, xmm2 lea ebp, [ebp+ebx*4] movdqu xmm2, [ebp] movdqu xmm3, [ebp+ebx] movdqu xmm4, [ebp+ebx*2] movdqu xmm5, [ebp+ecx] psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 psadbw xmm5, xmm0 paddw xmm2, xmm3 paddw xmm4, xmm5 paddw xmm2, xmm4 paddw xmm1, xmm2 movdqa xmm2, xmm1 punpckhwd xmm2, xmm0 paddw xmm1, xmm2 movd eax, xmm1 mov [edi], ax inc dword [edx+eax*4] inc esi lea edi, [edi+2] dec dword [tmp_width] jg near FIRST_ROW_X16H mov esi, [ref] mov edi, [sum_ref] mov ebp, [width] dec dword [height] mov ecx, ebx sal ecx, 4 ; succeeded 16th line HEIGHT_LOOP_X16: mov [tmp_width], ebp WIDTH_LOOP_X16: movdqu xmm1, [esi+ecx] movdqu xmm2, [esi] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psubw xmm1, xmm2 movdqa xmm2, xmm1 punpckhwd xmm2, xmm0 paddw xmm1, xmm2 movd eax, xmm1 add ax, word [edi] mov [edi+ebp*2], ax inc dword [edx+eax*4] inc esi add edi, 2 dec dword [tmp_width] jg near WIDTH_LOOP_X16 add esi, ebx sub esi, ebp dec dword [height] jg near HEIGHT_LOOP_X16 add esp, localsize pop edi pop esi pop ebp pop ebx %undef pushsize %undef localsize %undef ref %undef sum_ref %undef times_of_sum %undef width %undef height %undef linesize %undef tmp_width ret ; requires: width % 16 == 0 && height > 1 ;----------------------------------------------------------------------------------------------------------------------------- ;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, ; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); ;----------------------------------------------------------------------------------------------------------------------------- ; try 8 mv via offset %macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1 movdqu %2, [%1] movdqu %3, [%1+8h] movdqa %4, %2 movdqa %5, %3 mpsadbw %2, xmm0, 0 ; 000 B mpsadbw %4, xmm0, 5 ; 101 B mpsadbw %3, xmm0, 2 ; 010 B mpsadbw %5, xmm0, 7 ; 111 B paddw %2, %4 paddw %3, %5 paddw %2, %3 ; accumulate cost %endmacro ; end of SAD_16x16_LINE_SSE41 WELS_EXTERN SumOf16x16BlockOfFrame_sse4 %define pushsize 16 %define localsize 4 %define ref esp + pushsize + localsize + 4 %define sum_ref esp + pushsize + localsize + 20 %define times_of_sum esp + pushsize + localsize + 24 %define width esp + pushsize + localsize + 8 %define height esp + pushsize + localsize + 12 %define linesize esp + pushsize + localsize + 16 %define tmp_width esp push ebx push ebp push esi push edi sub esp, localsize pxor xmm0, xmm0 mov esi, [ref] mov edi, [sum_ref] mov edx, [times_of_sum] mov ebx, [linesize] mov eax, [width] lea ecx, [ebx+ebx*2] mov [tmp_width], eax FIRST_ROW_X16_SSE4: SUM_LINE_X16_SSE41 esi, xmm1, xmm2, xmm3, xmm4 SUM_LINE_X16_SSE41 esi+ebx, xmm2, xmm3, xmm4, xmm5 SUM_LINE_X16_SSE41 esi+ebx*2, xmm3, xmm4, xmm5, xmm6 SUM_LINE_X16_SSE41 esi+ecx, xmm4, xmm5, xmm6, xmm7 paddw xmm1, xmm2 paddw xmm3, xmm4 paddw xmm1, xmm3 lea ebp, [esi+ebx*4] SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 lea ebp, [ebp+ebx*4] SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 lea ebp, [ebp+ebx*4] SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 movdqa [edi], xmm1 movdqa xmm2, xmm1 punpcklwd xmm1, xmm0 punpckhwd xmm2, xmm0 COUNT_SUM xmm1, eax, 1 COUNT_SUM xmm1, eax, 1 COUNT_SUM xmm1, eax, 1 COUNT_SUM xmm1, eax, 0 COUNT_SUM xmm2, eax, 1 COUNT_SUM xmm2, eax, 1 COUNT_SUM xmm2, eax, 1 COUNT_SUM xmm2, eax, 0 lea esi, [esi+8] lea edi, [edi+16] ; element size is 2 sub dword [tmp_width], 8 jg near FIRST_ROW_X16_SSE4 mov esi, [ref] mov edi, [sum_ref] mov ebp, [width] dec dword [height] mov ecx, ebx sal ecx, 4 ; succeeded 16th line HEIGHT_LOOP_X16_SSE4: mov [tmp_width], ebp WIDTH_LOOP_X16_SSE4: movdqa xmm7, [edi] SUM_LINE_X16_SSE41 esi+ecx, xmm1, xmm2, xmm3, xmm4 SUM_LINE_X16_SSE41 esi, xmm2, xmm3, xmm4, xmm5 paddw xmm7, xmm1 psubw xmm7, xmm2 movdqa [edi+ebp*2], xmm7 movdqa xmm6, xmm7 punpcklwd xmm7, xmm0 punpckhwd xmm6, xmm0 COUNT_SUM xmm7, eax, 1 COUNT_SUM xmm7, eax, 1 COUNT_SUM xmm7, eax, 1 COUNT_SUM xmm7, eax, 0 COUNT_SUM xmm6, eax, 1 COUNT_SUM xmm6, eax, 1 COUNT_SUM xmm6, eax, 1 COUNT_SUM xmm6, eax, 0 lea esi, [esi+8] lea edi, [edi+16] sub dword [tmp_width], 8 jg near WIDTH_LOOP_X16_SSE4 add esi, ebx sub esi, ebp dec dword [height] jg near HEIGHT_LOOP_X16_SSE4 add esp, localsize pop edi pop esi pop ebp pop ebx %undef pushsize %undef localsize %undef ref %undef sum_ref %undef times_of_sum %undef width %undef height %undef linesize %undef tmp_width ret ;----------------------------------------------------------------------------------------------------------------------------- ; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList) ;----------------------------------------------------------------------------------------------------------------------------- WELS_EXTERN FillQpelLocationByFeatureValue_sse2 push esi push edi push ebx push ebp %define _ps 16 ; push size %define _ls 4 ; local size %define sum_ref esp+_ps+_ls+4 %define pos_list esp+_ps+_ls+16 %define width esp+_ps+_ls+8 %define height esp+_ps+_ls+12 %define i_height esp sub esp, _ls mov esi, [sum_ref] mov edi, [pos_list] mov ebp, [width] mov ebx, [height] mov [i_height], ebx movq xmm7, [mv_x_inc_x4] ; x_qpel inc movq xmm6, [mv_y_inc_x4] ; y_qpel inc movq xmm5, [mx_x_offset_x4] ; x_qpel vector pxor xmm4, xmm4 pxor xmm3, xmm3 ; y_qpel vector HASH_HEIGHT_LOOP_SSE2: movdqa xmm2, xmm5 ; x_qpel vector mov ecx, ebp HASH_WIDTH_LOOP_SSE2: movq xmm0, [esi] ; load x8 sum punpcklwd xmm0, xmm4 movdqa xmm1, xmm2 punpcklwd xmm1, xmm3 %rep 3 movd edx, xmm0 lea ebx, [edi+edx*4] mov eax, [ebx] movd [eax], xmm1 mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation lea eax, [eax+4] mov [ebx], eax psrldq xmm1, 4 psrldq xmm0, 4 %endrep movd edx, xmm0 lea ebx, [edi+edx*4] mov eax, [ebx] movd [eax], xmm1 mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation lea eax, [eax+4] mov [ebx], eax paddw xmm2, xmm7 lea esi, [esi+8] sub ecx, 4 jnz near HASH_WIDTH_LOOP_SSE2 paddw xmm3, xmm6 dec dword [i_height] jnz near HASH_HEIGHT_LOOP_SSE2 add esp, _ls %undef _ps %undef _ls %undef sum_ref %undef pos_list %undef width %undef height %undef i_height pop ebp pop ebx pop edi pop esi ret ;--------------------------------------------------------------------------------------------------------------------------------------------------- ; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, ; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList ) ;--------------------------------------------------------------------------------------------------------------------------------------------------- WELS_EXTERN InitializeHashforFeature_sse2 push ebx push esi push edi push ebp %define _ps 16 ; push size mov edi, [esp+_ps+16] ; pPositionOfSum mov ebp, [esp+_ps+20] ; sum_idx_list mov esi, [esp+_ps+4] ; pTimesOfSum mov ebx, [esp+_ps+8] ; pBuf mov edx, [esp+_ps+12] ; list_sz sar edx, 2 mov ecx, 0 pxor xmm7, xmm7 hash_assign_loop_x4_sse2: movdqa xmm0, [esi+ecx] pslld xmm0, 2 movdqa xmm1, xmm0 pcmpeqd xmm1, xmm7 movmskps eax, xmm1 cmp eax, 0x0f je near hash_assign_with_copy_sse2 %assign x 0 %rep 4 lea eax, [edi+ecx+x] mov [eax], ebx lea eax, [ebp+ecx+x] mov [eax], ebx movd eax, xmm0 add ebx, eax psrldq xmm0, 4 %assign x x+4 %endrep jmp near assign_next_sse2 hash_assign_with_copy_sse2: movd xmm1, ebx pshufd xmm2, xmm1, 0 movdqa [edi+ecx], xmm2 movdqa [ebp+ecx], xmm2 assign_next_sse2: add ecx, 16 dec edx jnz near hash_assign_loop_x4_sse2 mov edx, [esp+_ps+12] ; list_sz and edx, 3 jz near hash_assign_no_rem_sse2 hash_assign_loop_x4_rem_sse2: lea eax, [edi+ecx] mov [eax], ebx lea eax, [ebp+ecx] mov [eax], ebx mov eax, [esi+ecx] sal eax, 2 add ebx, eax add ecx, 4 dec edx jnz near hash_assign_loop_x4_rem_sse2 hash_assign_no_rem_sse2: %undef _ps pop ebp pop edi pop esi pop ebx ret %else ;********************************************************************************************************************** ;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, ; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); ;********************************************************************************************************************* WELS_EXTERN SumOf8x8BlockOfFrame_sse2 %assign push_num 0 LOAD_6_PARA PUSH_XMM 6 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r2, r2d SIGN_EXTENSION r3, r3d push r12 push r13 push r0 push r2 push r4 pxor xmm0, xmm0 lea r6, [r3+r3*2] mov r12, r1 ;r12:tmp_width lea r13, [r0+r3*4] ;rbp:r13 FIRST_ROW: movq xmm1, [r0] movq xmm2, [r0+r3] movq xmm3, [r0+r3*2] movq xmm4, [r0+r6] shufps xmm1, xmm2, 01000100b shufps xmm3, xmm4, 01000100b psadbw xmm1, xmm0 psadbw xmm3, xmm0 paddd xmm1, xmm3 movq xmm2, [r13] movq xmm3, [r13+r3] movq xmm4, [r13+r3*2] movq xmm5, [r13+r6] shufps xmm2, xmm3, 01000100b shufps xmm4, xmm5, 01000100b psadbw xmm2, xmm0 psadbw xmm4, xmm0 paddd xmm2, xmm4 paddd xmm1, xmm2 pshufd xmm2, xmm1, 00001110b paddd xmm1, xmm2 movd r2d, xmm1 mov [r4], r2w inc dword [r5+r2*4] inc r0 inc r13 add r4, 2 dec r12 jg FIRST_ROW pop r4 pop r2 pop r0 mov r13, r2 dec r13 HEIGHT_LOOP: mov r12, r1 WIDTH_LOOP: movq xmm1, [r0+r3*8] movq xmm2, [r0] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psubd xmm1, xmm2 movd r2d, xmm1 mov r6w, [r4] add r2d, r6d mov [r4+r1*2], r2w inc dword [r5+r2*4] inc r0 add r4, 2 dec r12 jg WIDTH_LOOP add r0, r3 sub r0, r1 dec r13 jg HEIGHT_LOOP pop r13 pop r12 POP_XMM LOAD_6_PARA_POP ret %macro COUNT_SUM 4 %define xmm_reg %1 %define tmp_dreg %2 %define tmp_qreg %3 movd tmp_dreg, xmm_reg inc dword [r5+tmp_qreg*4] %if %4 == 1 psrldq xmm_reg, 4 %endif %endmacro ;----------------------------------------------------------------------------- ; requires: width % 8 == 0 && height > 1 ;----------------------------------------------------------------------------- ;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, ; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); ;----------------------------------------------------------------------------- ; read extra (16 - (width % 8) ) mod 16 bytes of every line ; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref WELS_EXTERN SumOf8x8BlockOfFrame_sse4 %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r2, r2d SIGN_EXTENSION r3, r3d push r12 push r13 push r0 push r2 push r4 pxor xmm0, xmm0 lea r6, [r3+r3*2] mov r12, r1 ;r12:tmp_width lea r13, [r0+r3*4] ;rbp:r13 FIRST_ROW_SSE4: movdqu xmm1, [r0] movdqu xmm3, [r0+r3] movdqu xmm5, [r0+r3*2] movdqu xmm7, [r0+r6] movdqa xmm2, xmm1 mpsadbw xmm1, xmm0, 000b mpsadbw xmm2, xmm0, 100b paddw xmm1, xmm2 ; 8 sums of line1 movdqa xmm4, xmm3 mpsadbw xmm3, xmm0, 000b mpsadbw xmm4, xmm0, 100b paddw xmm3, xmm4 ; 8 sums of line2 movdqa xmm2, xmm5 mpsadbw xmm5, xmm0, 000b mpsadbw xmm2, xmm0, 100b paddw xmm5, xmm2 ; 8 sums of line3 movdqa xmm4, xmm7 mpsadbw xmm7, xmm0, 000b mpsadbw xmm4, xmm0, 100b paddw xmm7, xmm4 ; 8 sums of line4 paddw xmm1, xmm3 paddw xmm5, xmm7 paddw xmm1, xmm5 ; sum the upper 4 lines first movdqu xmm2, [r13] movdqu xmm3, [r13+r3] movdqu xmm4, [r13+r3*2] movdqu xmm5, [r13+r6] movdqa xmm6, xmm2 mpsadbw xmm2, xmm0, 000b mpsadbw xmm6, xmm0, 100b paddw xmm2, xmm6 movdqa xmm7, xmm3 mpsadbw xmm3, xmm0, 000b mpsadbw xmm7, xmm0, 100b paddw xmm3, xmm7 movdqa xmm6, xmm4 mpsadbw xmm4, xmm0, 000b mpsadbw xmm6, xmm0, 100b paddw xmm4, xmm6 movdqa xmm7, xmm5 mpsadbw xmm5, xmm0, 000b mpsadbw xmm7, xmm0, 100b paddw xmm5, xmm7 paddw xmm2, xmm3 paddw xmm4, xmm5 paddw xmm1, xmm2 paddw xmm1, xmm4 ; sum of lines 1- 8 movdqu [r4], xmm1 movdqa xmm2, xmm1 punpcklwd xmm1, xmm0 punpckhwd xmm2, xmm0 COUNT_SUM xmm1, r2d, r2, 1 COUNT_SUM xmm1, r2d, r2, 1 COUNT_SUM xmm1, r2d, r2, 1 COUNT_SUM xmm1, r2d, r2, 0 COUNT_SUM xmm2, r2d, r2 ,1 COUNT_SUM xmm2, r2d, r2 ,1 COUNT_SUM xmm2, r2d, r2 ,1 COUNT_SUM xmm2, r2d, r2 ,0 lea r0, [r0+8] lea r13, [r13+8] lea r4, [r4+16] ; element size is 2 sub r12, 8 jg near FIRST_ROW_SSE4 pop r4 pop r2 pop r0 mov r13, r2 dec r13 HEIGHT_LOOP_SSE4: mov r12, r1 WIDTH_LOOP_SSE4: movdqu xmm1, [r0+r3*8] movdqu xmm2, [r0] movdqu xmm7, [r4] movdqa xmm3, xmm1 mpsadbw xmm1, xmm0, 000b mpsadbw xmm3, xmm0, 100b paddw xmm1, xmm3 movdqa xmm4, xmm2 mpsadbw xmm2, xmm0, 000b mpsadbw xmm4, xmm0, 100b paddw xmm2, xmm4 paddw xmm7, xmm1 psubw xmm7, xmm2 movdqu [r4+r1*2], xmm7 movdqa xmm6, xmm7 punpcklwd xmm7, xmm0 punpckhwd xmm6, xmm0 COUNT_SUM xmm7, r2d, r2, 1 COUNT_SUM xmm7, r2d, r2, 1 COUNT_SUM xmm7, r2d, r2, 1 COUNT_SUM xmm7, r2d, r2, 0 COUNT_SUM xmm6, r2d, r2, 1 COUNT_SUM xmm6, r2d, r2, 1 COUNT_SUM xmm6, r2d, r2, 1 COUNT_SUM xmm6, r2d, r2, 0 lea r0, [r0+8] lea r4, [r4+16] sub r12, 8 jg near WIDTH_LOOP_SSE4 lea r0, [r0+r3] sub r0, r1 dec r13 jg near HEIGHT_LOOP_SSE4 pop r13 pop r12 POP_XMM LOAD_6_PARA_POP ret ;**************************************************************************************************************************************************** ;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, ; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); ;**************************************************************************************************************************************************** WELS_EXTERN SumOf16x16BlockOfFrame_sse2 %assign push_num 0 LOAD_6_PARA PUSH_XMM 6 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r2, r2d SIGN_EXTENSION r3, r3d push r12 push r13 push r0 push r2 push r4 pxor xmm0, xmm0 lea r6, [r3+r3*2] mov r12, r1 ;r12:tmp_width FIRST_ROW_X16H: movdqu xmm1, [r0] movdqu xmm2, [r0+r3] movdqu xmm3, [r0+r3*2] movdqu xmm4, [r0+r6] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 paddw xmm1, xmm2 paddw xmm3, xmm4 paddw xmm1, xmm3 lea r13, [r0+r3*4] ;ebp:r13 movdqu xmm2, [r13] movdqu xmm3, [r13+r3] movdqu xmm4, [r13+r3*2] movdqu xmm5, [r13+r6] psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 psadbw xmm5, xmm0 paddw xmm2, xmm3 paddw xmm4, xmm5 paddw xmm2, xmm4 paddw xmm1, xmm2 lea r13, [r13+r3*4] movdqu xmm2, [r13] movdqu xmm3, [r13+r3] movdqu xmm4, [r13+r3*2] movdqu xmm5, [r13+r6] psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 psadbw xmm5, xmm0 paddw xmm2, xmm3 paddw xmm4, xmm5 paddw xmm2, xmm4 paddw xmm1, xmm2 lea r13, [r13+r3*4] movdqu xmm2, [r13] movdqu xmm3, [r13+r3] movdqu xmm4, [r13+r3*2] movdqu xmm5, [r13+r6] psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 psadbw xmm5, xmm0 paddw xmm2, xmm3 paddw xmm4, xmm5 paddw xmm2, xmm4 paddw xmm1, xmm2 movdqa xmm2, xmm1 punpckhwd xmm2, xmm0 paddw xmm1, xmm2 movd r2d, xmm1 mov [r4], r2w inc dword [r5+r2*4] inc r0 lea r4, [r4+2] dec r12 jg near FIRST_ROW_X16H pop r4 pop r2 pop r0 mov r13, r2 dec r13 mov r6, r3 sal r6, 4 ; succeeded 16th line HEIGHT_LOOP_X16: mov r12, r1 WIDTH_LOOP_X16: movdqu xmm1, [r0+r6] movdqu xmm2, [r0] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psubw xmm1, xmm2 movdqa xmm2, xmm1 punpckhwd xmm2, xmm0 paddw xmm1, xmm2 movd r2d, xmm1 add r2w, word [r4] mov [r4+r1*2], r2w inc dword [r5+r2*4] inc r0 add r4, 2 dec r12 jg near WIDTH_LOOP_X16 add r0, r3 sub r0, r1 dec r13 jg near HEIGHT_LOOP_X16 pop r13 pop r12 POP_XMM LOAD_6_PARA_POP ret ; requires: width % 16 == 0 && height > 1 ;----------------------------------------------------------------------------------------------------------------------------- ;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, ; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); ;----------------------------------------------------------------------------------------------------------------------------- ; try 8 mv via offset %macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1 movdqu %2, [%1] movdqu %3, [%1+8h] movdqa %4, %2 movdqa %5, %3 mpsadbw %2, xmm0, 0 ; 000 B mpsadbw %4, xmm0, 5 ; 101 B mpsadbw %3, xmm0, 2 ; 010 B mpsadbw %5, xmm0, 7 ; 111 B paddw %2, %4 paddw %3, %5 paddw %2, %3 ; accumulate cost %endmacro ; end of SAD_16x16_LINE_SSE41 WELS_EXTERN SumOf16x16BlockOfFrame_sse4 %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r2, r2d SIGN_EXTENSION r3, r3d push r12 push r13 push r0 push r2 push r4 pxor xmm0, xmm0 lea r6, [r3+r3*2] mov r12, r1 ;r12:tmp_width FIRST_ROW_X16_SSE4: SUM_LINE_X16_SSE41 r0, xmm1, xmm2, xmm3, xmm4 SUM_LINE_X16_SSE41 r0+r3, xmm2, xmm3, xmm4, xmm5 SUM_LINE_X16_SSE41 r0+r3*2,xmm3, xmm4, xmm5, xmm6 SUM_LINE_X16_SSE41 r0+r6, xmm4, xmm5, xmm6, xmm7 paddw xmm1, xmm2 paddw xmm3, xmm4 paddw xmm1, xmm3 lea r13, [r0+r3*4] SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 lea r13, [r13+r3*4] SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 lea r13, [r13+r3*4] SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5 paddw xmm1, xmm2 movdqa [r4], xmm1 movdqa xmm2, xmm1 punpcklwd xmm1, xmm0 punpckhwd xmm2, xmm0 COUNT_SUM xmm1, r2d, r2, 1 COUNT_SUM xmm1, r2d, r2, 1 COUNT_SUM xmm1, r2d, r2, 1 COUNT_SUM xmm1, r2d, r2, 0 COUNT_SUM xmm2, r2d, r2, 1 COUNT_SUM xmm2, r2d, r2, 1 COUNT_SUM xmm2, r2d, r2, 1 COUNT_SUM xmm2, r2d, r2, 0 lea r0, [r0+8] lea r4, [r4+16] ; element size is 2 sub r12, 8 jg near FIRST_ROW_X16_SSE4 pop r4 pop r2 pop r0 mov r13, r2 dec r13 mov r6, r3 sal r6, 4 ; succeeded 16th line HEIGHT_LOOP_X16_SSE4: mov r12, r1 WIDTH_LOOP_X16_SSE4: movdqa xmm7, [r4] SUM_LINE_X16_SSE41 r0+r6, xmm1, xmm2, xmm3, xmm4 SUM_LINE_X16_SSE41 r0, xmm2, xmm3, xmm4, xmm5 paddw xmm7, xmm1 psubw xmm7, xmm2 movdqa [r4+r1*2], xmm7 movdqa xmm6, xmm7 punpcklwd xmm7, xmm0 punpckhwd xmm6, xmm0 COUNT_SUM xmm7, r2d, r2, 1 COUNT_SUM xmm7, r2d, r2, 1 COUNT_SUM xmm7, r2d, r2, 1 COUNT_SUM xmm7, r2d, r2, 0 COUNT_SUM xmm6, r2d, r2, 1 COUNT_SUM xmm6, r2d, r2, 1 COUNT_SUM xmm6, r2d, r2, 1 COUNT_SUM xmm6, r2d, r2, 0 lea r0, [r0+8] lea r4, [r4+16] sub r12, 8 jg near WIDTH_LOOP_X16_SSE4 add r0, r3 sub r0, r1 dec r13 jg near HEIGHT_LOOP_X16_SSE4 pop r13 pop r12 POP_XMM LOAD_6_PARA_POP ret ;----------------------------------------------------------------------------------------------------------------------------- ; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList) ;----------------------------------------------------------------------------------------------------------------------------- WELS_EXTERN FillQpelLocationByFeatureValue_sse2 %assign push_num 0 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r2, r2d push r12 push r13 mov r12, r2 movq xmm7, [mv_x_inc_x4] ; x_qpel inc movq xmm6, [mv_y_inc_x4] ; y_qpel inc movq xmm5, [mx_x_offset_x4] ; x_qpel vector pxor xmm4, xmm4 pxor xmm3, xmm3 ; y_qpel vector HASH_HEIGHT_LOOP_SSE2: movdqa xmm2, xmm5 ; x_qpel vector mov r4, r1 HASH_WIDTH_LOOP_SSE2: movq xmm0, [r0] ; load x8 sum punpcklwd xmm0, xmm4 movdqa xmm1, xmm2 punpcklwd xmm1, xmm3 %rep 3 movd r2d, xmm0 ;edx:r3 lea r5, [r3+r2*8] ;ebx:r5 mov r6, [r5] ;eax:r6 movd [r6], xmm1 mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation lea r6, [r6+4] mov [r5], r6 psrldq xmm1, 4 psrldq xmm0, 4 %endrep movd r2d, xmm0 lea r5, [r3+r2*8] ;ebx:r5 mov r6, [r5] ;eax:r6 movd [r6], xmm1 mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation lea r6, [r6+4] mov [r5], r6 paddw xmm2, xmm7 lea r0, [r0+8] sub r4, 4 jnz near HASH_WIDTH_LOOP_SSE2 paddw xmm3, xmm6 dec r12 jnz near HASH_HEIGHT_LOOP_SSE2 pop r13 pop r12 POP_XMM ret ;--------------------------------------------------------------------------------------------------------------------------------------------------- ; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, ; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList); ;uint16_t** pPositionOfSum, uint16_t** sum_idx_list, uint32_t* pTimesOfSum, uint16_t* pBuf, const int32_t list_sz ) ;--------------------------------------------------------------------------------------------------------------------------------------------------- WELS_EXTERN InitializeHashforFeature_sse2 %assign push_num 0 LOAD_5_PARA SIGN_EXTENSION r2, r2d push r12 push r13 mov r12, r2 sar r2, 2 mov r5, 0 ;r5:ecx xor r6, r6 pxor xmm3, xmm3 hash_assign_loop_x4_sse2: movdqa xmm0, [r0+r5] pslld xmm0, 2 movdqa xmm1, xmm0 pcmpeqd xmm1, xmm3 movmskps r6, xmm1 cmp r6, 0x0f jz near hash_assign_with_copy_sse2 %assign x 0 %rep 4 lea r13, [r3+r5*2+x] mov [r13], r1 lea r13, [r4+r5*2+x] mov [r13], r1 movd r6d, xmm0 add r1, r6 psrldq xmm0, 4 %assign x x+8 %endrep jmp near assign_next_sse2 hash_assign_with_copy_sse2: movq xmm1, r1 pshufd xmm2, xmm1, 01000100b movdqa [r3+r5*2], xmm2 movdqa [r4+r5*2], xmm2 movdqa [r3+r5*2+16], xmm2 movdqa [r4+r5*2+16], xmm2 assign_next_sse2: add r5, 16 dec r2 jnz near hash_assign_loop_x4_sse2 and r12, 3 jz near hash_assign_no_rem_sse2 hash_assign_loop_x4_rem_sse2: lea r13, [r3+r5*2] mov [r13], r1 lea r13, [r4+r5*2] mov [r13], r1 mov r6d, [r0+r5] sal r6, 2 add r1, r6 add r5, 4 dec r12 jnz near hash_assign_loop_x4_rem_sse2 hash_assign_no_rem_sse2: pop r13 pop r12 ret %endif ;********************************************************************************************************************************** ; int32_t SumOf8x8SingleBlock_sse2(uint8_t* ref0, int32_t linesize) ;********************************************************************************************************************************** WELS_EXTERN SumOf8x8SingleBlock_sse2 %assign push_num 0 LOAD_2_PARA SIGN_EXTENSION r1, r1d pxor xmm0, xmm0 movq xmm1, [r0] movhps xmm1, [r0+r1] lea r0, [r0+2*r1] movq xmm2, [r0] movhps xmm2, [r0+r1] lea r0, [r0+2*r1] movq xmm3, [r0] movhps xmm3, [r0+r1] lea r0, [r0+2*r1] movq xmm4, [r0] movhps xmm4, [r0+r1] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 paddw xmm1, xmm2 paddw xmm3, xmm4 paddw xmm1, xmm3 movdqa xmm2, xmm1 punpckhwd xmm2, xmm0 paddw xmm1, xmm2 movd retrd, xmm1 ret ;********************************************************************************************************************************** ; int32_t SumOf16x16SingleBlock_sse2(uint8_t* ref0, int32_t linesize) ;********************************************************************************************************************************** WELS_EXTERN SumOf16x16SingleBlock_sse2 %assign push_num 0 LOAD_2_PARA PUSH_XMM 6 SIGN_EXTENSION r1, r1d pxor xmm0, xmm0 movdqa xmm1, [r0] movdqa xmm2, [r0+r1] lea r0, [r0+2*r1] movdqa xmm3, [r0] movdqa xmm4, [r0+r1] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 paddw xmm1, xmm2 paddw xmm3, xmm4 paddw xmm1, xmm3 lea r0, [r0+2*r1] movdqa xmm2, [r0] movdqa xmm3, [r0+r1] lea r0, [r0+2*r1] movdqa xmm4, [r0] movdqa xmm5, [r0+r1] psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 psadbw xmm5, xmm0 paddw xmm2, xmm3 paddw xmm4, xmm5 paddw xmm2, xmm4 paddw xmm1, xmm2 lea r0, [r0+2*r1] movdqa xmm2, [r0] movdqa xmm3, [r0+r1] lea r0, [r0+2*r1] movdqa xmm4, [r0] movdqa xmm5, [r0+r1] psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 psadbw xmm5, xmm0 paddw xmm2, xmm3 paddw xmm4, xmm5 paddw xmm2, xmm4 paddw xmm1, xmm2 lea r0, [r0+2*r1] movdqa xmm2, [r0] movdqa xmm3, [r0+r1] lea r0, [r0+2*r1] movdqa xmm4, [r0] movdqa xmm5, [r0+r1] psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 psadbw xmm5, xmm0 paddw xmm2, xmm3 paddw xmm4, xmm5 paddw xmm2, xmm4 paddw xmm1, xmm2 movdqa xmm2, xmm1 punpckhwd xmm2, xmm0 paddw xmm1, xmm2 movd retrd, xmm1 POP_XMM ret ;********************************************************************************************************************************** ; ; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost ) ; ; \note: ; src need align with 16 bytes, ref is optional ; \return value: ; return minimal SAD cost, according index carried by index_min_cost ;********************************************************************************************************************************** ; try 8 mv via offset ; xmm7 store sad costs %macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref movdqa xmm0, [%1] movdqu xmm1, [%2] movdqu xmm2, [%2+8h] movdqa xmm3, xmm1 movdqa xmm4, xmm2 mpsadbw xmm1, xmm0, 0 ; 000 B paddw xmm7, xmm1 ; accumulate cost mpsadbw xmm3, xmm0, 5 ; 101 B paddw xmm7, xmm3 ; accumulate cost mpsadbw xmm2, xmm0, 2 ; 010 B paddw xmm7, xmm2 ; accumulate cost mpsadbw xmm4, xmm0, 7 ; 111 B paddw xmm7, xmm4 ; accumulate cost add %1, %3 add %2, %4 %endmacro ; end of SAD_16x16_LINE_SSE41 %macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref movdqa xmm0, [%1] movdqu xmm1, [%2] movdqu xmm2, [%2+8h] movdqa xmm3, xmm1 movdqa xmm4, xmm2 mpsadbw xmm1, xmm0, 0 ; 000 B paddw xmm7, xmm1 ; accumulate cost mpsadbw xmm3, xmm0, 5 ; 101 B paddw xmm7, xmm3 ; accumulate cost mpsadbw xmm2, xmm0, 2 ; 010 B paddw xmm7, xmm2 ; accumulate cost mpsadbw xmm4, xmm0, 7 ; 111 B paddw xmm7, xmm4 ; accumulate cost %endmacro ; end of SAD_16x16_LINE_SSE41E WELS_EXTERN SampleSad16x16Hor8_sse41 ;push ebx ;push esi ;mov eax, [esp+12] ; src ;mov ecx, [esp+16] ; stride_src ;mov ebx, [esp+20] ; ref ;mov edx, [esp+24] ; stride_ref ;mov esi, [esp+28] ; base_cost %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d pxor xmm7, xmm7 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41E r0, r2, r1, r3 pxor xmm0, xmm0 movdqa xmm6, xmm7 punpcklwd xmm6, xmm0 punpckhwd xmm7, xmm0 movdqa xmm5, [r4] movdqa xmm4, xmm5 punpcklwd xmm4, xmm0 punpckhwd xmm5, xmm0 paddd xmm4, xmm6 paddd xmm5, xmm7 movdqa xmm3, xmm4 pminud xmm3, xmm5 pshufd xmm2, xmm3, 01001110B pminud xmm2, xmm3 pshufd xmm3, xmm2, 10110001B pminud xmm2, xmm3 movd retrd, xmm2 pcmpeqd xmm4, xmm2 movmskps r2d, xmm4 bsf r1d, r2d jnz near WRITE_INDEX pcmpeqd xmm5, xmm2 movmskps r2d, xmm5 bsf r1d, r2d add r1d, 4 WRITE_INDEX: mov [r5], r1d POP_XMM LOAD_6_PARA_POP ret ;********************************************************************************************************************************** ; ; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost ) ; ; \note: ; src and ref is optional to align with 16 due inter 8x8 ; \return value: ; return minimal SAD cost, according index carried by index_min_cost ; ;********************************************************************************************************************************** ; try 8 mv via offset ; xmm7 store sad costs %macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref movdqu xmm0, [%1] movdqu xmm1, [%2] movdqa xmm2, xmm1 mpsadbw xmm1, xmm0, 0 ; 000 B paddw xmm7, xmm1 ; accumulate cost mpsadbw xmm2, xmm0, 5 ; 101 B paddw xmm7, xmm2 ; accumulate cost add %1, %3 add %2, %4 %endmacro ; end of SAD_8x8_LINE_SSE41 %macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref movdqu xmm0, [%1] movdqu xmm1, [%2] movdqa xmm2, xmm1 mpsadbw xmm1, xmm0, 0 ; 000 B paddw xmm7, xmm1 ; accumulate cost mpsadbw xmm2, xmm0, 5 ; 101 B paddw xmm7, xmm2 ; accumulate cost %endmacro ; end of SAD_8x8_LINE_SSE41E WELS_EXTERN SampleSad8x8Hor8_sse41 %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d movdqa xmm7, [r4] ; load base cost list SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41E r0, r2, r1, r3 phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX mov r1d, retrd and retrd, 0xFFFF sar r1d, 16 mov [r5], r1d POP_XMM LOAD_6_PARA_POP ret