ref: e66b3488c5843a5be0a0d21544c25d135989c4d4
dir: /codec/encoder/core/asm/satd_sad.asm/
;*! ;* \copy ;* Copyright (c) 2009-2013, Cisco Systems ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* * Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* ;* * Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in ;* the documentation and/or other materials provided with the ;* distribution. ;* ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ;* POSSIBILITY OF SUCH DAMAGE. ;* ;* ;* satd_sad.asm ;* ;* Abstract ;* WelsSampleSatd4x4_sse2 ;* WelsSampleSatd8x8_sse2 ;* WelsSampleSatd16x8_sse2 ;* WelsSampleSatd8x16_sse2 ;* WelsSampleSatd16x16_sse2 ;* ;* WelsSampleSad16x8_sse2 ;* WelsSampleSad16x16_sse2 ;* ;* History ;* 8/5/2009 Created ;* 24/9/2009 modified ;* ;* ;*************************************************************************/ %include "asm_inc.asm" BITS 32 ;*********************************************************************** ; Data ;*********************************************************************** SECTION .rodata align=16 align 16 HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1 align 16 HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1 align 16 PDW1: dw 1,1,1,1,1,1,1,1 align 16 PDQ2: dw 2,0,0,0,2,0,0,0 align 16 HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 ;*********************************************************************** ; Code ;*********************************************************************** SECTION .text ;*********************************************************************** ; ;Pixel_satd_wxh_sse2 BEGIN ; ;*********************************************************************** %macro MMX_DW_1_2REG 2 pxor %1, %1 pcmpeqw %2, %2 psubw %1, %2 %endmacro %macro SSE2_SumWHorizon1 2 movdqa %2, %1 psrldq %2, 8 paddusw %1, %2 movdqa %2, %1 psrldq %2, 4 paddusw %1, %2 movdqa %2, %1 psrldq %2, 2 paddusw %1, %2 %endmacro %macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3 SSE2_SumSub %1, %2, %5 SSE2_SumSub %3, %4, %5 SSE2_SumSub %2, %4, %5 SSE2_SumSub %1, %3, %5 %endmacro %macro SSE2_SumAbs4 7 WELS_AbsW %1, %3 WELS_AbsW %2, %3 WELS_AbsW %4, %6 WELS_AbsW %5, %6 paddusw %1, %2 paddusw %4, %5 paddusw %7, %1 paddusw %7, %4 %endmacro %macro SSE2_SumWHorizon 3 movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 %endmacro %macro SSE2_GetSatd8x8 0 SSE2_LoadDiff8P xmm0,xmm4,xmm7,[eax],[ecx] SSE2_LoadDiff8P xmm1,xmm5,xmm7,[eax+ebx],[ecx+edx] lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] SSE2_LoadDiff8P xmm2,xmm4,xmm7,[eax],[ecx] SSE2_LoadDiff8P xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx] SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] SSE2_LoadDiff8P xmm0,xmm4,xmm7,[eax],[ecx] SSE2_LoadDiff8P xmm1,xmm5,xmm7,[eax+ebx],[ecx+edx] lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] SSE2_LoadDiff8P xmm2,xmm4,xmm7,[eax],[ecx] SSE2_LoadDiff8P xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx] SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 %endmacro ;*********************************************************************** ; ;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd4x4_sse2 align 16 WelsSampleSatd4x4_sse2: push ebx mov eax, [esp+8] mov ebx, [esp+12] mov ecx, [esp+16] mov edx, [esp+20] movd xmm0, [eax] movd xmm1, [eax+ebx] lea eax , [eax+2*ebx] movd xmm2, [eax] movd xmm3, [eax+ebx] punpckldq xmm0, xmm2 punpckldq xmm1, xmm3 movd xmm4, [ecx] movd xmm5, [ecx+edx] lea ecx , [ecx+2*edx] movd xmm6, [ecx] movd xmm7, [ecx+edx] punpckldq xmm4, xmm6 punpckldq xmm5, xmm7 pxor xmm6, xmm6 punpcklbw xmm0, xmm6 punpcklbw xmm1, xmm6 punpcklbw xmm4, xmm6 punpcklbw xmm5, xmm6 psubw xmm0, xmm4 psubw xmm1, xmm5 movdqa xmm2, xmm0 paddw xmm0, xmm1 psubw xmm2, xmm1 SSE2_XSawp qdq, xmm0, xmm2, xmm3 movdqa xmm4, xmm0 paddw xmm0, xmm3 psubw xmm4, xmm3 movdqa xmm2, xmm0 punpcklwd xmm0, xmm4 punpckhwd xmm4, xmm2 SSE2_XSawp dq, xmm0, xmm4, xmm3 SSE2_XSawp qdq, xmm0, xmm3, xmm5 movdqa xmm7, xmm0 paddw xmm0, xmm5 psubw xmm7, xmm5 SSE2_XSawp qdq, xmm0, xmm7, xmm1 movdqa xmm2, xmm0 paddw xmm0, xmm1 psubw xmm2, xmm1 WELS_AbsW xmm0, xmm3 paddusw xmm6, xmm0 WELS_AbsW xmm2, xmm4 paddusw xmm6, xmm2 SSE2_SumWHorizon1 xmm6, xmm4 movd eax, xmm6 and eax, 0xffff shr eax, 1 pop ebx ret ;*********************************************************************** ; ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x8_sse2 align 16 WelsSampleSatd8x8_sse2: push ebx mov eax, [esp+8] mov ebx, [esp+12] mov ecx, [esp+16] mov edx, [esp+20] pxor xmm6, xmm6 pxor xmm7, xmm7 SSE2_GetSatd8x8 psrlw xmm6, 1 SSE2_SumWHorizon xmm6,xmm4,xmm7 movd eax, xmm6 pop ebx ret ;*********************************************************************** ; ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x16_sse2 align 16 WelsSampleSatd8x16_sse2: push ebx mov eax, [esp+8] mov ebx, [esp+12] mov ecx, [esp+16] mov edx, [esp+20] pxor xmm6, xmm6 pxor xmm7, xmm7 SSE2_GetSatd8x8 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] SSE2_GetSatd8x8 psrlw xmm6, 1 SSE2_SumWHorizon xmm6,xmm4,xmm7 movd eax, xmm6 pop ebx ret ;*********************************************************************** ; ;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd16x8_sse2 align 16 WelsSampleSatd16x8_sse2: push ebx mov eax, [esp+8] mov ebx, [esp+12] mov ecx, [esp+16] mov edx, [esp+20] pxor xmm6, xmm6 pxor xmm7, xmm7 SSE2_GetSatd8x8 mov eax, [esp+8] mov ecx, [esp+16] add eax, 8 add ecx, 8 SSE2_GetSatd8x8 psrlw xmm6, 1 SSE2_SumWHorizon xmm6,xmm4,xmm7 movd eax, xmm6 pop ebx ret ;*********************************************************************** ; ;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd16x16_sse2 align 16 WelsSampleSatd16x16_sse2: push ebx mov eax, [esp+8] mov ebx, [esp+12] mov ecx, [esp+16] mov edx, [esp+20] pxor xmm6, xmm6 pxor xmm7, xmm7 SSE2_GetSatd8x8 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] SSE2_GetSatd8x8 mov eax, [esp+8] mov ecx, [esp+16] add eax, 8 add ecx, 8 SSE2_GetSatd8x8 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] SSE2_GetSatd8x8 ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first. psrlw xmm6, 1 SSE2_SumWHorizon xmm6,xmm4,xmm7 movd eax, xmm6 pop ebx ret ;*********************************************************************** ; ;Pixel_satd_wxh_sse2 END ; ;*********************************************************************** ;*********************************************************************** ; ;Pixel_satd_intra_sse2 BEGIN ; ;*********************************************************************** %macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 pmaddubsw %1, xmm5 movdqa %2, %1 pmaddwd %1, xmm7 pmaddwd %2, xmm6 movdqa %3, %1 punpckldq %1, %2 punpckhdq %2, %3 movdqa %3, %1 punpcklqdq %1, %2 punpckhqdq %3, %2 paddd xmm4, %1 ;for dc paddd xmm4, %3 ;for dc packssdw %1, %3 psllw %1, 2 %endmacro %macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2 pmaddubsw %1, xmm5 movdqa %2, %1 pmaddwd %1, xmm7 pmaddwd %2, xmm6 movdqa %3, %1 punpckldq %1, %2 punpckhdq %2, %3 movdqa %3, %1 punpcklqdq %1, %2 punpckhqdq %3, %2 ; paddd xmm4, %1 ;for dc ; paddd xmm4, %3 ;for dc movdqa %4, %1 punpcklqdq %4, %3 packssdw %1, %3 psllw %1, 2 %endmacro %macro SSE41_GetX38x4SatdDec 0 pxor xmm7, xmm7 movq xmm0, [eax] movq xmm1, [eax+ebx] lea eax, [eax+2*ebx] movq xmm2, [eax] movq xmm3, [eax+ebx] lea eax, [eax+2*ebx] punpcklbw xmm0, xmm7 punpcklbw xmm1, xmm7 punpcklbw xmm2, xmm7 punpcklbw xmm3, xmm7 SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7 SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7 SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2 ;doesn't need another transpose %endmacro %macro SSE41_GetX38x4SatdV 2 pxor xmm0, xmm0 pinsrw xmm0, word[esi+%2], 0 pinsrw xmm0, word[esi+%2+8], 4 psubsw xmm0, xmm7 pabsw xmm0, xmm0 paddw xmm4, xmm0 pxor xmm0, xmm0 pinsrw xmm0, word[esi+%2+2], 0 pinsrw xmm0, word[esi+%2+10], 4 psubsw xmm0, xmm1 pabsw xmm0, xmm0 paddw xmm4, xmm0 pxor xmm0, xmm0 pinsrw xmm0, word[esi+%2+4], 0 pinsrw xmm0, word[esi+%2+12], 4 psubsw xmm0, xmm3 pabsw xmm0, xmm0 paddw xmm4, xmm0 pxor xmm0, xmm0 pinsrw xmm0, word[esi+%2+6], 0 pinsrw xmm0, word[esi+%2+14], 4 psubsw xmm0, xmm2 pabsw xmm0, xmm0 paddw xmm4, xmm0 %endmacro %macro SSE41_GetX38x4SatdH 3 movq xmm0, [esi+%3+8*%1] punpcklqdq xmm0, xmm0 psubsw xmm0, xmm7 pabsw xmm0, xmm0 paddw xmm5, xmm0 pabsw xmm1, xmm1 pabsw xmm2, xmm2 pabsw xmm3, xmm3 paddw xmm2, xmm1;for DC paddw xmm2, xmm3;for DC paddw xmm5, xmm2 %endmacro %macro SSE41_I16X16GetX38x4SatdDC 0 pxor xmm0, xmm0 movq2dq xmm0, mm4 punpcklqdq xmm0, xmm0 psubsw xmm0, xmm7 pabsw xmm0, xmm0 paddw xmm6, xmm0 paddw xmm6, xmm2 %endmacro %macro SSE41_ChromaGetX38x4SatdDC 1 shl %1, 4 movdqa xmm0, [esi+32+%1] psubsw xmm0, xmm7 pabsw xmm0, xmm0 paddw xmm6, xmm0 paddw xmm6, xmm2 %endmacro %macro SSE41_I16x16GetX38x4Satd 2 SSE41_GetX38x4SatdDec SSE41_GetX38x4SatdV %1, %2 SSE41_GetX38x4SatdH %1, %2, 32 SSE41_I16X16GetX38x4SatdDC %endmacro %macro SSE41_ChromaGetX38x4Satd 2 SSE41_GetX38x4SatdDec SSE41_GetX38x4SatdV %1, %2 SSE41_GetX38x4SatdH %1, %2, 16 SSE41_ChromaGetX38x4SatdDC %1 %endmacro %macro SSE41_HSum8W 3 pmaddwd %1, %2 movhlps %3, %1 paddd %1, %3 pshuflw %3, %1,0Eh paddd %1, %3 %endmacro WELS_EXTERN WelsIntra16x16Combined3Satd_sse41 WelsIntra16x16Combined3Satd_sse41: push ebx push esi push edi mov ecx, [esp+16] mov edx, [esp+20] mov eax, [esp+24] mov ebx, [esp+28] mov esi, [esp+40] ;temp_satd pxor xmm4, xmm4 movdqa xmm5, [HSumSubDB1] movdqa xmm6, [HSumSubDW1] movdqa xmm7, [PDW1] sub ecx, edx movdqu xmm0, [ecx] movhlps xmm1, xmm0 punpcklqdq xmm0, xmm0 punpcklqdq xmm1, xmm1 SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 movdqa [esi], xmm0 ;V movdqa [esi+16], xmm1 add ecx, edx pinsrb xmm0, byte[ecx-1], 0 pinsrb xmm0, byte[ecx+edx-1], 1 lea ecx, [ecx+2*edx] pinsrb xmm0, byte[ecx-1], 2 pinsrb xmm0, byte[ecx+edx-1], 3 lea ecx, [ecx+2*edx] pinsrb xmm0, byte[ecx-1], 4 pinsrb xmm0, byte[ecx+edx-1], 5 lea ecx, [ecx+2*edx] pinsrb xmm0, byte[ecx-1], 6 pinsrb xmm0, byte[ecx+edx-1], 7 lea ecx, [ecx+2*edx] pinsrb xmm0, byte[ecx-1], 8 pinsrb xmm0, byte[ecx+edx-1], 9 lea ecx, [ecx+2*edx] pinsrb xmm0, byte[ecx-1], 10 pinsrb xmm0, byte[ecx+edx-1], 11 lea ecx, [ecx+2*edx] pinsrb xmm0, byte[ecx-1], 12 pinsrb xmm0, byte[ecx+edx-1], 13 lea ecx, [ecx+2*edx] pinsrb xmm0, byte[ecx-1], 14 pinsrb xmm0, byte[ecx+edx-1], 15 movhlps xmm1, xmm0 punpcklqdq xmm0, xmm0 punpcklqdq xmm1, xmm1 SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 movdqa [esi+32], xmm0 ;H movdqa [esi+48], xmm1 movd ecx, xmm4 ;dc add ecx, 16 ;(sum+16) shr ecx, 5 ;((sum+16)>>5) shl ecx, 4 ; movd mm4, ecx ; mm4 copy DC pxor xmm4, xmm4 ;V pxor xmm5, xmm5 ;H pxor xmm6, xmm6 ;DC mov ecx, 0 mov edi, 0 .loop16x16_get_satd: .loopStart1: SSE41_I16x16GetX38x4Satd ecx, edi inc ecx cmp ecx, 4 jl .loopStart1 cmp edi, 16 je .loop16x16_get_satd_end mov eax, [esp+24] add eax, 8 mov ecx, 0 add edi, 16 jmp .loop16x16_get_satd .loop16x16_get_satd_end: MMX_DW_1_2REG xmm0, xmm1 psrlw xmm4, 1 ;/2 psrlw xmm5, 1 ;/2 psrlw xmm6, 1 ;/2 SSE41_HSum8W xmm4, xmm0, xmm1 SSE41_HSum8W xmm5, xmm0, xmm1 SSE41_HSum8W xmm6, xmm0, xmm1 ; comparing order: DC H V movd ebx, xmm6 ;DC movd edi, xmm5 ;H movd ecx, xmm4 ;V mov edx, [esp+36] shl edx, 1 add edi, edx add ebx, edx mov edx, [esp+32] cmp ebx, edi jge near not_dc_16x16 cmp ebx, ecx jge near not_dc_h_16x16 ; for DC mode mov dword[edx], 2;I16_PRED_DC mov eax, ebx jmp near return_satd_intra_16x16_x3 not_dc_16x16: ; for H mode cmp edi, ecx jge near not_dc_h_16x16 mov dword[edx], 1;I16_PRED_H mov eax, edi jmp near return_satd_intra_16x16_x3 not_dc_h_16x16: ; for V mode mov dword[edx], 0;I16_PRED_V mov eax, ecx return_satd_intra_16x16_x3: WELSEMMS pop edi pop esi pop ebx ret %macro SSE41_ChromaGetX38x8Satd 0 movdqa xmm5, [HSumSubDB1] movdqa xmm6, [HSumSubDW1] movdqa xmm7, [PDW1] sub ecx, edx movq xmm0, [ecx] punpcklqdq xmm0, xmm0 SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4 movdqa [esi], xmm0 ;V add ecx, edx pinsrb xmm0, byte[ecx-1], 0 pinsrb xmm0, byte[ecx+edx-1], 1 lea ecx, [ecx+2*edx] pinsrb xmm0, byte[ecx-1], 2 pinsrb xmm0, byte[ecx+edx-1], 3 lea ecx, [ecx+2*edx] pinsrb xmm0, byte[ecx-1], 4 pinsrb xmm0, byte[ecx+edx-1], 5 lea ecx, [ecx+2*edx] pinsrb xmm0, byte[ecx-1], 6 pinsrb xmm0, byte[ecx+edx-1], 7 punpcklqdq xmm0, xmm0 SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1 movdqa [esi+16], xmm0 ;H ;(sum+2)>>2 movdqa xmm6, [PDQ2] movdqa xmm5, xmm4 punpckhqdq xmm5, xmm1 paddd xmm5, xmm6 psrld xmm5, 2 ;(sum1+sum2+4)>>3 paddd xmm6, xmm6 paddd xmm4, xmm1 paddd xmm4, xmm6 psrld xmm4, 3 ;satd *16 pslld xmm5, 4 pslld xmm4, 4 ;temp satd movdqa xmm6, xmm4 punpcklqdq xmm4, xmm5 psllq xmm4, 32 psrlq xmm4, 32 movdqa [esi+32], xmm4 punpckhqdq xmm5, xmm6 psllq xmm5, 32 psrlq xmm5, 32 movdqa [esi+48], xmm5 pxor xmm4, xmm4 ;V pxor xmm5, xmm5 ;H pxor xmm6, xmm6 ;DC mov ecx, 0 loop_chroma_satdx3_cb_cr: SSE41_ChromaGetX38x4Satd ecx, 0 inc ecx cmp ecx, 2 jl loop_chroma_satdx3_cb_cr %endmacro %macro SSEReg2MMX 3 movdq2q %2, %1 movhlps %1, %1 movdq2q %3, %1 %endmacro %macro MMXReg2SSE 4 movq2dq %1, %3 movq2dq %2, %4 punpcklqdq %1, %2 %endmacro ;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41 WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41 WelsIntraChroma8x8Combined3Satd_sse41: push ebx push esi push edi mov ecx, [esp+16] mov edx, [esp+20] mov eax, [esp+24] mov ebx, [esp+28] mov esi, [esp+40] ;temp_satd xor edi, edi loop_chroma_satdx3: SSE41_ChromaGetX38x8Satd cmp edi, 1 je loop_chroma_satdx3end inc edi SSEReg2MMX xmm4, mm0,mm1 SSEReg2MMX xmm5, mm2,mm3 SSEReg2MMX xmm6, mm5,mm6 mov ecx, [esp+44] mov eax, [esp+48] jmp loop_chroma_satdx3 loop_chroma_satdx3end: MMXReg2SSE xmm0, xmm3, mm0, mm1 MMXReg2SSE xmm1, xmm3, mm2, mm3 MMXReg2SSE xmm2, xmm3, mm5, mm6 paddw xmm4, xmm0 paddw xmm5, xmm1 paddw xmm6, xmm2 MMX_DW_1_2REG xmm0, xmm1 psrlw xmm4, 1 ;/2 psrlw xmm5, 1 ;/2 psrlw xmm6, 1 ;/2 SSE41_HSum8W xmm4, xmm0, xmm1 SSE41_HSum8W xmm5, xmm0, xmm1 SSE41_HSum8W xmm6, xmm0, xmm1 ; comparing order: DC H V movd ebx, xmm6 ;DC movd edi, xmm5 ;H movd ecx, xmm4 ;V mov edx, [esp+36] shl edx, 1 add edi, edx add ecx, edx mov edx, [esp+32] cmp ebx, edi jge near not_dc_8x8 cmp ebx, ecx jge near not_dc_h_8x8 ; for DC mode mov dword[edx], 0;I8_PRED_DC mov eax, ebx jmp near return_satd_intra_8x8_x3 not_dc_8x8: ; for H mode cmp edi, ecx jge near not_dc_h_8x8 mov dword[edx], 1;I8_PRED_H mov eax, edi jmp near return_satd_intra_8x8_x3 not_dc_h_8x8: ; for V mode mov dword[edx], 2;I8_PRED_V mov eax, ecx return_satd_intra_8x8_x3: WELSEMMS pop edi pop esi pop ebx ret ;*********************************************************************** ; ;Pixel_satd_intra_sse2 END ; ;*********************************************************************** %macro SSSE3_Get16BSadHVDC 2 movd xmm6,%1 pshufb xmm6,xmm1 movdqa %1, xmm6 movdqa xmm0,%2 psadbw xmm0,xmm7 paddw xmm4,xmm0 movdqa xmm0,%2 psadbw xmm0,xmm5 paddw xmm2,xmm0 psadbw xmm6,%2 paddw xmm3,xmm6 %endmacro %macro WelsAddDCValue 4 movzx %2, byte %1 mov %3, %2 add %4, %2 %endmacro ;*********************************************************************** ; ;Pixel_sad_intra_ssse3 BEGIN ; ;*********************************************************************** WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3 WelsIntra16x16Combined3Sad_ssse3: push ebx push esi push edi mov ecx, [esp+16] mov edx, [esp+20] mov edi, [esp+40] ;temp_sad sub ecx, edx movdqa xmm5,[ecx] pxor xmm0,xmm0 psadbw xmm0,xmm5 movhlps xmm1,xmm0 paddw xmm0,xmm1 movd eax,xmm0 add ecx,edx lea ebx, [edx+2*edx] WelsAddDCValue [ecx-1 ], esi, [edi ], eax WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax lea ecx, [ecx+4*edx] add edi, 64 WelsAddDCValue [ecx-1 ], esi, [edi ], eax WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax lea ecx, [ecx+4*edx] add edi, 64 WelsAddDCValue [ecx-1 ], esi, [edi ], eax WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax lea ecx, [ecx+4*edx] add edi, 64 WelsAddDCValue [ecx-1 ], esi, [edi ], eax WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax sub edi, 192 add eax,10h shr eax,5 movd xmm7,eax pxor xmm1,xmm1 pshufb xmm7,xmm1 pxor xmm4,xmm4 pxor xmm3,xmm3 pxor xmm2,xmm2 ;sad begin mov eax, [esp+24] mov ebx, [esp+28] lea esi, [ebx+2*ebx] SSSE3_Get16BSadHVDC [edi], [eax] SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] SSSE3_Get16BSadHVDC [edi+48], [eax+esi] add edi, 64 lea eax, [eax+4*ebx] SSSE3_Get16BSadHVDC [edi], [eax] SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] SSSE3_Get16BSadHVDC [edi+48], [eax+esi] add edi, 64 lea eax, [eax+4*ebx] SSSE3_Get16BSadHVDC [edi], [eax] SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] SSSE3_Get16BSadHVDC [edi+48], [eax+esi] add edi, 64 lea eax, [eax+4*ebx] SSSE3_Get16BSadHVDC [edi], [eax] SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] SSSE3_Get16BSadHVDC [edi+48], [eax+esi] pslldq xmm3,4 por xmm3,xmm2 movhlps xmm1,xmm3 paddw xmm3,xmm1 movhlps xmm0,xmm4 paddw xmm4,xmm0 ; comparing order: DC H V movd ebx, xmm4 ;DC movd ecx, xmm3 ;V psrldq xmm3, 4 movd esi, xmm3 ;H mov eax, [esp+36] ;lamda shl eax, 1 add esi, eax add ebx, eax mov edx, [esp+32] cmp ebx, esi jge near not_dc_16x16_sad cmp ebx, ecx jge near not_dc_h_16x16_sad ; for DC mode mov dword[edx], 2;I16_PRED_DC mov eax, ebx sub edi, 192 %assign x 0 %rep 16 movdqa [edi+16*x], xmm7 %assign x x+1 %endrep jmp near return_sad_intra_16x16_x3 not_dc_16x16_sad: ; for H mode cmp esi, ecx jge near not_dc_h_16x16_sad mov dword[edx], 1;I16_PRED_H mov eax, esi jmp near return_sad_intra_16x16_x3 not_dc_h_16x16_sad: ; for V mode mov dword[edx], 0;I16_PRED_V mov eax, ecx sub edi, 192 %assign x 0 %rep 16 movdqa [edi+16*x], xmm5 %assign x x+1 %endrep return_sad_intra_16x16_x3: pop edi pop esi pop ebx ret ;*********************************************************************** ; ;Pixel_sad_intra_ssse3 END ; ;*********************************************************************** ;*********************************************************************** ; ;Pixel_satd_wxh_sse41 BEGIN ; ;*********************************************************************** ;SSE4.1 %macro SSE41_GetSatd8x4 0 movq xmm0, [eax] punpcklqdq xmm0, xmm0 pmaddubsw xmm0, xmm7 movq xmm1, [eax+ebx] punpcklqdq xmm1, xmm1 pmaddubsw xmm1, xmm7 movq xmm2, [ecx] punpcklqdq xmm2, xmm2 pmaddubsw xmm2, xmm7 movq xmm3, [ecx+edx] punpcklqdq xmm3, xmm3 pmaddubsw xmm3, xmm7 psubsw xmm0, xmm2 psubsw xmm1, xmm3 movq xmm2, [eax+2*ebx] punpcklqdq xmm2, xmm2 pmaddubsw xmm2, xmm7 movq xmm3, [eax+esi] punpcklqdq xmm3, xmm3 pmaddubsw xmm3, xmm7 movq xmm4, [ecx+2*edx] punpcklqdq xmm4, xmm4 pmaddubsw xmm4, xmm7 movq xmm5, [ecx+edi] punpcklqdq xmm5, xmm5 pmaddubsw xmm5, xmm7 psubsw xmm2, xmm4 psubsw xmm3, xmm5 SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4 pabsw xmm0, xmm0 pabsw xmm2, xmm2 pabsw xmm1, xmm1 pabsw xmm3, xmm3 movdqa xmm4, xmm3 pblendw xmm3, xmm1, 0xAA pslld xmm1, 16 psrld xmm4, 16 por xmm1, xmm4 pmaxuw xmm1, xmm3 paddw xmm6, xmm1 movdqa xmm4, xmm0 pblendw xmm0, xmm2, 0xAA pslld xmm2, 16 psrld xmm4, 16 por xmm2, xmm4 pmaxuw xmm0, xmm2 paddw xmm6, xmm0 %endmacro %macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE MMX_DW_1_2REG %3, %4 pmaddwd %2, %3 movhlps %4, %2 paddd %2, %4 pshuflw %4, %2,0Eh paddd %2, %4 movd %1, %2 %endmacro ;*********************************************************************** ; ;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd4x4_sse41 WelsSampleSatd4x4_sse41: push ebx mov eax,[esp+8] mov ebx,[esp+12] mov ecx,[esp+16] mov edx,[esp+20] movdqa xmm4,[HSwapSumSubDB1] movd xmm2,[ecx] movd xmm5,[ecx+edx] shufps xmm2,xmm5,0 movd xmm3,[ecx+edx*2] lea ecx, [edx*2+ecx] movd xmm5,[ecx+edx] shufps xmm3,xmm5,0 movd xmm0,[eax] movd xmm5,[eax+ebx] shufps xmm0,xmm5,0 movd xmm1,[eax+ebx*2] lea eax, [ebx*2+eax] movd xmm5,[eax+ebx] shufps xmm1,xmm5,0 pmaddubsw xmm0,xmm4 pmaddubsw xmm1,xmm4 pmaddubsw xmm2,xmm4 pmaddubsw xmm3,xmm4 psubw xmm0,xmm2 psubw xmm1,xmm3 movdqa xmm2,xmm0 paddw xmm0,xmm1 psubw xmm1,xmm2 movdqa xmm2,xmm0 punpcklqdq xmm0,xmm1 punpckhqdq xmm2,xmm1 movdqa xmm1,xmm0 paddw xmm0,xmm2 psubw xmm2,xmm1 movdqa xmm1,xmm0 pblendw xmm0,xmm2,0AAh pslld xmm2,16 psrld xmm1,16 por xmm2,xmm1 pabsw xmm0,xmm0 pabsw xmm2,xmm2 pmaxsw xmm0,xmm2 SSSE3_SumWHorizon eax, xmm0, xmm5, xmm7 pop ebx ret ;*********************************************************************** ; ;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x8_sse41 align 16 WelsSampleSatd8x8_sse41: push ebx push esi push edi mov eax, [esp+16] mov ebx, [esp+20] mov ecx, [esp+24] mov edx, [esp+28] movdqa xmm7, [HSumSubDB1] lea esi, [ebx+ebx*2] lea edi, [edx+edx*2] pxor xmm6, xmm6 SSE41_GetSatd8x4 lea eax, [eax+4*ebx] lea ecx, [ecx+4*edx] SSE41_GetSatd8x4 SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7 pop edi pop esi pop ebx ret ;*********************************************************************** ; ;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x16_sse41 align 16 WelsSampleSatd8x16_sse41: push ebx push esi push edi push ebp %define pushsize 16 mov eax, [esp+pushsize+4] mov ebx, [esp+pushsize+8] mov ecx, [esp+pushsize+12] mov edx, [esp+pushsize+16] movdqa xmm7, [HSumSubDB1] lea esi, [ebx+ebx*2] lea edi, [edx+edx*2] pxor xmm6, xmm6 mov ebp, 0 loop_get_satd_8x16: SSE41_GetSatd8x4 lea eax, [eax+4*ebx] lea ecx, [ecx+4*edx] inc ebp cmp ebp, 4 jl loop_get_satd_8x16 SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7 pop ebp pop edi pop esi pop ebx ret ;*********************************************************************** ; ;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd16x8_sse41 align 16 WelsSampleSatd16x8_sse41: push ebx push esi push edi mov eax, [esp+16] mov ebx, [esp+20] mov ecx, [esp+24] mov edx, [esp+28] movdqa xmm7, [HSumSubDB1] lea esi, [ebx+ebx*2] lea edi, [edx+edx*2] pxor xmm6, xmm6 SSE41_GetSatd8x4 lea eax, [eax+4*ebx] lea ecx, [ecx+4*edx] SSE41_GetSatd8x4 mov eax, [esp+16] mov ecx, [esp+24] add eax, 8 add ecx, 8 SSE41_GetSatd8x4 lea eax, [eax+4*ebx] lea ecx, [ecx+4*edx] SSE41_GetSatd8x4 SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7 pop edi pop esi pop ebx ret ;*********************************************************************** ; ;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd16x16_sse41 align 16 WelsSampleSatd16x16_sse41: push ebx push esi push edi push ebp %define pushsize 16 mov eax, [esp+pushsize+4] mov ebx, [esp+pushsize+8] mov ecx, [esp+pushsize+12] mov edx, [esp+pushsize+16] movdqa xmm7, [HSumSubDB1] lea esi, [ebx+ebx*2] lea edi, [edx+edx*2] pxor xmm6, xmm6 mov ebp, 0 loop_get_satd_16x16_left: SSE41_GetSatd8x4 lea eax, [eax+4*ebx] lea ecx, [ecx+4*edx] inc ebp cmp ebp, 4 jl loop_get_satd_16x16_left mov eax, [esp+pushsize+4] mov ecx, [esp+pushsize+12] add eax, 8 add ecx, 8 mov ebp, 0 loop_get_satd_16x16_right: SSE41_GetSatd8x4 lea eax, [eax+4*ebx] lea ecx, [ecx+4*edx] inc ebp cmp ebp, 4 jl loop_get_satd_16x16_right SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7 %undef pushsize pop ebp pop edi pop esi pop ebx ret ;*********************************************************************** ; ;Pixel_satd_wxh_sse41 END ; ;*********************************************************************** ;*********************************************************************** ; ;Pixel_sad_wxh_sse2 BEGIN ; ;*********************************************************************** %macro SSE2_GetSad2x16 0 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] movdqu xmm1, [ecx] MOVDQ xmm2, [eax];[eax] must aligned 16 psadbw xmm1, xmm2 paddw xmm0, xmm1 movdqu xmm1, [ecx+edx] MOVDQ xmm2, [eax+ebx] psadbw xmm1, xmm2 paddw xmm0, xmm1 %endmacro %macro SSE2_GetSad4x16 0 movdqu xmm0, [ecx] MOVDQ xmm2, [eax] psadbw xmm0, xmm2 paddw xmm7, xmm0 movdqu xmm1, [ecx+edx] MOVDQ xmm2, [eax+ebx] psadbw xmm1, xmm2 paddw xmm7, xmm1 movdqu xmm1, [ecx+2*edx] MOVDQ xmm2, [eax+2*ebx];[eax] must aligned 16 psadbw xmm1, xmm2 paddw xmm7, xmm1 movdqu xmm1, [ecx+edi] MOVDQ xmm2, [eax+esi] psadbw xmm1, xmm2 paddw xmm7, xmm1 %endmacro %macro SSE2_GetSad8x4 0 movq xmm0, [eax] movq xmm1, [eax+ebx] lea eax, [eax+2*ebx] movhps xmm0, [eax] movhps xmm1, [eax+ebx] movq xmm2, [ecx] movq xmm3, [ecx+edx] lea ecx, [ecx+2*edx] movhps xmm2, [ecx] movhps xmm3, [ecx+edx] psadbw xmm0, xmm2 psadbw xmm1, xmm3 paddw xmm6, xmm0 paddw xmm6, xmm1 %endmacro ;*********************************************************************** ; ;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ) ;First parameter can align to 16 bytes, ;In wels, the third parameter can't align to 16 bytes. ; ;*********************************************************************** WELS_EXTERN WelsSampleSad16x16_sse2 align 16 WelsSampleSad16x16_sse2: push ebx push edi push esi %define _STACK_SIZE 12 mov eax, [esp+_STACK_SIZE+4 ] mov ebx, [esp+_STACK_SIZE+8 ] lea esi, [3*ebx] mov ecx, [esp+_STACK_SIZE+12] mov edx, [esp+_STACK_SIZE+16] lea edi, [3*edx] pxor xmm7, xmm7 SSE2_GetSad4x16 lea eax, [eax+4*ebx] lea ecx, [ecx+4*edx] SSE2_GetSad4x16 lea eax, [eax+4*ebx] lea ecx, [ecx+4*edx] SSE2_GetSad4x16 lea eax, [eax+4*ebx] lea ecx, [ecx+4*edx] SSE2_GetSad4x16 movhlps xmm0, xmm7 paddw xmm0, xmm7 movd eax, xmm0 %undef _STACK_SIZE pop esi pop edi pop ebx ret ;*********************************************************************** ; ;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ) ;First parameter can align to 16 bytes, ;In wels, the third parameter can't align to 16 bytes. ; ;*********************************************************************** WELS_EXTERN WelsSampleSad16x8_sse2 align 16 WelsSampleSad16x8_sse2: push ebx mov eax, [esp+8] mov ebx, [esp+12] mov ecx, [esp+16] mov edx, [esp+20] movdqu xmm0, [ecx] MOVDQ xmm2, [eax] psadbw xmm0, xmm2 movdqu xmm1, [ecx+edx] MOVDQ xmm2, [eax+ebx] psadbw xmm1, xmm2 paddw xmm0, xmm1 SSE2_GetSad2x16 SSE2_GetSad2x16 SSE2_GetSad2x16 movhlps xmm1, xmm0 paddw xmm0, xmm1 movd eax, xmm0 pop ebx ret WELS_EXTERN WelsSampleSad8x16_sse2 WelsSampleSad8x16_sse2: push ebx mov eax, [esp+8] mov ebx, [esp+12] mov ecx, [esp+16] mov edx, [esp+20] pxor xmm6, xmm6 SSE2_GetSad8x4 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] SSE2_GetSad8x4 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] SSE2_GetSad8x4 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] SSE2_GetSad8x4 movhlps xmm0, xmm6 paddw xmm0, xmm6 movd eax, xmm0 pop ebx ret %macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline and %1, 0x1f|(%3>>1) cmp %1, (32-%2)|(%3>>1) %endmacro WELS_EXTERN WelsSampleSad8x8_sse21 WelsSampleSad8x8_sse21: mov ecx, [esp+12] mov edx, ecx CACHE_SPLIT_CHECK edx, 8, 64 jle near .pixel_sad_8x8_nsplit push ebx push edi mov eax, [esp+12] mov ebx, [esp+16] pxor xmm7, xmm7 mov edi, ecx and edi, 0x07 sub ecx, edi mov edx, 8 sub edx, edi shl edi, 3 shl edx, 3 movd xmm5, edi movd xmm6, edx mov edi, 8 add edi, ecx mov edx, [esp+24] movq xmm0, [eax] movhps xmm0, [eax+ebx] movq xmm1, [ecx] movq xmm2, [edi] movhps xmm1, [ecx+edx] movhps xmm2, [edi+edx] psrlq xmm1, xmm5 psllq xmm2, xmm6 por xmm1, xmm2 psadbw xmm0, xmm1 paddw xmm7, xmm0 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] lea edi, [edi+2*edx] movq xmm0, [eax] movhps xmm0, [eax+ebx] movq xmm1, [ecx] movq xmm2, [edi] movhps xmm1, [ecx+edx] movhps xmm2, [edi+edx] psrlq xmm1, xmm5 psllq xmm2, xmm6 por xmm1, xmm2 psadbw xmm0, xmm1 paddw xmm7, xmm0 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] lea edi, [edi+2*edx] movq xmm0, [eax] movhps xmm0, [eax+ebx] movq xmm1, [ecx] movq xmm2, [edi] movhps xmm1, [ecx+edx] movhps xmm2, [edi+edx] psrlq xmm1, xmm5 psllq xmm2, xmm6 por xmm1, xmm2 psadbw xmm0, xmm1 paddw xmm7, xmm0 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] lea edi, [edi+2*edx] movq xmm0, [eax] movhps xmm0, [eax+ebx] movq xmm1, [ecx] movq xmm2, [edi] movhps xmm1, [ecx+edx] movhps xmm2, [edi+edx] psrlq xmm1, xmm5 psllq xmm2, xmm6 por xmm1, xmm2 psadbw xmm0, xmm1 paddw xmm7, xmm0 movhlps xmm0, xmm7 paddw xmm0, xmm7 movd eax, xmm0 pop edi jmp .return .pixel_sad_8x8_nsplit: push ebx mov eax, [esp+8] mov ebx, [esp+12] mov edx, [esp+20] pxor xmm6, xmm6 SSE2_GetSad8x4 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] SSE2_GetSad8x4 movhlps xmm0, xmm6 paddw xmm0, xmm6 movd eax, xmm0 .return: pop ebx ret ;*********************************************************************** ; ;Pixel_sad_wxh_sse2 END ; ;*********************************************************************** ;*********************************************************************** ; ;Pixel_sad_4_wxh_sse2 BEGIN ; ;*********************************************************************** %macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address psadbw %1, %4 paddw xmm5, %1 psadbw %4, %3 paddw xmm4, %4 movdqu %4, [%5-1] psadbw %4, %2 paddw xmm6, %4 movdqu %4, [%5+1] psadbw %4, %2 paddw xmm7, %4 %endmacro WELS_EXTERN WelsSampleSadFour16x16_sse2 WelsSampleSadFour16x16_sse2: push ebx mov eax, [esp+8] mov ebx, [esp+12] mov ecx, [esp+16] mov edx, [esp+20] pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref pxor xmm6, xmm6 ;sad pRefMb-1 pxor xmm7, xmm7 ;sad pRefMb+1 movdqa xmm0, [eax] sub ecx, edx movdqu xmm3, [ecx] psadbw xmm3, xmm0 paddw xmm4, xmm3 movdqa xmm1, [eax+ebx] movdqu xmm3, [ecx+edx] psadbw xmm3, xmm1 paddw xmm4, xmm3 movdqu xmm2, [ecx+edx-1] psadbw xmm2, xmm0 paddw xmm6, xmm2 movdqu xmm3, [ecx+edx+1] psadbw xmm3, xmm0 paddw xmm7, xmm3 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] movdqa xmm2, [eax] movdqu xmm3, [ecx] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx movdqa xmm0, [eax+ebx] movdqu xmm3, [ecx+edx] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] movdqa xmm1, [eax] movdqu xmm3, [ecx] SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx movdqa xmm2, [eax+ebx] movdqu xmm3, [ecx+edx] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx+edx lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] movdqa xmm0, [eax] movdqu xmm3, [ecx] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx movdqa xmm1, [eax+ebx] movdqu xmm3, [ecx+edx] SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx+edx lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] movdqa xmm2, [eax] movdqu xmm3, [ecx] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx movdqa xmm0, [eax+ebx] movdqu xmm3, [ecx+edx] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] movdqa xmm1, [eax] movdqu xmm3, [ecx] SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx movdqa xmm2, [eax+ebx] movdqu xmm3, [ecx+edx] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx+edx lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] movdqa xmm0, [eax] movdqu xmm3, [ecx] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx movdqa xmm1, [eax+ebx] movdqu xmm3, [ecx+edx] SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx+edx lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] movdqa xmm2, [eax] movdqu xmm3, [ecx] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx movdqa xmm0, [eax+ebx] movdqu xmm3, [ecx+edx] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx lea ecx, [ecx+2*edx] movdqu xmm3, [ecx] psadbw xmm2, xmm3 paddw xmm5, xmm2 movdqu xmm2, [ecx-1] psadbw xmm2, xmm0 paddw xmm6, xmm2 movdqu xmm3, [ecx+1] psadbw xmm3, xmm0 paddw xmm7, xmm3 movdqu xmm3, [ecx+edx] psadbw xmm0, xmm3 paddw xmm5, xmm0 mov ecx, [esp+24] movhlps xmm0, xmm4 paddw xmm4, xmm0 movhlps xmm0, xmm5 paddw xmm5, xmm0 movhlps xmm0, xmm6 paddw xmm6, xmm0 movhlps xmm0, xmm7 paddw xmm7, xmm0 punpckldq xmm4, xmm5 punpckldq xmm6, xmm7 punpcklqdq xmm4, xmm6 movdqa [ecx],xmm4 pop ebx ret WELS_EXTERN WelsSampleSadFour16x8_sse2 WelsSampleSadFour16x8_sse2: push ebx push edi mov eax, [esp+12] mov ebx, [esp+16] mov edi, [esp+20] mov edx, [esp+24] pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref pxor xmm6, xmm6 ;sad pRefMb-1 pxor xmm7, xmm7 ;sad pRefMb+1 movdqa xmm0, [eax] sub edi, edx movdqu xmm3, [edi] psadbw xmm3, xmm0 paddw xmm4, xmm3 movdqa xmm1, [eax+ebx] movdqu xmm3, [edi+edx] psadbw xmm3, xmm1 paddw xmm4, xmm3 movdqu xmm2, [edi+edx-1] psadbw xmm2, xmm0 paddw xmm6, xmm2 movdqu xmm3, [edi+edx+1] psadbw xmm3, xmm0 paddw xmm7, xmm3 lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movdqa xmm2, [eax] movdqu xmm3, [edi] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, edi movdqa xmm0, [eax+ebx] movdqu xmm3, [edi+edx] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, edi+edx lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movdqa xmm1, [eax] movdqu xmm3, [edi] SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, edi movdqa xmm2, [eax+ebx] movdqu xmm3, [edi+edx] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, edi+edx lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movdqa xmm0, [eax] movdqu xmm3, [edi] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, edi movdqa xmm1, [eax+ebx] movdqu xmm3, [edi+edx] SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, edi+edx lea edi, [edi+2*edx] movdqu xmm3, [edi] psadbw xmm0, xmm3 paddw xmm5, xmm0 movdqu xmm0, [edi-1] psadbw xmm0, xmm1 paddw xmm6, xmm0 movdqu xmm3, [edi+1] psadbw xmm3, xmm1 paddw xmm7, xmm3 movdqu xmm3, [edi+edx] psadbw xmm1, xmm3 paddw xmm5, xmm1 mov edi, [esp+28] movhlps xmm0, xmm4 paddw xmm4, xmm0 movhlps xmm0, xmm5 paddw xmm5, xmm0 movhlps xmm0, xmm6 paddw xmm6, xmm0 movhlps xmm0, xmm7 paddw xmm7, xmm0 punpckldq xmm4, xmm5 punpckldq xmm6, xmm7 punpcklqdq xmm4, xmm6 movdqa [edi],xmm4 pop edi pop ebx ret WELS_EXTERN WelsSampleSadFour8x16_sse2 WelsSampleSadFour8x16_sse2: push ebx push edi mov eax, [esp+12] mov ebx, [esp+16] mov edi, [esp+20] mov edx, [esp+24] pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref pxor xmm6, xmm6 ;sad pRefMb-1 pxor xmm7, xmm7 ;sad pRefMb+1 movq xmm0, [eax] movhps xmm0, [eax+ebx] sub edi, edx movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [edi+edx-1] movq xmm3, [edi+edx+1] lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movhps xmm1, [edi-1] movhps xmm3, [edi+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [eax] movhps xmm0, [eax+ebx] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [edi+edx-1] movq xmm3, [edi+edx+1] lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movhps xmm1, [edi-1] movhps xmm3, [edi+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [eax] movhps xmm0, [eax+ebx] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [edi+edx-1] movq xmm3, [edi+edx+1] lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movhps xmm1, [edi-1] movhps xmm3, [edi+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [eax] movhps xmm0, [eax+ebx] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [edi+edx-1] movq xmm3, [edi+edx+1] lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movhps xmm1, [edi-1] movhps xmm3, [edi+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [eax] movhps xmm0, [eax+ebx] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [edi+edx-1] movq xmm3, [edi+edx+1] lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movhps xmm1, [edi-1] movhps xmm3, [edi+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [eax] movhps xmm0, [eax+ebx] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [edi+edx-1] movq xmm3, [edi+edx+1] lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movhps xmm1, [edi-1] movhps xmm3, [edi+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [eax] movhps xmm0, [eax+ebx] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [edi+edx-1] movq xmm3, [edi+edx+1] lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movhps xmm1, [edi-1] movhps xmm3, [edi+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [eax] movhps xmm0, [eax+ebx] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [edi+edx-1] movq xmm3, [edi+edx+1] lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movhps xmm1, [edi-1] movhps xmm3, [edi+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm0, xmm3 paddw xmm5, xmm0 mov edi, [esp+28] movhlps xmm0, xmm4 paddw xmm4, xmm0 movhlps xmm0, xmm5 paddw xmm5, xmm0 movhlps xmm0, xmm6 paddw xmm6, xmm0 movhlps xmm0, xmm7 paddw xmm7, xmm0 punpckldq xmm4, xmm5 punpckldq xmm6, xmm7 punpcklqdq xmm4, xmm6 movdqa [edi],xmm4 pop edi pop ebx ret WELS_EXTERN WelsSampleSadFour8x8_sse2 WelsSampleSadFour8x8_sse2: push ebx push edi mov eax, [esp+12] mov ebx, [esp+16] mov edi, [esp+20] mov edx, [esp+24] pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref pxor xmm6, xmm6 ;sad pRefMb-1 pxor xmm7, xmm7 ;sad pRefMb+1 movq xmm0, [eax] movhps xmm0, [eax+ebx] sub edi, edx movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [edi+edx-1] movq xmm3, [edi+edx+1] lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movhps xmm1, [edi-1] movhps xmm3, [edi+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [eax] movhps xmm0, [eax+ebx] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [edi+edx-1] movq xmm3, [edi+edx+1] lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movhps xmm1, [edi-1] movhps xmm3, [edi+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [eax] movhps xmm0, [eax+ebx] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [edi+edx-1] movq xmm3, [edi+edx+1] lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movhps xmm1, [edi-1] movhps xmm3, [edi+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [eax] movhps xmm0, [eax+ebx] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [edi+edx-1] movq xmm3, [edi+edx+1] lea eax, [eax+2*ebx] lea edi, [edi+2*edx] movhps xmm1, [edi-1] movhps xmm3, [edi+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [edi] movhps xmm3, [edi+edx] psadbw xmm0, xmm3 paddw xmm5, xmm0 mov edi, [esp+28] movhlps xmm0, xmm4 paddw xmm4, xmm0 movhlps xmm0, xmm5 paddw xmm5, xmm0 movhlps xmm0, xmm6 paddw xmm6, xmm0 movhlps xmm0, xmm7 paddw xmm7, xmm0 punpckldq xmm4, xmm5 punpckldq xmm6, xmm7 punpcklqdq xmm4, xmm6 movdqa [edi],xmm4 pop edi pop ebx ret WELS_EXTERN WelsSampleSadFour4x4_sse2 WelsSampleSadFour4x4_sse2: push ebx push edi mov eax, [esp+12] mov ebx, [esp+16] mov edi, [esp+20] mov edx, [esp+24] movd xmm0, [eax] movd xmm1, [eax+ebx] lea eax, [eax+2*ebx] movd xmm2, [eax] movd xmm3, [eax+ebx] punpckldq xmm0, xmm1 punpckldq xmm2, xmm3 punpcklqdq xmm0, xmm2 sub edi, edx movd xmm1, [edi] movd xmm2, [edi+edx] punpckldq xmm1, xmm2 movd xmm2, [edi+edx-1] movd xmm3, [edi+edx+1] lea edi, [edi+2*edx] movd xmm4, [edi] movd xmm5, [edi-1] punpckldq xmm2, xmm5 movd xmm5, [edi+1] punpckldq xmm3, xmm5 movd xmm5, [edi+edx] punpckldq xmm4, xmm5 punpcklqdq xmm1, xmm4 ;-L movd xmm5, [edi+edx-1] movd xmm6, [edi+edx+1] lea edi, [edi+2*edx] movd xmm7, [edi-1] punpckldq xmm5, xmm7 punpcklqdq xmm2, xmm5 ;-1 movd xmm7, [edi+1] punpckldq xmm6, xmm7 punpcklqdq xmm3, xmm6 ;+1 movd xmm6, [edi] movd xmm7, [edi+edx] punpckldq xmm6, xmm7 punpcklqdq xmm4, xmm6 ;+L psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 movhlps xmm0, xmm1 paddw xmm1, xmm0 movhlps xmm0, xmm2 paddw xmm2, xmm0 movhlps xmm0, xmm3 paddw xmm3, xmm0 movhlps xmm0, xmm4 paddw xmm4, xmm0 mov edi, [esp+28] punpckldq xmm1, xmm4 punpckldq xmm2, xmm3 punpcklqdq xmm1, xmm2 movdqa [edi],xmm1 pop edi pop ebx ret ;*********************************************************************** ; ;Pixel_sad_4_wxh_sse2 END ; ;*********************************************************************** WELS_EXTERN WelsSampleSad4x4_mmx align 16 ;*********************************************************************** ; int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t ) ;*********************************************************************** WelsSampleSad4x4_mmx: push ebx %define pushsize 4 %define pix1address esp+pushsize+4 %define pix1stride esp+pushsize+8 %define pix2address esp+pushsize+12 %define pix2stride esp+pushsize+16 mov eax, [pix1address] mov ebx, [pix1stride ] mov ecx, [pix2address] mov edx, [pix2stride ] movd mm0, [eax] movd mm1, [eax+ebx] punpckldq mm0, mm1 movd mm3, [ecx] movd mm4, [ecx+edx] punpckldq mm3, mm4 psadbw mm0, mm3 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] movd mm1, [eax] movd mm2, [eax+ebx] punpckldq mm1, mm2 movd mm3, [ecx] movd mm4, [ecx+edx] punpckldq mm3, mm4 psadbw mm1, mm3 paddw mm0, mm1 movd eax, mm0 WELSEMMS pop ebx %undef pushsize %undef pix1address %undef pix1stride %undef pix2address %undef pix2stride ret