ref: 11f0a111772d964b30ee428979d2c25e11ee8df4
dir: /codec/encoder/core/x86/sample_sc.asm/
;*! ;* \copy ;* Copyright (c) 2009-2013, Cisco Systems ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* * Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* ;* * Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in ;* the documentation and/or other materials provided with the ;* distribution. ;* ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ;* POSSIBILITY OF SUCH DAMAGE. ;* ;*************************************************************************/ %include "asm_inc.asm" SECTION .text ;********************************************************************************************************************************** ; ; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost ) ; ; \note: ; src need align with 16 bytes, ref is optional ; \return value: ; return minimal SAD cost, according index carried by index_min_cost ;********************************************************************************************************************************** ; try 8 mv via offset ; xmm7 store sad costs %macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref movdqa xmm0, [%1] movdqu xmm1, [%2] movdqu xmm2, [%2+8h] movdqa xmm3, xmm1 movdqa xmm4, xmm2 mpsadbw xmm1, xmm0, 0 ; 000 B paddw xmm7, xmm1 ; accumulate cost mpsadbw xmm3, xmm0, 5 ; 101 B paddw xmm7, xmm3 ; accumulate cost mpsadbw xmm2, xmm0, 2 ; 010 B paddw xmm7, xmm2 ; accumulate cost mpsadbw xmm4, xmm0, 7 ; 111 B paddw xmm7, xmm4 ; accumulate cost add %1, %3 add %2, %4 %endmacro ; end of SAD_16x16_LINE_SSE41 %macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref movdqa xmm0, [%1] movdqu xmm1, [%2] movdqu xmm2, [%2+8h] movdqa xmm3, xmm1 movdqa xmm4, xmm2 mpsadbw xmm1, xmm0, 0 ; 000 B paddw xmm7, xmm1 ; accumulate cost mpsadbw xmm3, xmm0, 5 ; 101 B paddw xmm7, xmm3 ; accumulate cost mpsadbw xmm2, xmm0, 2 ; 010 B paddw xmm7, xmm2 ; accumulate cost mpsadbw xmm4, xmm0, 7 ; 111 B paddw xmm7, xmm4 ; accumulate cost %endmacro ; end of SAD_16x16_LINE_SSE41E WELS_EXTERN SampleSad16x16Hor8_sse41 ;push ebx ;push esi ;mov eax, [esp+12] ; src ;mov ecx, [esp+16] ; stride_src ;mov ebx, [esp+20] ; ref ;mov edx, [esp+24] ; stride_ref ;mov esi, [esp+28] ; base_cost %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d pxor xmm7, xmm7 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41E r0, r2, r1, r3 pxor xmm0, xmm0 movdqa xmm6, xmm7 punpcklwd xmm6, xmm0 punpckhwd xmm7, xmm0 movdqa xmm5, [r4] movdqa xmm4, xmm5 punpcklwd xmm4, xmm0 punpckhwd xmm5, xmm0 paddd xmm4, xmm6 paddd xmm5, xmm7 movdqa xmm3, xmm4 pminud xmm3, xmm5 pshufd xmm2, xmm3, 01001110B pminud xmm2, xmm3 pshufd xmm3, xmm2, 10110001B pminud xmm2, xmm3 movd retrd, xmm2 pcmpeqd xmm4, xmm2 movmskps r2d, xmm4 bsf r1d, r2d jnz near WRITE_INDEX pcmpeqd xmm5, xmm2 movmskps r2d, xmm5 bsf r1d, r2d add r1d, 4 WRITE_INDEX: mov [r5], r1d POP_XMM LOAD_6_PARA_POP ret ;********************************************************************************************************************************** ; ; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost ) ; ; \note: ; src and ref is optional to align with 16 due inter 8x8 ; \return value: ; return minimal SAD cost, according index carried by index_min_cost ; ;********************************************************************************************************************************** ; try 8 mv via offset ; xmm7 store sad costs %macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref movdqu xmm0, [%1] movdqu xmm1, [%2] movdqa xmm2, xmm1 mpsadbw xmm1, xmm0, 0 ; 000 B paddw xmm7, xmm1 ; accumulate cost mpsadbw xmm2, xmm0, 5 ; 101 B paddw xmm7, xmm2 ; accumulate cost add %1, %3 add %2, %4 %endmacro ; end of SAD_8x8_LINE_SSE41 %macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref movdqu xmm0, [%1] movdqu xmm1, [%2] movdqa xmm2, xmm1 mpsadbw xmm1, xmm0, 0 ; 000 B paddw xmm7, xmm1 ; accumulate cost mpsadbw xmm2, xmm0, 5 ; 101 B paddw xmm7, xmm2 ; accumulate cost %endmacro ; end of SAD_8x8_LINE_SSE41E WELS_EXTERN SampleSad8x8Hor8_sse41 %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d movdqa xmm7, [r4] ; load base cost list SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41E r0, r2, r1, r3 phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX mov r1d, retrd and retrd, 0xFFFF sar r1d, 16 mov [r5], r1d POP_XMM LOAD_6_PARA_POP ret