ref: a913cc853e517c2a5a0f79cc72cd5df590d82317
dir: /codec/decoder/core/asm/mb_copy.asm/
;*! ;* \copy ;* Copyright (c) 2009-2013, Cisco Systems ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* * Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* ;* * Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in ;* the documentation and/or other materials provided with the ;* distribution. ;* ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ;* POSSIBILITY OF SUCH DAMAGE. ;* ;* ;* mb_copy.asm ;* ;* Abstract ;* mb_copy and mb_copy1 ;* ;* History ;* 15/09/2009 Created ;* 12/28/2009 Modified with larger throughput ;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2, ;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc; ;* ;* ;*********************************************************************************************/ %include "asm_inc.asm" BITS 32 ;******************************************************************************* ; Macros and other preprocessor constants ;******************************************************************************* ;******************************************************************************* ; Local Data (Read Only) ;******************************************************************************* ;SECTION .rodata data align=16 ;******************************************************************************* ; Various memory constants (trigonometric values or rounding values) ;******************************************************************************* ALIGN 16 ;******************************************************************************* ; Code ;******************************************************************************* SECTION .text WELS_EXTERN PixelAvgWidthEq4_mmx WELS_EXTERN PixelAvgWidthEq8_mmx WELS_EXTERN PixelAvgWidthEq16_sse2 WELS_EXTERN McCopyWidthEq4_mmx WELS_EXTERN McCopyWidthEq8_mmx WELS_EXTERN McCopyWidthEq16_sse2 ALIGN 16 ;******************************************************************************* ; void_t PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride, ; uint8_t *pSrcA, int iSrcAStride, ; uint8_t *pSrcB, int iSrcBStride, ; int iHeight ); ;******************************************************************************* PixelAvgWidthEq4_mmx: push esi push edi push ebp push ebx mov edi, [esp+20] ; pDst mov eax, [esp+24] ; iDstStride mov esi, [esp+28] ; pSrcA mov ecx, [esp+32] ; iSrcAStride mov ebp, [esp+36] ; pSrcB mov edx, [esp+40] ; iSrcBStride mov ebx, [esp+44] ; iHeight ALIGN 4 .height_loop: movd mm0, [ebp] pavgb mm0, [esi] movd [edi], mm0 dec ebx lea edi, [edi+eax] lea esi, [esi+ecx] lea ebp, [ebp+edx] jne .height_loop WELSEMMS pop ebx pop ebp pop edi pop esi ret ALIGN 16 ;******************************************************************************* ; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride, ; uint8_t *pSrcA, int iSrcAStride, ; uint8_t *pSrcB, int iSrcBStride, ; int iHeight ); ;******************************************************************************* PixelAvgWidthEq8_mmx: push esi push edi push ebp push ebx mov edi, [esp+20] ; pDst mov eax, [esp+24] ; iDstStride mov esi, [esp+28] ; pSrcA mov ecx, [esp+32] ; iSrcAStride mov ebp, [esp+36] ; pSrcB mov edx, [esp+40] ; iSrcBStride mov ebx, [esp+44] ; iHeight ALIGN 4 .height_loop: movq mm0, [esi] pavgb mm0, [ebp] movq [edi], mm0 movq mm0, [esi+ecx] pavgb mm0, [ebp+edx] movq [edi+eax], mm0 lea esi, [esi+2*ecx] lea ebp, [ebp+2*edx] lea edi, [edi+2*eax] sub ebx, 2 jnz .height_loop WELSEMMS pop ebx pop ebp pop edi pop esi ret ALIGN 16 ;******************************************************************************* ; void_t PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride, ; uint8_t *pSrcA, int iSrcAStride, ; uint8_t *pSrcB, int iSrcBStride, ; int iHeight ); ;******************************************************************************* PixelAvgWidthEq16_sse2: push esi push edi push ebp push ebx mov edi, [esp+20] ; pDst mov eax, [esp+24] ; iDstStride mov esi, [esp+28] ; pSrcA mov ecx, [esp+32] ; iSrcAStride mov ebp, [esp+36] ; pSrcB mov edx, [esp+40] ; iSrcBStride mov ebx, [esp+44] ; iHeight ALIGN 4 .height_loop: movdqu xmm0, [esi] pavgb xmm0, [ebp] movdqu [edi], xmm0 movdqu xmm0, [esi+ecx] pavgb xmm0, [ebp+edx] movdqu [edi+eax], xmm0 movdqu xmm0, [esi+2*ecx] pavgb xmm0, [ebp+2*edx] movdqu [edi+2*eax], xmm0 lea esi, [esi+2*ecx] lea ebp, [ebp+2*edx] lea edi, [edi+2*eax] movdqu xmm0, [esi+ecx] pavgb xmm0, [ebp+edx] movdqu [edi+eax], xmm0 lea esi, [esi+2*ecx] lea ebp, [ebp+2*edx] lea edi, [edi+2*eax] sub ebx, 4 jne .height_loop WELSEMMS pop ebx pop ebp pop edi pop esi ret ALIGN 16 ;******************************************************************************* ; void_t McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride, ; uint8_t *pDst, int iDstStride, int iHeight ) ;******************************************************************************* McCopyWidthEq4_mmx: push esi push edi push ebx mov esi, [esp+16] mov eax, [esp+20] mov edi, [esp+24] mov ecx, [esp+28] mov edx, [esp+32] ALIGN 4 .height_loop: mov ebx, [esi] mov [edi], ebx add esi, eax add edi, ecx dec edx jnz .height_loop WELSEMMS pop ebx pop edi pop esi ret ALIGN 16 ;******************************************************************************* ; void_t McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride, ; uint8_t *pDst, int iDstStride, int iHeight ) ;******************************************************************************* McCopyWidthEq8_mmx: push esi push edi mov esi, [esp+12] mov eax, [esp+16] mov edi, [esp+20] mov ecx, [esp+24] mov edx, [esp+28] ALIGN 4 .height_loop: movq mm0, [esi] movq [edi], mm0 add esi, eax add edi, ecx dec edx jnz .height_loop WELSEMMS pop edi pop esi ret ALIGN 16 ;******************************************************************************* ; void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight ) ;******************************************************************************* ;read unaligned memory %macro SSE_READ_UNA 2 movq %1, [%2] movhps %1, [%2+8] %endmacro ;write unaligned memory %macro SSE_WRITE_UNA 2 movq [%1], %2 movhps [%1+8], %2 %endmacro McCopyWidthEq16_sse2: push esi push edi mov esi, [esp+12] ; pSrc mov eax, [esp+16] ; iSrcStride mov edi, [esp+20] ; pDst mov edx, [esp+24] ; iDstStride mov ecx, [esp+28] ; iHeight ALIGN 4 .height_loop: SSE_READ_UNA xmm0, esi SSE_READ_UNA xmm1, esi+eax SSE_WRITE_UNA edi, xmm0 SSE_WRITE_UNA edi+edx, xmm1 sub ecx, 2 lea esi, [esi+eax*2] lea edi, [edi+edx*2] jnz .height_loop pop edi pop esi ret