ref: 8a0bc0a8b9c729dd7d7ff1f2b34f02922194bb5f
dir: /codec/common/vaa.asm/
;*! ;* \copy ;* Copyright (c) 2010-2013, Cisco Systems ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* * Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* ;* * Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in ;* the documentation and/or other materials provided with the ;* distribution. ;* ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ;* POSSIBILITY OF SUCH DAMAGE. ;* ;* ;* vaa.asm ;* ;* Abstract ;* sse2 for pVaa routines ;* ;* History ;* 04/14/2010 Created ;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3) ;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement ;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2 ;* ;*************************************************************************/ %include "asm_inc.asm" ;*********************************************************************** ; Macros and other preprocessor constants ;*********************************************************************** ; by comparing it outperforms than phaddw(SSSE3) sets %macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp ; @sum_8x2 begin pshufd %2, %1, 04Eh ; 01001110 B paddw %1, %2 pshuflw %2, %1, 04Eh ; 01001110 B paddw %1, %2 pshuflw %2, %1, 0B1h ; 10110001 B paddw %1, %2 ; end of @sum_8x2 %endmacro ; END of SUM_WORD_8x2_SSE2 %macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4 movdqa %1, [r0 ] ; line 0 movdqa %2, [r0+r1] ; line 1 movdqa %3, %1 punpcklbw %1, xmm7 punpckhbw %3, xmm7 movdqa %4, %2 punpcklbw %4, xmm7 punpckhbw %2, xmm7 paddw %1, %4 paddw %2, %3 movdqa %3, [r0+r2] ; line 2 movdqa %4, [r0+r3] ; line 3 movdqa %5, %3 punpcklbw %3, xmm7 punpckhbw %5, xmm7 movdqa %6, %4 punpcklbw %6, xmm7 punpckhbw %4, xmm7 paddw %3, %6 paddw %4, %5 paddw %1, %3 ; block 0, 1 paddw %2, %4 ; block 2, 3 pshufd %3, %1, 0B1h pshufd %4, %2, 0B1h paddw %1, %3 paddw %2, %4 movdqa %3, %1 movdqa %4, %2 pshuflw %5, %1, 0B1h pshufhw %6, %3, 0B1h paddw %1, %5 paddw %3, %6 pshuflw %5, %2, 0B1h pshufhw %6, %4, 0B1h paddw %2, %5 paddw %4, %6 punpcklwd %1, %2 punpckhwd %3, %4 punpcklwd %1, %3 psraw %1, $04 %endmacro %macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4 movdqa %1, [r0 ] ; line 0 movdqa %2, [r0+r1] ; line 1 movdqa %3, %1 punpcklbw %1, xmm7 punpckhbw %3, xmm7 movdqa %4, %2 punpcklbw %4, xmm7 punpckhbw %2, xmm7 paddw %1, %4 paddw %2, %3 movdqa %3, [r0+r2] ; line 2 movdqa %4, [r0+r3] ; line 3 movdqa %5, %3 punpcklbw %3, xmm7 punpckhbw %5, xmm7 movdqa %6, %4 punpcklbw %6, xmm7 punpckhbw %4, xmm7 paddw %3, %6 paddw %4, %5 paddw %1, %3 ; block 0, 1 paddw %2, %4 ; block 2, 3 phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; .. phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; .... psraw %1, $04 %endmacro ;*********************************************************************** ; Local Data (Read Only) ;*********************************************************************** ;SECTION .rodata align=16 ;ALIGN 16 ;pack1_8x2: ; dw 1, 1, 1, 1, 1, 1, 1, 1 ;*********************************************************************** ; Code ;*********************************************************************** SECTION .text ; , 6/7/2010 WELS_EXTERN AnalysisVaaInfoIntra_sse2 ;*********************************************************************** ; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize ); ;*********************************************************************** ALIGN 16 AnalysisVaaInfoIntra_sse2: %assign push_num 0 LOAD_2_PARA SIGN_EXTENTION r1,r1d %ifdef X86_32 push r3 push r4 push r5 push r6 %assign push_num push_num+4 %endif mov r5,r7 and r5,0fh sub r7,r5 sub r7,32 mov r2,r1 sal r2,$01 ;r2 = 2*iLineSize mov r3,r2 add r3,r1 ;r3 = 3*iLineSize mov r4,r2 sal r4,$01 ;r4 = 4*iLineSize pxor xmm7, xmm7 ; loops VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 movq [r7], xmm0 lea r0, [r0+r4] VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 movq [r7+8], xmm0 lea r0, [r0+r4] VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 movq [r7+16], xmm0 lea r0, [r0+r4] VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 movq [r7+24], xmm0 movdqa xmm0, [r7] ; block 0~7 movdqa xmm1, [r7+16] ; block 8~15 movdqa xmm2, xmm0 paddw xmm0, xmm1 SUM_WORD_8x2_SSE2 xmm0, xmm3 pmullw xmm1, xmm1 pmullw xmm2, xmm2 movdqa xmm3, xmm1 movdqa xmm4, xmm2 punpcklwd xmm1, xmm7 punpckhwd xmm3, xmm7 punpcklwd xmm2, xmm7 punpckhwd xmm4, xmm7 paddd xmm1, xmm2 paddd xmm3, xmm4 paddd xmm1, xmm3 pshufd xmm2, xmm1, 01Bh paddd xmm1, xmm2 pshufd xmm2, xmm1, 0B1h paddd xmm1, xmm2 movd r2d, xmm0 and r2, 0ffffh ; effective low work truncated mov r3, r2 imul r2, r3 sar r2, $04 movd retrd, xmm1 sub retrd, r2d add r7,32 add r7,r5 %ifdef X86_32 pop r6 pop r5 pop r4 pop r3 %endif ret WELS_EXTERN AnalysisVaaInfoIntra_ssse3 ;*********************************************************************** ; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize ); ;*********************************************************************** ALIGN 16 AnalysisVaaInfoIntra_ssse3: %assign push_num 0 LOAD_2_PARA SIGN_EXTENTION r1,r1d %ifdef X86_32 push r3 push r4 push r5 push r6 %assign push_num push_num+4 %endif mov r5,r7 and r5,0fh sub r7,r5 sub r7,32 mov r2,r1 sal r2,$01 ;r2 = 2*iLineSize mov r3,r2 add r3,r1 ;r3 = 3*iLineSize mov r4,r2 sal r4,$01 ;r4 = 4*iLineSize pxor xmm7, xmm7 ; loops VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 movq [r7],xmm0 lea r0,[r0+r4] VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 movq [r7+8],xmm1 lea r0,[r0+r4] VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 movq [r7+16],xmm0 lea r0,[r0+r4] VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 movq [r7+24],xmm1 movdqa xmm0,[r7] movdqa xmm1,[r7+16] movdqa xmm2, xmm0 paddw xmm0, xmm1 SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets pmullw xmm1, xmm1 pmullw xmm2, xmm2 movdqa xmm3, xmm1 movdqa xmm4, xmm2 punpcklwd xmm1, xmm7 punpckhwd xmm3, xmm7 punpcklwd xmm2, xmm7 punpckhwd xmm4, xmm7 paddd xmm1, xmm2 paddd xmm3, xmm4 paddd xmm1, xmm3 pshufd xmm2, xmm1, 01Bh paddd xmm1, xmm2 pshufd xmm2, xmm1, 0B1h paddd xmm1, xmm2 movd r2d, xmm0 and r2, 0ffffh ; effective low work truncated mov r3, r2 imul r2, r3 sar r2, $04 movd retrd, xmm1 sub retrd, r2d add r7,32 add r7,r5 %ifdef X86_32 pop r6 pop r5 pop r4 pop r3 %endif ret WELS_EXTERN MdInterAnalysisVaaInfo_sse41 ;*********************************************************************** ; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 ) ;*********************************************************************** ALIGN 16 MdInterAnalysisVaaInfo_sse41: %assign push_num 0 LOAD_1_PARA movdqa xmm0,[r0] pshufd xmm1, xmm0, 01Bh paddd xmm1, xmm0 pshufd xmm2, xmm1, 0B1h paddd xmm1, xmm2 psrad xmm1, 02h ; iAverageSad movdqa xmm2, xmm1 psrad xmm2, 06h movdqa xmm3, xmm0 ; iSadBlock psrad xmm3, 06h psubd xmm3, xmm2 pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets pshufd xmm4, xmm3, 01Bh paddd xmm4, xmm3 pshufd xmm3, xmm4, 0B1h paddd xmm3, xmm4 movd r0d, xmm3 cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD jb near .threshold_exit pshufd xmm0, xmm0, 01Bh pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad movmskps retrd, xmm0 ret .threshold_exit: mov retrd, 15 ret WELS_EXTERN MdInterAnalysisVaaInfo_sse2 ;*********************************************************************** ; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 ) ;*********************************************************************** ALIGN 16 MdInterAnalysisVaaInfo_sse2: %assign push_num 0 LOAD_1_PARA movdqa xmm0, [r0] pshufd xmm1, xmm0, 01Bh paddd xmm1, xmm0 pshufd xmm2, xmm1, 0B1h paddd xmm1, xmm2 psrad xmm1, 02h ; iAverageSad movdqa xmm2, xmm1 psrad xmm2, 06h movdqa xmm3, xmm0 ; iSadBlock psrad xmm3, 06h psubd xmm3, xmm2 ; to replace pmulld functionality as below movdqa xmm2, xmm3 pmuludq xmm2, xmm3 pshufd xmm4, xmm3, 0B1h pmuludq xmm4, xmm4 movdqa xmm5, xmm2 punpckldq xmm5, xmm4 punpckhdq xmm2, xmm4 punpcklqdq xmm5, xmm2 pshufd xmm4, xmm5, 01Bh paddd xmm4, xmm5 pshufd xmm5, xmm4, 0B1h paddd xmm5, xmm4 movd r0d, xmm5 cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD jb near .threshold_exit pshufd xmm0, xmm0, 01Bh pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad movmskps retrd, xmm0 ret .threshold_exit: mov retrd, 15 ret