shithub: openh264

ref: d821fbfd1c8fd5f0eae94a722e70e04a68e72324
dir: /codec/encoder/core/asm/score.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  score.asm
;*
;*  Abstract
;*      scan/score/count of sse2
;*
;*  History
;*      8/21/2009 Created
;*
;*
;*************************************************************************/

%include "asm_inc.asm"

;***********************************************************************
; Macros
;***********************************************************************

;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
SECTION .rodata align=16

;align 16
;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
align 16
sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
align 16
sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
align 16
sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
align 16
sse2_plane_inc: dw 1, 2, 3, 4, 5, 6, 7, 8
align 16
sse2_plane_dec: dw 8, 7, 6, 5, 4, 3, 2, 1
align 16
pb_scanacdc_maska:db 0,1,2,3,8,9,14,15,10,11,4,5,6,7,12,13
align 16
pb_scanacdc_maskb:db 2,3,8,9,10,11,4,5,0,1,6,7,12,13,14,15
align 16
pb_scandc_maska:db 2,3,8,9,14,15,10,11,4,5,6,7,12,13,0,1
align 16
pb_scandc_maskb:db 8,9,10,11,4,5,0,1,6,7,12,13,14,15,128,128

align 16
nozero_count_table:
db  0,1,1,2,1,2,2,3,1,2
db  2,3,2,3,3,4,1,2,2,3
db  2,3,3,4,2,3,3,4,3,4
db  4,5,1,2,2,3,2,3,3,4
db  2,3,3,4,3,4,4,5,2,3
db  3,4,3,4,4,5,3,4,4,5
db  4,5,5,6,1,2,2,3,2,3
db  3,4,2,3,3,4,3,4,4,5
db  2,3,3,4,3,4,4,5,3,4
db  4,5,4,5,5,6,2,3,3,4
db  3,4,4,5,3,4,4,5,4,5
db  5,6,3,4,4,5,4,5,5,6
db  4,5,5,6,5,6,6,7,1,2
db  2,3,2,3,3,4,2,3,3,4
db  3,4,4,5,2,3,3,4,3,4
db  4,5,3,4,4,5,4,5,5,6
db  2,3,3,4,3,4,4,5,3,4
db  4,5,4,5,5,6,3,4,4,5
db  4,5,5,6,4,5,5,6,5,6
db  6,7,2,3,3,4,3,4,4,5
db  3,4,4,5,4,5,5,6,3,4
db  4,5,4,5,5,6,4,5,5,6
db  5,6,6,7,3,4,4,5,4,5
db  5,6,4,5,5,6,5,6,6,7
db  4,5,5,6,5,6,6,7,5,6
db  6,7,6,7,7,8

align 16
high_mask_table:
	db  0, 0, 0, 3, 0, 2, 3, 6, 0, 2
	db  2, 5, 3, 5, 6, 9, 0, 1, 2, 5
	db  2, 4, 5, 8, 3, 5, 5, 8, 6, 8
	db  9,12, 0, 1, 1, 4, 2, 4, 5, 8
	db  2, 4, 4, 7, 5, 7, 8,11, 3, 4
	db  5, 8, 5, 7, 8,11, 6, 8, 8,11
	db  9,11,12,15, 0, 1, 1, 4, 1, 3
	db  4, 7, 2, 4, 4, 7, 5, 7, 8,11
	db  2, 3, 4, 7, 4, 6, 7,10, 5, 7
	db  7,10, 8,10,11,14, 3, 4, 4, 7
	db  5, 7, 8,11, 5, 7, 7,10, 8,10
	db 11,14, 6, 7, 8,11, 8,10,11,14
	db  9,11,11,14,12,14,15,18, 0, 0
	db  1, 4, 1, 3, 4, 7, 1, 3, 3, 6
	db  4, 6, 7,10, 2, 3, 4, 7, 4, 6
	db  7,10, 5, 7, 7,10, 8,10,11,14
	db  2, 3, 3, 6, 4, 6, 7,10, 4, 6
	db  6, 9, 7, 9,10,13, 5, 6, 7,10
	db  7, 9,10,13, 8,10,10,13,11,13
	db 14,17, 3, 4, 4, 7, 4, 6, 7,10
	db  5, 7, 7,10, 8,10,11,14, 5, 6
	db  7,10, 7, 9,10,13, 8,10,10,13
	db 11,13,14,17, 6, 7, 7,10, 8,10
	db 11,14, 8,10,10,13,11,13,14,17
	db  9,10,11,14,11,13,14,17,12,14
	db 14,17,15,17,18,21

align 16
low_mask_table:
    db  0, 3, 2, 6, 2, 5, 5, 9, 1, 5
    db  4, 8, 5, 8, 8,12, 1, 4, 4, 8
    db  4, 7, 7,11, 4, 8, 7,11, 8,11
    db 11,15, 1, 4, 3, 7, 4, 7, 7,11
    db  3, 7, 6,10, 7,10,10,14, 4, 7
    db  7,11, 7,10,10,14, 7,11,10,14
    db 11,14,14,18, 0, 4, 3, 7, 3, 6
    db  6,10, 3, 7, 6,10, 7,10,10,14
    db  3, 6, 6,10, 6, 9, 9,13, 6,10
    db  9,13,10,13,13,17, 4, 7, 6,10
    db  7,10,10,14, 6,10, 9,13,10,13
    db 13,17, 7,10,10,14,10,13,13,17
    db 10,14,13,17,14,17,17,21, 0, 3
    db  3, 7, 3, 6, 6,10, 2, 6, 5, 9
    db  6, 9, 9,13, 3, 6, 6,10, 6, 9
    db  9,13, 6,10, 9,13,10,13,13,17
    db  3, 6, 5, 9, 6, 9, 9,13, 5, 9
    db  8,12, 9,12,12,16, 6, 9, 9,13
    db  9,12,12,16, 9,13,12,16,13,16
    db 16,20, 3, 7, 6,10, 6, 9, 9,13
    db  6,10, 9,13,10,13,13,17, 6, 9
    db  9,13, 9,12,12,16, 9,13,12,16
    db 13,16,16,20, 7,10, 9,13,10,13
    db 13,17, 9,13,12,16,13,16,16,20
    db 10,13,13,17,13,16,16,20,13,17
    db 16,20,17,20,20,24


SECTION .text

;***********************************************************************
;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsScan4x4DcAc_sse2
WelsScan4x4DcAc_sse2:
	%ifdef X86_32
	push r3
	%assign push_num 1
	%else
	%assign push_num 0
	%endif
	LOAD_2_PARA
	;mov        eax, [esp+8]
	movdqa     xmm0, [r1]			; 7 6 5 4 3 2 1 0
	movdqa     xmm1, [r1+16]		; f e d c b a 9 8
	pextrw     r2d, xmm0, 7			; ecx = 7
	pextrw     r3d, xmm1, 2			; edx = a
	pextrw     r1d, xmm0, 5			; eax = 5
	pinsrw     xmm1, r2d, 2			; f e d c b 7 9 8
	pinsrw     xmm0, r1d, 7			; 5 6 5 4 3 2 1 0
	pextrw     r2d, xmm1, 0			; ecx = 8
	pinsrw     xmm0, r2d, 5			; 5 6 8 4 3 2 1 0
	pinsrw     xmm1, r3d, 0			; f e d c b 7 9 a
	pshufd     xmm2, xmm0, 0xd8		; 5 6 3 2 8 4 1 0
	pshufd     xmm3, xmm1, 0xd8		; f e b 7 d c 9 a
	pshufhw    xmm0, xmm2, 0x93		; 6 3 2 5 8 4 1 0
	pshuflw    xmm1, xmm3, 0x39		; f e b 7 a d c 9
	;mov        eax,  [esp+4]
	movdqa     [r0],xmm0
	movdqa     [r0+16], xmm1
	%ifdef X86_32
	pop r3
	%endif
	ret

;***********************************************************************
;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsScan4x4DcAc_ssse3
WelsScan4x4DcAc_ssse3:
	%assign push_num 0
	LOAD_2_PARA
	;mov        eax, [esp+8]
	movdqa     xmm0, [r1]
	movdqa     xmm1, [r1+16]
	pextrw		r2d,  xmm0, 7			; ecx = [7]
	pextrw		r1d,  xmm1, 0			; eax = [8]
	pinsrw		xmm0, r1d, 7			; xmm0[7]	=	[8]
	pinsrw		xmm1, r2d, 0			; xmm1[0]	=	[7]
	pshufb		xmm1, [pb_scanacdc_maskb]
	pshufb		xmm0, [pb_scanacdc_maska]

	;mov        eax,  [esp+4]
	movdqa     [r0],xmm0
	movdqa     [r0+16], xmm1
	ret
;***********************************************************************
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsScan4x4Ac_sse2
WelsScan4x4Ac_sse2:
	%assign push_num 0
	LOAD_2_PARA
	;mov        eax, [esp+8]
	movdqa     xmm0, [r1]
	movdqa     xmm1, [r1+16]
	movdqa     xmm2, xmm0
	punpcklqdq xmm0, xmm1
	punpckhqdq xmm2, xmm1

	movdqa     xmm3, xmm0
	punpckldq  xmm0, xmm2
	punpckhdq  xmm3, xmm2
	pextrw     r1d , xmm0, 3
	pextrw     r2d , xmm0, 7
	pinsrw     xmm0, r1d,  7
	pextrw     r1d,  xmm3, 4
	pinsrw     xmm3, r2d,  4
	pextrw     r2d,  xmm3, 0
	pinsrw     xmm3, r1d,  0
	pinsrw     xmm0, r2d,  3

	pshufhw    xmm1, xmm0, 0x93
	pshuflw    xmm2, xmm3, 0x39

    movdqa     xmm3, xmm2
    psrldq     xmm1, 2
    pslldq     xmm3, 14
    por        xmm1, xmm3
    psrldq     xmm2, 2
	;mov        eax,  [esp+4]
	movdqa     [r0],xmm1
	movdqa     [r0+16], xmm2
	ret


;***********************************************************************
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
WelsCalculateSingleCtr4x4_sse2:
	;push      ebx
	;mov       eax,  [esp+8]
	%ifdef X86_32
	push r3
	%assign push_num 1
	%else
	%assign push_num 0
	%endif
	LOAD_1_PARA
	movdqa    xmm0, [r0]
	movdqa    xmm1, [r0+16]

	packsswb  xmm0, xmm1
	; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
	xor r3, r3
    pxor      xmm3, xmm3
    pcmpeqb   xmm0, xmm3
    pmovmskb  r3d,  xmm0

    xor       r3,  0xffff

	xor       r0,  r0
	mov       r2,  7
	mov       r1,  8
.loop_low8_find1:
	bt        r3,  r2
	jc        .loop_high8_find1
	dec		  r2
	jnz      .loop_low8_find1
.loop_high8_find1:
	bt        r3, r1
	jc        .find1end
	inc       r1
	cmp       r1,16
	jb        .loop_high8_find1
.find1end:
	sub       r1, r2
	sub       r1, 1
	lea	  r2,  [i_ds_table]
	add       r0b,  [r2+r1]
	mov       r1, r3
	and       r3, 0xff
	shr       r1, 8
	and       r1, 0xff
	lea	  r2 , [low_mask_table]
	add       r0b,  [r2 +r3]
	lea	  r2, [high_mask_table]
	add       r0b,  [r2+r1]
	%ifdef X86_32
	pop r3
	%else
	mov retrd, r0d
	%endif
	;pop       ebx
	ret


;***********************************************************************
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsGetNoneZeroCount_sse2
WelsGetNoneZeroCount_sse2:
	%assign push_num 0
	LOAD_1_PARA
	;mov       eax,  [esp+4]
	movdqa    xmm0, [r0]
	movdqa    xmm1, [r0+16]
	pxor      xmm2, xmm2
	pcmpeqw   xmm0, xmm2
	pcmpeqw   xmm1, xmm2
	packsswb  xmm1, xmm0
	xor r1, r1
	pmovmskb  r1d,  xmm1
	xor       r1d,  0xffff
	mov       r2,  r1
	and       r1,  0xff
	shr       r2,  8
;	and       ecx,  0xff	; we do not need this due to high 16bits equal to 0 yet
;	xor       retr,  retr
	;add       al,  [nozero_count_table+r2]
	lea 	  r0 , [nozero_count_table]
	movzx	  r2, byte [r0+r2]
	movzx	  r1,   byte [r0+r1]
	mov	  retrq, r2
	add	  retrq, r1
	;add       al,  [nozero_count_table+r1]
	ret