shithub: openh264

ref: 90deb80b501ed138fe9371658c24d0f95ebad59e
dir: /codec/encoder/core/x86/coeff.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2010-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  memzero.asm
;*
;*  Abstract
;*     cavlc
;*
;*  History
;*      09/08/2010 Created
;*
;*
;*************************************************************************/

%include "asm_inc.asm"



%ifdef X86_32
SECTION .rodata align=16

align 16
sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8

ALIGN  16
sse2_b_1 db -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1

align 16
byte_1pos_table:
    db 0,0,0,0,0,0,0,0, ;0
    db 0,0,0,0,0,0,0,1, ;1
    db 1,0,0,0,0,0,0,1, ;2
    db 1,0,0,0,0,0,0,2, ;3
    db 2,0,0,0,0,0,0,1, ;4
    db 2,0,0,0,0,0,0,2, ;5
    db 2,1,0,0,0,0,0,2, ;6
    db 2,1,0,0,0,0,0,3, ;7
    db 3,0,0,0,0,0,0,1, ;8
    db 3,0,0,0,0,0,0,2, ;9
    db 3,1,0,0,0,0,0,2, ;10
    db 3,1,0,0,0,0,0,3, ;11
    db 3,2,0,0,0,0,0,2, ;12
    db 3,2,0,0,0,0,0,3, ;13
    db 3,2,1,0,0,0,0,3, ;14
    db 3,2,1,0,0,0,0,4, ;15
    db 4,0,0,0,0,0,0,1, ;16
    db 4,0,0,0,0,0,0,2, ;17
    db 4,1,0,0,0,0,0,2, ;18
    db 4,1,0,0,0,0,0,3, ;19
    db 4,2,0,0,0,0,0,2, ;20
    db 4,2,0,0,0,0,0,3, ;21
    db 4,2,1,0,0,0,0,3, ;22
    db 4,2,1,0,0,0,0,4, ;23
    db 4,3,0,0,0,0,0,2, ;24
    db 4,3,0,0,0,0,0,3, ;25
    db 4,3,1,0,0,0,0,3, ;26
    db 4,3,1,0,0,0,0,4, ;27
    db 4,3,2,0,0,0,0,3, ;28
    db 4,3,2,0,0,0,0,4, ;29
    db 4,3,2,1,0,0,0,4, ;30
    db 4,3,2,1,0,0,0,5, ;31
    db 5,0,0,0,0,0,0,1, ;32
    db 5,0,0,0,0,0,0,2, ;33
    db 5,1,0,0,0,0,0,2, ;34
    db 5,1,0,0,0,0,0,3, ;35
    db 5,2,0,0,0,0,0,2, ;36
    db 5,2,0,0,0,0,0,3, ;37
    db 5,2,1,0,0,0,0,3, ;38
    db 5,2,1,0,0,0,0,4, ;39
    db 5,3,0,0,0,0,0,2, ;40
    db 5,3,0,0,0,0,0,3, ;41
    db 5,3,1,0,0,0,0,3, ;42
    db 5,3,1,0,0,0,0,4, ;43
    db 5,3,2,0,0,0,0,3, ;44
    db 5,3,2,0,0,0,0,4, ;45
    db 5,3,2,1,0,0,0,4, ;46
    db 5,3,2,1,0,0,0,5, ;47
    db 5,4,0,0,0,0,0,2, ;48
    db 5,4,0,0,0,0,0,3, ;49
    db 5,4,1,0,0,0,0,3, ;50
    db 5,4,1,0,0,0,0,4, ;51
    db 5,4,2,0,0,0,0,3, ;52
    db 5,4,2,0,0,0,0,4, ;53
    db 5,4,2,1,0,0,0,4, ;54
    db 5,4,2,1,0,0,0,5, ;55
    db 5,4,3,0,0,0,0,3, ;56
    db 5,4,3,0,0,0,0,4, ;57
    db 5,4,3,1,0,0,0,4, ;58
    db 5,4,3,1,0,0,0,5, ;59
    db 5,4,3,2,0,0,0,4, ;60
    db 5,4,3,2,0,0,0,5, ;61
    db 5,4,3,2,1,0,0,5, ;62
    db 5,4,3,2,1,0,0,6, ;63
    db 6,0,0,0,0,0,0,1, ;64
    db 6,0,0,0,0,0,0,2, ;65
    db 6,1,0,0,0,0,0,2, ;66
    db 6,1,0,0,0,0,0,3, ;67
    db 6,2,0,0,0,0,0,2, ;68
    db 6,2,0,0,0,0,0,3, ;69
    db 6,2,1,0,0,0,0,3, ;70
    db 6,2,1,0,0,0,0,4, ;71
    db 6,3,0,0,0,0,0,2, ;72
    db 6,3,0,0,0,0,0,3, ;73
    db 6,3,1,0,0,0,0,3, ;74
    db 6,3,1,0,0,0,0,4, ;75
    db 6,3,2,0,0,0,0,3, ;76
    db 6,3,2,0,0,0,0,4, ;77
    db 6,3,2,1,0,0,0,4, ;78
    db 6,3,2,1,0,0,0,5, ;79
    db 6,4,0,0,0,0,0,2, ;80
    db 6,4,0,0,0,0,0,3, ;81
    db 6,4,1,0,0,0,0,3, ;82
    db 6,4,1,0,0,0,0,4, ;83
    db 6,4,2,0,0,0,0,3, ;84
    db 6,4,2,0,0,0,0,4, ;85
    db 6,4,2,1,0,0,0,4, ;86
    db 6,4,2,1,0,0,0,5, ;87
    db 6,4,3,0,0,0,0,3, ;88
    db 6,4,3,0,0,0,0,4, ;89
    db 6,4,3,1,0,0,0,4, ;90
    db 6,4,3,1,0,0,0,5, ;91
    db 6,4,3,2,0,0,0,4, ;92
    db 6,4,3,2,0,0,0,5, ;93
    db 6,4,3,2,1,0,0,5, ;94
    db 6,4,3,2,1,0,0,6, ;95
    db 6,5,0,0,0,0,0,2, ;96
    db 6,5,0,0,0,0,0,3, ;97
    db 6,5,1,0,0,0,0,3, ;98
    db 6,5,1,0,0,0,0,4, ;99
    db 6,5,2,0,0,0,0,3, ;100
    db 6,5,2,0,0,0,0,4, ;101
    db 6,5,2,1,0,0,0,4, ;102
    db 6,5,2,1,0,0,0,5, ;103
    db 6,5,3,0,0,0,0,3, ;104
    db 6,5,3,0,0,0,0,4, ;105
    db 6,5,3,1,0,0,0,4, ;106
    db 6,5,3,1,0,0,0,5, ;107
    db 6,5,3,2,0,0,0,4, ;108
    db 6,5,3,2,0,0,0,5, ;109
    db 6,5,3,2,1,0,0,5, ;110
    db 6,5,3,2,1,0,0,6, ;111
    db 6,5,4,0,0,0,0,3, ;112
    db 6,5,4,0,0,0,0,4, ;113
    db 6,5,4,1,0,0,0,4, ;114
    db 6,5,4,1,0,0,0,5, ;115
    db 6,5,4,2,0,0,0,4, ;116
    db 6,5,4,2,0,0,0,5, ;117
    db 6,5,4,2,1,0,0,5, ;118
    db 6,5,4,2,1,0,0,6, ;119
    db 6,5,4,3,0,0,0,4, ;120
    db 6,5,4,3,0,0,0,5, ;121
    db 6,5,4,3,1,0,0,5, ;122
    db 6,5,4,3,1,0,0,6, ;123
    db 6,5,4,3,2,0,0,5, ;124
    db 6,5,4,3,2,0,0,6, ;125
    db 6,5,4,3,2,1,0,6, ;126
    db 6,5,4,3,2,1,0,7, ;127
    db 7,0,0,0,0,0,0,1, ;128
    db 7,0,0,0,0,0,0,2, ;129
    db 7,1,0,0,0,0,0,2, ;130
    db 7,1,0,0,0,0,0,3, ;131
    db 7,2,0,0,0,0,0,2, ;132
    db 7,2,0,0,0,0,0,3, ;133
    db 7,2,1,0,0,0,0,3, ;134
    db 7,2,1,0,0,0,0,4, ;135
    db 7,3,0,0,0,0,0,2, ;136
    db 7,3,0,0,0,0,0,3, ;137
    db 7,3,1,0,0,0,0,3, ;138
    db 7,3,1,0,0,0,0,4, ;139
    db 7,3,2,0,0,0,0,3, ;140
    db 7,3,2,0,0,0,0,4, ;141
    db 7,3,2,1,0,0,0,4, ;142
    db 7,3,2,1,0,0,0,5, ;143
    db 7,4,0,0,0,0,0,2, ;144
    db 7,4,0,0,0,0,0,3, ;145
    db 7,4,1,0,0,0,0,3, ;146
    db 7,4,1,0,0,0,0,4, ;147
    db 7,4,2,0,0,0,0,3, ;148
    db 7,4,2,0,0,0,0,4, ;149
    db 7,4,2,1,0,0,0,4, ;150
    db 7,4,2,1,0,0,0,5, ;151
    db 7,4,3,0,0,0,0,3, ;152
    db 7,4,3,0,0,0,0,4, ;153
    db 7,4,3,1,0,0,0,4, ;154
    db 7,4,3,1,0,0,0,5, ;155
    db 7,4,3,2,0,0,0,4, ;156
    db 7,4,3,2,0,0,0,5, ;157
    db 7,4,3,2,1,0,0,5, ;158
    db 7,4,3,2,1,0,0,6, ;159
    db 7,5,0,0,0,0,0,2, ;160
    db 7,5,0,0,0,0,0,3, ;161
    db 7,5,1,0,0,0,0,3, ;162
    db 7,5,1,0,0,0,0,4, ;163
    db 7,5,2,0,0,0,0,3, ;164
    db 7,5,2,0,0,0,0,4, ;165
    db 7,5,2,1,0,0,0,4, ;166
    db 7,5,2,1,0,0,0,5, ;167
    db 7,5,3,0,0,0,0,3, ;168
    db 7,5,3,0,0,0,0,4, ;169
    db 7,5,3,1,0,0,0,4, ;170
    db 7,5,3,1,0,0,0,5, ;171
    db 7,5,3,2,0,0,0,4, ;172
    db 7,5,3,2,0,0,0,5, ;173
    db 7,5,3,2,1,0,0,5, ;174
    db 7,5,3,2,1,0,0,6, ;175
    db 7,5,4,0,0,0,0,3, ;176
    db 7,5,4,0,0,0,0,4, ;177
    db 7,5,4,1,0,0,0,4, ;178
    db 7,5,4,1,0,0,0,5, ;179
    db 7,5,4,2,0,0,0,4, ;180
    db 7,5,4,2,0,0,0,5, ;181
    db 7,5,4,2,1,0,0,5, ;182
    db 7,5,4,2,1,0,0,6, ;183
    db 7,5,4,3,0,0,0,4, ;184
    db 7,5,4,3,0,0,0,5, ;185
    db 7,5,4,3,1,0,0,5, ;186
    db 7,5,4,3,1,0,0,6, ;187
    db 7,5,4,3,2,0,0,5, ;188
    db 7,5,4,3,2,0,0,6, ;189
    db 7,5,4,3,2,1,0,6, ;190
    db 7,5,4,3,2,1,0,7, ;191
    db 7,6,0,0,0,0,0,2, ;192
    db 7,6,0,0,0,0,0,3, ;193
    db 7,6,1,0,0,0,0,3, ;194
    db 7,6,1,0,0,0,0,4, ;195
    db 7,6,2,0,0,0,0,3, ;196
    db 7,6,2,0,0,0,0,4, ;197
    db 7,6,2,1,0,0,0,4, ;198
    db 7,6,2,1,0,0,0,5, ;199
    db 7,6,3,0,0,0,0,3, ;200
    db 7,6,3,0,0,0,0,4, ;201
    db 7,6,3,1,0,0,0,4, ;202
    db 7,6,3,1,0,0,0,5, ;203
    db 7,6,3,2,0,0,0,4, ;204
    db 7,6,3,2,0,0,0,5, ;205
    db 7,6,3,2,1,0,0,5, ;206
    db 7,6,3,2,1,0,0,6, ;207
    db 7,6,4,0,0,0,0,3, ;208
    db 7,6,4,0,0,0,0,4, ;209
    db 7,6,4,1,0,0,0,4, ;210
    db 7,6,4,1,0,0,0,5, ;211
    db 7,6,4,2,0,0,0,4, ;212
    db 7,6,4,2,0,0,0,5, ;213
    db 7,6,4,2,1,0,0,5, ;214
    db 7,6,4,2,1,0,0,6, ;215
    db 7,6,4,3,0,0,0,4, ;216
    db 7,6,4,3,0,0,0,5, ;217
    db 7,6,4,3,1,0,0,5, ;218
    db 7,6,4,3,1,0,0,6, ;219
    db 7,6,4,3,2,0,0,5, ;220
    db 7,6,4,3,2,0,0,6, ;221
    db 7,6,4,3,2,1,0,6, ;222
    db 7,6,4,3,2,1,0,7, ;223
    db 7,6,5,0,0,0,0,3, ;224
    db 7,6,5,0,0,0,0,4, ;225
    db 7,6,5,1,0,0,0,4, ;226
    db 7,6,5,1,0,0,0,5, ;227
    db 7,6,5,2,0,0,0,4, ;228
    db 7,6,5,2,0,0,0,5, ;229
    db 7,6,5,2,1,0,0,5, ;230
    db 7,6,5,2,1,0,0,6, ;231
    db 7,6,5,3,0,0,0,4, ;232
    db 7,6,5,3,0,0,0,5, ;233
    db 7,6,5,3,1,0,0,5, ;234
    db 7,6,5,3,1,0,0,6, ;235
    db 7,6,5,3,2,0,0,5, ;236
    db 7,6,5,3,2,0,0,6, ;237
    db 7,6,5,3,2,1,0,6, ;238
    db 7,6,5,3,2,1,0,7, ;239
    db 7,6,5,4,0,0,0,4, ;240
    db 7,6,5,4,0,0,0,5, ;241
    db 7,6,5,4,1,0,0,5, ;242
    db 7,6,5,4,1,0,0,6, ;243
    db 7,6,5,4,2,0,0,5, ;244
    db 7,6,5,4,2,0,0,6, ;245
    db 7,6,5,4,2,1,0,6, ;246
    db 7,6,5,4,2,1,0,7, ;247
    db 7,6,5,4,3,0,0,5, ;248
    db 7,6,5,4,3,0,0,6, ;249
    db 7,6,5,4,3,1,0,6, ;250
    db 7,6,5,4,3,1,0,7, ;251
    db 7,6,5,4,3,2,0,6, ;252
    db 7,6,5,4,3,2,0,7, ;253
    db 7,6,5,4,3,2,1,7, ;254
    db 7,6,5,4,3,2,1,8, ;255

;***********************************************************************
; Code
;***********************************************************************
SECTION .text



;***********************************************************************
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
WELS_EXTERN CavlcParamCal_sse2
    push ebx
    push edi
    push esi

    mov         eax,    [esp+16]    ;coffLevel
    mov         edi,    [esp+24]    ;Level
    mov         ebx,    [esp+32]    ;endIdx
    cmp         ebx,    3
    jne         .Level16
    pxor        xmm1,   xmm1
    movq        xmm0,   [eax]   ; removed QWORD
    jmp         .Cal_begin
.Level16:
    movdqa      xmm0,   [eax]
    movdqa      xmm1,   [eax+16]
.Cal_begin:
    movdqa      xmm2,   xmm0
    packsswb    xmm0,   xmm1
    movdqa      xmm4,   xmm0
    pxor        xmm3,   xmm3
    pcmpgtb     xmm0,   xmm3
    pcmpgtb     xmm3,   xmm4
    por         xmm0,   xmm3
    pmovmskb    edx,    xmm0
    cmp         edx,    0
    je near   .return
    movdqa      xmm6,   [sse2_b_1]
    pcmpeqw     xmm7,   xmm7    ;generate -1
    mov         ebx,    0xff
    ;pinsrw     xmm6,   ebx,    3

    mov       bl,   dh

    lea       ebx,  [byte_1pos_table+8*ebx]
    movq      xmm0, [ebx]
    pextrw    ecx,  xmm0, 3
    shr       ecx,  8
    mov       dh,   cl

.loopHighFind0:
    cmp       ecx,   0
    je        .loopHighFind0End
    ;mov       esi, [ebx]
    ;and       esi, 0xff
    movzx     esi, byte [ebx]
    add       esi, 8
    mov       esi, [eax+2*esi]
    mov       [edi], si
    add       edi,   2
    ;add       ebx,   1
    inc       ebx
    dec       ecx
    jmp       .loopHighFind0
.loopHighFind0End:
    mov       cl,   dh
    cmp       cl,   8
    pand      xmm0, xmm6
    jne       .LowByteFind0
    sub       edi,   2
    mov       esi,   [eax+16]
    mov       [edi], esi
    add       edi,   2
.LowByteFind0:
    and       edx,  0xff
    lea       ebx,  [byte_1pos_table+8*edx]
    movq      xmm1, [ebx]
    pextrw    esi,  xmm1, 3
    or        esi,  0xff
    or        ecx,  0xff00
    and       ecx,  esi
    shr       esi,  8
    pand      xmm1, xmm6
.loopLowFind0:
    cmp       esi, 0
    je        .loopLowFind0End
    ;mov       edx, [ebx]
    ;and       edx, 0xff
    movzx     edx,  byte [ebx]
    mov       edx, [eax+2*edx]
    mov       [edi], dx
    add       edi,   2
    ;add       ebx,   1
    inc       ebx
    dec       esi
    jmp       .loopLowFind0
.loopLowFind0End:
    cmp       ch,  8
    jne       .getLevelEnd
    sub       edi, 2
    mov       edx, [eax]
    mov       [edi], dx
.getLevelEnd:
    mov      edx, [esp+28]  ;total_coeffs
    ;mov      ebx,   ecx
    ;and      ebx,   0xff
    movzx    ebx,   byte cl
    add      cl,    ch
    mov      [edx], cl
;getRun
    movq     xmm5, [sse2_b8]
    paddb    xmm0, xmm5
    pxor     xmm2, xmm2
    pxor     xmm3, xmm3
    mov      eax,  8
    sub      eax,  ebx
    shl      eax,  3
    shl      ebx,  3
    pinsrw   xmm2, ebx, 0
    pinsrw   xmm3, eax, 0
    psllq    xmm0, xmm3
    psrlq    xmm0, xmm3
    movdqa   xmm4, xmm1
    psllq    xmm1, xmm2
    psrlq    xmm4, xmm3
    punpcklqdq xmm1, xmm4
    por      xmm0,  xmm1

    pextrw   eax,   xmm0, 0
    and      eax,   0xff
    inc      eax
    sub      al,    cl
    movdqa   xmm1,  xmm0
    paddb    xmm1,  xmm7
    psrldq   xmm0,  1
    psubb    xmm1,  xmm0
    mov      ecx,   [esp+20] ;run
    movdqa   [ecx], xmm1
;getRunEnd
.return:
    pop esi
    pop edi
    pop ebx
    ret
%endif