shithub: openh264

ref: 597b4eef73c3032175fded97504a8db6ccd33b6f
dir: /codec/common/x86/deblock.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  deblock.asm
;*
;*  Abstract
;*      edge loop
;*
;*  History
;*      08/07/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"

;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************

SECTION .rodata align=16

ALIGN   16
FOUR_16B_SSE2:   dw   4, 4, 4, 4, 4, 4, 4, 4

ALIGN   16
WELS_DB1_16:
    times 16 db 1
WELS_DB127_16:
    times 16 db 127
WELS_DB96_16:
    times 16 db 96
WELS_SHUFB0000111122223333:
    times 4 db 0
    times 4 db 1
    times 4 db 2
    times 4 db 3


SECTION .text

; Unsigned byte absolute difference.
; a=%1 b=%2 clobber=%3
; Subtract once in each direction with saturation and return the maximum.
%macro SSE2_AbsDiffUB 3
    movdqa   %3, %2
    psubusb  %3, %1
    psubusb  %1, %2
    por      %1, %3
%endmacro

; Unsigned byte compare less than.
; lhs=%1 rhs^0x7f=%2 0x7f=%3
; No unsigned byte lt/gt compare instruction available; xor by 0x7f and use a
; signed compare. Some other options do exist. This one allows modifying the lhs
; without mov and uses a bitwise op which can be executed on most ports on
; common architectures.
%macro SSE2_CmpltUB 3
    pxor     %1, %3
    pcmpgtb  %1, %2
%endmacro

; Unsigned byte compare greater than or equal.
%macro SSE2_CmpgeUB 2
    pminub   %1, %2
    pcmpeqb  %1, %2
%endmacro

; Clip unsigned bytes to ref +/- diff.
; data=%1 ref=%2 maxdiff_from_ref=%3 clobber=%4
%macro SSE2_ClipUB 4
    movdqa   %4, %2
    psubusb  %4, %3
    paddusb  %3, %2
    pmaxub   %1, %4
    pminub   %1, %3
%endmacro

; (a + b + 1 - c) >> 1
; a=%1 b=%2 c=%3 [out:a^b&c]=%4
%macro SSE2_AvgbFloor1 4
    movdqa   %4, %1
    pxor     %4, %2
    pavgb    %1, %2
    pand     %4, %3
    psubb    %1, %4
%endmacro

; (a + b + carry) >> 1
; a=%1 b=%2 carry-1=%3
%macro SSE2_AvgbFloor2 3
    pxor     %1, %3
    pxor     %2, %3
    pavgb    %1, %2
    pxor     %1, %3
%endmacro

; a = (a & m) | (b & ~m)
; a=%1 b=%2 m=%3
%macro SSE2_Blend 3
    pand     %1, %3
    pandn    %3, %2
    por      %1, %3
%endmacro


;*******************************************************************************
;    void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
;                                 int32_t iBeta, int8_t * pTC)
;*******************************************************************************

WELS_EXTERN DeblockLumaLt4V_ssse3
    %assign push_num 0
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    movd     xmm1, arg3d
    movd     xmm2, arg4d
    pxor     xmm3, xmm3
    pxor     xmm1, [WELS_DB127_16]
    pxor     xmm2, [WELS_DB127_16]
    pshufb   xmm1, xmm3                       ; iAlpha ^ 0x7f
    pshufb   xmm2, xmm3                       ; iBeta  ^ 0x7f
    mov      r2, r1                           ; iStride
    neg      r1                               ; -iStride
    lea      r3, [r0 + r1]                    ; pPix - iStride

    ; Compute masks to enable/disable deblocking.
    MOVDQ    xmm6, [r3 + 0 * r1]              ; p0
    MOVDQ    xmm7, [r3 + 1 * r1]              ; p1
    MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
    movdqa   xmm4, xmm6
    SSE2_AbsDiffUB xmm6, xmm0, xmm3           ; |p0 - q0|
    SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
    MOVDQ    xmm1, [r0 + 1 * r2]              ; q1
    SSE2_AbsDiffUB xmm7, xmm4, xmm3           ; |p1 - p0|
    SSE2_AbsDiffUB xmm0, xmm1, xmm3           ; |q1 - q0|
    pmaxub   xmm7, xmm0                       ; max(|p1 - p0|, |q1 - q0|)
    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
    pand     xmm6, xmm7                       ; bDeltaP0Q0P1P0Q1Q0 = bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
    MOVDQ    xmm7, [r3 + 2 * r1]              ; p2
    movdqa   xmm0, xmm7
    SSE2_AbsDiffUB xmm7, xmm4, xmm3           ; |p2 - p0|
    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP2P0 = |p2 - p0| < iBeta
    MOVDQ    xmm5, [r0 + 2 * r2]              ; q2
    MOVDQ    xmm3, [r0 + 0 * r2]              ; q0
    movdqa   xmm1, xmm5
    SSE2_AbsDiffUB xmm5, xmm3, xmm4           ; |q2 - q0|
    SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta

    pavgb    xmm3, [r3 + 0 * r1]
    pcmpeqw  xmm2, xmm2  ; FFh
    pxor     xmm3, xmm2
    ; (p2 + ((p0 + q0 + 1) >> 1)) >> 1
    pxor     xmm0, xmm2
    pavgb    xmm0, xmm3
    pxor     xmm0, xmm2
    ; (q2 + ((p0 + q0 + 1) >> 1)) >> 1
    pxor     xmm1, xmm2
    pavgb    xmm1, xmm3
    pxor     xmm1, xmm2

    movd     xmm3, [r4]
    pshufb   xmm3, [WELS_SHUFB0000111122223333] ; iTc
    movdqa   xmm4, xmm3  ; iTc0 = iTc
    pcmpgtb  xmm3, xmm2  ; iTc > -1 ? 0xff : 0x00
    pand     xmm6, xmm3  ; bDeltaP0Q0P1P0Q1Q0 &= iTc > -1
    movdqa   xmm3, xmm4
    psubb    xmm3, xmm7  ; iTc -= bDeltaP2P0 ? -1 : 0
    psubb    xmm3, xmm5  ; iTc -= bDeltaQ2Q0 ? -1 : 0
    pand     xmm3, xmm6  ; iTc &= bDeltaP0Q0P1P0Q1Q0 ? 0xff : 0
    pand     xmm7, xmm6  ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0
    pand     xmm5, xmm6  ; bDeltaQ2Q0 &= bDeltaP0Q0P1P0Q1Q0
    pand     xmm7, xmm4  ; iTc0 & (bDeltaP2P0 ? 0xff : 0)
    pand     xmm5, xmm4  ; iTc0 & (bDeltaQ2Q0 ? 0xff : 0)

    MOVDQ    xmm4, [r3 + 1 * r1]
    SSE2_ClipUB xmm0, xmm4, xmm7, xmm6  ; clip p1.
    MOVDQ    xmm6, [r0 + 1 * r2]
    MOVDQ    [r3 + 1 * r1], xmm0        ; store p1.
    SSE2_ClipUB xmm1, xmm6, xmm5, xmm7  ; clip q1.
    MOVDQ    [r0 + 1 * r2], xmm1        ; store q1.

    ; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255].
    ; A limited range is sufficient because the value is clipped to [-iTc, iTc] later.
    ; Bias so that unsigned saturation can be used.
    ; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs.
    ; q0 - p0 is split into a non-negative and non-positive part. The latter is
    ; subtracted from the biased value.
    MOVDQ    xmm1, [r3 + 0 * r1] ; p0
    MOVDQ    xmm0, [r0 + 0 * r2] ; q0
    movdqa   xmm7, xmm1
    psubusb  xmm7, xmm0  ; clip(p0 - q0, 0, 255)
    ; ((p1 - q1) >> 2) + 0xc0
    pxor     xmm6, xmm2  ; q1 ^ 0xff aka -q1 - 1 & 0xff
    pavgb    xmm4, xmm6  ; (((p1 - q1 + 0x100) >> 1)
    pavgb    xmm4, xmm2  ;  + 0x100) >> 1
    psubusb  xmm4, xmm7  ; -= clip(p0 - q0, 0, 255) saturate.
    psubusb  xmm0, xmm1  ; (clip(q0 - p0, 0, 255)
    pavgb    xmm0, xmm4  ;  + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1

    ; Unbias and split into a non-negative and a non-positive part.
    ; Clip each part to iTc via minub.
    ; Add/subtract each part to/from p0/q0 and clip.
    movdqa   xmm6, [WELS_DB96_16]
    psubusb  xmm6, xmm0
    psubusb  xmm0, [WELS_DB96_16]
    pminub   xmm6, xmm3
    pminub   xmm0, xmm3
    psubusb  xmm1, xmm6
    paddusb  xmm1, xmm0
    paddusb  xmm6, [r0 + 0 * r2]
    psubusb  xmm6, xmm0
    MOVDQ    [r3 + 0 * r1], xmm1  ; store p0.
    MOVDQ    [r0 + 0 * r2], xmm6  ; store q0.

    POP_XMM
    LOAD_5_PARA_POP
    ret


; Deblock 3x16 luma pixels for the eq4 case.
;
; Compose 8-bit averages from pavgbs. Ie. (p1 + p0 + p2 + q0 + 2) >> 2 can be
; written as (((p1 + p0) >> 1) + ((p2 + q0 + (p1 ^ p0 & 1)) >> 1) + 1) >> 1,
; which maps to 3 pavgbs.
;
; pPix=%1 iStride=%2 [in:q0,out:p0]=%3 [in:q1,out:p1]=%4 bDeltaP0Q0P1P0Q1Q0=%5 bDeltaP2P0=%6 clobber=%7,%8,%9,%10 preserve_p0p1=%11 db1=%12
%macro SSE2_DeblockLumaEq4_3x16P 12
    movdqa   %7, %3
    movdqa   %8, %6
    MOVDQ    %10, [%1 + 1 * %2]                      ; p1
    SSE2_Blend %7, %10, %8                           ; t0 = bDeltaP2P0 ? q0 : p1
    movdqa   %8, %6
    MOVDQ    %9, [%1 + 2 * %2]                       ; p2
    SSE2_Blend %9, %4, %8                            ; t1 = bDeltaP2P0 ? p2 : q1
    SSE2_AvgbFloor1 %4,  %9,   %12, %8               ; t1 = (t1 + q1) >> 1
    SSE2_AvgbFloor1 %10, [%1], %12, %8               ; (p0 + p1) >> 1, p0 ^ p1
    pxor     %8, %12
    SSE2_AvgbFloor1 %7, %4, %8, %9                   ; (t0 + t1 + (p0 ^ p1 & 1)) >> 1
    MOVDQ    %9, [%1 + 2 * %2]                       ; p2
    SSE2_AvgbFloor1 %3, %9, %8, %4                   ; (p2 + q0 + (p0 ^ p1 & 1)) >> 1
    pavgb    %7, %10                                 ; p0' = (p0 + p1 + t0 + t1 + 2) >> 2
    movdqa   %8, %10
    pxor     %8, %3                                  ; (p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1
    pand     %8, %12                                 ; & 1
    pavgb    %10, %3                                 ; p1' = (p0 + p1 + p2 + q0 + 2) >> 2
    pand     %6, %5                                  ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0
%if %11
    MOVDQ    %3, [%1 + 0 * %2]                       ; p0
    movdqa   %4, %5
    SSE2_Blend %7, %3, %4                            ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
%else
    SSE2_Blend %7, [%1 + 0 * %2], %5                 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
%endif
    MOVDQ    [%1 + 0 * %2], %7                       ; store p0
    add      %1, %2
    movdqa   %7, %10
    psubb    %10, %8                                 ; (p0 + p1 + p2 + q0) >> 2
    psubb    %8, %12
    MOVDQ    %4, [%1 + (3 - 1) * %2]                 ; p3
    SSE2_AvgbFloor2 %4, %9, %8                       ; (p2 + p3 + ((p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1 & 1)) >> 1
    pavgb    %10, %4                                 ; p2' = (((p0 + p1 + p2 + q0) >> 1) + p2 + p3 + 2) >> 2
    movdqa   %8, %6
    SSE2_Blend %10, [%1 + (2 - 1) * %2], %8          ; p2out = bDeltaP2P0 ? p2' : p2
    MOVDQ    [%1 + (2 - 1) * %2], %10                ; store p2
%if %11
    MOVDQ    %4, [%1 + (1 - 1) * %2]                 ; p1
    SSE2_Blend %7, %4, %6                            ; p1out = bDeltaP2P0 ? p1' : p1
%else
    SSE2_Blend %7, [%1 + (1 - 1) * %2], %6           ; p1out = bDeltaP2P0 ? p1' : p1
%endif
    MOVDQ    [%1 + (1 - 1) * %2], %7                 ; store p1
%endmacro


;*******************************************************************************
;    void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
;                                 int32_t iBeta)
;*******************************************************************************

WELS_EXTERN DeblockLumaEq4V_ssse3
    %assign push_num 0
    LOAD_4_PARA
    PUSH_XMM 10
    SIGN_EXTENSION r1, r1d
    movd     xmm1, arg3d
    movd     xmm2, arg4d
    shr      r2, 2
    add      r2, 1
    movd     xmm3, r2d
    pxor     xmm4, xmm4
    pxor     xmm1, [WELS_DB127_16]
    pxor     xmm2, [WELS_DB127_16]
    pshufb   xmm1, xmm4                       ; iAlpha ^ 0x7f
    pshufb   xmm2, xmm4                       ; iBeta  ^ 0x7f
    pshufb   xmm3, xmm4                       ; (iAlpha >> 2) + 1
    mov      r2, r1                           ; iStride
    neg      r1                               ; -iStride
    lea      r3, [r0 + r1]                    ; pPix - iStride

    ; Compute masks to enable/disable filtering.
    MOVDQ    xmm7, [r3 + 1 * r1]              ; p1
    MOVDQ    xmm6, [r3 + 0 * r1]              ; p0
    MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
    movdqa   xmm4, xmm6
    SSE2_AbsDiffUB xmm6, xmm0, xmm5           ; |p0 - q0|
    SSE2_CmpgeUB xmm3, xmm6                   ; |p0 - q0| < (iAlpha >> 2) + 2
    SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
    MOVDQ    xmm1, [r0 + 1 * r2]              ; q1
    SSE2_AbsDiffUB xmm7, xmm4, xmm5           ; |p1 - p0|
    SSE2_AbsDiffUB xmm0, xmm1, xmm5           ; |q1 - q0|
    pmaxub   xmm7, xmm0                       ; max(|p1 - p0|, |q1 - q0|)
    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
    pand     xmm6, xmm7                       ; & bDeltaP0Q0

    MOVDQ    xmm7, [r3 + 2 * r1]              ; p2
    SSE2_AbsDiffUB xmm7, xmm4, xmm5           ; |p2 - p0|
    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP2P0 = |p2 - p0| < iBeta
    pand     xmm7, xmm3                       ; &= |p0 - q0| < (iAlpha >> 2) + 2

    MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
    MOVDQ    xmm5, [r0 + 2 * r2]              ; q2
    SSE2_AbsDiffUB xmm5, xmm0, xmm4           ; |q2 - q0|
    SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
    pand     xmm5, xmm3                       ; &= |p0 - q0| < (iAlpha >> 2) + 2

%ifdef X86_32
    ; Push xmm5 to free up one register. Align stack so as to ensure that failed
    ; store forwarding penalty cannot occur (up to ~50 cycles for 128-bit on IVB).
    mov      r2, esp
    sub      esp,  16
    and      esp, -16
    movdqa   [esp], xmm5
    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [WELS_DB1_16]
    movdqa   xmm5, [esp]
    mov      esp, r2
    neg      r1
    SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [WELS_DB1_16]
%else
    movdqa   xmm9, [WELS_DB1_16]
    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm8, xmm4, 1, xmm9
    SSE2_DeblockLumaEq4_3x16P r0, r2, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, xmm9
%endif

    POP_XMM
    LOAD_4_PARA_POP
    ret


;******************************************************************************
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************

WELS_EXTERN DeblockChromaLt4V_ssse3
    %assign push_num 0
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r2, r2d
    movd     xmm7, arg4d
    pxor     xmm0, xmm0
    pshufb   xmm7, xmm0                       ; iAlpha
    mov      r3, r2
    neg      r3                               ; -iStride

    movq     xmm0, [r0 + 0 * r2]              ; q0 cb
    movhps   xmm0, [r1 + 0 * r2]              ; q0 cr
    movq     xmm2, [r0 + 1 * r3]              ; p0 cb
    movhps   xmm2, [r1 + 1 * r3]              ; p0 cr

    movdqa   xmm4, xmm0
    SSE2_AbsDiffUB xmm4, xmm2, xmm5           ; |p0 - q0|
    SSE2_CmpgeUB xmm4, xmm7                   ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha

    movq     xmm1, [r0 + 1 * r2]              ; q1 cb
    movhps   xmm1, [r1 + 1 * r2]              ; q1 cr
    movq     xmm3, [r0 + 2 * r3]              ; p1 cb
    movhps   xmm3, [r1 + 2 * r3]              ; p1 cr

    movdqa   xmm5, xmm1
    SSE2_AbsDiffUB xmm5, xmm0, xmm7           ; |q1 - q0|
    movdqa   xmm6, xmm3
    SSE2_AbsDiffUB xmm6, xmm2, xmm7           ; |p1 - p0|
    pmaxub   xmm5, xmm6                       ; max(|q1 - q0|, |p1 - p0|)

    pxor     xmm6, xmm6
    movd     xmm7, arg5d
    pshufb   xmm7, xmm6                       ; iBeta

    SSE2_CmpgeUB xmm5, xmm7                   ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
    por      xmm4, xmm5                       ; | !bDeltaP0Q0

%ifidni arg6, r5
    movd     xmm7, [arg6]
%else
    mov      r2, arg6
    movd     xmm7, [r2]
%endif
    punpckldq xmm7, xmm7
    punpcklbw xmm7, xmm7                      ; iTc
    pcmpeqw  xmm6, xmm6                       ; FFh
    movdqa   xmm5, xmm7
    pcmpgtb  xmm5, xmm6                       ; iTc > -1 ? FFh : FFh
    pandn    xmm4, xmm7                       ; iTc & bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
    pand     xmm4, xmm5                       ; &= (iTc > -1 ? FFh : 00h)

    ; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255].
    ; A limited range is sufficient because the value is clipped to [-iTc, iTc] later.
    ; Bias so that unsigned saturation can be used.
    ; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs.
    ; q0 - p0 is split into a non-negative and non-positive part. The latter is
    ; subtracted from the biased value.
    movdqa   xmm7, xmm2
    psubusb  xmm7, xmm0  ; clip(p0 - q0, 0, 255)
    ; ((p1 - q1) >> 2) + 0xc0
    pxor     xmm1, xmm6  ; q1 ^ 0xff aka -q1 - 1 & 0xff
    pavgb    xmm3, xmm1  ; (((p1 - q1 + 0x100) >> 1)
    pavgb    xmm3, xmm6  ;  + 0x100) >> 1
    psubusb  xmm3, xmm7  ; -= clip(p0 - q0, 0, 255) saturate.
    movdqa   xmm5, xmm0
    psubusb  xmm5, xmm2  ; (clip(q0 - p0, 0, 255)
    pavgb    xmm5, xmm3  ;  + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1

    ; Unbias and split into a non-negative and a non-positive part.
    ; Clip each part to iTc via minub.
    ; Add/subtract each part to/from p0/q0 and clip.
    movdqa   xmm6, [WELS_DB96_16]
    psubusb  xmm6, xmm5
    psubusb  xmm5, [WELS_DB96_16]
    pminub   xmm6, xmm4
    pminub   xmm5, xmm4
    psubusb  xmm2, xmm6
    paddusb  xmm2, xmm5
    paddusb  xmm0, xmm6
    psubusb  xmm0, xmm5

    movlps   [r0 + 1 * r3], xmm2              ; store p0 cb
    movhps   [r1 + 1 * r3], xmm2              ; store p0 cr
    movlps   [r0         ], xmm0              ; store q0 cb
    movhps   [r1         ], xmm0              ; store q0 cr

    POP_XMM
    LOAD_4_PARA_POP
    ret


;********************************************************************************
;  void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;                             int32_t iAlpha, int32_t iBeta)
;********************************************************************************

WELS_EXTERN DeblockChromaEq4V_ssse3
    %assign push_num 0
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r2, r2d
    movd     xmm7, arg4d
    pxor     xmm0, xmm0
    pshufb   xmm7, xmm0                       ; iAlpha
    mov      r3, r2
    neg      r3                               ; -iStride

    movq     xmm0, [r0 + 0 * r2]              ; q0 cb
    movhps   xmm0, [r1 + 0 * r2]              ; q0 cr
    movq     xmm2, [r0 + 1 * r3]              ; p0 cb
    movhps   xmm2, [r1 + 1 * r3]              ; p0 cr

    movdqa   xmm4, xmm0
    SSE2_AbsDiffUB xmm4, xmm2, xmm5           ; |p0 - q0|
    SSE2_CmpgeUB xmm4, xmm7                   ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha

    movq     xmm1, [r0 + 1 * r2]              ; q1 cb
    movhps   xmm1, [r1 + 1 * r2]              ; q1 cr
    movq     xmm3, [r0 + 2 * r3]              ; p1 cb
    movhps   xmm3, [r1 + 2 * r3]              ; p1 cr

    movdqa   xmm5, xmm1
    SSE2_AbsDiffUB xmm5, xmm0, xmm7           ; |q1 - q0|
    movdqa   xmm6, xmm3
    SSE2_AbsDiffUB xmm6, xmm2, xmm7           ; |p1 - p0|
    pmaxub   xmm5, xmm6                       ; max(|q1 - q0|, |p1 - p0|)

    pxor     xmm6, xmm6
    movd     xmm7, arg5d
    pshufb   xmm7, xmm6                       ; iBeta

    SSE2_CmpgeUB xmm5, xmm7                   ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
    por      xmm4, xmm5                       ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0

    WELS_DB1 xmm7
    movdqa   xmm5, xmm2
    SSE2_AvgbFloor1 xmm2, xmm1, xmm7, xmm6    ; (p0 + q1) >> 1
    pavgb    xmm2, xmm3                       ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
    movdqa   xmm6, xmm4
    SSE2_Blend xmm5, xmm2, xmm4               ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0

    SSE2_AvgbFloor1 xmm3, xmm0, xmm7, xmm4    ; (q0 + p1) >> 1
    pavgb    xmm3, xmm1                       ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
    SSE2_Blend xmm0, xmm3, xmm6               ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0

    movlps   [r0 + 1 * r3], xmm5              ; store p0 cb
    movhps   [r1 + 1 * r3], xmm5              ; store p0 cr
    movlps   [r0 + 0 * r2], xmm0              ; store q0 cb
    movhps   [r1 + 0 * r2], xmm0              ; store q0 cr

    POP_XMM
    LOAD_4_PARA_POP
    ret


%ifdef  WIN64


WELS_EXTERN DeblockChromaEq4H_ssse3
    mov         rax,rsp
    mov         [rax+20h],rbx
    push        rdi
    PUSH_XMM 16
    sub         rsp,140h
    mov         rdi,rdx
    lea         eax,[r8*4]
    movsxd      r10,eax
    mov         eax,[rcx-2]
    mov         [rsp+10h],eax
    lea         rbx,[r10+rdx-2]
    lea         r11,[r10+rcx-2]
    movdqa      xmm5,[rsp+10h]
    movsxd      r10,r8d
    mov         eax,[r10+rcx-2]
    lea         rdx,[r10+r10*2]
    mov         [rsp+20h],eax
    mov         eax,[rcx+r10*2-2]
    mov         [rsp+30h],eax
    mov         eax,[rdx+rcx-2]
    movdqa      xmm2,[rsp+20h]
    mov         [rsp+40h],eax
    mov         eax, [rdi-2]
    movdqa      xmm4,[rsp+30h]
    mov         [rsp+50h],eax
    mov         eax,[r10+rdi-2]
    movdqa      xmm3,[rsp+40h]
    mov         [rsp+60h],eax
    mov         eax,[rdi+r10*2-2]
    punpckldq   xmm5,[rsp+50h]
    mov         [rsp+70h],eax
    mov         eax, [rdx+rdi-2]
    punpckldq   xmm2, [rsp+60h]
    mov          [rsp+80h],eax
    mov         eax,[r11]
    punpckldq   xmm4, [rsp+70h]
    mov         [rsp+50h],eax
    mov         eax,[rbx]
    punpckldq   xmm3,[rsp+80h]
    mov         [rsp+60h],eax
    mov         eax,[r10+r11]
    movdqa      xmm0, [rsp+50h]
    punpckldq   xmm0, [rsp+60h]
    punpcklqdq  xmm5,xmm0
    movdqa      [rsp+50h],xmm0
    mov         [rsp+50h],eax
    mov         eax,[r10+rbx]
    movdqa      xmm0,[rsp+50h]
    movdqa      xmm1,xmm5
    mov         [rsp+60h],eax
    mov         eax,[r11+r10*2]
    punpckldq   xmm0, [rsp+60h]
    punpcklqdq  xmm2,xmm0
    punpcklbw   xmm1,xmm2
    punpckhbw   xmm5,xmm2
    movdqa      [rsp+50h],xmm0
    mov         [rsp+50h],eax
    mov         eax,[rbx+r10*2]
    movdqa      xmm0,[rsp+50h]
    mov         [rsp+60h],eax
    mov         eax, [rdx+r11]
    movdqa      xmm15,xmm1
    punpckldq   xmm0,[rsp+60h]
    punpcklqdq  xmm4,xmm0
    movdqa      [rsp+50h],xmm0
    mov         [rsp+50h],eax
    mov         eax, [rdx+rbx]
    movdqa      xmm0,[rsp+50h]
    mov         [rsp+60h],eax
    punpckldq   xmm0, [rsp+60h]
    punpcklqdq  xmm3,xmm0
    movdqa      xmm0,xmm4
    punpcklbw   xmm0,xmm3
    punpckhbw   xmm4,xmm3
    punpcklwd   xmm15,xmm0
    punpckhwd   xmm1,xmm0
    movdqa      xmm0,xmm5
    movdqa      xmm12,xmm15
    punpcklwd   xmm0,xmm4
    punpckhwd   xmm5,xmm4
    punpckldq   xmm12,xmm0
    punpckhdq   xmm15,xmm0
    movdqa      xmm0,xmm1
    movdqa      xmm11,xmm12
    punpckldq   xmm0,xmm5
    punpckhdq   xmm1,xmm5
    punpcklqdq  xmm11,xmm0
    punpckhqdq  xmm12,xmm0
    movsx       eax,r9w
    movdqa      xmm14,xmm15
    punpcklqdq  xmm14,xmm1
    punpckhqdq  xmm15,xmm1
    pxor        xmm1,xmm1
    movd        xmm0,eax
    movdqa      xmm4,xmm12
    movdqa      xmm8,xmm11
    movsx       eax,word [rsp+170h + 160] ; iBeta
    punpcklwd   xmm0,xmm0
    punpcklbw   xmm4,xmm1
    punpckhbw   xmm12,xmm1
    movdqa      xmm9,xmm14
    movdqa      xmm7,xmm15
    movdqa      xmm10,xmm15
    pshufd      xmm13,xmm0,0
    punpcklbw   xmm9,xmm1
    punpckhbw   xmm14,xmm1
    movdqa      xmm6,xmm13
    movd        xmm0,eax
    movdqa      [rsp],xmm11
    mov         eax,2
    cwde
    punpckhbw   xmm11,xmm1
    punpckhbw   xmm10,xmm1
    punpcklbw   xmm7,xmm1
    punpcklwd   xmm0,xmm0
    punpcklbw   xmm8,xmm1
    pshufd      xmm3,xmm0,0
    movdqa      xmm1,xmm8
    movdqa      xmm0,xmm4
    psubw       xmm0,xmm9
    psubw       xmm1,xmm4
    movdqa      xmm2,xmm3
    pabsw       xmm0,xmm0
    pcmpgtw     xmm6,xmm0
    pabsw       xmm0,xmm1
    movdqa      xmm1,xmm3
    pcmpgtw     xmm2,xmm0
    pand        xmm6,xmm2
    movdqa      xmm0,xmm7
    movdqa      xmm2,xmm3
    psubw       xmm0,xmm9
    pabsw       xmm0,xmm0
    pcmpgtw     xmm1,xmm0
    pand        xmm6,xmm1
    movdqa      xmm0,xmm12
    movdqa      xmm1,xmm11
    psubw       xmm0,xmm14
    psubw       xmm1,xmm12
    movdqa      xmm5,xmm6
    pabsw       xmm0,xmm0
    pcmpgtw     xmm13,xmm0
    pabsw       xmm0,xmm1
    movdqa      xmm1,xmm8
    pcmpgtw     xmm2,xmm0
    paddw       xmm1,xmm8
    movdqa      xmm0,xmm10
    pand        xmm13,xmm2
    psubw       xmm0,xmm14
    paddw       xmm1,xmm4
    movdqa      xmm2,xmm11
    pabsw       xmm0,xmm0
    paddw       xmm2,xmm11
    paddw       xmm1,xmm7
    pcmpgtw     xmm3,xmm0
    paddw       xmm2,xmm12
    movd        xmm0,eax
    pand        xmm13,xmm3
    paddw       xmm2,xmm10
    punpcklwd   xmm0,xmm0
    pshufd      xmm3,xmm0,0
    movdqa      xmm0,xmm6
    paddw       xmm1,xmm3
    pandn       xmm0,xmm4
    paddw       xmm2,xmm3
    psraw       xmm1,2
    pand        xmm5,xmm1
    por         xmm5,xmm0
    paddw       xmm7,xmm7
    paddw       xmm10,xmm10
    psraw       xmm2,2
    movdqa      xmm1,xmm13
    movdqa      xmm0,xmm13
    pandn       xmm0,xmm12
    pand        xmm1,xmm2
    paddw       xmm7,xmm9
    por         xmm1,xmm0
    paddw       xmm10,xmm14
    paddw       xmm7,xmm8
    movdqa      xmm0,xmm13
    packuswb    xmm5,xmm1
    paddw       xmm7,xmm3
    paddw       xmm10,xmm11
    movdqa      xmm1,xmm6
    paddw       xmm10,xmm3
    pandn       xmm6,xmm9
    psraw       xmm7,2
    pand        xmm1,xmm7
    psraw       xmm10,2
    pandn       xmm13,xmm14
    pand        xmm0,xmm10
    por         xmm1,xmm6
    movdqa      xmm6,[rsp]
    movdqa      xmm4,xmm6
    por         xmm0,xmm13
    punpcklbw   xmm4,xmm5
    punpckhbw   xmm6,xmm5
    movdqa      xmm3,xmm4
    packuswb    xmm1,xmm0
    movdqa      xmm0,xmm1
    punpckhbw   xmm1,xmm15
    punpcklbw   xmm0,xmm15
    punpcklwd   xmm3,xmm0
    punpckhwd   xmm4,xmm0
    movdqa      xmm0,xmm6
    movdqa      xmm2,xmm3
    punpcklwd   xmm0,xmm1
    punpckhwd   xmm6,xmm1
    movdqa      xmm1,xmm4
    punpckldq   xmm2,xmm0
    punpckhdq   xmm3,xmm0
    punpckldq   xmm1,xmm6
    movdqa      xmm0,xmm2
    punpcklqdq  xmm0,xmm1
    punpckhdq   xmm4,xmm6
    punpckhqdq  xmm2,xmm1
    movdqa      [rsp+10h],xmm0
    movdqa      [rsp+60h],xmm2
    movdqa      xmm0,xmm3
    mov         eax,[rsp+10h]
    mov         [rcx-2],eax
    mov         eax,[rsp+60h]
    punpcklqdq  xmm0,xmm4
    punpckhqdq  xmm3,xmm4
    mov         [r10+rcx-2],eax
    movdqa      [rsp+20h],xmm0
    mov         eax, [rsp+20h]
    movdqa      [rsp+70h],xmm3
    mov         [rcx+r10*2-2],eax
    mov         eax,[rsp+70h]
    mov         [rdx+rcx-2],eax
    mov         eax,[rsp+18h]
    mov         [r11],eax
    mov         eax,[rsp+68h]
    mov         [r10+r11],eax
    mov         eax,[rsp+28h]
    mov         [r11+r10*2],eax
    mov         eax,[rsp+78h]
    mov         [rdx+r11],eax
    mov         eax,[rsp+14h]
    mov         [rdi-2],eax
    mov         eax,[rsp+64h]
    mov         [r10+rdi-2],eax
    mov         eax,[rsp+24h]
    mov         [rdi+r10*2-2],eax
    mov         eax, [rsp+74h]
    mov         [rdx+rdi-2],eax
    mov         eax, [rsp+1Ch]
    mov         [rbx],eax
    mov         eax, [rsp+6Ch]
    mov         [r10+rbx],eax
    mov         eax,[rsp+2Ch]
    mov         [rbx+r10*2],eax
    mov         eax,[rsp+7Ch]
    mov         [rdx+rbx],eax
    lea         rsp,[rsp+140h]
    POP_XMM
    mov         rbx, [rsp+28h]
    pop         rdi
    ret



WELS_EXTERN DeblockChromaLt4H_ssse3
    mov         rax,rsp
    push        rbx
    push        rbp
    push        rsi
    push        rdi
    push        r12
    PUSH_XMM 16
    sub         rsp,170h

    movsxd      rsi,r8d
    lea         eax,[r8*4]
    mov         r11d,r9d
    movsxd      r10,eax
    mov         eax, [rcx-2]
    mov         r12,rdx
    mov         [rsp+40h],eax
    mov         eax, [rsi+rcx-2]
    lea         rbx,[r10+rcx-2]
    movdqa      xmm5,[rsp+40h]
    mov         [rsp+50h],eax
    mov         eax, [rcx+rsi*2-2]
    lea         rbp,[r10+rdx-2]
    movdqa      xmm2, [rsp+50h]
    mov         [rsp+60h],eax
    lea         r10,[rsi+rsi*2]
    mov         rdi,rcx
    mov         eax,[r10+rcx-2]
    movdqa      xmm4,[rsp+60h]
    mov         [rsp+70h],eax
    mov         eax,[rdx-2]
    mov         [rsp+80h],eax
    mov         eax, [rsi+rdx-2]
    movdqa      xmm3,[rsp+70h]
    mov         [rsp+90h],eax
    mov         eax,[rdx+rsi*2-2]
    punpckldq   xmm5,[rsp+80h]
    mov         [rsp+0A0h],eax
    mov         eax, [r10+rdx-2]
    punpckldq   xmm2,[rsp+90h]
    mov         [rsp+0B0h],eax
    mov         eax, [rbx]
    punpckldq   xmm4,[rsp+0A0h]
    mov         [rsp+80h],eax
    mov         eax,[rbp]
    punpckldq   xmm3,[rsp+0B0h]
    mov         [rsp+90h],eax
    mov         eax,[rsi+rbx]
    movdqa      xmm0,[rsp+80h]
    punpckldq   xmm0,[rsp+90h]
    punpcklqdq  xmm5,xmm0
    movdqa      [rsp+80h],xmm0
    mov         [rsp+80h],eax
    mov         eax,[rsi+rbp]
    movdqa      xmm0,[rsp+80h]
    movdqa      xmm1,xmm5
    mov         [rsp+90h],eax
    mov         eax,[rbx+rsi*2]
    punpckldq   xmm0,[rsp+90h]
    punpcklqdq  xmm2,xmm0
    punpcklbw   xmm1,xmm2
    punpckhbw   xmm5,xmm2
    movdqa      [rsp+80h],xmm0
    mov         [rsp+80h],eax
    mov         eax,[rbp+rsi*2]
    movdqa      xmm0, [rsp+80h]
    mov         [rsp+90h],eax
    mov         eax,[r10+rbx]
    movdqa      xmm7,xmm1
    punpckldq   xmm0,[rsp+90h]
    punpcklqdq  xmm4,xmm0
    movdqa      [rsp+80h],xmm0
    mov         [rsp+80h],eax
    mov         eax, [r10+rbp]
    movdqa      xmm0,[rsp+80h]
    mov         [rsp+90h],eax
    punpckldq   xmm0,[rsp+90h]
    punpcklqdq  xmm3,xmm0
    movdqa      xmm0,xmm4
    punpcklbw   xmm0,xmm3
    punpckhbw   xmm4,xmm3
    punpcklwd   xmm7,xmm0
    punpckhwd   xmm1,xmm0
    movdqa      xmm0,xmm5
    movdqa      xmm6,xmm7
    punpcklwd   xmm0,xmm4
    punpckhwd   xmm5,xmm4
    punpckldq   xmm6,xmm0
    punpckhdq   xmm7,xmm0
    movdqa      xmm0,xmm1
    punpckldq   xmm0,xmm5
    mov         rax, [rsp+1C8h+160]    ; pTC
    punpckhdq   xmm1,xmm5
    movdqa      xmm9,xmm6
    punpckhqdq  xmm6,xmm0
    punpcklqdq  xmm9,xmm0
    movdqa      xmm2,xmm7
    movdqa      xmm13,xmm6
    movdqa      xmm4,xmm9
    movdqa      [rsp+10h],xmm9
    punpcklqdq  xmm2,xmm1
    punpckhqdq  xmm7,xmm1
    pxor        xmm1,xmm1
    movsx       ecx,byte [rax+3]
    movsx       edx,byte [rax+2]
    movsx       r8d,byte [rax+1]
    movsx       r9d,byte [rax]
    movdqa      xmm10,xmm1
    movdqa      xmm15,xmm2
    punpckhbw   xmm2,xmm1
    punpckhbw   xmm6,xmm1
    punpcklbw   xmm4,xmm1
    movsx       eax,r11w
    mov         word [rsp+0Eh],cx
    mov         word [rsp+0Ch],cx
    movdqa      xmm3,xmm7
    movdqa      xmm8,xmm7
    movdqa      [rsp+20h],xmm7
    punpcklbw   xmm15,xmm1
    punpcklbw   xmm13,xmm1
    punpcklbw   xmm3,xmm1
    mov         word [rsp+0Ah],dx
    mov         word [rsp+8],dx
    mov         word [rsp+6],r8w
    movd        xmm0,eax
    movdqa      [rsp+30h],xmm6
    punpckhbw   xmm9,xmm1
    punpckhbw   xmm8,xmm1
    punpcklwd   xmm0,xmm0
    movsx       eax,word [rsp+1C0h+160]   ; iBeta
    mov         word [rsp+4],r8w
    mov         word [rsp+2],r9w
    pshufd      xmm12,xmm0,0
    mov         word [rsp],r9w
    movd        xmm0,eax
    mov         eax,4
    cwde
    movdqa      xmm14, [rsp]
    movdqa      [rsp],xmm2
    movdqa      xmm2,xmm12
    punpcklwd   xmm0,xmm0
    pshufd      xmm11,xmm0,0
    psubw       xmm10,xmm14
    movd        xmm0,eax
    movdqa      xmm7,xmm14
    movdqa      xmm6,xmm14
    pcmpgtw     xmm7,xmm1
    punpcklwd   xmm0,xmm0
    pshufd      xmm5,xmm0,0
    movdqa      xmm0,xmm4
    movdqa      xmm1,xmm15
    psubw       xmm4,xmm13
    psubw       xmm0,xmm3
    psubw       xmm1,xmm13
    psubw       xmm3,xmm15
    psllw       xmm1,2
    paddw       xmm1,xmm0
    paddw       xmm1,xmm5
    movdqa      xmm0,xmm10
    psraw       xmm1,3
    pmaxsw      xmm0,xmm1
    pminsw      xmm6,xmm0
    movdqa      xmm1,xmm11
    movdqa      xmm0,xmm13
    psubw       xmm0,xmm15
    pabsw       xmm0,xmm0
    pcmpgtw     xmm2,xmm0
    pabsw       xmm0,xmm4
    pcmpgtw     xmm1,xmm0
    pabsw       xmm0,xmm3
    pand        xmm2,xmm1
    movdqa      xmm1,xmm11
    movdqa      xmm3,[rsp+30h]
    pcmpgtw     xmm1,xmm0
    movdqa      xmm0,xmm9
    pand        xmm2,xmm1
    psubw       xmm0,xmm8
    psubw       xmm9,xmm3
    pand        xmm2,xmm7
    pand        xmm6,xmm2
    psubw       xmm15,xmm6
    paddw       xmm13,xmm6
    movdqa      xmm2,[rsp]
    movdqa      xmm1,xmm2
    psubw       xmm1,xmm3
    psubw       xmm8,xmm2
    psllw       xmm1,2
    paddw       xmm1,xmm0
    paddw       xmm1,xmm5
    movdqa      xmm0,xmm3
    movdqa      xmm5,[rsp+10h]
    psubw       xmm0,xmm2
    psraw       xmm1,3
    movdqa      xmm4,xmm5
    pabsw       xmm0,xmm0
    pmaxsw      xmm10,xmm1
    movdqa      xmm1,xmm11
    pcmpgtw     xmm12,xmm0
    pabsw       xmm0,xmm9
    pminsw      xmm14,xmm10
    pcmpgtw     xmm1,xmm0
    pabsw       xmm0,xmm8
    pcmpgtw     xmm11,xmm0
    pand        xmm12,xmm1
    movdqa      xmm1,[rsp+20h]
    pand        xmm12,xmm11
    pand        xmm12,xmm7
    pand        xmm14,xmm12
    paddw       xmm3,xmm14
    psubw       xmm2,xmm14
    packuswb    xmm13,xmm3
    packuswb    xmm15,xmm2
    punpcklbw   xmm4,xmm13
    punpckhbw   xmm5,xmm13
    movdqa      xmm0,xmm15
    punpcklbw   xmm0,xmm1
    punpckhbw   xmm15,xmm1
    movdqa      xmm3,xmm4
    punpcklwd   xmm3,xmm0
    punpckhwd   xmm4,xmm0
    movdqa      xmm0,xmm5
    movdqa      xmm2,xmm3
    movdqa      xmm1,xmm4
    punpcklwd   xmm0,xmm15
    punpckhwd   xmm5,xmm15
    punpckldq   xmm2,xmm0
    punpckhdq   xmm3,xmm0
    punpckldq   xmm1,xmm5
    movdqa      xmm0,xmm2
    punpcklqdq  xmm0,xmm1
    punpckhdq   xmm4,xmm5
    punpckhqdq  xmm2,xmm1
    movdqa      [rsp+40h],xmm0
    movdqa      xmm0,xmm3
    movdqa      [rsp+90h],xmm2
    mov         eax,[rsp+40h]
    mov         [rdi-2],eax
    mov         eax, [rsp+90h]
    punpcklqdq  xmm0,xmm4
    punpckhqdq  xmm3,xmm4
    mov         [rsi+rdi-2],eax
    movdqa      [rsp+50h],xmm0
    mov         eax,[rsp+50h]
    movdqa      [rsp+0A0h],xmm3
    mov         [rdi+rsi*2-2],eax
    mov         eax,[rsp+0A0h]
    mov         [r10+rdi-2],eax
    mov         eax,[rsp+48h]
    mov         [rbx],eax
    mov         eax,[rsp+98h]
    mov         [rsi+rbx],eax
    mov         eax,[rsp+58h]
    mov         [rbx+rsi*2],eax
    mov         eax, [rsp+0A8h]
    mov         [r10+rbx],eax
    mov         eax, [rsp+44h]
    mov         [r12-2],eax
    mov         eax,[rsp+94h]
    mov         [rsi+r12-2],eax
    mov         eax,[rsp+54h]
    mov         [r12+rsi*2-2],eax
    mov         eax, [rsp+0A4h]
    mov         [r10+r12-2],eax
    mov         eax,[rsp+4Ch]
    mov         [rbp],eax
    mov         eax,[rsp+9Ch]
    mov         [rsi+rbp],eax
    mov         eax, [rsp+5Ch]
    mov         [rbp+rsi*2],eax
    mov         eax,[rsp+0ACh]
    mov         [r10+rbp],eax
    lea         r11,[rsp+170h]
    mov         rsp,r11
    POP_XMM
    pop         r12
    pop         rdi
    pop         rsi
    pop         rbp
    pop         rbx
    ret



%elifdef  UNIX64


WELS_EXTERN DeblockChromaEq4H_ssse3
    mov         rax,rsp
    push        rbx
    push        rbp
    push        r12

    mov         rbp,   r8
    mov         r8,    rdx
    mov         r9,    rcx
    mov         rcx,   rdi
    mov         rdx,   rsi
    mov         rdi,   rdx

    sub         rsp,140h
    lea         eax,[r8*4]
    movsxd      r10,eax
    mov         eax,[rcx-2]
    mov         [rsp+10h],eax
    lea         rbx,[r10+rdx-2]
    lea         r11,[r10+rcx-2]

    movdqa      xmm5,[rsp+10h]
    movsxd      r10,r8d
    mov         eax,[r10+rcx-2]
    lea         rdx,[r10+r10*2]
    mov         [rsp+20h],eax
    mov         eax,[rcx+r10*2-2]
    mov         [rsp+30h],eax
    mov         eax,[rdx+rcx-2]
    movdqa      xmm2,[rsp+20h]
    mov         [rsp+40h],eax
    mov         eax, [rdi-2]
    movdqa      xmm4,[rsp+30h]
    mov         [rsp+50h],eax
    mov         eax,[r10+rdi-2]
    movdqa      xmm3,[rsp+40h]
    mov         [rsp+60h],eax
    mov         eax,[rdi+r10*2-2]
    punpckldq   xmm5,[rsp+50h]
    mov         [rsp+70h],eax
    mov         eax, [rdx+rdi-2]
    punpckldq   xmm2, [rsp+60h]
    mov          [rsp+80h],eax
    mov         eax,[r11]
    punpckldq   xmm4, [rsp+70h]
    mov         [rsp+50h],eax
    mov         eax,[rbx]
    punpckldq   xmm3,[rsp+80h]
    mov         [rsp+60h],eax
    mov         eax,[r10+r11]
    movdqa      xmm0, [rsp+50h]
    punpckldq   xmm0, [rsp+60h]
    punpcklqdq  xmm5,xmm0
    movdqa      [rsp+50h],xmm0
    mov         [rsp+50h],eax
    mov         eax,[r10+rbx]
    movdqa      xmm0,[rsp+50h]
    movdqa      xmm1,xmm5
    mov         [rsp+60h],eax
    mov         eax,[r11+r10*2]
    punpckldq   xmm0, [rsp+60h]
    punpcklqdq  xmm2,xmm0
    punpcklbw   xmm1,xmm2
    punpckhbw   xmm5,xmm2
    movdqa      [rsp+50h],xmm0
    mov         [rsp+50h],eax
    mov         eax,[rbx+r10*2]
    movdqa      xmm0,[rsp+50h]
    mov         [rsp+60h],eax
    mov         eax, [rdx+r11]
    movdqa      xmm15,xmm1
    punpckldq   xmm0,[rsp+60h]
    punpcklqdq  xmm4,xmm0
    movdqa      [rsp+50h],xmm0
    mov         [rsp+50h],eax
    mov         eax, [rdx+rbx]
    movdqa      xmm0,[rsp+50h]
    mov         [rsp+60h],eax
    punpckldq   xmm0, [rsp+60h]
    punpcklqdq  xmm3,xmm0
    movdqa      xmm0,xmm4
    punpcklbw   xmm0,xmm3
    punpckhbw   xmm4,xmm3
    punpcklwd   xmm15,xmm0
    punpckhwd   xmm1,xmm0
    movdqa      xmm0,xmm5
    movdqa      xmm12,xmm15
    punpcklwd   xmm0,xmm4
    punpckhwd   xmm5,xmm4
    punpckldq   xmm12,xmm0
    punpckhdq   xmm15,xmm0
    movdqa      xmm0,xmm1
    movdqa      xmm11,xmm12
    punpckldq   xmm0,xmm5
    punpckhdq   xmm1,xmm5
    punpcklqdq  xmm11,xmm0
    punpckhqdq  xmm12,xmm0
    movsx       eax,r9w
    movdqa      xmm14,xmm15
    punpcklqdq  xmm14,xmm1
    punpckhqdq  xmm15,xmm1
    pxor        xmm1,xmm1
    movd        xmm0,eax
    movdqa      xmm4,xmm12
    movdqa      xmm8,xmm11
    mov         eax, ebp ; iBeta
    punpcklwd   xmm0,xmm0
    punpcklbw   xmm4,xmm1
    punpckhbw   xmm12,xmm1
    movdqa      xmm9,xmm14
    movdqa      xmm7,xmm15
    movdqa      xmm10,xmm15
    pshufd      xmm13,xmm0,0
    punpcklbw   xmm9,xmm1
    punpckhbw   xmm14,xmm1
    movdqa      xmm6,xmm13
    movd        xmm0,eax
    movdqa      [rsp],xmm11
    mov         eax,2
    cwde
    punpckhbw   xmm11,xmm1
    punpckhbw   xmm10,xmm1
    punpcklbw   xmm7,xmm1
    punpcklwd   xmm0,xmm0
    punpcklbw   xmm8,xmm1
    pshufd      xmm3,xmm0,0
    movdqa      xmm1,xmm8
    movdqa      xmm0,xmm4
    psubw       xmm0,xmm9
    psubw       xmm1,xmm4
    movdqa      xmm2,xmm3
    pabsw       xmm0,xmm0
    pcmpgtw     xmm6,xmm0
    pabsw       xmm0,xmm1
    movdqa      xmm1,xmm3
    pcmpgtw     xmm2,xmm0
    pand        xmm6,xmm2
    movdqa      xmm0,xmm7
    movdqa      xmm2,xmm3
    psubw       xmm0,xmm9
    pabsw       xmm0,xmm0
    pcmpgtw     xmm1,xmm0
    pand        xmm6,xmm1
    movdqa      xmm0,xmm12
    movdqa      xmm1,xmm11
    psubw       xmm0,xmm14
    psubw       xmm1,xmm12
    movdqa      xmm5,xmm6
    pabsw       xmm0,xmm0
    pcmpgtw     xmm13,xmm0
    pabsw       xmm0,xmm1
    movdqa      xmm1,xmm8
    pcmpgtw     xmm2,xmm0
    paddw       xmm1,xmm8
    movdqa      xmm0,xmm10
    pand        xmm13,xmm2
    psubw       xmm0,xmm14
    paddw       xmm1,xmm4
    movdqa      xmm2,xmm11
    pabsw       xmm0,xmm0
    paddw       xmm2,xmm11
    paddw       xmm1,xmm7
    pcmpgtw     xmm3,xmm0
    paddw       xmm2,xmm12
    movd        xmm0,eax
    pand        xmm13,xmm3
    paddw       xmm2,xmm10
    punpcklwd   xmm0,xmm0
    pshufd      xmm3,xmm0,0
    movdqa      xmm0,xmm6
    paddw       xmm1,xmm3
    pandn       xmm0,xmm4
    paddw       xmm2,xmm3
    psraw       xmm1,2
    pand        xmm5,xmm1
    por         xmm5,xmm0
    paddw       xmm7,xmm7
    paddw       xmm10,xmm10
    psraw       xmm2,2
    movdqa      xmm1,xmm13
    movdqa      xmm0,xmm13
    pandn       xmm0,xmm12
    pand        xmm1,xmm2
    paddw       xmm7,xmm9
    por         xmm1,xmm0
    paddw       xmm10,xmm14
    paddw       xmm7,xmm8
    movdqa      xmm0,xmm13
    packuswb    xmm5,xmm1
    paddw       xmm7,xmm3
    paddw       xmm10,xmm11
    movdqa      xmm1,xmm6
    paddw       xmm10,xmm3
    pandn       xmm6,xmm9
    psraw       xmm7,2
    pand        xmm1,xmm7
    psraw       xmm10,2
    pandn       xmm13,xmm14
    pand        xmm0,xmm10
    por         xmm1,xmm6
    movdqa      xmm6,[rsp]
    movdqa      xmm4,xmm6
    por         xmm0,xmm13
    punpcklbw   xmm4,xmm5
    punpckhbw   xmm6,xmm5
    movdqa      xmm3,xmm4
    packuswb    xmm1,xmm0
    movdqa      xmm0,xmm1
    punpckhbw   xmm1,xmm15
    punpcklbw   xmm0,xmm15
    punpcklwd   xmm3,xmm0
    punpckhwd   xmm4,xmm0
    movdqa      xmm0,xmm6
    movdqa      xmm2,xmm3
    punpcklwd   xmm0,xmm1
    punpckhwd   xmm6,xmm1
    movdqa      xmm1,xmm4
    punpckldq   xmm2,xmm0
    punpckhdq   xmm3,xmm0
    punpckldq   xmm1,xmm6
    movdqa      xmm0,xmm2
    punpcklqdq  xmm0,xmm1
    punpckhdq   xmm4,xmm6
    punpckhqdq  xmm2,xmm1
    movdqa      [rsp+10h],xmm0
    movdqa      [rsp+60h],xmm2
    movdqa      xmm0,xmm3
    mov         eax,[rsp+10h]
    mov         [rcx-2],eax
    mov         eax,[rsp+60h]
    punpcklqdq  xmm0,xmm4
    punpckhqdq  xmm3,xmm4
    mov         [r10+rcx-2],eax
    movdqa      [rsp+20h],xmm0
    mov         eax, [rsp+20h]
    movdqa      [rsp+70h],xmm3
    mov         [rcx+r10*2-2],eax
    mov         eax,[rsp+70h]
    mov         [rdx+rcx-2],eax
    mov         eax,[rsp+18h]
    mov         [r11],eax
    mov         eax,[rsp+68h]
    mov         [r10+r11],eax
    mov         eax,[rsp+28h]
    mov         [r11+r10*2],eax
    mov         eax,[rsp+78h]
    mov         [rdx+r11],eax
    mov         eax,[rsp+14h]
    mov         [rdi-2],eax
    mov         eax,[rsp+64h]
    mov         [r10+rdi-2],eax
    mov         eax,[rsp+24h]
    mov         [rdi+r10*2-2],eax
    mov         eax, [rsp+74h]
    mov         [rdx+rdi-2],eax
    mov         eax, [rsp+1Ch]
    mov         [rbx],eax
    mov         eax, [rsp+6Ch]
    mov         [r10+rbx],eax
    mov         eax,[rsp+2Ch]
    mov         [rbx+r10*2],eax
    mov         eax,[rsp+7Ch]
    mov         [rdx+rbx],eax
    lea         r11,[rsp+140h]
    mov         rbx, [r11+28h]
    mov         rsp,r11
    pop         r12
    pop         rbp
    pop         rbx
    ret


WELS_EXTERN DeblockChromaLt4H_ssse3
    mov         rax,rsp
    push        rbx
    push        rbp
    push        r12
    push        r13
    push        r14
    sub         rsp,170h

    mov         r13,   r8
    mov         r14,   r9
    mov         r8,    rdx
    mov         r9,    rcx
    mov         rdx,   rdi
    mov         rcx,   rsi

    movsxd      rsi,r8d
    lea         eax,[r8*4]
    mov         r11d,r9d
    movsxd      r10,eax
    mov         eax, [rcx-2]
    mov         r12,rdx
    mov         [rsp+40h],eax
    mov         eax, [rsi+rcx-2]
    lea         rbx,[r10+rcx-2]
    movdqa      xmm5,[rsp+40h]
    mov         [rsp+50h],eax
    mov         eax, [rcx+rsi*2-2]
    lea         rbp,[r10+rdx-2]
    movdqa      xmm2, [rsp+50h]
    mov         [rsp+60h],eax
    lea         r10,[rsi+rsi*2]
    mov         rdi,rcx
    mov         eax,[r10+rcx-2]
    movdqa      xmm4,[rsp+60h]
    mov         [rsp+70h],eax
    mov         eax,[rdx-2]
    mov         [rsp+80h],eax
    mov         eax, [rsi+rdx-2]
    movdqa      xmm3,[rsp+70h]
    mov         [rsp+90h],eax
    mov         eax,[rdx+rsi*2-2]
    punpckldq   xmm5,[rsp+80h]
    mov         [rsp+0A0h],eax
    mov         eax, [r10+rdx-2]
    punpckldq   xmm2,[rsp+90h]
    mov         [rsp+0B0h],eax
    mov         eax, [rbx]
    punpckldq   xmm4,[rsp+0A0h]
    mov         [rsp+80h],eax
    mov         eax,[rbp]
    punpckldq   xmm3,[rsp+0B0h]
    mov         [rsp+90h],eax
    mov         eax,[rsi+rbx]
    movdqa      xmm0,[rsp+80h]
    punpckldq   xmm0,[rsp+90h]
    punpcklqdq  xmm5,xmm0
    movdqa      [rsp+80h],xmm0
    mov         [rsp+80h],eax
    mov         eax,[rsi+rbp]
    movdqa      xmm0,[rsp+80h]
    movdqa      xmm1,xmm5
    mov         [rsp+90h],eax
    mov         eax,[rbx+rsi*2]
    punpckldq   xmm0,[rsp+90h]
    punpcklqdq  xmm2,xmm0
    punpcklbw   xmm1,xmm2
    punpckhbw   xmm5,xmm2
    movdqa      [rsp+80h],xmm0
    mov         [rsp+80h],eax
    mov         eax,[rbp+rsi*2]
    movdqa      xmm0, [rsp+80h]
    mov         [rsp+90h],eax
    mov         eax,[r10+rbx]
    movdqa      xmm7,xmm1
    punpckldq   xmm0,[rsp+90h]
    punpcklqdq  xmm4,xmm0
    movdqa      [rsp+80h],xmm0
    mov         [rsp+80h],eax
    mov         eax, [r10+rbp]
    movdqa      xmm0,[rsp+80h]
    mov         [rsp+90h],eax
    punpckldq   xmm0,[rsp+90h]
    punpcklqdq  xmm3,xmm0
    movdqa      xmm0,xmm4
    punpcklbw   xmm0,xmm3
    punpckhbw   xmm4,xmm3
    punpcklwd   xmm7,xmm0
    punpckhwd   xmm1,xmm0
    movdqa      xmm0,xmm5
    movdqa      xmm6,xmm7
    punpcklwd   xmm0,xmm4
    punpckhwd   xmm5,xmm4
    punpckldq   xmm6,xmm0
    punpckhdq   xmm7,xmm0
    movdqa      xmm0,xmm1
    punpckldq   xmm0,xmm5
    mov         rax, r14    ; pTC
    punpckhdq   xmm1,xmm5
    movdqa      xmm9,xmm6
    punpckhqdq  xmm6,xmm0
    punpcklqdq  xmm9,xmm0
    movdqa      xmm2,xmm7
    movdqa      xmm13,xmm6
    movdqa      xmm4,xmm9
    movdqa      [rsp+10h],xmm9
    punpcklqdq  xmm2,xmm1
    punpckhqdq  xmm7,xmm1
    pxor        xmm1,xmm1
    movsx       ecx,byte [rax+3]
    movsx       edx,byte [rax+2]
    movsx       r8d,byte [rax+1]
    movsx       r9d,byte [rax]
    movdqa      xmm10,xmm1
    movdqa      xmm15,xmm2
    punpckhbw   xmm2,xmm1
    punpckhbw   xmm6,xmm1
    punpcklbw   xmm4,xmm1
    movsx       eax,r11w
    mov         word [rsp+0Eh],cx
    mov         word [rsp+0Ch],cx
    movdqa      xmm3,xmm7
    movdqa      xmm8,xmm7
    movdqa      [rsp+20h],xmm7
    punpcklbw   xmm15,xmm1
    punpcklbw   xmm13,xmm1
    punpcklbw   xmm3,xmm1
    mov         word [rsp+0Ah],dx
    mov         word [rsp+8],dx
    mov         word [rsp+6],r8w
    movd        xmm0,eax
    movdqa      [rsp+30h],xmm6
    punpckhbw   xmm9,xmm1
    punpckhbw   xmm8,xmm1
    punpcklwd   xmm0,xmm0
    mov         eax, r13d   ; iBeta
    mov         word [rsp+4],r8w
    mov         word [rsp+2],r9w
    pshufd      xmm12,xmm0,0
    mov         word [rsp],r9w
    movd        xmm0,eax
    mov         eax,4
    cwde
    movdqa      xmm14, [rsp]
    movdqa      [rsp],xmm2
    movdqa      xmm2,xmm12
    punpcklwd   xmm0,xmm0
    pshufd      xmm11,xmm0,0
    psubw       xmm10,xmm14
    movd        xmm0,eax
    movdqa      xmm7,xmm14
    movdqa      xmm6,xmm14
    pcmpgtw     xmm7,xmm1
    punpcklwd   xmm0,xmm0
    pshufd      xmm5,xmm0,0
    movdqa      xmm0,xmm4
    movdqa      xmm1,xmm15
    psubw       xmm4,xmm13
    psubw       xmm0,xmm3
    psubw       xmm1,xmm13
    psubw       xmm3,xmm15
    psllw       xmm1,2
    paddw       xmm1,xmm0
    paddw       xmm1,xmm5
    movdqa      xmm0,xmm10
    psraw       xmm1,3
    pmaxsw      xmm0,xmm1
    pminsw      xmm6,xmm0
    movdqa      xmm1,xmm11
    movdqa      xmm0,xmm13
    psubw       xmm0,xmm15
    pabsw       xmm0,xmm0
    pcmpgtw     xmm2,xmm0
    pabsw       xmm0,xmm4
    pcmpgtw     xmm1,xmm0
    pabsw       xmm0,xmm3
    pand        xmm2,xmm1
    movdqa      xmm1,xmm11
    movdqa      xmm3,[rsp+30h]
    pcmpgtw     xmm1,xmm0
    movdqa      xmm0,xmm9
    pand        xmm2,xmm1
    psubw       xmm0,xmm8
    psubw       xmm9,xmm3
    pand        xmm2,xmm7
    pand        xmm6,xmm2
    psubw       xmm15,xmm6
    paddw       xmm13,xmm6
    movdqa      xmm2,[rsp]
    movdqa      xmm1,xmm2
    psubw       xmm1,xmm3
    psubw       xmm8,xmm2
    psllw       xmm1,2
    paddw       xmm1,xmm0
    paddw       xmm1,xmm5
    movdqa      xmm0,xmm3
    movdqa      xmm5,[rsp+10h]
    psubw       xmm0,xmm2
    psraw       xmm1,3
    movdqa      xmm4,xmm5
    pabsw       xmm0,xmm0
    pmaxsw      xmm10,xmm1
    movdqa      xmm1,xmm11
    pcmpgtw     xmm12,xmm0
    pabsw       xmm0,xmm9
    pminsw      xmm14,xmm10
    pcmpgtw     xmm1,xmm0
    pabsw       xmm0,xmm8
    pcmpgtw     xmm11,xmm0
    pand        xmm12,xmm1
    movdqa      xmm1,[rsp+20h]
    pand        xmm12,xmm11
    pand        xmm12,xmm7
    pand        xmm14,xmm12
    paddw       xmm3,xmm14
    psubw       xmm2,xmm14
    packuswb    xmm13,xmm3
    packuswb    xmm15,xmm2
    punpcklbw   xmm4,xmm13
    punpckhbw   xmm5,xmm13
    movdqa      xmm0,xmm15
    punpcklbw   xmm0,xmm1
    punpckhbw   xmm15,xmm1
    movdqa      xmm3,xmm4
    punpcklwd   xmm3,xmm0
    punpckhwd   xmm4,xmm0
    movdqa      xmm0,xmm5
    movdqa      xmm2,xmm3
    movdqa      xmm1,xmm4
    punpcklwd   xmm0,xmm15
    punpckhwd   xmm5,xmm15
    punpckldq   xmm2,xmm0
    punpckhdq   xmm3,xmm0
    punpckldq   xmm1,xmm5
    movdqa      xmm0,xmm2
    punpcklqdq  xmm0,xmm1
    punpckhdq   xmm4,xmm5
    punpckhqdq  xmm2,xmm1
    movdqa      [rsp+40h],xmm0
    movdqa      xmm0,xmm3
    movdqa      [rsp+90h],xmm2
    mov         eax,[rsp+40h]
    mov         [rdi-2],eax
    mov         eax, [rsp+90h]
    punpcklqdq  xmm0,xmm4
    punpckhqdq  xmm3,xmm4
    mov         [rsi+rdi-2],eax
    movdqa      [rsp+50h],xmm0
    mov         eax,[rsp+50h]
    movdqa      [rsp+0A0h],xmm3
    mov         [rdi+rsi*2-2],eax
    mov         eax,[rsp+0A0h]
    mov         [r10+rdi-2],eax
    mov         eax,[rsp+48h]
    mov         [rbx],eax
    mov         eax,[rsp+98h]
    mov         [rsi+rbx],eax
    mov         eax,[rsp+58h]
    mov         [rbx+rsi*2],eax
    mov         eax, [rsp+0A8h]
    mov         [r10+rbx],eax
    mov         eax, [rsp+44h]
    mov         [r12-2],eax
    mov         eax,[rsp+94h]
    mov         [rsi+r12-2],eax
    mov         eax,[rsp+54h]
    mov         [r12+rsi*2-2],eax
    mov         eax, [rsp+0A4h]
    mov         [r10+r12-2],eax
    mov         eax,[rsp+4Ch]
    mov         [rbp],eax
    mov         eax,[rsp+9Ch]
    mov         [rsi+rbp],eax
    mov         eax, [rsp+5Ch]
    mov         [rbp+rsi*2],eax
    mov         eax,[rsp+0ACh]
    mov         [r10+rbp],eax
    lea         r11,[rsp+170h]
    mov         rsp,r11
    pop         r14
    pop         r13
    pop         r12
    pop         rbp
    pop         rbx
    ret



%elifdef  X86_32

;***************************************************************************
;  void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;          int32_t iAlpha, int32_t iBeta)
;***************************************************************************

WELS_EXTERN DeblockChromaEq4H_ssse3
    push        ebp
    mov         ebp,esp
    and         esp,0FFFFFFF0h
    sub         esp,0C8h
    mov         ecx,dword [ebp+8]
    mov         edx,dword [ebp+0Ch]
    mov         eax,dword [ebp+10h]
    sub         ecx,2
    sub         edx,2
    push        esi
    lea         esi,[eax+eax*2]
    mov         dword [esp+18h],ecx
    mov         dword [esp+4],edx
    lea         ecx,[ecx+eax*4]
    lea         edx,[edx+eax*4]
    lea         eax,[esp+7Ch]
    push        edi
    mov         dword [esp+14h],esi
    mov         dword [esp+18h],ecx
    mov         dword [esp+0Ch],edx
    mov         dword [esp+10h],eax
    mov         esi,dword [esp+1Ch]
    mov         ecx,dword [ebp+10h]
    mov         edx,dword [esp+14h]
    movd        xmm0,dword [esi]
    movd        xmm1,dword [esi+ecx]
    movd        xmm2,dword [esi+ecx*2]
    movd        xmm3,dword [esi+edx]
    mov         esi,dword  [esp+8]
    movd        xmm4,dword [esi]
    movd        xmm5,dword [esi+ecx]
    movd        xmm6,dword [esi+ecx*2]
    movd        xmm7,dword [esi+edx]
    punpckldq   xmm0,xmm4
    punpckldq   xmm1,xmm5
    punpckldq   xmm2,xmm6
    punpckldq   xmm3,xmm7
    mov         esi,dword [esp+18h]
    mov         edi,dword [esp+0Ch]
    movd        xmm4,dword [esi]
    movd        xmm5,dword [edi]
    punpckldq   xmm4,xmm5
    punpcklqdq  xmm0,xmm4
    movd        xmm4,dword [esi+ecx]
    movd        xmm5,dword [edi+ecx]
    punpckldq   xmm4,xmm5
    punpcklqdq  xmm1,xmm4
    movd        xmm4,dword [esi+ecx*2]
    movd        xmm5,dword [edi+ecx*2]
    punpckldq   xmm4,xmm5
    punpcklqdq  xmm2,xmm4
    movd        xmm4,dword [esi+edx]
    movd        xmm5,dword [edi+edx]
    punpckldq   xmm4,xmm5
    punpcklqdq  xmm3,xmm4
    movdqa      xmm6,xmm0
    punpcklbw   xmm0,xmm1
    punpckhbw   xmm6,xmm1
    movdqa      xmm7,xmm2
    punpcklbw   xmm2,xmm3
    punpckhbw   xmm7,xmm3
    movdqa      xmm4,xmm0
    movdqa      xmm5,xmm6
    punpcklwd   xmm0,xmm2
    punpckhwd   xmm4,xmm2
    punpcklwd   xmm6,xmm7
    punpckhwd   xmm5,xmm7
    movdqa      xmm1,xmm0
    movdqa      xmm2,xmm4
    punpckldq   xmm0,xmm6
    punpckhdq   xmm1,xmm6
    punpckldq   xmm4,xmm5
    punpckhdq   xmm2,xmm5
    movdqa      xmm5,xmm0
    movdqa      xmm6,xmm1
    punpcklqdq  xmm0,xmm4
    punpckhqdq  xmm5,xmm4
    punpcklqdq  xmm1,xmm2
    punpckhqdq  xmm6,xmm2
    mov         edi,dword [esp+10h]
    movdqa      [edi],xmm0
    movdqa      [edi+10h],xmm5
    movdqa      [edi+20h],xmm1
    movdqa      [edi+30h],xmm6
    movsx       ecx,word [ebp+14h]
    movsx       edx,word [ebp+18h]
    movdqa      xmm6,[esp+80h]
    movdqa      xmm4,[esp+90h]
    movdqa      xmm5,[esp+0A0h]
    movdqa      xmm7,[esp+0B0h]
    pxor        xmm0,xmm0
    movd        xmm1,ecx
    movdqa      xmm2,xmm1
    punpcklwd   xmm2,xmm1
    pshufd      xmm1,xmm2,0
    movd        xmm2,edx
    movdqa      xmm3,xmm2
    punpcklwd   xmm3,xmm2
    pshufd      xmm2,xmm3,0
    movdqa      xmm3,xmm6
    punpckhbw   xmm6,xmm0
    movdqa      [esp+60h],xmm6
    movdqa      xmm6,[esp+90h]
    punpckhbw   xmm6,xmm0
    movdqa      [esp+30h],xmm6
    movdqa      xmm6,[esp+0A0h]
    punpckhbw   xmm6,xmm0
    movdqa      [esp+40h],xmm6
    movdqa      xmm6,[esp+0B0h]
    punpckhbw   xmm6,xmm0
    movdqa      [esp+70h],xmm6
    punpcklbw   xmm7,xmm0
    punpcklbw   xmm4,xmm0
    punpcklbw   xmm5,xmm0
    punpcklbw   xmm3,xmm0
    movdqa      [esp+50h],xmm7
    movdqa      xmm6,xmm4
    psubw       xmm6,xmm5
    pabsw       xmm6,xmm6
    movdqa      xmm0,xmm1
    pcmpgtw     xmm0,xmm6
    movdqa      xmm6,xmm3
    psubw       xmm6,xmm4
    pabsw       xmm6,xmm6
    movdqa      xmm7,xmm2
    pcmpgtw     xmm7,xmm6
    movdqa      xmm6,[esp+50h]
    psubw       xmm6,xmm5
    pabsw       xmm6,xmm6
    pand        xmm0,xmm7
    movdqa      xmm7,xmm2
    pcmpgtw     xmm7,xmm6
    movdqa      xmm6,[esp+30h]
    psubw       xmm6,[esp+40h]
    pabsw       xmm6,xmm6
    pcmpgtw     xmm1,xmm6
    movdqa      xmm6,[esp+60h]
    psubw       xmm6,[esp+30h]
    pabsw       xmm6,xmm6
    pand        xmm0,xmm7
    movdqa      xmm7,xmm2
    pcmpgtw     xmm7,xmm6
    movdqa      xmm6,[esp+70h]
    psubw       xmm6,[esp+40h]
    pabsw       xmm6,xmm6
    pand        xmm1,xmm7
    pcmpgtw     xmm2,xmm6
    pand        xmm1,xmm2
    mov         eax,2
    movsx       ecx,ax
    movd        xmm2,ecx
    movdqa      xmm6,xmm2
    punpcklwd   xmm6,xmm2
    pshufd      xmm2,xmm6,0
    movdqa      [esp+20h],xmm2
    movdqa      xmm2,xmm3
    paddw       xmm2,xmm3
    paddw       xmm2,xmm4
    paddw       xmm2,[esp+50h]
    paddw       xmm2,[esp+20h]
    psraw       xmm2,2
    movdqa      xmm6,xmm0
    pand        xmm6,xmm2
    movdqa      xmm2,xmm0
    pandn       xmm2,xmm4
    por         xmm6,xmm2
    movdqa      xmm2,[esp+60h]
    movdqa      xmm7,xmm2
    paddw       xmm7,xmm2
    paddw       xmm7,[esp+30h]
    paddw       xmm7,[esp+70h]
    paddw       xmm7,[esp+20h]
    movdqa      xmm4,xmm1
    movdqa      xmm2,xmm1
    pandn       xmm2,[esp+30h]
    psraw       xmm7,2
    pand        xmm4,xmm7
    por         xmm4,xmm2
    movdqa      xmm2,[esp+50h]
    packuswb    xmm6,xmm4
    movdqa      [esp+90h],xmm6
    movdqa      xmm6,xmm2
    paddw       xmm6,xmm2
    movdqa      xmm2,[esp+20h]
    paddw       xmm6,xmm5
    paddw       xmm6,xmm3
    movdqa      xmm4,xmm0
    pandn       xmm0,xmm5
    paddw       xmm6,xmm2
    psraw       xmm6,2
    pand        xmm4,xmm6
    por         xmm4,xmm0
    movdqa      xmm0,[esp+70h]
    movdqa      xmm5,xmm0
    paddw       xmm5,xmm0
    movdqa      xmm0,[esp+40h]
    paddw       xmm5,xmm0
    paddw       xmm5,[esp+60h]
    movdqa      xmm3,xmm1
    paddw       xmm5,xmm2
    psraw       xmm5,2
    pand        xmm3,xmm5
    pandn       xmm1,xmm0
    por         xmm3,xmm1
    packuswb    xmm4,xmm3
    movdqa      [esp+0A0h],xmm4
    mov         esi,dword [esp+10h]
    movdqa      xmm0,[esi]
    movdqa      xmm1,[esi+10h]
    movdqa      xmm2,[esi+20h]
    movdqa      xmm3,[esi+30h]
    movdqa      xmm6,xmm0
    punpcklbw   xmm0,xmm1
    punpckhbw   xmm6,xmm1
    movdqa      xmm7,xmm2
    punpcklbw   xmm2,xmm3
    punpckhbw   xmm7,xmm3
    movdqa      xmm4,xmm0
    movdqa      xmm5,xmm6
    punpcklwd   xmm0,xmm2
    punpckhwd   xmm4,xmm2
    punpcklwd   xmm6,xmm7
    punpckhwd   xmm5,xmm7
    movdqa      xmm1,xmm0
    movdqa      xmm2,xmm4
    punpckldq   xmm0,xmm6
    punpckhdq   xmm1,xmm6
    punpckldq   xmm4,xmm5
    punpckhdq   xmm2,xmm5
    movdqa      xmm5,xmm0
    movdqa      xmm6,xmm1
    punpcklqdq  xmm0,xmm4
    punpckhqdq  xmm5,xmm4
    punpcklqdq  xmm1,xmm2
    punpckhqdq  xmm6,xmm2
    mov         esi,dword [esp+1Ch]
    mov         ecx,dword [ebp+10h]
    mov         edx,dword [esp+14h]
    mov         edi,dword [esp+8]
    movd        dword [esi],xmm0
    movd        dword [esi+ecx],xmm5
    movd        dword [esi+ecx*2],xmm1
    movd        dword [esi+edx],xmm6
    psrldq      xmm0,4
    psrldq      xmm5,4
    psrldq      xmm1,4
    psrldq      xmm6,4
    mov         esi,dword [esp+18h]
    movd        dword [edi],xmm0
    movd        dword [edi+ecx],xmm5
    movd        dword [edi+ecx*2],xmm1
    movd        dword [edi+edx],xmm6
    psrldq      xmm0,4
    psrldq      xmm5,4
    psrldq      xmm1,4
    psrldq      xmm6,4
    movd        dword [esi],xmm0
    movd        dword [esi+ecx],xmm5
    movd        dword [esi+ecx*2],xmm1
    movd        dword [esi+edx],xmm6
    psrldq      xmm0,4
    psrldq      xmm5,4
    psrldq      xmm1,4
    psrldq      xmm6,4
    mov         edi,dword [esp+0Ch]
    movd        dword [edi],xmm0
    movd        dword [edi+ecx],xmm5
    movd        dword [edi+ecx*2],xmm1
    movd        dword [edi+edx],xmm6
    pop         edi
    pop         esi
    mov         esp,ebp
    pop         ebp
    ret

;*******************************************************************************
;    void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************

WELS_EXTERN DeblockChromaLt4H_ssse3
    push        ebp
    mov         ebp,esp
    and         esp,0FFFFFFF0h
    sub         esp,108h
    mov         ecx,dword [ebp+8]
    mov         edx,dword [ebp+0Ch]
    mov         eax,dword [ebp+10h]
    sub         ecx,2
    sub         edx,2
    push        esi
    lea         esi,[eax+eax*2]
    mov         dword [esp+10h],ecx
    mov         dword [esp+4],edx
    lea         ecx,[ecx+eax*4]
    lea         edx,[edx+eax*4]
    lea         eax,[esp+6Ch]
    push        edi
    mov         dword [esp+0Ch],esi
    mov         dword [esp+18h],ecx
    mov         dword [esp+10h],edx
    mov         dword [esp+1Ch],eax
    mov         esi,dword [esp+14h]
    mov         ecx,dword [ebp+10h]
    mov         edx,dword [esp+0Ch]
    movd        xmm0,dword [esi]
    movd        xmm1,dword [esi+ecx]
    movd        xmm2,dword [esi+ecx*2]
    movd        xmm3,dword [esi+edx]
    mov         esi,dword [esp+8]
    movd        xmm4,dword [esi]
    movd        xmm5,dword [esi+ecx]
    movd        xmm6,dword [esi+ecx*2]
    movd        xmm7,dword [esi+edx]
    punpckldq   xmm0,xmm4
    punpckldq   xmm1,xmm5
    punpckldq   xmm2,xmm6
    punpckldq   xmm3,xmm7
    mov         esi,dword [esp+18h]
    mov         edi,dword [esp+10h]
    movd        xmm4,dword [esi]
    movd        xmm5,dword [edi]
    punpckldq   xmm4,xmm5
    punpcklqdq  xmm0,xmm4
    movd        xmm4,dword [esi+ecx]
    movd        xmm5,dword [edi+ecx]
    punpckldq   xmm4,xmm5
    punpcklqdq  xmm1,xmm4
    movd        xmm4,dword [esi+ecx*2]
    movd        xmm5,dword [edi+ecx*2]
    punpckldq   xmm4,xmm5
    punpcklqdq  xmm2,xmm4
    movd        xmm4,dword [esi+edx]
    movd        xmm5,dword [edi+edx]
    punpckldq   xmm4,xmm5
    punpcklqdq  xmm3,xmm4
    movdqa      xmm6,xmm0
    punpcklbw   xmm0,xmm1
    punpckhbw   xmm6,xmm1
    movdqa      xmm7,xmm2
    punpcklbw   xmm2,xmm3
    punpckhbw   xmm7,xmm3
    movdqa      xmm4,xmm0
    movdqa      xmm5,xmm6
    punpcklwd   xmm0,xmm2
    punpckhwd   xmm4,xmm2
    punpcklwd   xmm6,xmm7
    punpckhwd   xmm5,xmm7
    movdqa      xmm1,xmm0
    movdqa      xmm2,xmm4
    punpckldq   xmm0,xmm6
    punpckhdq   xmm1,xmm6
    punpckldq   xmm4,xmm5
    punpckhdq   xmm2,xmm5
    movdqa      xmm5,xmm0
    movdqa      xmm6,xmm1
    punpcklqdq  xmm0,xmm4
    punpckhqdq  xmm5,xmm4
    punpcklqdq  xmm1,xmm2
    punpckhqdq  xmm6,xmm2
    mov         edi,dword [esp+1Ch]
    movdqa      [edi],xmm0
    movdqa      [edi+10h],xmm5
    movdqa      [edi+20h],xmm1
    movdqa      [edi+30h],xmm6
    mov         eax,dword [ebp+1Ch]
    movsx       cx,byte [eax+3]
    movsx       dx,byte [eax+2]
    movsx       si,byte [eax+1]
    movsx       ax,byte [eax]
    movzx       edi,cx
    movzx       ecx,cx
    movd        xmm2,ecx
    movzx       ecx,dx
    movzx       edx,dx
    movd        xmm3,ecx
    movd        xmm4,edx
    movzx       ecx,si
    movzx       edx,si
    movd        xmm5,ecx
    pxor        xmm0,xmm0
    movd        xmm6,edx
    movzx       ecx,ax
    movdqa      [esp+60h],xmm0
    movzx       edx,ax
    movsx       eax,word [ebp+14h]
    punpcklwd   xmm6,xmm2
    movd        xmm1,edi
    movd        xmm7,ecx
    movsx       ecx,word [ebp+18h]
    movd        xmm0,edx
    punpcklwd   xmm7,xmm3
    punpcklwd   xmm5,xmm1
    movdqa      xmm1,[esp+60h]
    punpcklwd   xmm7,xmm5
    movdqa      xmm5,[esp+0A0h]
    punpcklwd   xmm0,xmm4
    punpcklwd   xmm0,xmm6
    movdqa      xmm6, [esp+70h]
    punpcklwd   xmm0,xmm7
    movdqa      xmm7,[esp+80h]
    movdqa      xmm2,xmm1
    psubw       xmm2,xmm0
    movdqa      [esp+0D0h],xmm2
    movd        xmm2,eax
    movdqa      xmm3,xmm2
    punpcklwd   xmm3,xmm2
    pshufd      xmm4,xmm3,0
    movd        xmm2,ecx
    movdqa      xmm3,xmm2
    punpcklwd   xmm3,xmm2
    pshufd      xmm2,xmm3,0
    movdqa      xmm3, [esp+90h]
    movdqa      [esp+50h],xmm2
    movdqa      xmm2,xmm6
    punpcklbw   xmm2,xmm1
    punpckhbw   xmm6,xmm1
    movdqa      [esp+40h],xmm2
    movdqa      [esp+0B0h],xmm6
    movdqa      xmm6,[esp+90h]
    movdqa      xmm2,xmm7
    punpckhbw   xmm7,xmm1
    punpckhbw   xmm6,xmm1
    punpcklbw   xmm2,xmm1
    punpcklbw   xmm3,xmm1
    punpcklbw   xmm5,xmm1
    movdqa      [esp+0F0h],xmm7
    movdqa      [esp+0C0h],xmm6
    movdqa      xmm6, [esp+0A0h]
    punpckhbw   xmm6,xmm1
    movdqa      [esp+0E0h],xmm6
    mov         edx,4
    movsx       eax,dx
    movd        xmm6,eax
    movdqa      xmm7,xmm6
    punpcklwd   xmm7,xmm6
    pshufd      xmm6,xmm7,0
    movdqa      [esp+30h],xmm6
    movdqa      xmm7, [esp+40h]
    psubw       xmm7,xmm5
    movdqa      xmm6,xmm0
    pcmpgtw     xmm6,xmm1
    movdqa      [esp+60h],xmm6
    movdqa      xmm1, [esp+0D0h]
    movdqa      xmm6,xmm3
    psubw       xmm6,xmm2
    psllw       xmm6,2
    paddw       xmm6,xmm7
    paddw       xmm6,[esp+30h]
    psraw       xmm6,3
    pmaxsw      xmm1,xmm6
    movdqa      xmm7,[esp+50h]
    movdqa      [esp+20h],xmm0
    movdqa      xmm6, [esp+20h]
    pminsw      xmm6,xmm1
    movdqa      [esp+20h],xmm6
    movdqa      xmm6,xmm4
    movdqa      xmm1,xmm2
    psubw       xmm1,xmm3
    pabsw       xmm1,xmm1
    pcmpgtw     xmm6,xmm1
    movdqa      xmm1, [esp+40h]
    psubw       xmm1,xmm2
    pabsw       xmm1,xmm1
    pcmpgtw     xmm7,xmm1
    movdqa      xmm1, [esp+50h]
    pand        xmm6,xmm7
    movdqa      xmm7, [esp+50h]
    psubw       xmm5,xmm3
    pabsw       xmm5,xmm5
    pcmpgtw     xmm1,xmm5
    movdqa      xmm5, [esp+0B0h]
    psubw       xmm5,[esp+0E0h]
    pand        xmm6,xmm1
    pand        xmm6, [esp+60h]
    movdqa      xmm1, [esp+20h]
    pand        xmm1,xmm6
    movdqa      xmm6, [esp+0C0h]
    movdqa      [esp+40h],xmm1
    movdqa      xmm1, [esp+0F0h]
    psubw       xmm6,xmm1
    psllw       xmm6,2
    paddw       xmm6,xmm5
    paddw       xmm6, [esp+30h]
    movdqa      xmm5, [esp+0D0h]
    psraw       xmm6,3
    pmaxsw      xmm5,xmm6
    pminsw      xmm0,xmm5
    movdqa      xmm5,[esp+0C0h]
    movdqa      xmm6,xmm1
    psubw       xmm6,xmm5
    pabsw       xmm6,xmm6
    pcmpgtw     xmm4,xmm6
    movdqa      xmm6,[esp+0B0h]
    psubw       xmm6,xmm1
    pabsw       xmm6,xmm6
    pcmpgtw     xmm7,xmm6
    movdqa      xmm6, [esp+0E0h]
    pand        xmm4,xmm7
    movdqa      xmm7, [esp+50h]
    psubw       xmm6,xmm5
    pabsw       xmm6,xmm6
    pcmpgtw     xmm7,xmm6
    pand        xmm4,xmm7
    pand        xmm4,[esp+60h]
    pand        xmm0,xmm4
    movdqa      xmm4, [esp+40h]
    paddw       xmm2,xmm4
    paddw       xmm1,xmm0
    psubw       xmm3,xmm4
    psubw       xmm5,xmm0
    packuswb    xmm2,xmm1
    packuswb    xmm3,xmm5
    movdqa      [esp+80h],xmm2
    movdqa      [esp+90h],xmm3
    mov         esi,dword [esp+1Ch]
    movdqa      xmm0, [esi]
    movdqa      xmm1, [esi+10h]
    movdqa      xmm2, [esi+20h]
    movdqa      xmm3, [esi+30h]
    movdqa      xmm6,xmm0
    punpcklbw   xmm0,xmm1
    punpckhbw   xmm6,xmm1
    movdqa      xmm7,xmm2
    punpcklbw   xmm2,xmm3
    punpckhbw   xmm7,xmm3
    movdqa      xmm4,xmm0
    movdqa      xmm5,xmm6
    punpcklwd   xmm0,xmm2
    punpckhwd   xmm4,xmm2
    punpcklwd   xmm6,xmm7
    punpckhwd   xmm5,xmm7
    movdqa      xmm1,xmm0
    movdqa      xmm2,xmm4
    punpckldq   xmm0,xmm6
    punpckhdq   xmm1,xmm6
    punpckldq   xmm4,xmm5
    punpckhdq   xmm2,xmm5
    movdqa      xmm5,xmm0
    movdqa      xmm6,xmm1
    punpcklqdq  xmm0,xmm4
    punpckhqdq  xmm5,xmm4
    punpcklqdq  xmm1,xmm2
    punpckhqdq  xmm6,xmm2
    mov         esi,dword [esp+14h]
    mov         ecx,dword [ebp+10h]
    mov         edx,dword [esp+0Ch]
    mov         edi,dword [esp+8]
    movd        dword [esi],xmm0
    movd        dword [esi+ecx],xmm5
    movd        dword [esi+ecx*2],xmm1
    movd        dword [esi+edx],xmm6
    psrldq      xmm0,4
    psrldq      xmm5,4
    psrldq      xmm1,4
    psrldq      xmm6,4
    mov         esi,dword [esp+18h]
    movd        dword [edi],xmm0
    movd        dword [edi+ecx],xmm5
    movd        dword [edi+ecx*2],xmm1
    movd        dword [edi+edx],xmm6
    psrldq      xmm0,4
    psrldq      xmm5,4
    psrldq      xmm1,4
    psrldq      xmm6,4
    movd        dword [esi],xmm0
    movd        dword [esi+ecx],xmm5
    movd        dword [esi+ecx*2],xmm1
    movd        dword [esi+edx],xmm6
    psrldq      xmm0,4
    psrldq      xmm5,4
    psrldq      xmm1,4
    psrldq      xmm6,4
    mov         edi,dword [esp+10h]
    movd        dword [edi],xmm0
    movd        dword [edi+ecx],xmm5
    movd        dword [edi+ecx*2],xmm1
    movd        dword [edi+edx],xmm6
    pop         edi
    pop         esi
    mov         esp,ebp
    pop         ebp
    ret


%endif



;********************************************************************************
;
;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
;
;********************************************************************************

WELS_EXTERN DeblockLumaTransposeH2V_sse2
    push     r3
    push     r4
    push     r5

%assign   push_num   3
    LOAD_3_PARA
    PUSH_XMM 8

    SIGN_EXTENSION   r1, r1d

    mov      r5,    r7
    mov      r3,    r7
    and      r3,    0Fh
    sub      r7,    r3
    sub      r7,    10h

    lea      r3,    [r0 + r1 * 8]
    lea      r4,    [r1 * 3]

    movq    xmm0,  [r0]
    movq    xmm7,  [r3]
    punpcklqdq   xmm0,  xmm7
    movq    xmm1,  [r0 + r1]
    movq    xmm7,  [r3 + r1]
    punpcklqdq   xmm1,  xmm7
    movq    xmm2,  [r0 + r1*2]
    movq    xmm7,  [r3 + r1*2]
    punpcklqdq   xmm2,  xmm7
    movq    xmm3,  [r0 + r4]
    movq    xmm7,  [r3 + r4]
    punpcklqdq   xmm3,  xmm7

    lea     r0,   [r0 + r1 * 4]
    lea     r3,   [r3 + r1 * 4]
    movq    xmm4,  [r0]
    movq    xmm7,  [r3]
    punpcklqdq   xmm4,  xmm7
    movq    xmm5,  [r0 + r1]
    movq    xmm7,  [r3 + r1]
    punpcklqdq   xmm5,  xmm7
    movq    xmm6,  [r0 + r1*2]
    movq    xmm7,  [r3 + r1*2]
    punpcklqdq   xmm6,  xmm7

    movdqa  [r7],   xmm0
    movq    xmm7,  [r0 + r4]
    movq    xmm0,  [r3 + r4]
    punpcklqdq   xmm7,  xmm0
    movdqa  xmm0,   [r7]

    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1

    movdqa  [r2],    xmm4
    movdqa  [r2 + 10h],  xmm2
    movdqa  [r2 + 20h],  xmm3
    movdqa  [r2 + 30h],  xmm7
    movdqa  [r2 + 40h],  xmm5
    movdqa  [r2 + 50h],  xmm1
    movdqa  [r2 + 60h],  xmm6
    movdqa  [r2 + 70h],  xmm0

    mov     r7,   r5
    POP_XMM
    pop     r5
    pop     r4
    pop     r3
    ret


;*******************************************************************************************
;
;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
;
;*******************************************************************************************

WELS_EXTERN DeblockLumaTransposeV2H_sse2
    push     r3
    push     r4

%assign  push_num 2
    LOAD_3_PARA
    PUSH_XMM 8

    SIGN_EXTENSION   r1, r1d

    mov      r4,    r7
    mov      r3,    r7
    and      r3,    0Fh
    sub      r7,    r3
    sub      r7,    10h

    movdqa   xmm0,   [r2]
    movdqa   xmm1,   [r2 + 10h]
    movdqa   xmm2,   [r2 + 20h]
    movdqa   xmm3,   [r2 + 30h]
    movdqa   xmm4,   [r2 + 40h]
    movdqa   xmm5,   [r2 + 50h]
    movdqa   xmm6,   [r2 + 60h]
    movdqa   xmm7,   [r2 + 70h]

    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1

    lea      r2,   [r1 * 3]

    movq     [r0],  xmm4
    movq     [r0 + r1],  xmm2
    movq     [r0 + r1*2],  xmm3
    movq     [r0 + r2],  xmm7

    lea      r0,   [r0 + r1*4]
    movq     [r0],  xmm5
    movq     [r0 + r1],  xmm1
    movq     [r0 + r1*2],  xmm6
    movq     [r0 + r2],  xmm0

    psrldq    xmm4,   8
    psrldq    xmm2,   8
    psrldq    xmm3,   8
    psrldq    xmm7,   8
    psrldq    xmm5,   8
    psrldq    xmm1,   8
    psrldq    xmm6,   8
    psrldq    xmm0,   8

    lea       r0,  [r0 + r1*4]
    movq     [r0],  xmm4
    movq     [r0 + r1],  xmm2
    movq     [r0 + r1*2],  xmm3
    movq     [r0 + r2],  xmm7

    lea      r0,   [r0 + r1*4]
    movq     [r0],  xmm5
    movq     [r0 + r1],  xmm1
    movq     [r0 + r1*2],  xmm6
    movq     [r0 + r2],  xmm0


    mov      r7,   r4
    POP_XMM
    pop      r4
    pop      r3
    ret

WELS_EXTERN WelsNonZeroCount_sse2
    %assign  push_num 0
    LOAD_1_PARA
    movdqu  xmm0, [r0]
    movq    xmm1, [r0+16]
    WELS_DB1 xmm2
    pminub  xmm0, xmm2
    pminub  xmm1, xmm2
    movdqu  [r0], xmm0
    movq    [r0+16], xmm1
    ret