ref: 1f53d38c8b3c5d1a92e4ef36a0c7124a270955e0
dir: /codec/common/expand_picture.asm/
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  expand_picture.asm
;*
;*  Abstract
;*      mmxext/sse for expand_frame
;*
;*  History
;*      09/25/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
;SECTION .rodata pData align=16
;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
;***********************************************************************
;%define PADDING_SIZE_ASM 	32 	; PADDING_LENGTH
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
WELS_EXTERN ExpandPictureLuma_sse2
WELS_EXTERN ExpandPictureChromaAlign_sse2	; for chroma alignment
WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
;;;;;;;expanding result;;;;;;;
;aaaa|attttttttttttttttb|bbbb
;aaaa|attttttttttttttttb|bbbb
;aaaa|attttttttttttttttb|bbbb
;aaaa|attttttttttttttttb|bbbb
;----------------------------
;aaaa|attttttttttttttttb|bbbb
;llll|l                r|rrrr
;llll|l                r|rrrr
;llll|l                r|rrrr
;llll|l                r|rrrr
;llll|l                r|rrrr
;cccc|ceeeeeeeeeeeeeeeed|dddd
;----------------------------
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
%macro mov_line_8x4_mmx		3	; dst, stride, mm?
	movq [%1], %3
	movq [%1+%2], %3
	lea %1, [%1+2*%2]
	movq [%1], %3
	movq [%1+%2], %3
	lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end8x4_mmx		3	; dst, stride, mm?
	movq [%1], %3
	movq [%1+%2], %3
	lea %1, [%1+2*%2]
	movq [%1], %3
	movq [%1+%2], %3
	lea %1, [%1+%2]
%endmacro
%macro mov_line_16x4_sse2	4	; dst, stride, xmm?, u/a
	movdq%4 [%1], %3 		; top(bottom)_0
	movdq%4 [%1+%2], %3		; top(bottom)_1
	lea %1, [%1+2*%2]
	movdq%4 [%1], %3 		; top(bottom)_2
	movdq%4 [%1+%2], %3		; top(bottom)_3
	lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end16x4_sse2	4	; dst, stride, xmm?, u/a
	movdq%4 [%1], %3 		; top(bottom)_0
	movdq%4 [%1+%2], %3		; top(bottom)_1
	lea %1, [%1+2*%2]
	movdq%4 [%1], %3 		; top(bottom)_2
	movdq%4 [%1+%2], %3		; top(bottom)_3
	lea %1, [%1+%2]
%endmacro
%macro mov_line_32x4_sse2	3	; dst, stride, xmm?
	movdqa [%1], %3 		; top(bottom)_0
	movdqa [%1+16], %3 		; top(bottom)_0
	movdqa [%1+%2], %3		; top(bottom)_1
	movdqa [%1+%2+16], %3		; top(bottom)_1
	lea %1, [%1+2*%2]
	movdqa [%1], %3 		; top(bottom)_2
	movdqa [%1+16], %3 		; top(bottom)_2
	movdqa [%1+%2], %3		; top(bottom)_3
	movdqa [%1+%2+16], %3		; top(bottom)_3
	lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end32x4_sse2	3	; dst, stride, xmm?
	movdqa [%1], %3 		; top(bottom)_0
	movdqa [%1+16], %3 		; top(bottom)_0
	movdqa [%1+%2], %3		; top(bottom)_1
	movdqa [%1+%2+16], %3		; top(bottom)_1
	lea %1, [%1+2*%2]
	movdqa [%1], %3 		; top(bottom)_2
	movdqa [%1+16], %3 		; top(bottom)_2
	movdqa [%1+%2], %3		; top(bottom)_3
	movdqa [%1+%2+16], %3		; top(bottom)_3
	lea %1, [%1+%2]
%endmacro
%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
    ;r2 [width/16(8)]
    ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
    ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
%if %1 == 32		; for luma
	sar r2, 04h 	; width / 16(8) pixels
.top_bottom_loops:
	; top
	movdqa xmm0, [r0]		; first line of picture pData
	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
	mov_line_16x4_sse2 r5, r1, xmm0, a
	mov_line_16x4_sse2 r5, r1, xmm0, a
	mov_line_16x4_sse2 r5, r1, xmm0, a
	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
	mov_line_16x4_sse2 r5, r1, xmm0, a
	mov_line_16x4_sse2 r5, r1, xmm0, a
	mov_line_end16x4_sse2 r5, r1, xmm0, a
	; bottom
	movdqa xmm1, [r3] 		; last line of picture pData
	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
	mov_line_16x4_sse2 r4, r1, xmm1, a
	mov_line_16x4_sse2 r4, r1, xmm1, a
	mov_line_16x4_sse2 r4, r1, xmm1, a
	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
	mov_line_16x4_sse2 r4, r1, xmm1, a
	mov_line_16x4_sse2 r4, r1, xmm1, a
	mov_line_end16x4_sse2 r4, r1, xmm1, a
	lea r0, [r0+16]		; top pSrc
	lea r5, [r5+16]		; top dst
	lea r3, [r3+16]		; bottom pSrc
	lea r4, [r4+16]		; bottom dst
	neg r1 			; positive/negative stride need for next loop?
	dec r2
	jnz near .top_bottom_loops
%elif %1 == 16	; for chroma ??
	mov r6, r2
	sar r2, 04h 	; (width / 16) pixels
.top_bottom_loops:
	; top
	movdqa xmm0, [r0]		; first line of picture pData
	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
	mov_line_16x4_sse2 r5, r1, xmm0, a
	mov_line_16x4_sse2 r5, r1, xmm0, a
	mov_line_end16x4_sse2 r5, r1, xmm0, a
	; bottom
	movdqa xmm1, [r3] 		; last line of picture pData
	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
	mov_line_16x4_sse2 r4, r1, xmm1, a
	mov_line_16x4_sse2 r4, r1, xmm1, a
	mov_line_end16x4_sse2 r4, r1, xmm1, a
	lea r0, [r0+16]		; top pSrc
	lea r5, [r5+16]		; top dst
	lea r3, [r3+16]		; bottom pSrc
	lea r4, [r4+16]		; bottom dst
	neg r1 			; positive/negative stride need for next loop?
	dec r2
	jnz near .top_bottom_loops
	; for remaining 8 bytes
	and r6, 0fh		; any 8 bytes left?
	test r6, r6
	jz near .to_be_continued	; no left to exit here
	; top
	movq mm0, [r0]		; remained 8 byte
	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
	mov_line_end8x4_mmx r5, r1, mm0	; dst, stride, mm?
	; bottom
	movq mm1, [r3]
	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
	mov_line_end8x4_mmx r4, r1, mm1	; dst, stride, mm?
	WELSEMMS
.to_be_continued:
%endif
%endmacro
%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
    ;r6 [height]
    ;r0 [pSrc+0]  r5[pSrc-32] r1[stride]
    ;r3 [pSrc+(w-1)] r4[pSrc+w]
%if %1 == 32		; for luma
.left_right_loops:
	; left
	movzx r2d, byte [r0]		; pixel pData for left border
	SSE2_Copy16Times	xmm0, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
	movdqa [r5], xmm0
	movdqa [r5+16], xmm0
	; right
	movzx r2d, byte [r3]
	SSE2_Copy16Times	xmm1, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
	movdqa [r4], xmm1
	movdqa [r4+16], xmm1
	lea r0, [r0+r1]		; left pSrc
	lea r5, [r5+r1]		; left dst
	lea r3, [r3+r1]		; right pSrc
	lea r4, [r4+r1]		; right dst
	dec r6
	jnz near .left_right_loops
%elif %1 == 16	; for chroma ??
.left_right_loops:
	; left
	movzx r2d, byte [r0]		; pixel pData for left border
	SSE2_Copy16Times	xmm0, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
	movdqa [r5], xmm0
	; right
	movzx r2d, byte [r3]
	SSE2_Copy16Times	xmm1, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
	movdq%2 [r4], xmm1								; might not be aligned 16 bytes in case chroma planes
	lea r0, [r0+r1]		; left pSrc
	lea r5, [r5+r1]		; left dst
	lea r3, [r3+r1]		; right pSrc
	lea r4, [r4+r1]		; right dst
	dec r6
	jnz near .left_right_loops
%endif
%endmacro
%macro exp_cross_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
	; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
	; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
    ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
%if %1 == 32		; luma
	; TL
	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
	mov_line_end32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
	; TR
	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
	mov_line_end32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
	; BL
	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
	mov_line_end32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
	; BR
	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
	mov_line_end32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
%elif %1 == 16	; chroma
	; TL
	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
	mov_line_end16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
	; TR
	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
	mov_line_end16x4_sse2 r4, r1, xmm4, %2	; dst, stride, xmm?
	; BL
	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
	mov_line_end16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
	; BR
	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
	mov_line_end16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
%endif
%endmacro
ALIGN 16
;***********************************************************************----------------
; void ExpandPictureLuma_sse2(	uint8_t *pDst,
;									const int32_t iStride,
;									const int32_t iWidth,
;									const int32_t iHeight	);
;***********************************************************************----------------
ExpandPictureLuma_sse2:
    push r4
    push r5
    push r6
    %assign push_num 3
    LOAD_4_PARA
    SIGN_EXTENTION r1, r1d
    SIGN_EXTENTION r2, r2d
    SIGN_EXTENTION r3, r3d
    ;also prepare for cross border pData top-left:xmm3
    movzx r6d,byte[r0]
    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
    neg r1
    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
    neg r1
    push r3
    dec r3                      ;h-1
    imul r3,r1                  ;(h-1)*stride
    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
    mov r6,r1                    ;r6 = stride
    sal r6,05h                   ;r6 = 32*stride
    lea r4,[r3+r6]               ;r4 = dst bottom
    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
    movzx r6d,byte [r3]             ;bottom-left
    SSE2_Copy16Times xmm5,r6d
    lea r6,[r3+r2-1]
    movzx r6d,byte [r6]
    SSE2_Copy16Times xmm6,r6d ;bottom-right
    neg r1  ;r1 = -stride
    push r0
    push r1
    push r2
    exp_top_bottom_sse2 32
	; for both left and right border
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    pop r2
    pop r1
    pop r0
    lea r5,[r0-32]                          ;left border dst  luma =32 chroma = -16
    lea r3,[r0+r2-1]                        ;right border src
    lea r4,[r3+1]                           ;right border dst
    ;prepare for cross border data: top-rigth with xmm4
     movzx r6d,byte [r3]                         ;top -rigth
     SSE2_Copy16Times xmm4,r6d
    neg r1   ;r1 = stride
    pop r6  ;  r6 = height
    push r0
    push r1
    push r2
    push r6
    exp_left_right_sse2  32,a
    pop r6
    pop r2
    pop r1
    pop r0
	; for cross border [top-left, top-right, bottom-left, bottom-right]
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
    neg r1  ;r1 = -stride
    lea r3,[r0-32]
    lea r3,[r3+r1]    ;last line of top-left border
    lea r4,[r0+r2]    ;psrc +width
    lea r4,[r4+r1]    ;psrc +width -stride
    neg r1  ;r1 = stride
    add r6,32         ;height +32(16) ,luma = 32, chroma = 16
    imul r6,r1
    lea r5,[r3+r6]    ;last line of bottom-left border
    lea r6,[r4+r6]    ;last line of botoom-right border
    neg r1 ; r1 = -stride
    ; for left & right border expanding
    exp_cross_sse2 32,a
    LOAD_4_PARA_POP
    pop r6
    pop r5
    pop r4
    %assign push_num 0
	ret
ALIGN 16
;***********************************************************************----------------
; void ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
;										const int32_t iStride,
;										const int32_t iWidth,
;										const int32_t iHeight	);
;***********************************************************************----------------
ExpandPictureChromaAlign_sse2:
    push r4
    push r5
    push r6
    %assign push_num 3
    LOAD_4_PARA
    SIGN_EXTENTION r1,r1d
    SIGN_EXTENTION r2,r2d
    SIGN_EXTENTION r3,r3d
    ;also prepare for cross border pData top-left:xmm3
    movzx r6d,byte [r0]
    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
    neg r1
    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
    neg r1
    push r3
    dec r3                      ;h-1
    imul r3,r1                  ;(h-1)*stride
    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
    mov r6,r1                    ;r6 = stride
    sal r6,04h                   ;r6 = 32*stride
    lea r4,[r3+r6]               ;r4 = dst bottom
    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
    movzx r6d,byte [r3]             ;bottom-left
    SSE2_Copy16Times xmm5,r6d
    lea r6,[r3+r2-1]
    movzx r6d,byte [r6]
    SSE2_Copy16Times xmm6,r6d ;bottom-right
    neg r1  ;r1 = -stride
    push r0
    push r1
    push r2
    exp_top_bottom_sse2 16
	; for both left and right border
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    pop r2
    pop r1
    pop r0
    lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
    lea r3,[r0+r2-1]                        ;right border src
    lea r4,[r3+1]                           ;right border dst
    ;prepare for cross border data: top-rigth with xmm4
    movzx r6d,byte [r3]                         ;top -rigth
    SSE2_Copy16Times xmm4,r6d
    neg r1   ;r1 = stride
    pop r6  ;  r6 = height
    push r0
    push r1
    push r2
	push r6
    exp_left_right_sse2 16,a
    pop r6
    pop r2
    pop r1
    pop r0
	; for cross border [top-left, top-right, bottom-left, bottom-right]
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
    neg r1  ;r1 = -stride
    lea r3,[r0-16]
    lea r3,[r3+r1]    ;last line of top-left border
    lea r4,[r0+r2]    ;psrc +width
    lea r4,[r4+r1]    ;psrc +width -stride
    neg r1  ;r1 = stride
    add r6,16         ;height +32(16) ,luma = 32, chroma = 16
    imul r6,r1
    lea r5,[r3+r6]    ;last line of bottom-left border
    lea r6,[r4+r6]    ;last line of botoom-right border
    neg r1 ; r1 = -stride
    ; for left & right border expanding
    exp_cross_sse2 16,a
    LOAD_4_PARA_POP
    pop r6
    pop r5
    pop r4
    %assign push_num 0
	ret
ALIGN 16
;***********************************************************************----------------
; void ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
;										const int32_t iStride,
;										const int32_t iWidth,
;										const int32_t iHeight	);
;***********************************************************************----------------
ExpandPictureChromaUnalign_sse2:
	push r4
    push r5
    push r6
    %assign push_num 3
    LOAD_4_PARA
    SIGN_EXTENTION r1,r1d
    SIGN_EXTENTION r2,r2d
    SIGN_EXTENTION r3,r3d
    ;also prepare for cross border pData top-left:xmm3
    movzx r6d,byte [r0]
    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
    neg r1
    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
    neg r1
    push r3
    dec r3                      ;h-1
    imul r3,r1                  ;(h-1)*stride
    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
    mov r6,r1                    ;r6 = stride
    sal r6,04h                   ;r6 = 32*stride
    lea r4,[r3+r6]               ;r4 = dst bottom
    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
    movzx r6d,byte [r3]             ;bottom-left
    SSE2_Copy16Times xmm5,r6d
    lea r6,[r3+r2-1]
    movzx r6d,byte [r6]
    SSE2_Copy16Times xmm6,r6d ;bottom-right
    neg r1  ;r1 = -stride
    push r0
    push r1
    push r2
    exp_top_bottom_sse2 16
	; for both left and right border
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    pop r2
    pop r1
    pop r0
    lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
    lea r3,[r0+r2-1]                        ;right border src
    lea r4,[r3+1]                           ;right border dst
    ;prepare for cross border data: top-rigth with xmm4
    movzx r6d,byte [r3]                         ;top -rigth
    SSE2_Copy16Times xmm4,r6d
    neg r1   ;r1 = stride
    pop r6  ;  r6 = height
    push r0
    push r1
    push r2
	push r6
    exp_left_right_sse2 16,u
    pop r6
    pop r2
    pop r1
    pop r0
	; for cross border [top-left, top-right, bottom-left, bottom-right]
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
    neg r1  ;r1 = -stride
    lea r3,[r0-16]
    lea r3,[r3+r1]    ;last line of top-left border
    lea r4,[r0+r2]    ;psrc +width
    lea r4,[r4+r1]    ;psrc +width -stride
    neg r1  ;r1 = stride
    add r6,16         ;height +32(16) ,luma = 32, chroma = 16
    imul r6,r1
    lea r5,[r3+r6]    ;last line of bottom-left border
    lea r6,[r4+r6]    ;last line of botoom-right border
    neg r1 ; r1 = -stride
    ; for left & right border expanding
    exp_cross_sse2 16,u
    LOAD_4_PARA_POP
    pop r6
    pop r5
    pop r4
    %assign push_num 0
	ret