ref: a30f8fe33d92e9c029659a6f702eb8c4e9f9feb8
dir: /codec/common/x86/expand_picture.asm/
;*! ;* \copy ;* Copyright (c) 2009-2013, Cisco Systems ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* * Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* ;* * Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in ;* the documentation and/or other materials provided with the ;* distribution. ;* ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ;* POSSIBILITY OF SUCH DAMAGE. ;* ;* ;* expand_picture.asm ;* ;* Abstract ;* mmxext/sse for expand_frame ;* ;* History ;* 09/25/2009 Created ;* ;* ;*************************************************************************/ %include "asm_inc.asm" ;*********************************************************************** ; Macros and other preprocessor constants ;*********************************************************************** ;*********************************************************************** ; Code ;*********************************************************************** SECTION .text ;;;;;;;expanding result;;;;;;; ;aaaa|attttttttttttttttb|bbbb ;aaaa|attttttttttttttttb|bbbb ;aaaa|attttttttttttttttb|bbbb ;aaaa|attttttttttttttttb|bbbb ;---------------------------- ;aaaa|attttttttttttttttb|bbbb ;llll|l r|rrrr ;llll|l r|rrrr ;llll|l r|rrrr ;llll|l r|rrrr ;llll|l r|rrrr ;cccc|ceeeeeeeeeeeeeeeed|dddd ;---------------------------- ;cccc|ceeeeeeeeeeeeeeeed|dddd ;cccc|ceeeeeeeeeeeeeeeed|dddd ;cccc|ceeeeeeeeeeeeeeeed|dddd ;cccc|ceeeeeeeeeeeeeeeed|dddd %macro mov_line_8x4_mmx 3 ; dst, stride, mm? movq [%1], %3 movq [%1+%2], %3 lea %1, [%1+2*%2] movq [%1], %3 movq [%1+%2], %3 lea %1, [%1+2*%2] %endmacro %macro mov_line_end8x4_mmx 3 ; dst, stride, mm? movq [%1], %3 movq [%1+%2], %3 lea %1, [%1+2*%2] movq [%1], %3 movq [%1+%2], %3 lea %1, [%1+%2] %endmacro %macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a movdq%4 [%1], %3 ; top(bottom)_0 movdq%4 [%1+%2], %3 ; top(bottom)_1 lea %1, [%1+2*%2] movdq%4 [%1], %3 ; top(bottom)_2 movdq%4 [%1+%2], %3 ; top(bottom)_3 lea %1, [%1+2*%2] %endmacro %macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a movdq%4 [%1], %3 ; top(bottom)_0 movdq%4 [%1+%2], %3 ; top(bottom)_1 lea %1, [%1+2*%2] movdq%4 [%1], %3 ; top(bottom)_2 movdq%4 [%1+%2], %3 ; top(bottom)_3 lea %1, [%1+%2] %endmacro %macro mov_line_32x4_sse2 3 ; dst, stride, xmm? movdqa [%1], %3 ; top(bottom)_0 movdqa [%1+16], %3 ; top(bottom)_0 movdqa [%1+%2], %3 ; top(bottom)_1 movdqa [%1+%2+16], %3 ; top(bottom)_1 lea %1, [%1+2*%2] movdqa [%1], %3 ; top(bottom)_2 movdqa [%1+16], %3 ; top(bottom)_2 movdqa [%1+%2], %3 ; top(bottom)_3 movdqa [%1+%2+16], %3 ; top(bottom)_3 lea %1, [%1+2*%2] %endmacro %macro mov_line_end32x4_sse2 3 ; dst, stride, xmm? movdqa [%1], %3 ; top(bottom)_0 movdqa [%1+16], %3 ; top(bottom)_0 movdqa [%1+%2], %3 ; top(bottom)_1 movdqa [%1+%2+16], %3 ; top(bottom)_1 lea %1, [%1+2*%2] movdqa [%1], %3 ; top(bottom)_2 movdqa [%1+16], %3 ; top(bottom)_2 movdqa [%1+%2], %3 ; top(bottom)_3 movdqa [%1+%2+16], %3 ; top(bottom)_3 lea %1, [%1+%2] %endmacro %macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)] ;r2 [width/16(8)] ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom %if %1 == 32 ; for luma sar r2, 04h ; width / 16(8) pixels .top_bottom_loops: ; top movdqa xmm0, [r0] ; first line of picture pData mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_end16x4_sse2 r5, r1, xmm0, a ; bottom movdqa xmm1, [r3] ; last line of picture pData mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_end16x4_sse2 r4, r1, xmm1, a lea r0, [r0+16] ; top pSrc lea r5, [r5+16] ; top dst lea r3, [r3+16] ; bottom pSrc lea r4, [r4+16] ; bottom dst neg r1 ; positive/negative stride need for next loop? dec r2 jnz near .top_bottom_loops %elif %1 == 16 ; for chroma ?? mov r6, r2 sar r2, 04h ; (width / 16) pixels .top_bottom_loops: ; top movdqa xmm0, [r0] ; first line of picture pData mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_end16x4_sse2 r5, r1, xmm0, a ; bottom movdqa xmm1, [r3] ; last line of picture pData mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_end16x4_sse2 r4, r1, xmm1, a lea r0, [r0+16] ; top pSrc lea r5, [r5+16] ; top dst lea r3, [r3+16] ; bottom pSrc lea r4, [r4+16] ; bottom dst neg r1 ; positive/negative stride need for next loop? dec r2 jnz near .top_bottom_loops ; for remaining 8 bytes and r6, 0fh ; any 8 bytes left? test r6, r6 jz near .to_be_continued ; no left to exit here ; top movq mm0, [r0] ; remained 8 byte mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm? ; bottom movq mm1, [r3] mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm? WELSEMMS .to_be_continued: %endif %endmacro %macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a ;r6 [height] ;r0 [pSrc+0] r5[pSrc-32] r1[stride] ;r3 [pSrc+(w-1)] r4[pSrc+w] %if %1 == 32 ; for luma .left_right_loops: ; left movzx r2d, byte [r0] ; pixel pData for left border SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] movdqa [r5], xmm0 movdqa [r5+16], xmm0 ; right movzx r2d, byte [r3] SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] movdqa [r4], xmm1 movdqa [r4+16], xmm1 lea r0, [r0+r1] ; left pSrc lea r5, [r5+r1] ; left dst lea r3, [r3+r1] ; right pSrc lea r4, [r4+r1] ; right dst dec r6 jnz near .left_right_loops %elif %1 == 16 ; for chroma ?? .left_right_loops: ; left movzx r2d, byte [r0] ; pixel pData for left border SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] movdqa [r5], xmm0 ; right movzx r2d, byte [r3] SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes lea r0, [r0+r1] ; left pSrc lea r5, [r5+r1] ; left dst lea r3, [r3+r1] ; right pSrc lea r4, [r4+r1] ; right dst dec r6 jnz near .left_right_loops %endif %endmacro %macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6 ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride %if %1 == 32 ; luma ; TL mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? ; TR mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? ; BL mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? ; BR mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? %elif %1 == 16 ; chroma ; TL mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? ; TR mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? ; BL mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? ; BR mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? %endif %endmacro ;***********************************************************************---------------- ; void ExpandPictureLuma_sse2( uint8_t *pDst, ; const int32_t iStride, ; const int32_t iWidth, ; const int32_t iHeight ); ;***********************************************************************---------------- WELS_EXTERN ExpandPictureLuma_sse2 push r4 push r5 push r6 %assign push_num 3 LOAD_4_PARA PUSH_XMM 7 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r2, r2d SIGN_EXTENSION r3, r3d ;also prepare for cross border pData top-left:xmm3 movzx r6d,byte[r0] SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0] neg r1 lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride] neg r1 push r3 dec r3 ;h-1 imul r3,r1 ;(h-1)*stride lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom mov r6,r1 ;r6 = stride sal r6,05h ;r6 = 32*stride lea r4,[r3+r6] ;r4 = dst bottom ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6 movzx r6d,byte [r3] ;bottom-left SSE2_Copy16Times xmm5,r6d lea r6,[r3+r2-1] movzx r6d,byte [r6] SSE2_Copy16Times xmm6,r6d ;bottom-right neg r1 ;r1 = -stride push r0 push r1 push r2 exp_top_bottom_sse2 32 ; for both left and right border ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; pop r2 pop r1 pop r0 lea r5,[r0-32] ;left border dst luma =32 chroma = -16 lea r3,[r0+r2-1] ;right border src lea r4,[r3+1] ;right border dst ;prepare for cross border data: top-rigth with xmm4 movzx r6d,byte [r3] ;top -rigth SSE2_Copy16Times xmm4,r6d neg r1 ;r1 = stride pop r6 ; r6 = height push r0 push r1 push r2 push r6 exp_left_right_sse2 32,a pop r6 pop r2 pop r1 pop r0 ; for cross border [top-left, top-right, bottom-left, bottom-right] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued.. neg r1 ;r1 = -stride lea r3,[r0-32] lea r3,[r3+r1] ;last line of top-left border lea r4,[r0+r2] ;psrc +width lea r4,[r4+r1] ;psrc +width -stride neg r1 ;r1 = stride add r6,32 ;height +32(16) ,luma = 32, chroma = 16 imul r6,r1 lea r5,[r3+r6] ;last line of bottom-left border lea r6,[r4+r6] ;last line of botoom-right border neg r1 ; r1 = -stride ; for left & right border expanding exp_cross_sse2 32,a POP_XMM LOAD_4_PARA_POP pop r6 pop r5 pop r4 %assign push_num 0 ret ;***********************************************************************---------------- ; void ExpandPictureChromaAlign_sse2( uint8_t *pDst, ; const int32_t iStride, ; const int32_t iWidth, ; const int32_t iHeight ); ;***********************************************************************---------------- WELS_EXTERN ExpandPictureChromaAlign_sse2 push r4 push r5 push r6 %assign push_num 3 LOAD_4_PARA PUSH_XMM 7 SIGN_EXTENSION r1,r1d SIGN_EXTENSION r2,r2d SIGN_EXTENSION r3,r3d ;also prepare for cross border pData top-left:xmm3 movzx r6d,byte [r0] SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0] neg r1 lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride] neg r1 push r3 dec r3 ;h-1 imul r3,r1 ;(h-1)*stride lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom mov r6,r1 ;r6 = stride sal r6,04h ;r6 = 32*stride lea r4,[r3+r6] ;r4 = dst bottom ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6 movzx r6d,byte [r3] ;bottom-left SSE2_Copy16Times xmm5,r6d lea r6,[r3+r2-1] movzx r6d,byte [r6] SSE2_Copy16Times xmm6,r6d ;bottom-right neg r1 ;r1 = -stride push r0 push r1 push r2 exp_top_bottom_sse2 16 ; for both left and right border ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; pop r2 pop r1 pop r0 lea r5,[r0-16] ;left border dst luma =32 chroma = -16 lea r3,[r0+r2-1] ;right border src lea r4,[r3+1] ;right border dst ;prepare for cross border data: top-rigth with xmm4 movzx r6d,byte [r3] ;top -rigth SSE2_Copy16Times xmm4,r6d neg r1 ;r1 = stride pop r6 ; r6 = height push r0 push r1 push r2 push r6 exp_left_right_sse2 16,a pop r6 pop r2 pop r1 pop r0 ; for cross border [top-left, top-right, bottom-left, bottom-right] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued.. neg r1 ;r1 = -stride lea r3,[r0-16] lea r3,[r3+r1] ;last line of top-left border lea r4,[r0+r2] ;psrc +width lea r4,[r4+r1] ;psrc +width -stride neg r1 ;r1 = stride add r6,16 ;height +32(16) ,luma = 32, chroma = 16 imul r6,r1 lea r5,[r3+r6] ;last line of bottom-left border lea r6,[r4+r6] ;last line of botoom-right border neg r1 ; r1 = -stride ; for left & right border expanding exp_cross_sse2 16,a POP_XMM LOAD_4_PARA_POP pop r6 pop r5 pop r4 %assign push_num 0 ret ;***********************************************************************---------------- ; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst, ; const int32_t iStride, ; const int32_t iWidth, ; const int32_t iHeight ); ;***********************************************************************---------------- WELS_EXTERN ExpandPictureChromaUnalign_sse2 push r4 push r5 push r6 %assign push_num 3 LOAD_4_PARA PUSH_XMM 7 SIGN_EXTENSION r1,r1d SIGN_EXTENSION r2,r2d SIGN_EXTENSION r3,r3d ;also prepare for cross border pData top-left:xmm3 movzx r6d,byte [r0] SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0] neg r1 lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride] neg r1 push r3 dec r3 ;h-1 imul r3,r1 ;(h-1)*stride lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom mov r6,r1 ;r6 = stride sal r6,04h ;r6 = 32*stride lea r4,[r3+r6] ;r4 = dst bottom ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6 movzx r6d,byte [r3] ;bottom-left SSE2_Copy16Times xmm5,r6d lea r6,[r3+r2-1] movzx r6d,byte [r6] SSE2_Copy16Times xmm6,r6d ;bottom-right neg r1 ;r1 = -stride push r0 push r1 push r2 exp_top_bottom_sse2 16 ; for both left and right border ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; pop r2 pop r1 pop r0 lea r5,[r0-16] ;left border dst luma =32 chroma = -16 lea r3,[r0+r2-1] ;right border src lea r4,[r3+1] ;right border dst ;prepare for cross border data: top-rigth with xmm4 movzx r6d,byte [r3] ;top -rigth SSE2_Copy16Times xmm4,r6d neg r1 ;r1 = stride pop r6 ; r6 = height push r0 push r1 push r2 push r6 exp_left_right_sse2 16,u pop r6 pop r2 pop r1 pop r0 ; for cross border [top-left, top-right, bottom-left, bottom-right] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued.. neg r1 ;r1 = -stride lea r3,[r0-16] lea r3,[r3+r1] ;last line of top-left border lea r4,[r0+r2] ;psrc +width lea r4,[r4+r1] ;psrc +width -stride neg r1 ;r1 = stride add r6,16 ;height +32(16) ,luma = 32, chroma = 16 imul r6,r1 lea r5,[r3+r6] ;last line of bottom-left border lea r6,[r4+r6] ;last line of botoom-right border neg r1 ; r1 = -stride ; for left & right border expanding exp_cross_sse2 16,u POP_XMM LOAD_4_PARA_POP pop r6 pop r5 pop r4 %assign push_num 0 ret