shithub: openh264

ref: 283b5e8f5628f120b735199725ee9f9e7573d145
dir: /codec/processing/src/x86/downsample_bilinear.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  upsampling.asm
;*
;*  Abstract
;*      SIMD for pixel domain down sampling
;*
;*  History
;*      10/22/2009  Created
;*
;*************************************************************************/
%include "asm_inc.asm"

%ifdef __NASM_VER__
    %use smartalign
%endif

;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************


;***********************************************************************
; Some constants
;***********************************************************************

;***********************************************************************
; Local Data (Read Only)
;***********************************************************************

SECTION .rodata align=32

;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
;***********************************************************************

ALIGN 32
db80h_256:
    times 32 db 80h
shufb_0000000088888888:
    times 8 db 0
    times 8 db 8
shufb_000044448888CCCC:
    times 4 db 0
    times 4 db 4
    times 4 db 8
    times 4 db 12
shufb_mask_low:
    db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
shufb_mask_high:
    db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
add_extra_half:
    dd 16384,0,0,0

shufb_mask_quarter:
db 00h, 04h, 08h, 0ch, 80h, 80h, 80h, 80h, 01h, 05h, 09h, 0dh, 80h, 80h, 80h, 80h

shufb_mask_onethird_low_1:
db 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h
shufb_mask_onethird_low_2:
db 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh, 80h, 80h, 80h, 80h, 80h
shufb_mask_onethird_low_3:
db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 01h, 04h, 07h, 0ah, 0dh

shufb_mask_onethird_high_1:
db 01h, 04h, 07h, 0ah, 0dh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h
shufb_mask_onethird_high_2:
db 80h, 80h, 80h, 80h, 80h, 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h
shufb_mask_onethird_high_3:
db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh

;***********************************************************************
; Code
;***********************************************************************

SECTION .text

;***********************************************************************
;   void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
;                   unsigned char* pSrc, const int iSrcStride,
;                   const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
%ifdef X86_32
    push r6
    %assign push_num 1
%else
    %assign push_num 0
%endif
    LOAD_6_PARA
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
    SIGN_EXTENSION r5, r5d

%ifndef X86_32
    push r12
    mov r12, r4
%endif
    sar r5, $01            ; iSrcHeight >> 1

.yloops1:
%ifdef X86_32
    mov r4, arg5
%else
    mov r4, r12
%endif
    sar r4, $01            ; iSrcWidth >> 1
    mov r6, r4        ; iDstWidth restored at ebx
    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
    neg r6             ; - (iSrcWidth >> 1)
    ; each loop = source bandwidth: 32 bytes
.xloops1:
    ; 1st part horizonal loop: x16 bytes
    ;               mem  hi<-       ->lo
    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
    ;=> target:
    ;: H G F E D C B A, P O N M L K J I
    ;: h g f e d c b a, p o n m l k j i
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    movq mm0, [r2]         ; 1st pSrc line
    movq mm1, [r2+8]       ; 1st pSrc line + 8
    movq mm2, [r2+r3]     ; 2nd pSrc line
    movq mm3, [r2+r3+8]   ; 2nd pSrc line + 8

    ; to handle mm0, mm1, mm2, mm3
    pshufw mm4, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
    pshufw mm5, mm4, 04eh   ; c C a A d D b B ; 01001110 B
    punpcklbw mm4, mm5      ; d c D C b a B A
    pshufw mm4, mm4, 0d8h   ; d c b a D C B A ; 11011000 B: mm4

    pshufw mm5, mm1, 0d8h   ; h H f F g G e E ; 11011000 B
    pshufw mm6, mm5, 04eh   ; g G e E h H f F ; 01001110 B
    punpcklbw mm5, mm6      ; h g H G f e F E
    pshufw mm5, mm5, 0d8h   ; h g f e H G F E ; 11011000 B: mm5

    pshufw mm6, mm2, 0d8h   ; l L j J k K i I ; 11011000 B
    pshufw mm7, mm6, 04eh   ; k K i I l L j J ; 01001110 B
    punpcklbw mm6, mm7      ; l k L K j i J I
    pshufw mm6, mm6, 0d8h   ; l k j i L K J I ; 11011000 B: mm6

    pshufw mm7, mm3, 0d8h   ; p P n N o O m M ; 11011000 B
    pshufw mm0, mm7, 04eh   ; o O m M p P n N ; 01001110 B
    punpcklbw mm7, mm0      ; p o P O n m N M
    pshufw mm7, mm7, 0d8h   ; p o n m P O N M ; 11011000 B: mm7

    ; to handle mm4, mm5, mm6, mm7
    movq mm0, mm4       ;
    punpckldq mm0, mm5  ; H G F E D C B A
    punpckhdq mm4, mm5  ; h g f e d c b a

    movq mm1, mm6
    punpckldq mm1, mm7  ; P O N M L K J I
    punpckhdq mm6, mm7  ; p o n m l k j i

    ; avg within MB horizon width (16 x 2 lines)
    pavgb mm0, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
    pavgb mm1, mm6      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once

    ; 2nd part horizonal loop: x16 bytes
    ;               mem  hi<-       ->lo
    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
    ;=> target:
    ;: H G F E D C B A, P O N M L K J I
    ;: h g f e d c b a, p o n m l k j i
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    movq mm1, [r2+16]      ; 1st pSrc line + 16
    movq mm2, [r2+24]      ; 1st pSrc line + 24
    movq mm3, [r2+r3+16]  ; 2nd pSrc line + 16
    movq mm4, [r2+r3+24]  ; 2nd pSrc line + 24

    ; to handle mm1, mm2, mm3, mm4
    pshufw mm5, mm1, 0d8h   ; d D b B c C a A ; 11011000 B
    pshufw mm6, mm5, 04eh   ; c C a A d D b B ; 01001110 B
    punpcklbw mm5, mm6      ; d c D C b a B A
    pshufw mm5, mm5, 0d8h   ; d c b a D C B A ; 11011000 B: mm5

    pshufw mm6, mm2, 0d8h   ; h H f F g G e E ; 11011000 B
    pshufw mm7, mm6, 04eh   ; g G e E h H f F ; 01001110 B
    punpcklbw mm6, mm7      ; h g H G f e F E
    pshufw mm6, mm6, 0d8h   ; h g f e H G F E ; 11011000 B: mm6

    pshufw mm7, mm3, 0d8h   ; l L j J k K i I ; 11011000 B
    pshufw mm1, mm7, 04eh   ; k K i I l L j J ; 01001110 B
    punpcklbw mm7, mm1      ; l k L K j i J I
    pshufw mm7, mm7, 0d8h   ; l k j i L K J I ; 11011000 B: mm7

    pshufw mm1, mm4, 0d8h   ; p P n N o O m M ; 11011000 B
    pshufw mm2, mm1, 04eh   ; o O m M p P n N ; 01001110 B
    punpcklbw mm1, mm2      ; p o P O n m N M
    pshufw mm1, mm1, 0d8h   ; p o n m P O N M ; 11011000 B: mm1

    ; to handle mm5, mm6, mm7, mm1
    movq mm2, mm5
    punpckldq mm2, mm6  ; H G F E D C B A
    punpckhdq mm5, mm6  ; h g f e d c b a

    movq mm3, mm7
    punpckldq mm3, mm1  ; P O N M L K J I
    punpckhdq mm7, mm1  ; p o n m l k j i

    ; avg within MB horizon width (16 x 2 lines)
    pavgb mm2, mm5      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
    pavgb mm3, mm7      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
    pavgb mm2, mm3      ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part

    movq [r0  ], mm0
    movq [r0+8], mm2

    ; next SMB
    lea r2, [r2+32]
    lea r0, [r0+16]

    dec r4
    jg near .xloops1

    ; next line
    lea r2, [r2+2*r3]    ; next end of lines
    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
    lea r0, [r0+r1]
    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

    dec r5
    jg near .yloops1

    WELSEMMS
%ifndef X86_32
    pop r12
%endif
    LOAD_6_PARA_POP
%ifdef X86_32
    pop r6
%endif
    ret

;***********************************************************************
;   void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
;                     unsigned char* pSrc, const int iSrcStride,
;                     const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
%ifdef X86_32
    push r6
    %assign push_num 1
%else
    %assign push_num 0
%endif
    LOAD_6_PARA
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
    SIGN_EXTENSION r5, r5d

%ifndef X86_32
    push r12
    mov r12, r4
%endif
    sar r5, $01            ; iSrcHeight >> 1

.yloops2:
%ifdef X86_32
    mov r4, arg5
%else
    mov r4, r12
%endif
    sar r4, $01            ; iSrcWidth >> 1
    mov r6, r4        ; iDstWidth restored at ebx
    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb
    neg r6             ; - (iSrcWidth >> 1)
    ; each loop = source bandwidth: 16 bytes
.xloops2:
    ; 1st part horizonal loop: x16 bytes
    ;               mem  hi<-       ->lo
    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
    ;=> target:
    ;: H G F E D C B A, P O N M L K J I
    ;: h g f e d c b a, p o n m l k j i
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    movq mm0, [r2]         ; 1st pSrc line
    movq mm1, [r2+8]       ; 1st pSrc line + 8
    movq mm2, [r2+r3]     ; 2nd pSrc line
    movq mm3, [r2+r3+8]   ; 2nd pSrc line + 8

    ; to handle mm0, mm1, mm2, mm3
    pshufw mm4, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
    pshufw mm5, mm4, 04eh   ; c C a A d D b B ; 01001110 B
    punpcklbw mm4, mm5      ; d c D C b a B A
    pshufw mm4, mm4, 0d8h   ; d c b a D C B A ; 11011000 B: mm4

    pshufw mm5, mm1, 0d8h   ; h H f F g G e E ; 11011000 B
    pshufw mm6, mm5, 04eh   ; g G e E h H f F ; 01001110 B
    punpcklbw mm5, mm6      ; h g H G f e F E
    pshufw mm5, mm5, 0d8h   ; h g f e H G F E ; 11011000 B: mm5

    pshufw mm6, mm2, 0d8h   ; l L j J k K i I ; 11011000 B
    pshufw mm7, mm6, 04eh   ; k K i I l L j J ; 01001110 B
    punpcklbw mm6, mm7      ; l k L K j i J I
    pshufw mm6, mm6, 0d8h   ; l k j i L K J I ; 11011000 B: mm6

    pshufw mm7, mm3, 0d8h   ; p P n N o O m M ; 11011000 B
    pshufw mm0, mm7, 04eh   ; o O m M p P n N ; 01001110 B
    punpcklbw mm7, mm0      ; p o P O n m N M
    pshufw mm7, mm7, 0d8h   ; p o n m P O N M ; 11011000 B: mm7

    ; to handle mm4, mm5, mm6, mm7
    movq mm0, mm4       ;
    punpckldq mm0, mm5  ; H G F E D C B A
    punpckhdq mm4, mm5  ; h g f e d c b a

    movq mm1, mm6
    punpckldq mm1, mm7  ; P O N M L K J I
    punpckhdq mm6, mm7  ; p o n m l k j i

    ; avg within MB horizon width (16 x 2 lines)
    pavgb mm0, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
    pavgb mm1, mm6      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once

    movq [r0  ], mm0

    ; next SMB
    lea r2, [r2+16]
    lea r0, [r0+8]

    dec r4
    jg near .xloops2

    ; next line
    lea r2, [r2+2*r3]    ; next end of lines
    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
    lea r0, [r0+r1]
    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

    dec r5
    jg near .yloops2

    WELSEMMS
%ifndef X86_32
    pop r12
%endif
    LOAD_6_PARA_POP
%ifdef X86_32
    pop r6
%endif
    ret

;***********************************************************************
;   void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
;                     unsigned char* pSrc, const int iSrcStride,
;                     const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
%ifdef X86_32
    push r6
    %assign push_num 1
%else
    %assign push_num 0
%endif
    LOAD_6_PARA
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
    SIGN_EXTENSION r5, r5d

%ifndef X86_32
    push r12
    mov r12, r4
%endif
    sar r5, $01            ; iSrcHeight >> 1

.yloops3:
%ifdef X86_32
    mov r4, arg5
%else
    mov r4, r12
%endif
    sar r4, $01            ; iSrcWidth >> 1
    mov r6, r4        ; iDstWidth restored at ebx
    sar r4, $02            ; (iSrcWidth >> 1) / 4     ; loop count = num_of_mb
    neg r6             ; - (iSrcWidth >> 1)
    ; each loop = source bandwidth: 8 bytes
.xloops3:
    ; 1st part horizonal loop: x8 bytes
    ;               mem  hi<-       ->lo
    ;1st Line Src:  mm0: d D c C b B a A
    ;2nd Line Src:  mm1: h H g G f F e E
    ;=> target:
    ;: H G F E D C B A
    ;: h g f e d c b a
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    movq mm0, [r2]         ; 1st pSrc line
    movq mm1, [r2+r3]     ; 2nd pSrc line

    ; to handle mm0, mm1, mm2, mm3
    pshufw mm2, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
    pshufw mm3, mm2, 04eh   ; c C a A d D b B ; 01001110 B
    punpcklbw mm2, mm3      ; d c D C b a B A
    pshufw mm2, mm2, 0d8h   ; d c b a D C B A ; 11011000 B: mm4

    pshufw mm4, mm1, 0d8h   ; h H f F g G e E ; 11011000 B
    pshufw mm5, mm4, 04eh   ; g G e E h H f F ; 01001110 B
    punpcklbw mm4, mm5      ; h g H G f e F E
    pshufw mm4, mm4, 0d8h   ; h g f e H G F E ; 11011000 B: mm5

    ; to handle mm2, mm4
    movq mm0, mm2       ;
    punpckldq mm0, mm4  ; H G F E D C B A
    punpckhdq mm2, mm4  ; h g f e d c b a

    ; avg within MB horizon width (16 x 2 lines)
    pavgb mm0, mm2      ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
    pshufw mm1, mm0, 04eh   ; 01001110 B
    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once

    movd [r0], mm0

    ; next unit
    lea r2, [r2+8]
    lea r0, [r0+4]

    dec r4
    jg near .xloops3

    ; next line
    lea r2, [r2+2*r3]    ; next end of lines
    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
    lea r0, [r0+r1]
    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

    dec r5
    jg near .yloops3

    WELSEMMS
%ifndef X86_32
    pop r12
%endif
    LOAD_6_PARA_POP
%ifdef X86_32
    pop r6
%endif
    ret



;***********************************************************************
;   void DyadicBilinearDownsamplerWidthx32_ssse3(   unsigned char* pDst, const int iDstStride,
;                   unsigned char* pSrc, const int iSrcStride,
;                   const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
%ifdef X86_32
    push r6
    %assign push_num 1
%else
    %assign push_num 0
%endif
    LOAD_6_PARA
    PUSH_XMM 4
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
    SIGN_EXTENSION r5, r5d

%ifndef X86_32
    push r12
    mov r12, r4
%endif
    sar r5, $01            ; iSrcHeight >> 1

    WELS_DB1 xmm3
    WELS_Zero xmm2
    sar r4, $01            ; iSrcWidth >> 1
    add r0, r4             ; pDst += iSrcWidth >> 1

.yloops4:
%ifdef X86_32
    mov r4, arg5
%else
    mov r4, r12
%endif
    sar r4, $01            ; iSrcWidth >> 1
    neg r4                 ; -(iSrcWidth >> 1)
    mov r6, r4
    align 16
    ; each loop = source bandwidth: 32 bytes
.xloops4:
    movdqa xmm0, [r2+r3]
    movdqa xmm1, [r2+r3+16]
    pavgb  xmm0, [r2]          ; avg vertical pixels 0-15
    pavgb  xmm1, [r2+16]       ; avg vertical pixels 16-31
    add r2, 32                 ; pSrc += 32
    pmaddubsw xmm0, xmm3       ; pairwise horizontal sum neighboring pixels 0-15
    pmaddubsw xmm1, xmm3       ; pairwise horizontal sum neighboring pixels 16-31
    pavgw xmm0, xmm2           ; (sum + 1) >> 1
    pavgw xmm1, xmm2           ; (sum + 1) >> 1
    packuswb xmm0, xmm1        ; pack words to bytes
    movdqa [r0+r4], xmm0       ; store results
    add r4, 16
    jl .xloops4

    ; next line
    lea r2, [r2+2*r3]    ; next end of lines
    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
    lea r0, [r0+r1]

    sub r5, 1
    jg .yloops4

%ifndef X86_32
    pop r12
%endif

    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop r6
%endif
    ret

;***********************************************************************
;   void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
;                     unsigned char* pSrc, const int iSrcStride,
;                     const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
%ifdef X86_32
    push r6
    %assign push_num 1
%else
    %assign push_num 0
%endif
    LOAD_6_PARA
    PUSH_XMM 4
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
    SIGN_EXTENSION r5, r5d

%ifndef X86_32
    push r12
    mov r12, r4
%endif
    sar r5, $01            ; iSrcHeight >> 1
    WELS_DB1 xmm3
    WELS_Zero xmm2
    add r2, r4             ; pSrc += iSrcWidth
    sar r4, $01            ; iSrcWidth >> 1
    add r0, r4             ; pDst += iSrcWidth >> 1

.yloops5:
%ifdef X86_32
    mov r4, arg5
%else
    mov r4, r12
%endif
    sar r4, $01            ; iSrcWidth >> 1
    neg r4                 ; -(iSrcWidth >> 1)
    lea r6, [r2+r3]        ; pSrc + iSrcStride
    align 16
    ; each loop = source bandwidth: 16 bytes
.xloops5:
    movdqa xmm0, [r2+2*r4]
    pavgb  xmm0, [r6+2*r4]     ; avg vertical pixels
    pmaddubsw xmm0, xmm3       ; pairwise horizontal sum neighboring pixels
    pavgw xmm0, xmm2           ; (sum + 1) >> 1
    packuswb xmm0, xmm0        ; pack words to bytes
    movlps [r0+r4], xmm0       ; store results
    add r4, 8
    jl .xloops5

    ; next line
    lea r2, [r2+2*r3]    ; next end of lines
    lea r0, [r0+r1]

    sub r5, 1
    jg .yloops5

%ifndef X86_32
    pop r12
%endif

    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop r6
%endif
    ret


%ifdef X86_32
;**************************************************************************************************************
;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
;                           unsigned char* pSrc, const int iSrcStride,
;                           unsigned int uiScaleX, unsigned int uiScaleY );
;{
;**************************************************************************************************************

WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
    push    ebp
    push    esi
    push    edi
    push    ebx
%define     pushsize    16
%define     localsize   16
%define     pDstData        esp + pushsize + localsize + 4
%define     dwDstStride     esp + pushsize + localsize + 8
%define     dwDstWidth      esp + pushsize + localsize + 12
%define     dwDstHeight     esp + pushsize + localsize + 16
%define     pSrcData        esp + pushsize + localsize + 20
%define     dwSrcStride     esp + pushsize + localsize + 24
%define     uiScaleX            esp + pushsize + localsize + 28
%define     uiScaleY            esp + pushsize + localsize + 32
%define     tmpHeight       esp + 0
%define     yInverse        esp + 4
%define     xInverse        esp + 8
%define     dstStep         esp + 12
    sub     esp,            localsize

    pxor    xmm0,   xmm0
    mov     eax,    [uiScaleX]
    and     eax,    32767
    mov     ebx,    eax
    neg     ebx
    and     ebx,    32767
    movd    xmm1,       eax                     ; uinc(uiScaleX mod 32767)
    movd    xmm2,       ebx                     ; -uinc
    psllq   xmm1,       32
    por     xmm1,       xmm2                    ; 0 0  uinc  -uinc   (dword)
    pshufd  xmm7,       xmm1,   01000100b       ; xmm7: uinc -uinc uinc -uinc

    mov     eax,    [uiScaleY]
    and     eax,    32767
    mov     ebx,    eax
    neg     ebx
    and     ebx,    32767
    movd    xmm6,       eax                     ; vinc(uiScaleY mod 32767)
    movd    xmm2,       ebx                     ; -vinc
    psllq   xmm6,       32
    por     xmm6,       xmm2                    ; 0 0 vinc -vinc (dword)
    pshufd  xmm6,       xmm6,   01010000b       ; xmm6: vinc vinc -vinc -vinc

    mov     edx,        40003fffh
    movd    xmm5,       edx
    punpcklwd   xmm5,   xmm0                    ; 16384 16383
    pshufd  xmm5,       xmm5,   01000100b       ; xmm5: 16384 16383 16384 16383


DOWNSAMPLE:

    mov     eax,            [dwDstHeight]
    mov     edi,            [pDstData]
    mov     edx,            [dwDstStride]
    mov     ecx,            [dwDstWidth]
    sub     edx,            ecx
    mov     [dstStep],  edx             ; stride - width
    dec     eax
    mov     [tmpHeight],    eax
    mov     eax,            16384
    mov     [yInverse],     eax

    pshufd  xmm4,       xmm5,   01010000b   ; initial v to 16384 16384 16383 16383

HEIGHT:
    mov     eax,    [yInverse]
    mov     esi,    [pSrcData]
    shr     eax,    15
    mul     dword [dwSrcStride]
    add     esi,    eax                 ; get current row address
    mov     ebp,    esi
    add     ebp,    [dwSrcStride]

    mov     eax,        16384
    mov     [xInverse],     eax
    mov     ecx,            [dwDstWidth]
    dec     ecx

    movdqa  xmm3,       xmm5            ; initial u to 16384 16383 16384 16383

WIDTH:
    mov     eax,        [xInverse]
    shr     eax,        15

    movd    xmm1,       [esi+eax]       ; xxxxxxba
    movd    xmm2,       [ebp+eax]       ; xxxxxxdc
    pxor    xmm0,       xmm0
    punpcklwd   xmm1,   xmm2            ; xxxxdcba
    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
    punpcklwd   xmm1,   xmm0            ; 000d000c000b000a

    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
    pmaddwd xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
    movdqa  xmm0,   xmm2
    pmuludq xmm2,   xmm1
    psrlq   xmm0,   32
    psrlq   xmm1,   32
    pmuludq xmm0,   xmm1
    paddq   xmm2,   xmm0
    pshufd  xmm1,   xmm2,   00001110b
    paddq   xmm2,   xmm1
    psrlq   xmm2,   29

    movd    eax,    xmm2
    inc     eax
    shr     eax,    1
    mov     [edi],  al
    inc     edi

    mov     eax,        [uiScaleX]
    add     [xInverse], eax

    paddw   xmm3,       xmm7            ; inc u
    psllw   xmm3,       1
    psrlw   xmm3,       1

    loop    WIDTH

WIDTH_END:
    mov     eax,        [xInverse]
    shr     eax,        15
    mov     cl,         [esi+eax]
    mov     [edi],      cl
    inc     edi

    mov     eax,        [uiScaleY]
    add     [yInverse], eax
    add     edi,        [dstStep]

    paddw   xmm4,   xmm6                ; inc v
    psllw   xmm4,   1
    psrlw   xmm4,   1

    dec     dword [tmpHeight]
    jg      HEIGHT


LAST_ROW:
    mov     eax,    [yInverse]
    mov     esi,    [pSrcData]
    shr     eax,    15
    mul     dword [dwSrcStride]
    add     esi,    eax                 ; get current row address

    mov     eax,        16384
    mov     [xInverse],     eax
    mov     ecx,            [dwDstWidth]

LAST_ROW_WIDTH:
    mov     eax,        [xInverse]
    shr     eax,        15

    mov     al,         [esi+eax]
    mov     [edi],  al
    inc     edi

    mov     eax,        [uiScaleX]
    add     [xInverse], eax

    loop    LAST_ROW_WIDTH

LAST_ROW_END:

    add     esp,            localsize
    pop     ebx
    pop     edi
    pop     esi
    pop     ebp
%undef      pushsize
%undef      localsize
%undef      pSrcData
%undef      dwSrcWidth
%undef      dwSrcHeight
%undef      dwSrcStride
%undef      pDstData
%undef      dwDstWidth
%undef      dwDstHeight
%undef      dwDstStride
%undef      uiScaleX
%undef      uiScaleY
%undef      tmpHeight
%undef      yInverse
%undef      xInverse
%undef      dstStep
    ret




;**************************************************************************************************************
;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
;               unsigned char* pSrc, const int iSrcStride,
;               unsigned int uiScaleX, unsigned int uiScaleY );
;{
;**************************************************************************************************************

WELS_EXTERN GeneralBilinearFastDownsampler_sse2
    push    ebp
    push    esi
    push    edi
    push    ebx
%define     pushsize    16
%define     localsize   16
%define     pDstData        esp + pushsize + localsize + 4
%define     dwDstStride     esp + pushsize + localsize + 8
%define     dwDstWidth      esp + pushsize + localsize + 12
%define     dwDstHeight     esp + pushsize + localsize + 16
%define     pSrcData        esp + pushsize + localsize + 20
%define     dwSrcStride     esp + pushsize + localsize + 24
%define     uiScaleX            esp + pushsize + localsize + 28
%define     uiScaleY            esp + pushsize + localsize + 32
%define     tmpHeight       esp + 0
%define     yInverse        esp + 4
%define     xInverse        esp + 8
%define     dstStep         esp + 12
    sub     esp,            localsize

    pxor    xmm0,   xmm0
    mov     edx,    65535
    mov     eax,    [uiScaleX]
    and     eax,    edx
    mov     ebx,    eax
    neg     ebx
    and     ebx,    65535
    movd    xmm1,       eax                     ; uinc(uiScaleX mod 65536)
    movd    xmm2,       ebx                     ; -uinc
    psllq   xmm1,       32
    por     xmm1,       xmm2                    ; 0 uinc 0 -uinc
    pshuflw xmm7,       xmm1,   10001000b       ; xmm7: uinc -uinc uinc -uinc

    mov     eax,    [uiScaleY]
    and     eax,    32767
    mov     ebx,    eax
    neg     ebx
    and     ebx,    32767
    movd    xmm6,       eax                     ; vinc(uiScaleY mod 32767)
    movd    xmm2,       ebx                     ; -vinc
    psllq   xmm6,       32
    por     xmm6,       xmm2                    ; 0 vinc 0 -vinc
    pshuflw xmm6,       xmm6,   10100000b       ; xmm6: vinc vinc -vinc -vinc

    mov     edx,        80007fffh               ; 32768 32767
    movd    xmm5,       edx
    pshuflw xmm5,       xmm5,       01000100b   ; 32768 32767 32768 32767
    mov     ebx,        16384


FAST_DOWNSAMPLE:

    mov     eax,            [dwDstHeight]
    mov     edi,            [pDstData]
    mov     edx,            [dwDstStride]
    mov     ecx,            [dwDstWidth]
    sub     edx,            ecx
    mov     [dstStep],  edx             ; stride - width
    dec     eax
    mov     [tmpHeight],    eax
    mov     eax,        16384
    mov     [yInverse],     eax

    pshuflw xmm4,       xmm5,   01010000b
    psrlw   xmm4,       1               ; initial v to 16384 16384 16383 16383

FAST_HEIGHT:
    mov     eax,    [yInverse]
    mov     esi,    [pSrcData]
    shr     eax,    15
    mul     dword [dwSrcStride]
    add     esi,    eax                 ; get current row address
    mov     ebp,    esi
    add     ebp,    [dwSrcStride]

    mov     eax,        32768
    mov     [xInverse],     eax
    mov     ecx,            [dwDstWidth]
    dec     ecx

    movdqa  xmm3,       xmm5            ; initial u to 32768 32767 32768 32767

FAST_WIDTH:
    mov     eax,        [xInverse]
    shr     eax,        16

    movd    xmm1,       [esi+eax]       ; xxxxxxba
    movd    xmm2,       [ebp+eax]       ; xxxxxxdc
    punpcklwd   xmm1,   xmm2            ; xxxxdcba
    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a

    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
    pmulhuw xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
    pmaddwd     xmm2,   xmm1
    pshufd  xmm1,   xmm2,   00000001b
    paddd   xmm2,   xmm1
    movd    xmm1,   ebx
    paddd   xmm2,   xmm1
    psrld   xmm2,   15

    packuswb    xmm2,   xmm0
    movd    eax,    xmm2
    mov     [edi],  al
    inc     edi

    mov     eax,        [uiScaleX]
    add     [xInverse], eax

    paddw   xmm3,       xmm7            ; inc u

    loop    FAST_WIDTH

FAST_WIDTH_END:
    mov     eax,        [xInverse]
    shr     eax,        16
    mov     cl,         [esi+eax]
    mov     [edi],      cl
    inc     edi

    mov     eax,        [uiScaleY]
    add     [yInverse], eax
    add     edi,        [dstStep]

    paddw   xmm4,   xmm6                ; inc v
    psllw   xmm4,   1
    psrlw   xmm4,   1

    dec     dword [tmpHeight]
    jg      FAST_HEIGHT


FAST_LAST_ROW:
    mov     eax,    [yInverse]
    mov     esi,    [pSrcData]
    shr     eax,    15
    mul     dword [dwSrcStride]
    add     esi,    eax                 ; get current row address

    mov     eax,        32768
    mov     [xInverse],     eax
    mov     ecx,            [dwDstWidth]

FAST_LAST_ROW_WIDTH:
    mov     eax,        [xInverse]
    shr     eax,        16

    mov     al,         [esi+eax]
    mov     [edi],  al
    inc     edi

    mov     eax,        [uiScaleX]
    add     [xInverse], eax

    loop    FAST_LAST_ROW_WIDTH

FAST_LAST_ROW_END:

    add     esp,            localsize
    pop     ebx
    pop     edi
    pop     esi
    pop     ebp
%undef      pushsize
%undef      localsize
%undef      pSrcData
%undef      dwSrcWidth
%undef      dwSrcHeight
%undef      dwSrcStride
%undef      pDstData
%undef      dwDstStride
%undef      uiScaleX
%undef      uiScaleY
%undef      tmpHeight
%undef      yInverse
%undef      xInverse
%undef      dstStep
    ret

%elifdef  WIN64

;**************************************************************************************************************
;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
;                           unsigned char* pSrc, const int iSrcStride,
;                           unsigned int uiScaleX, unsigned int uiScaleY );
;{
;**************************************************************************************************************

WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
    push    r12
    push    r13
    push    r14
    push    r15
    push    rsi
    push    rdi
    push    rbx
    push    rbp
    %assign push_num 8
    LOAD_7_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r2, r2d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r5, r5d
    SIGN_EXTENSION r6, r6d

    pxor    xmm0,   xmm0
    mov     r12d,   r6d
    and     r12d,   32767
    mov     r13d,   r12d
    neg     r13d
    and     r13d,   32767
    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 32767)
    movd    xmm2,   r13d                     ; -uinc
    psllq   xmm1,   32
    por     xmm1,   xmm2                    ; 0 0  uinc  -uinc   (dword)
    pshufd  xmm7,   xmm1,   01000100b       ; xmm7: uinc -uinc uinc -uinc

    mov     r12,    arg8
    SIGN_EXTENSION r12, r12d
    mov     rbp,    r12
    and     r12d,   32767
    mov     r13d,   r12d
    neg     r13d
    and     r13d,   32767
    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)
    movd    xmm2,       r13d                     ; -vinc
    psllq   xmm6,       32
    por     xmm6,       xmm2                    ; 0 0 vinc -vinc (dword)
    pshufd  xmm6,       xmm6,   01010000b       ; xmm6: vinc vinc -vinc -vinc

    mov     r12d,        40003fffh
    movd    xmm5,       r12d
    punpcklwd   xmm5,   xmm0                    ; 16384 16383
    pshufd  xmm5,       xmm5,   01000100b       ; xmm5: 16384 16383 16384 16383

DOWNSAMPLE:
    sub     r1, r2                   ; stride - width
    dec     r3
    mov     r14,16384
    pshufd  xmm4,       xmm5,   01010000b   ; initial v to 16384 16384 16383 16383

HEIGHT:
    ;mov     r12, r4
    mov     r12, r14
    shr     r12,    15
    imul    r12,    r5
    add     r12,    r4                 ; get current row address
    mov     r13,    r12
    add     r13,    r5

    mov     r15, 16384
    mov     rsi, r2
    dec     rsi
    movdqa  xmm3,       xmm5            ; initial u to 16384 16383 16384 16383

WIDTH:
    mov     rdi,        r15
    shr     rdi,        15

    movd    xmm1,       [r12+rdi]       ; xxxxxxba
    movd    xmm2,       [r13+rdi]       ; xxxxxxdc
    pxor    xmm0,       xmm0
    punpcklwd   xmm1,   xmm2            ; xxxxdcba
    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
    punpcklwd   xmm1,   xmm0            ; 000d000c000b000a

    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
    pmaddwd xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
    movdqa  xmm0,   xmm2
    pmuludq xmm2,   xmm1
    psrlq   xmm0,   32
    psrlq   xmm1,   32
    pmuludq xmm0,   xmm1
    paddq   xmm2,   xmm0
    pshufd  xmm1,   xmm2,   00001110b
    paddq   xmm2,   xmm1
    psrlq   xmm2,   29

    movd    ebx,    xmm2
    inc     ebx
    shr     ebx,    1
    mov     [r0],   bl
    inc     r0

    add      r15, r6
    paddw   xmm3,       xmm7            ; inc u
    psllw   xmm3,       1
    psrlw   xmm3,       1

    dec     rsi
    jg      WIDTH

WIDTH_END:
    shr     r15, 15
    mov     bl,  [r12+r15]
    mov     [r0],bl
    inc     r0
    add     r14, rbp
    add     r0,  r1

    paddw   xmm4,   xmm6                ; inc v
    psllw   xmm4,   1
    psrlw   xmm4,   1

    dec     r3
    jg      HEIGHT

LAST_ROW:
    shr     r14, 15
    imul    r14, r5
    add     r4, r14
    mov     r15, 16384

LAST_ROW_WIDTH:
    mov     rdi, r15
    shr     rdi, 15
    mov     bl,  [r4+rdi]
    mov     [r0],bl
    inc     r0

    add     r15, r6
    dec     r2
    jg    LAST_ROW_WIDTH

LAST_ROW_END:

    POP_XMM
    pop     rbp
    pop     rbx
    pop     rdi
    pop     rsi
    pop     r15
    pop     r14
    pop     r13
    pop     r12
    ret

;**************************************************************************************************************
;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
;               unsigned char* pSrc, const int iSrcStride,
;               unsigned int uiScaleX, unsigned int uiScaleY );
;{
;**************************************************************************************************************

WELS_EXTERN GeneralBilinearFastDownsampler_sse2
    push    r12
    push    r13
    push    r14
    push    r15
    push    rsi
    push    rdi
    push    rbx
    push    rbp
    %assign push_num 8
    LOAD_7_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r2, r2d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r5, r5d
    SIGN_EXTENSION r6, r6d

    pxor    xmm0,   xmm0
    mov     r12d,   r6d
    and     r12d,   65535
    mov     r13d,   r12d
    neg     r13d
    and     r13d,   65535
    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 65536)
    movd    xmm2,   r13d                     ; -uinc
    psllq   xmm1,   32
    por     xmm1,   xmm2                    ; 0 uinc 0 -uinc
    pshuflw xmm7,   xmm1,   10001000b       ; xmm7: uinc -uinc uinc -uinc

    mov     r12,    arg8
    SIGN_EXTENSION r12, r12d
    mov     rbp,    r12
    and     r12d,   32767
    mov     r13d,   r12d
    neg     r13d
    and     r13d,   32767
    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)
    movd    xmm2,       r13d                     ; -vinc
    psllq   xmm6,       32
    por     xmm6,       xmm2                    ; 0 vinc 0 -vinc
    pshuflw xmm6,       xmm6,   10100000b       ; xmm6: vinc vinc -vinc -vinc

    mov     r12d,       80007fffh               ; 32768 32767
    movd    xmm5,       r12d
    pshuflw xmm5,       xmm5,       01000100b   ; 32768 32767 32768 32767

FAST_DOWNSAMPLE:
    sub     r1, r2                   ; stride - width
    dec     r3
    mov     r14,16384

    pshuflw xmm4,       xmm5,   01010000b
    psrlw   xmm4,       1               ; initial v to 16384 16384 16383 16383

FAST_HEIGHT:
    mov     r12, r14
    shr     r12,    15
    imul    r12,    r5
    add     r12,    r4                 ; get current row address
    mov     r13,    r12
    add     r13,    r5

    mov     r15, 32768
    mov     rsi, r2
    dec     rsi

    movdqa  xmm3,       xmm5            ; initial u to 32768 32767 32768 32767

FAST_WIDTH:
    mov     rdi,        r15
    shr     rdi,        16

    movd    xmm1,       [r12+rdi]       ; xxxxxxba
    movd    xmm2,       [r13+rdi]       ; xxxxxxdc
    punpcklwd   xmm1,   xmm2            ; xxxxdcba
    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a

    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
    pmulhuw xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
    pmaddwd     xmm2,   xmm1
    pshufd  xmm1,   xmm2,   00000001b
    paddd   xmm2,   xmm1
%ifdef X86_32_PICASM
    push    r0
    mov     r0, esp
    and     esp, 0xffffffe0
    push    0x00000000
    push    0x00000000
    push    0x00000000
    push    0x00004000
    movdqa  xmm1,   [esp]
    mov     esp, r0
    pop     r0
%else
    movdqa  xmm1,   [add_extra_half]
%endif
    paddd   xmm2,   xmm1
    psrld   xmm2,   15

    packuswb    xmm2,   xmm0
    movd    ebx,    xmm2
    mov     [r0],  bl
    inc     r0

    add     r15, r6

    paddw   xmm3,       xmm7            ; inc u
    dec     rsi
    jg      FAST_WIDTH

FAST_WIDTH_END:
    shr     r15, 16
    mov     bl,  [r12+r15]
    mov     [r0],bl
    inc     r0
    add     r14, rbp
    add     r0,  r1

    paddw   xmm4,   xmm6                ; inc v
    psllw   xmm4,   1
    psrlw   xmm4,   1

    dec     r3
    jg      FAST_HEIGHT


FAST_LAST_ROW:
    shr     r14, 15
    imul    r14, r5
    add     r4, r14
    mov     r15, 32768

FAST_LAST_ROW_WIDTH:
    mov     rdi, r15
    shr     rdi, 16
    mov     bl,  [r4+rdi]
    mov     [r0],bl
    inc     r0

    add     r15, r6
    dec     r2
    jg      FAST_LAST_ROW_WIDTH

FAST_LAST_ROW_END:

    POP_XMM
    pop     rbp
    pop     rbx
    pop     rdi
    pop     rsi
    pop     r15
    pop     r14
    pop     r13
    pop     r12
    ret

%elifdef  UNIX64

;**************************************************************************************************************
;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
;                           unsigned char* pSrc, const int iSrcStride,
;                           unsigned int uiScaleX, unsigned int uiScaleY );
;{
;**************************************************************************************************************

WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
    push    r12
    push    r13
    push    r14
    push    r15
    push    rbx
    push    rbp
    %assign push_num 6
    LOAD_7_PARA
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r2, r2d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r5, r5d
    SIGN_EXTENSION r6, r6d

    pxor    xmm0,   xmm0
    mov     r12d,   r6d
    and     r12d,   32767
    mov     r13d,   r12d
    neg     r13d
    and     r13d,   32767
    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 32767)
    movd    xmm2,   r13d                     ; -uinc
    psllq   xmm1,   32
    por     xmm1,   xmm2                    ; 0 0  uinc  -uinc   (dword)
    pshufd  xmm7,   xmm1,   01000100b       ; xmm7: uinc -uinc uinc -uinc

    mov     r12,    arg8
    SIGN_EXTENSION r12, r12d
    mov     rbp,    r12
    and     r12d,   32767
    mov     r13d,   r12d
    neg     r13d
    and     r13d,   32767
    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)
    movd    xmm2,       r13d                     ; -vinc
    psllq   xmm6,       32
    por     xmm6,       xmm2                    ; 0 0 vinc -vinc (dword)
    pshufd  xmm6,       xmm6,   01010000b       ; xmm6: vinc vinc -vinc -vinc

    mov     r12d,        40003fffh
    movd    xmm5,       r12d
    punpcklwd   xmm5,   xmm0                    ; 16384 16383
    pshufd  xmm5,       xmm5,   01000100b       ; xmm5: 16384 16383 16384 16383

DOWNSAMPLE:
    sub     r1, r2                   ; stride - width
    dec     r3
    mov     r14,16384
    pshufd  xmm4,       xmm5,   01010000b   ; initial v to 16384 16384 16383 16383

HEIGHT:
    ;mov     r12, r4
    mov     r12, r14
    shr     r12,    15
    imul    r12,    r5
    add     r12,    r4                 ; get current row address
    mov     r13,    r12
    add     r13,    r5

    mov     r15, 16384
    mov     rax, r2
    dec     rax
    movdqa  xmm3,       xmm5            ; initial u to 16384 16383 16384 16383

WIDTH:
    mov     r11,        r15
    shr     r11,        15

    movd    xmm1,       [r12+r11]       ; xxxxxxba
    movd    xmm2,       [r13+r11]       ; xxxxxxdc
    pxor    xmm0,       xmm0
    punpcklwd   xmm1,   xmm2            ; xxxxdcba
    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
    punpcklwd   xmm1,   xmm0            ; 000d000c000b000a

    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
    pmaddwd xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
    movdqa  xmm0,   xmm2
    pmuludq xmm2,   xmm1
    psrlq   xmm0,   32
    psrlq   xmm1,   32
    pmuludq xmm0,   xmm1
    paddq   xmm2,   xmm0
    pshufd  xmm1,   xmm2,   00001110b
    paddq   xmm2,   xmm1
    psrlq   xmm2,   29

    movd    ebx,    xmm2
    inc     ebx
    shr     ebx,    1
    mov     [r0],   bl
    inc     r0

    add      r15, r6
    paddw   xmm3,       xmm7            ; inc u
    psllw   xmm3,       1
    psrlw   xmm3,       1

    dec     rax
    jg      WIDTH

WIDTH_END:
    shr     r15, 15
    mov     bl,  [r12+r15]
    mov     [r0],bl
    inc     r0
    add     r14, rbp
    add     r0,  r1

    paddw   xmm4,   xmm6                ; inc v
    psllw   xmm4,   1
    psrlw   xmm4,   1

    dec     r3
    jg      HEIGHT

LAST_ROW:
    shr     r14, 15
    imul    r14, r5
    add     r4, r14
    mov     r15, 16384

LAST_ROW_WIDTH:
    mov     r11, r15
    shr     r11, 15
    mov     bl,  [r4+r11]
    mov     [r0],bl
    inc     r0

    add     r15, r6
    dec     r2
    jg    LAST_ROW_WIDTH

LAST_ROW_END:

    pop     rbp
    pop     rbx
    pop     r15
    pop     r14
    pop     r13
    pop     r12
    ret

;**************************************************************************************************************
;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
;               unsigned char* pSrc, const int iSrcStride,
;               unsigned int uiScaleX, unsigned int uiScaleY );
;{
;**************************************************************************************************************

WELS_EXTERN GeneralBilinearFastDownsampler_sse2
    push    r12
    push    r13
    push    r14
    push    r15
    push    rbx
    push    rbp
    %assign push_num 6
    LOAD_7_PARA
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r2, r2d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r5, r5d
    SIGN_EXTENSION r6, r6d

    pxor    xmm0,   xmm0
    mov     r12d,   r6d
    and     r12d,   65535
    mov     r13d,   r12d
    neg     r13d
    and     r13d,   65535
    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 65536)
    movd    xmm2,   r13d                     ; -uinc
    psllq   xmm1,   32
    por     xmm1,   xmm2                    ; 0 uinc 0 -uinc
    pshuflw xmm7,   xmm1,   10001000b       ; xmm7: uinc -uinc uinc -uinc

    mov     r12,    arg8
    SIGN_EXTENSION r12, r12d
    mov     rbp,    r12
    and     r12d,   32767
    mov     r13d,   r12d
    neg     r13d
    and     r13d,   32767
    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)
    movd    xmm2,       r13d                     ; -vinc
    psllq   xmm6,       32
    por     xmm6,       xmm2                    ; 0 vinc 0 -vinc
    pshuflw xmm6,       xmm6,   10100000b       ; xmm6: vinc vinc -vinc -vinc

    mov     r12d,       80007fffh               ; 32768 32767
    movd    xmm5,       r12d
    pshuflw xmm5,       xmm5,       01000100b   ; 32768 32767 32768 32767

FAST_DOWNSAMPLE:
    sub     r1, r2                   ; stride - width
    dec     r3
    mov     r14,16384

    pshuflw xmm4,       xmm5,   01010000b
    psrlw   xmm4,       1               ; initial v to 16384 16384 16383 16383

FAST_HEIGHT:
    mov     r12, r14
    shr     r12,    15
    imul    r12,    r5
    add     r12,    r4                 ; get current row address
    mov     r13,    r12
    add     r13,    r5

    mov     r15, 32768
    mov     rax, r2
    dec     rax

    movdqa  xmm3,       xmm5            ; initial u to 32768 32767 32768 32767

FAST_WIDTH:
    mov     r11,        r15
    shr     r11,        16

    movd    xmm1,       [r12+r11]       ; xxxxxxba
    movd    xmm2,       [r13+r11]       ; xxxxxxdc
    punpcklwd   xmm1,   xmm2            ; xxxxdcba
    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a

    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
    pmulhuw xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
    pmaddwd     xmm2,   xmm1
    pshufd  xmm1,   xmm2,   00000001b
    paddd   xmm2,   xmm1
%ifdef X86_32_PICASM
    push    r0
    mov     r0, esp
    and     esp, 0xffffffe0
    push    0x00000000
    push    0x00000000
    push    0x00000000
    push    0x00004000
    movdqa  xmm1,   [esp]
    mov     esp, r0
    pop     r0
%else
    movdqa  xmm1,   [add_extra_half]
%endif
    paddd   xmm2,   xmm1
    psrld   xmm2,   15

    packuswb    xmm2,   xmm0
    movd    ebx,    xmm2
    mov     [r0],  bl
    inc     r0

    add     r15, r6

    paddw   xmm3,       xmm7            ; inc u
    dec     rax
    jg      FAST_WIDTH

FAST_WIDTH_END:
    shr     r15, 16
    mov     bl,  [r12+r15]
    mov     [r0],bl
    inc     r0
    add     r14, rbp
    add     r0,  r1

    paddw   xmm4,   xmm6                ; inc v
    psllw   xmm4,   1
    psrlw   xmm4,   1

    dec     r3
    jg      FAST_HEIGHT


FAST_LAST_ROW:
    shr     r14, 15
    imul    r14, r5
    add     r4, r14
    mov     r15, 32768

FAST_LAST_ROW_WIDTH:
    mov     r11, r15
    shr     r11, 16
    mov     bl,  [r4+r11]
    mov     [r0],bl
    inc     r0

    add     r15, r6
    dec     r2
    jg      FAST_LAST_ROW_WIDTH

FAST_LAST_ROW_END:

    pop     rbp
    pop     rbx
    pop     r15
    pop     r14
    pop     r13
    pop     r12
    ret
%endif

;***********************************************************************
;   void DyadicBilinearOneThirdDownsampler_ssse3(    unsigned char* pDst, const int iDstStride,
;                   unsigned char* pSrc, const int iSrcStride,
;                   const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearOneThirdDownsampler_ssse3
%ifdef X86_32
    push r6
    %assign push_num 1
%else
    %assign push_num 0
%endif
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
    SIGN_EXTENSION r5, r5d

%ifndef X86_32
    push r12
    mov r12, r4
%endif

    mov r6, r1             ;Save the tailer for the unasigned size
    imul r6, r5
    add r6, r0
    movdqa xmm7, [r6]

.yloops_onethird_sse3:
%ifdef X86_32
    mov r4, arg5
%else
    mov r4, r12
%endif

    mov r6, r0        ;save base address
    ; each loop = source bandwidth: 48 bytes
.xloops_onethird_sse3:
    ; 1st part horizonal loop: x48 bytes
    ;               mem  hi<-       ->lo
    ;1st Line Src:  xmm0: F * e E * d D * c C * b B * a A
    ;               xmm2: k K * j J * i I * h H * g G * f
    ;               xmm2: * p P * o O * n N * m M * l L *
    ;
    ;2nd Line Src:  xmm2: F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
    ;               xmm1: k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
    ;               xmm1: *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
    ;=> target:
    ;: P O N M L K J I H G F E D C B A
    ;: p o n m l k j i h g f e d c b a
    ;: P' ..                          A'
    ;: p' ..                          a'

    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    ;1st line
    movdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A
    movdqa xmm1, xmm0
%ifdef X86_32_PICASM
    push   r0
    mov    r0, esp
    and    esp, 0xfffffff0
    push   0x80808080    ;shufb_mask_onethird_low_1
    push   0x80808080
    push   0x80800f0c
    push   0x09060300
    push   0x80808080    ;shufb_mask_onethird_high_1
    push   0x80808080
    push   0x8080800d
    push   0x0a070401
    push   0x80808080    ;shufb_mask_onethird_low_2
    push   0x800e0b08
    push   0x05028080
    push   0x80808080
    push   0x80808080    ;shufb_mask_onethird_high_2
    push   0x800f0c09
    push   0x06030080
    push   0x80808080
    push   0x0d0a0704    ;shufb_mask_onethird_low_3
    push   0x01808080
    push   0x80808080
    push   0x80808080
    push   0x0e0b0805    ;shufb_mask_onethird_high_3
    push   0x02808080
    push   0x80808080
    push   0x80808080
    movdqa xmm5, [esp+80]
    movdqa xmm6, [esp+64]
%else
    movdqa xmm5, [shufb_mask_onethird_low_1]
    movdqa xmm6, [shufb_mask_onethird_high_1]
%endif
    pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
    pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1

    movdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f
    movdqa xmm3, xmm2
%ifdef X86_32_PICASM
    movdqa xmm5, [esp+48]
    movdqa xmm6, [esp+32]
%else
    movdqa xmm5, [shufb_mask_onethird_low_2]
    movdqa xmm6, [shufb_mask_onethird_high_2]
%endif
    pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
    pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3

    paddusb xmm0, xmm2                          ;0 0 0 0 0 K J I H G F E D C B A -> xmm0
    paddusb xmm1, xmm3                          ;0 0 0 0 0 k j i h g f e d c b a -> xmm1

    movdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *
    movdqa xmm3, xmm2
%ifdef X86_32_PICASM
    movdqa xmm5, [esp+16]
    movdqa xmm6, [esp]
%else
    movdqa xmm5, [shufb_mask_onethird_low_3]
    movdqa xmm6, [shufb_mask_onethird_high_3]
%endif
    pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
    pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3

    paddusb xmm0, xmm2                          ;P O N M L K J I H G F E D C B A -> xmm0
    paddusb xmm1, xmm3                          ;p o n m l k j i h g f e d c b a -> xmm1
    pavgb xmm0, xmm1                            ;1st line average                -> xmm0

    ;2nd line
    movdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
    movdqa xmm3, xmm2
%ifdef X86_32_PICASM
    movdqa xmm5, [esp+80]
    movdqa xmm6, [esp+64]
%else
    movdqa xmm5, [shufb_mask_onethird_low_1]
    movdqa xmm6, [shufb_mask_onethird_high_1]
%endif
    pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
    pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3

    movdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
    movdqa xmm4, xmm1
%ifdef X86_32_PICASM
    movdqa xmm5, [esp+48]
    movdqa xmm6, [esp+32]
%else
    movdqa xmm5, [shufb_mask_onethird_low_2]
    movdqa xmm6, [shufb_mask_onethird_high_2]
%endif
    pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1
    pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4

    paddusb xmm2, xmm1                          ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2
    paddusb xmm3, xmm4                          ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3

    movdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
    movdqa xmm4, xmm1
%ifdef X86_32_PICASM
    movdqa xmm5, [esp+16]
    movdqa xmm6, [esp]
    mov    esp, r0
    pop    r0
%else
    movdqa xmm5, [shufb_mask_onethird_low_3]
    movdqa xmm6, [shufb_mask_onethird_high_3]
%endif
    pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
    pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4

    paddusb xmm2, xmm1                          ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2
    paddusb xmm3, xmm4                          ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3
    pavgb xmm2, xmm3                            ;2nd line average                                -> xmm2

    pavgb xmm0, xmm2                            ; bytes-average(1st line , 2nd line )

    ; write pDst
    movdqa [r0], xmm0                           ;write result in dst

    ; next SMB
    lea r2, [r2+48]                             ;current src address
    lea r0, [r0+16]                             ;current dst address

    sub r4, 48                                  ;xloops counter
    cmp r4, 0
    jg near .xloops_onethird_sse3

    sub r6, r0                                  ;offset = base address - current address
    lea r2, [r2+2*r3]                           ;
    lea r2, [r2+r3]                             ;
    lea r2, [r2+2*r6]                           ;current line + 3 lines
    lea r2, [r2+r6]
    lea r0, [r0+r1]
    lea r0, [r0+r6]                             ;current dst lien + 1 line

    dec r5
    jg near .yloops_onethird_sse3

    movdqa [r0], xmm7                           ;restore the tailer for the unasigned size

%ifndef X86_32
    pop r12
%endif

    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop r6
%endif
    ret

;***********************************************************************
;   void DyadicBilinearOneThirdDownsampler_sse4(    unsigned char* pDst, const int iDstStride,
;                   unsigned char* pSrc, const int iSrcStride,
;                   const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearOneThirdDownsampler_sse4
%ifdef X86_32
    push r6
    %assign push_num 1
%else
    %assign push_num 0
%endif
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
    SIGN_EXTENSION r5, r5d

%ifndef X86_32
    push r12
    mov r12, r4
%endif

    mov r6, r1             ;Save the tailer for the unasigned size
    imul r6, r5
    add r6, r0
    movdqa xmm7, [r6]

.yloops_onethird_sse4:
%ifdef X86_32
    mov r4, arg5
%else
    mov r4, r12
%endif

    mov r6, r0        ;save base address
    ; each loop = source bandwidth: 48 bytes
.xloops_onethird_sse4:
    ; 1st part horizonal loop: x48 bytes
    ;               mem  hi<-       ->lo
    ;1st Line Src:  xmm0: F * e E * d D * c C * b B * a A
    ;               xmm2: k K * j J * i I * h H * g G * f
    ;               xmm2: * p P * o O * n N * m M * l L *
    ;
    ;2nd Line Src:  xmm2: F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
    ;               xmm1: k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
    ;               xmm1: *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
    ;=> target:
    ;: P O N M L K J I H G F E D C B A
    ;: p o n m l k j i h g f e d c b a
    ;: P' ..                          A'
    ;: p' ..                          a'

    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    ;1st line
    movntdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A
    movdqa xmm1, xmm0
%ifdef X86_32_PICASM
    push   r0
    mov    r0, esp
    and    esp, 0xfffffff0
    push   0x80808080    ;shufb_mask_onethird_low_1
    push   0x80808080
    push   0x80800f0c
    push   0x09060300
    push   0x80808080    ;shufb_mask_onethird_high_1
    push   0x80808080
    push   0x8080800d
    push   0x0a070401
    push   0x80808080    ;shufb_mask_onethird_low_2
    push   0x800e0b08
    push   0x05028080
    push   0x80808080
    push   0x80808080    ;shufb_mask_onethird_high_2
    push   0x800f0c09
    push   0x06030080
    push   0x80808080
    push   0x0d0a0704    ;shufb_mask_onethird_low_3
    push   0x01808080
    push   0x80808080
    push   0x80808080
    push   0x0e0b0805    ;shufb_mask_onethird_high_3
    push   0x02808080
    push   0x80808080
    push   0x80808080
    movdqa xmm5, [esp+80]
    movdqa xmm6, [esp+64]
%else
    movdqa xmm5, [shufb_mask_onethird_low_1]
    movdqa xmm6, [shufb_mask_onethird_high_1]
%endif
    pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
    pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1

    movntdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f
    movdqa xmm3, xmm2
%ifdef X86_32_PICASM
    movdqa xmm5, [esp+48]
    movdqa xmm6, [esp+32]
%else
    movdqa xmm5, [shufb_mask_onethird_low_2]
    movdqa xmm6, [shufb_mask_onethird_high_2]
%endif
    pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
    pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3

    paddusb xmm0, xmm2                          ;0 0 0 0 0 K J I H G F E D C B A -> xmm0
    paddusb xmm1, xmm3                          ;0 0 0 0 0 k j i h g f e d c b a -> xmm1

    movntdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *
    movdqa xmm3, xmm2
%ifdef X86_32_PICASM
    movdqa xmm5, [esp+16]
    movdqa xmm6, [esp]
%else
    movdqa xmm5, [shufb_mask_onethird_low_3]
    movdqa xmm6, [shufb_mask_onethird_high_3]
%endif
    pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
    pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3

    paddusb xmm0, xmm2                          ;P O N M L K J I H G F E D C B A -> xmm0
    paddusb xmm1, xmm3                          ;p o n m l k j i h g f e d c b a -> xmm1
    pavgb xmm0, xmm1                            ;1st line average                -> xmm0

    ;2nd line
    movntdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
    movdqa xmm3, xmm2
%ifdef X86_32_PICASM
    movdqa xmm5, [esp+80]
    movdqa xmm6, [esp+64]
%else
    movdqa xmm5, [shufb_mask_onethird_low_1]
    movdqa xmm6, [shufb_mask_onethird_high_1]
%endif
    pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
    pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3

    movntdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
    movdqa xmm4, xmm1
%ifdef X86_32_PICASM
    movdqa xmm5, [esp+48]
    movdqa xmm6, [esp+32]
%else
    movdqa xmm5, [shufb_mask_onethird_low_2]
    movdqa xmm6, [shufb_mask_onethird_high_2]
%endif
    pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1
    pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4

    paddusb xmm2, xmm1                          ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2
    paddusb xmm3, xmm4                          ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3

    movntdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
    movdqa xmm4, xmm1
%ifdef X86_32_PICASM
    movdqa xmm5, [esp+16]
    movdqa xmm6, [esp]
    mov    esp, r0
    pop    r0
%else
    movdqa xmm5, [shufb_mask_onethird_low_3]
    movdqa xmm6, [shufb_mask_onethird_high_3]
%endif
    pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
    pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4

    paddusb xmm2, xmm1                          ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2
    paddusb xmm3, xmm4                          ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3
    pavgb xmm2, xmm3                            ;2nd line average                                -> xmm2

    pavgb xmm0, xmm2                            ; bytes-average(1st line , 2nd line )

    ; write pDst
    movdqa [r0], xmm0                           ;write result in dst

    ; next SMB
    lea r2, [r2+48]                             ;current src address
    lea r0, [r0+16]                             ;current dst address

    sub r4, 48                                  ;xloops counter
    cmp r4, 0
    jg near .xloops_onethird_sse4

    sub r6, r0                                  ;offset = base address - current address
    lea r2, [r2+2*r3]                           ;
    lea r2, [r2+r3]                             ;
    lea r2, [r2+2*r6]                           ;current line + 3 lines
    lea r2, [r2+r6]
    lea r0, [r0+r1]
    lea r0, [r0+r6]                             ;current dst lien + 1 line

    dec r5
    jg near .yloops_onethird_sse4

    movdqa [r0], xmm7                           ;restore the tailer for the unasigned size

%ifndef X86_32
    pop r12
%endif

    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop r6
%endif
    ret

;***********************************************************************
;   void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride,
;                   unsigned char* pSrc, const int iSrcStride,
;                   const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearQuarterDownsampler_sse
%ifdef X86_32
    push r6
    %assign push_num 1
%else
    %assign push_num 0
%endif
    LOAD_6_PARA
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
    SIGN_EXTENSION r5, r5d

%ifndef X86_32
    push r12
    mov r12, r4
%endif
    sar r5, $02            ; iSrcHeight >> 2

    mov r6, r1             ;Save the tailer for the unasigned size
    imul r6, r5
    add r6, r0
    movq xmm7, [r6]

.yloops_quarter_sse:
%ifdef X86_32
    mov r4, arg5
%else
    mov r4, r12
%endif

    mov r6, r0        ;save base address
    ; each loop = source bandwidth: 32 bytes
.xloops_quarter_sse:
    ; 1st part horizonal loop: x16 bytes
    ;               mem  hi<-       ->lo
    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
    ;
    ;=> target:
    ;: G E C A,
    ;:
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    movq mm0, [r2]         ; 1st pSrc line
    movq mm1, [r2+8]       ; 1st pSrc line + 8
    movq mm2, [r2+r3]     ; 2nd pSrc line
    movq mm3, [r2+r3+8]   ; 2nd pSrc line + 8

    pshufw mm0, mm0, 0d8h    ; x X x X c C a A
    pshufw mm1, mm1, 0d8h    ; x X x X g G e E
    pshufw mm2, mm2, 0d8h    ; x X x X k K i I
    pshufw mm3, mm3, 0d8h    ; x X x X o O m M

    punpckldq mm0, mm1       ; g G e E c C a A
    punpckldq mm2, mm3       ; o O m M k K i I

    ; to handle mm0,mm2
    pshufw mm4, mm0, 0d8h       ;g G c C e E a A
    pshufw mm5, mm4, 04eh       ;e E a A g G c C
    punpcklbw mm4, mm5          ;g e G E c a C A  -> mm4
    pshufw mm4, mm4, 0d8h       ;g e c a G E C A  -> mm4

    pshufw mm5, mm2, 0d8h       ;o O k K m M i I
    pshufw mm6, mm5, 04eh       ;m M i I o O k K
    punpcklbw mm5, mm6          ;o m O M k i K I
    pshufw mm5, mm5, 0d8h       ;o m k i O M K I  -> mm5

    ; to handle mm4, mm5
    movq mm0, mm4
    punpckldq mm0, mm6          ;x x x x G E C A
    punpckhdq mm4, mm6          ;x x x x g e c a

    movq mm1, mm5
    punpckldq mm1, mm6          ;x x x x O M K I
    punpckhdq mm5, mm6          ;x x x x o m k i

    ; avg within MB horizon width (8 x 2 lines)
    pavgb mm0, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
    pavgb mm1, mm5      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once

    ; 2nd part horizonal loop: x16 bytes
    movq mm1, [r2+16]      ; 1st pSrc line + 16
    movq mm2, [r2+24]      ; 1st pSrc line + 24
    movq mm3, [r2+r3+16]  ; 2nd pSrc line + 16
    movq mm4, [r2+r3+24]  ; 2nd pSrc line + 24

    pshufw mm1, mm1, 0d8h
    pshufw mm2, mm2, 0d8h
    pshufw mm3, mm3, 0d8h
    pshufw mm4, mm4, 0d8h

    punpckldq mm1, mm2
    punpckldq mm3, mm4

    ; to handle mm1, mm3
    pshufw mm4, mm1, 0d8h
    pshufw mm5, mm4, 04eh
    punpcklbw mm4, mm5
    pshufw mm4, mm4, 0d8h

    pshufw mm5, mm3, 0d8h
    pshufw mm6, mm5, 04eh
    punpcklbw mm5, mm6
    pshufw mm5, mm5, 0d8h

    ; to handle mm4, mm5
    movq mm2, mm4
    punpckldq mm2, mm6
    punpckhdq mm4, mm6

    movq mm3, mm5
    punpckldq mm3, mm6
    punpckhdq mm5, mm6

    ; avg within MB horizon width (8 x 2 lines)
    pavgb mm2, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
    pavgb mm3, mm5      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
    pavgb mm2, mm3      ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part

    movd [r0  ], mm0
    movd [r0+4], mm2

    ; next SMB
    lea r2, [r2+32]
    lea r0, [r0+8]

    sub r4, 32
    cmp r4, 0
    jg near .xloops_quarter_sse

    sub  r6, r0
    ; next line
    lea r2, [r2+4*r3]    ; next 4 end of lines
    lea r2, [r2+4*r6]    ; reset to base 0 [- 4 * iDstWidth]
    lea r0, [r0+r1]
    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

    dec r5
    jg near .yloops_quarter_sse

    movq [r0], xmm7      ;restored the tailer for the unasigned size

    WELSEMMS
%ifndef X86_32
    pop r12
%endif
    LOAD_6_PARA_POP
%ifdef X86_32
    pop r6
%endif
    ret

;***********************************************************************
;   void DyadicBilinearQuarterDownsampler_ssse3(   unsigned char* pDst, const int iDstStride,
;                   unsigned char* pSrc, const int iSrcStride,
;                   const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearQuarterDownsampler_ssse3
    ;push ebx
    ;push edx
    ;push esi
    ;push edi
    ;push ebp

    ;mov edi, [esp+24]   ; pDst
    ;mov edx, [esp+28]   ; iDstStride
    ;mov esi, [esp+32]   ; pSrc
    ;mov ecx, [esp+36]   ; iSrcStride
    ;mov ebp, [esp+44]   ; iSrcHeight
%ifdef X86_32
    push r6
    %assign push_num 1
%else
    %assign push_num 0
%endif
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
    SIGN_EXTENSION r5, r5d

%ifndef X86_32
    push r12
    mov r12, r4
%endif
    sar r5, $02            ; iSrcHeight >> 2

    mov r6, r1             ;Save the tailer for the unasigned size
    imul r6, r5
    add r6, r0
    movq xmm7, [r6]

%ifdef X86_32_PICASM
    push   r0
    mov    r0, esp
    and    esp, 0xfffffff0
    push   0x80808080
    push   0x0d090501
    push   0x80808080
    push   0x0c080400
    movdqa xmm6, [esp]
    mov    esp, r0
    pop    r0
%else
    movdqa xmm6, [shufb_mask_quarter]
%endif
.yloops_quarter_sse3:
    ;mov eax, [esp+40]   ; iSrcWidth
    ;sar eax, $02            ; iSrcWidth >> 2
    ;mov ebx, eax        ; iDstWidth restored at ebx
    ;sar eax, $04            ; (iSrcWidth >> 2) / 16     ; loop count = num_of_mb
    ;neg ebx             ; - (iSrcWidth >> 2)
%ifdef X86_32
    mov r4, arg5
%else
    mov r4, r12
%endif

    mov r6, r0
    ; each loop = source bandwidth: 32 bytes
.xloops_quarter_sse3:
    ; 1st part horizonal loop: x32 bytes
    ;               mem  hi<-       ->lo
    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
    ;               xmm1: p P o O n N m M l L k K j J i I
    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
    ;               xmm3: p P o O n N m M l L k K j J i I

    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    movdqa xmm0, [r2]          ; 1st_src_line
    movdqa xmm1, [r2+16]       ; 1st_src_line + 16
    movdqa xmm2, [r2+r3]       ; 2nd_src_line
    movdqa xmm3, [r2+r3+16]    ; 2nd_src_line + 16

    pshufb xmm0, xmm6           ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A
    pshufb xmm1, xmm6           ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I
    pshufb xmm2, xmm6           ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A
    pshufb xmm3, xmm6           ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I

    movdqa xmm4, xmm0
    movdqa xmm5, xmm2
    punpckldq xmm0, xmm1        ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0
    punpckhdq xmm4, xmm1        ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4
    punpckldq xmm2, xmm3        ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2
    punpckhdq xmm5, xmm3        ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5

    pavgb xmm0, xmm4
    pavgb xmm2, xmm5
    pavgb xmm0, xmm2            ;average

    ; write pDst
    movq [r0], xmm0

    ; next SMB
    lea r2, [r2+32]
    lea r0, [r0+8]

    sub r4, 32
    cmp r4, 0
    jg near .xloops_quarter_sse3

    sub r6, r0
    ; next line
    lea r2, [r2+4*r3]    ; next end of lines
    lea r2, [r2+4*r6]    ; reset to base 0 [- 4 * iDstWidth]
    lea r0, [r0+r1]
    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

    dec r5
    jg near .yloops_quarter_sse3

    movq [r0], xmm7      ;restored the tailer for the unasigned size

%ifndef X86_32
    pop r12
%endif

    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop r6
%endif
    ret

;***********************************************************************
;   void DyadicBilinearQuarterDownsampler_sse4(    unsigned char* pDst, const int iDstStride,
;                   unsigned char* pSrc, const int iSrcStride,
;                   const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4
%ifdef X86_32
    push r6
    %assign push_num 1
%else
    %assign push_num 0
%endif
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
    SIGN_EXTENSION r5, r5d

%ifndef X86_32
    push r12
    mov r12, r4
%endif
    sar r5, $02            ; iSrcHeight >> 2

    mov r6, r1             ;Save the tailer for the unasigned size
    imul r6, r5
    add r6, r0
    movq xmm7, [r6]

%ifdef X86_32_PICASM
    push   r0
    mov    r0, esp
    and    esp, 0xfffffff0
    push   0x80808080
    push   0x0d090501
    push   0x80808080
    push   0x0c080400
    movdqa xmm6, [esp]
    mov    esp, r0
    pop    r0
%else
    movdqa xmm6, [shufb_mask_quarter]    ;mask
%endif

.yloops_quarter_sse4:
%ifdef X86_32
    mov r4, arg5
%else
    mov r4, r12
%endif

    mov r6, r0
    ; each loop = source bandwidth: 32 bytes
.xloops_quarter_sse4:
    ; 1st part horizonal loop: x16 bytes
    ;               mem  hi<-       ->lo
    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
    ;               xmm1: p P o O n N m M l L k K j J i I
    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
    ;               xmm3: p P o O n N m M l L k K j J i I

    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    movntdqa xmm0, [r2]            ; 1st_src_line
    movntdqa xmm1, [r2+16]         ; 1st_src_line + 16
    movntdqa xmm2, [r2+r3]         ; 2nd_src_line
    movntdqa xmm3, [r2+r3+16]      ; 2nd_src_line + 16

    pshufb xmm0, xmm6               ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A
    pshufb xmm1, xmm6               ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I
    pshufb xmm2, xmm6               ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A
    pshufb xmm3, xmm6               ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I

    movdqa xmm4, xmm0
    movdqa xmm5, xmm2
    punpckldq xmm0, xmm1            ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0
    punpckhdq xmm4, xmm1            ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4
    punpckldq xmm2, xmm3            ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2
    punpckhdq xmm5, xmm3            ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5

    pavgb xmm0, xmm4
    pavgb xmm2, xmm5
    pavgb xmm0, xmm2                ;average

    ; write pDst
    movq [r0], xmm0

    ; next SMB
    lea r2, [r2+32]
    lea r0, [r0+8]

    sub r4, 32
    cmp r4, 0
    jg near .xloops_quarter_sse4

    sub r6, r0
    lea r2, [r2+4*r3]    ; next end of lines
    lea r2, [r2+4*r6]    ; reset to base 0 [- 2 * iDstWidth]
    lea r0, [r0+r1]
    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

    dec r5
    jg near .yloops_quarter_sse4

    movq [r0], xmm7      ;restore the tailer for the unasigned size

%ifndef X86_32
    pop r12
%endif

    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop r6
%endif
    ret

; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
%macro SSE2_BilinearIncXposuw 5
    movdqa          %5, %2
    paddw           %2, %4
    paddusw         %5, %4
    pcmpeqw         %5, %2
    paddb           %1, %3
    paddb           %1, %5  ; subtract 1 if no carry
%endmacro

; outl=%1 outh=%2 in=%3
%macro SSE2_UnpckXFracuw 3
    pcmpeqw         %1, %1
    pxor            %1, %3
    movdqa          %2, %1
    punpcklwd       %1, %3
    punpckhwd       %2, %3
%endmacro

; [in:xfrac out:xyfrac0]=%1 [out:xyfrac1]=%2 yfrac0=%3 yfrac1=%4
%macro SSE2_BilinearFastCalcXYFrac 4
    movdqa          %2, %1
    pmulhuw         %1, %3
    pmulhuw         %2, %4
%endmacro

; [in:dwordsl out:bytes] dwordsh=%2 zero=%3
%macro SSE2_BilinearFastPackDwordsToBytes 3
    psrld           %1, 14
    psrld           %2, 14
    packssdw        %1, %2
    pavgw           %1, %3
    packuswb        %1, %1
%endmacro

%macro SSSE3_BilinearFastDownsample2xOrLess_8px 0
    movdqa          xmm_tmp0, xmm_xpos_int
    pshufb          xmm_tmp0, xmm_0
    psubb           xmm_xpos_int, xmm_tmp0
    SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
    mov             r_tmp0, i_xpos
    lea             i_xpos, [i_xpos + 8 * i_scalex]
    shr             r_tmp0, 16
    lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
    pshufb          xmm_tmp4, xmm_xpos_int
    movdqa          xmm_tmp5, xmm_tmp4
    punpcklbw       xmm_tmp4, xmm_0
    punpckhbw       xmm_tmp5, xmm_0
    SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
    SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp3, xmm_yfrac0, xmm_yfrac1
    pmaddwd         xmm_tmp0, xmm_tmp4
    pmaddwd         xmm_tmp1, xmm_tmp5
    lddqu           xmm_tmp4, [p_src_row1 + r_tmp0]
    pshufb          xmm_tmp4, xmm_xpos_int
    movdqa          xmm_tmp5, xmm_tmp4
    punpcklbw       xmm_tmp4, xmm_0
    punpckhbw       xmm_tmp5, xmm_0
    pmaddwd         xmm_tmp2, xmm_tmp4
    pmaddwd         xmm_tmp3, xmm_tmp5
    paddd           xmm_tmp0, xmm_tmp2
    paddd           xmm_tmp1, xmm_tmp3
    SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0
    movlps          [p_dst], xmm_tmp0
    add             p_dst, 8
    SSE2_BilinearIncXposuw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_tmp0
%endmacro

%macro SSSE3_BilinearFastDownsample4xOrLess_8px 0
    movdqa          xmm_tmp0, xmm_xpos_int
%ifdef X86_32_PICASM
    push            r0
    mov             r0, esp
    and             esp, 0xfffffff0
    push            0x08080808
    push            0x08080808
    push            0x00000000
    push            0x00000000
    pshufb          xmm_tmp0, [esp]
    mov             esp, r0
    pop             r0
%else
    pshufb          xmm_tmp0, [shufb_0000000088888888]
%endif
    psubb           xmm_xpos_int, xmm_tmp0
    SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    lddqu           xmm_tmp3, [p_src_row0 + r_tmp0]
    lddqu           xmm_tmp4, [p_src_row1 + r_tmp0]
    movdqa          xmm_tmp2, xmm_xpos_int
%ifdef X86_32_PICASM
    push            r5
    mov             r5, esp
    and             esp, 0xffffffe0
    push            0x80808080    ;db80h_256
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    punpcklbw       xmm_tmp2, [esp]
    mov             esp, r5
    pop             r5
%else
    punpcklbw       xmm_tmp2, [db80h_256]
%endif
    pshufb          xmm_tmp3, xmm_tmp2
    pshufb          xmm_tmp4, xmm_tmp2
    SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
    pmaddwd         xmm_tmp0, xmm_tmp3
    pmaddwd         xmm_tmp2, xmm_tmp4
    paddd           xmm_tmp0, xmm_tmp2
    lea             r_tmp0, [i_xpos + 4 * i_scalex]
    lea             i_xpos, [i_xpos + 8 * i_scalex]
    shr             r_tmp0, 16
    lddqu           xmm_tmp3, [p_src_row0 + r_tmp0]
    lddqu           xmm_tmp4, [p_src_row1 + r_tmp0]
    movdqa          xmm_tmp2, xmm_xpos_int
%ifdef X86_32_PICASM
    push            r5
    mov             r5, esp
    and             esp, 0xffffffe0
    push            0x80808080    ;db80h_256
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    punpckhbw       xmm_tmp2, [esp]
    mov             esp, r5
    pop             r5
%else
    punpckhbw       xmm_tmp2, [db80h_256]
%endif
    pshufb          xmm_tmp3, xmm_tmp2
    pshufb          xmm_tmp4, xmm_tmp2
    SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
    pmaddwd         xmm_tmp1, xmm_tmp3
    pmaddwd         xmm_tmp2, xmm_tmp4
    paddd           xmm_tmp1, xmm_tmp2
    SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0
    movlps          [p_dst], xmm_tmp0
    add             p_dst, 8
    SSE2_BilinearIncXposuw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_tmp0
%endmacro

%macro SSE2_GeneralBilinearFastDownsample_8px 0
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    movd            xmm_tmp3, [p_src_row0 + r_tmp0]
    movd            xmm_tmp4, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + i_scalex]
    shr             r_tmp0, 16
    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 1
    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 1
    lea             r_tmp0, [i_xpos + 2 * i_scalex]
    lea             i_xpos, [i_xpos + 4 * i_scalex]
    shr             r_tmp0, 16
    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 2
    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 2
    mov             r_tmp0, i_xpos
    sub             r_tmp0, i_scalex
    shr             r_tmp0, 16
    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 3
    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 3
    punpcklbw       xmm_tmp3, xmm_0
    punpcklbw       xmm_tmp4, xmm_0
    movdqa          xmm_tmp0, xmm_xfrac0
    SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
    pmaddwd         xmm_tmp0, xmm_tmp3
    pmaddwd         xmm_tmp2, xmm_tmp4
    paddd           xmm_tmp0, xmm_tmp2
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    movd            xmm_tmp3, [p_src_row0 + r_tmp0]
    movd            xmm_tmp4, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + i_scalex]
    shr             r_tmp0, 16
    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 1
    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 1
    lea             r_tmp0, [i_xpos + 2 * i_scalex]
    lea             i_xpos, [i_xpos + 4 * i_scalex]
    shr             r_tmp0, 16
    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 2
    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 2
    mov             r_tmp0, i_xpos
    sub             r_tmp0, i_scalex
    shr             r_tmp0, 16
    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 3
    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 3
    punpcklbw       xmm_tmp3, xmm_0
    punpcklbw       xmm_tmp4, xmm_0
    movdqa          xmm_tmp1, xmm_xfrac1
    SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
    pmaddwd         xmm_tmp1, xmm_tmp3
    pmaddwd         xmm_tmp2, xmm_tmp4
    paddd           xmm_tmp1, xmm_tmp2
    SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0
    movlps          [p_dst], xmm_tmp0
    add             p_dst, 8
    paddw           xmm_xfrac0, xmm_xfrac_inc
    paddw           xmm_xfrac1, xmm_xfrac_inc
%endmacro

; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6
%macro SSE2_BilinearIncXposw 6
    pxor            %6, %6
    paddw           %2, %4
    pcmpgtw         %6, %2
    paddb           %1, %3
    psubb           %1, %6  ; add carry
    pand            %2, %5
%endmacro

; outl=%1 outh=%2 in=%3 7FFFh=%4
%macro SSE2_UnpckXFracw 4
    movdqa          %1, %3
    pxor            %1, %4
    movdqa          %2, %1
    punpcklwd       %1, %3
    punpckhwd       %2, %3
%endmacro

; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6
%macro SSE41_LinearAccurateInterpolateVerticalDwords 6
    pshufd          %1, %2, 10110001b
    pshufd          %6, %3, 10110001b
    pmuludq         %1, %4
    pmuludq         %6, %5
    paddq           %1, %6
    pmuludq         %2, %4
    pmuludq         %3, %5
    paddq           %2, %3
    psllq           %1,  3
    psrlq           %2, 29
    blendps         %1, %2, 0101b
%endmacro

%macro SSE41_BilinearAccurateDownsample2xOrLess_8px 0
    movdqa          xmm_tmp0, xmm_xpos_int
    pshufb          xmm_tmp0, xmm_0
    psubb           xmm_xpos_int, xmm_tmp0
    SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
    mov             r_tmp0, i_xpos
    lea             i_xpos, [i_xpos + 8 * i_scalex]
    shr             r_tmp0, 16
    lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
    pshufb          xmm_tmp4, xmm_xpos_int
    movdqa          xmm_tmp5, xmm_tmp4
    punpcklbw       xmm_tmp4, xmm_0
    punpckhbw       xmm_tmp5, xmm_0
    pmaddwd         xmm_tmp4, xmm_tmp0
    pmaddwd         xmm_tmp5, xmm_tmp1
    lddqu           xmm_tmp2, [p_src_row1 + r_tmp0]
    pshufb          xmm_tmp2, xmm_xpos_int
    movdqa          xmm_tmp3, xmm_tmp2
    punpcklbw       xmm_tmp2, xmm_0
    punpckhbw       xmm_tmp3, xmm_0
    pmaddwd         xmm_tmp2, xmm_tmp0
    pmaddwd         xmm_tmp3, xmm_tmp1
    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp1
    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp5, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
    packssdw        xmm_tmp0, xmm_tmp1
    pavgw           xmm_tmp0, xmm_0
    packuswb        xmm_tmp0, xmm_tmp0
    movlps          [p_dst], xmm_tmp0
    add             p_dst, 8
    SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0
%endmacro

%macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0
    movdqa          xmm_tmp0, xmm_xpos_int
%ifdef X86_32_PICASM
    push            r0
    mov             r0, esp
    and             esp, 0xfffffff0
    push            0x08080808
    push            0x08080808
    push            0x00000000
    push            0x00000000
    pshufb          xmm_tmp0, [esp]
    mov             esp, r0
    pop             r0
%else
    pshufb          xmm_tmp0, [shufb_0000000088888888]
%endif
    psubb           xmm_xpos_int, xmm_tmp0
    SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    movdqa          xmm_tmp3, xmm_xpos_int
%ifdef X86_32_PICASM
    push            r5
    mov             r5, esp
    and             esp, 0xffffffe0
    push            0x80808080    ;db80h_256
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    punpcklbw       xmm_tmp3, [esp]
    mov             esp, r5
    pop             r5
%else
    punpcklbw       xmm_tmp3, [db80h_256]
%endif
    lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
    lddqu           xmm_tmp2, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 4 * i_scalex]
    lea             i_xpos, [i_xpos + 8 * i_scalex]
    shr             r_tmp0, 16
    pshufb          xmm_tmp4, xmm_tmp3
    pshufb          xmm_tmp2, xmm_tmp3
    pmaddwd         xmm_tmp4, xmm_tmp0
    pmaddwd         xmm_tmp2, xmm_tmp0
    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
    movdqa          xmm_tmp2, xmm_xpos_int
%ifdef X86_32_PICASM
    push            r5
    mov             r5, esp
    and             esp, 0xffffffe0
    push            0x80808080    ;db80h_256
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    punpckhbw       xmm_tmp2, [esp]
    mov             esp, r5
    pop             r5
%else
    punpckhbw       xmm_tmp2, [db80h_256]
%endif
    lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
    lddqu           xmm_tmp3, [p_src_row1 + r_tmp0]
    pshufb          xmm_tmp4, xmm_tmp2
    pshufb          xmm_tmp3, xmm_tmp2
    pmaddwd         xmm_tmp4, xmm_tmp1
    pmaddwd         xmm_tmp3, xmm_tmp1
    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
    packssdw        xmm_tmp0, xmm_tmp1
    pavgw           xmm_tmp0, xmm_0
    packuswb        xmm_tmp0, xmm_tmp0
    movlps          [p_dst], xmm_tmp0
    add             p_dst, 8
    SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0
%endmacro

%macro SSE41_GeneralBilinearAccurateDownsample_8px 0
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    movd            xmm_tmp4, [p_src_row0 + r_tmp0]
    movd            xmm_tmp2, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 1 * i_scalex]
    shr             r_tmp0, 16
    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 1
    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 1
    lea             r_tmp0, [i_xpos + 2 * i_scalex]
    lea             i_xpos, [i_xpos + 4 * i_scalex]
    shr             r_tmp0, 16
    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 2
    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 2
    mov             r_tmp0, i_xpos
    sub             r_tmp0, i_scalex
    shr             r_tmp0, 16
    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 3
    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 3
    punpcklbw       xmm_tmp4, xmm_0
    punpcklbw       xmm_tmp2, xmm_0
    pmaddwd         xmm_tmp4, xmm_xfrac0
    pmaddwd         xmm_tmp2, xmm_xfrac0
    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    movd            xmm_tmp4, [p_src_row0 + r_tmp0]
    movd            xmm_tmp3, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 1 * i_scalex]
    shr             r_tmp0, 16
    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 1
    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 1
    lea             r_tmp0, [i_xpos + 2 * i_scalex]
    lea             i_xpos, [i_xpos + 4 * i_scalex]
    shr             r_tmp0, 16
    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 2
    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 2
    mov             r_tmp0, i_xpos
    sub             r_tmp0, i_scalex
    shr             r_tmp0, 16
    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 3
    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 3
    punpcklbw       xmm_tmp4, xmm_0
    punpcklbw       xmm_tmp3, xmm_0
    pmaddwd         xmm_tmp4, xmm_xfrac1
    pmaddwd         xmm_tmp3, xmm_xfrac1
    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
    packssdw        xmm_tmp0, xmm_tmp1
    pavgw           xmm_tmp0, xmm_0
    packuswb        xmm_tmp0, xmm_tmp0
    movlps          [p_dst], xmm_tmp0
    add             p_dst, 8
    paddw           xmm_xfrac0, xmm_xfrac_inc
    paddw           xmm_xfrac1, xmm_xfrac_inc
    pand            xmm_xfrac0, xmm_7fff
    pand            xmm_xfrac1, xmm_7fff
%endmacro

; downsample_8px_macro=%1 b_fast=%2
%macro SSE2_GeneralBilinearDownsampler_loop 2
%%height:
    mov             p_src_row0, i_ypos
    shr             p_src_row0, 15
    imul            p_src_row0, i_src_stride
    add             p_src_row0, p_src
    mov             p_src_row1, p_src_row0
    add             p_src_row1, i_src_stride
    movd            xmm_tmp1, i_yposd
%if %2
    pshuflw         xmm_tmp1, xmm_tmp1, 0
    psllw           xmm_tmp1, 1
    psrlw           xmm_tmp1, 1
%else
    pslld           xmm_tmp1, 17
    psrld           xmm_tmp1, 17
%endif
%ifdef X86_32
    pshufd          xmm_tmp1, xmm_tmp1, 0
    pcmpeqw         xmm_tmp0, xmm_tmp0
%if %2
    psrlw           xmm_tmp0, 1
%else
    psrld           xmm_tmp0, 17
%endif
    pxor            xmm_tmp0, xmm_tmp1
    movdqa          xmm_yfrac0, xmm_tmp0
    movdqa          xmm_yfrac1, xmm_tmp1
%else
    pshufd          xmm_yfrac1, xmm_tmp1, 0
    pcmpeqw         xmm_yfrac0, xmm_yfrac0
%if %2
    psrlw           xmm_yfrac0, 1
%else
    psrld           xmm_yfrac0, 17
%endif
    pxor            xmm_yfrac0, xmm_yfrac1
%endif

    mov             i_xpos, 1 << 15
    mov             i_width_cnt, i_dst_width
    sub             i_width_cnt, 1

%ifdef xmm_xpos_int
    movdqa          xmm_xpos_int, xmm_xpos_int_begin
    movdqa          xmm_xpos_frac, xmm_xpos_frac_begin
%else
    movdqa          xmm_xfrac0, xmm_xfrac0_begin
    movdqa          xmm_xfrac1, xmm_xfrac1_begin
%endif

%%width:
    %1
    sub             i_width_cnt, 8
    jg              %%width

    lea             p_dst, [p_dst + i_width_cnt + 1]
    imul            i_width_cnt, i_scalex
    add             i_xpos, i_width_cnt
    shr             i_xpos, 16
    movzx           r_tmp0, byte [p_src_row0 + i_xpos]
    mov             [p_dst - 1], r_tmp0b
%ifdef X86_32
    mov             r_tmp0, i_scaleyd
    add             i_yposd, r_tmp0
%else
    add             i_yposd, i_scaleyd
%endif
    add             p_dst, i_dst_stride_less_width
    sub             i_dst_height, 1
    jg              %%height
%endmacro

;**************************************************************************************************************
;void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
;    uint32_t uiScaleY);
;
;**************************************************************************************************************

WELS_EXTERN GeneralBilinearFastDownsampler_ssse3
    %assign push_num 0
%ifndef X86_32
    push            r12
    push            r13
    push            rbx
    push            rbp
    %assign push_num 4
%ifdef WIN64
    push            rdi
    push            rsi
    %assign push_num push_num + 2
%endif
%endif
    LOAD_7_PARA
    PUSH_XMM 16
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r2, r2d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r5, r5d
    ZERO_EXTENSION  r6d
    sub             r1, r2                                            ; dst_stride - dst_width
%ifdef X86_32
    movd            xmm0, arg8
    movd            xmm1, esp
    and             esp, -16
    sub             esp, 8 * 4 + 7 * 16
    movd            [esp], xmm1
    %define p_dst                   r0
    %define i_dst_stride_less_width [esp + 1 * 4]
    %define i_dst_width             [esp + 2 * 4]
    %define i_dst_height            dword [esp + 3 * 4]
    %define p_src                   [esp + 4 * 4]
    %define i_src_stride            [esp + 5 * 4]
    %define i_scalex                r6
    %define i_scalexd               r6d
    %define i_scaleyd               [esp + 6 * 4]
    %define i_xpos                  r2
    %define i_ypos                  dword [esp + 7 * 4]
    %define i_yposd                 dword [esp + 7 * 4]
    %define p_src_row0              r3
    %define p_src_row1              r4
    %define i_width_cnt             r5
    %define r_tmp0                  r1
    %define r_tmp0b                 r1b
    %define xmm_xpos_frac           xmm1
    %define xmm_xpos_frac_inc       [esp + 8 * 4]
    %define xmm_xpos_int            xmm3
    %define xmm_xpos_int_inc        [esp + 8 * 4 + 1 * 16]
    %define xmm_yfrac0              [esp + 8 * 4 + 2 * 16]
    %define xmm_yfrac1              [esp + 8 * 4 + 3 * 16]
    %define xmm_tmp0                xmm7
    %define xmm_tmp1                xmm0
    %define xmm_tmp2                xmm2
    %define xmm_tmp3                xmm4
    %define xmm_tmp4                xmm5
    %define xmm_tmp5                xmm6
    %define xmm_0                   [esp + 8 * 4 + 4 * 16]
    %define xmm_xpos_int_begin      [esp + 8 * 4 + 5 * 16]
    %define xmm_xpos_frac_begin     [esp + 8 * 4 + 6 * 16]
    mov             i_dst_stride_less_width, r1
    mov             i_dst_width, r2
    mov             i_dst_height, r3
    mov             p_src, r4
    mov             i_src_stride, r5
    movd            i_scaleyd, xmm0
    pxor            xmm_tmp0, xmm_tmp0
    movdqa          xmm_0, xmm_tmp0
%else
    %define p_dst                   r0
    %define i_dst_stride_less_width r1
    %define i_dst_width             r2
    %define i_dst_height            r3
    %define p_src                   r4
    %define i_src_stride            r5
    %define i_scalex                r6
    %define i_scalexd               r6d
    %define i_scaleyd               dword arg8d
    %define i_xpos                  r12
    %define i_ypos                  r13
    %define i_yposd                 r13d
    %define p_src_row0              rbp
%ifdef WIN64
    %define p_src_row1              rsi
    %define i_width_cnt             rdi
%else
    %define p_src_row1              r11
    %define i_width_cnt             rax
%endif
    %define r_tmp0                  rbx
    %define r_tmp0b                 bl
    %define xmm_0                   xmm0
    %define xmm_xpos_frac           xmm1
    %define xmm_xpos_frac_inc       xmm8
    %define xmm_xpos_int            xmm3
    %define xmm_xpos_int_inc        xmm10
    %define xmm_yfrac0              xmm11
    %define xmm_yfrac1              xmm12
    %define xmm_tmp0                xmm7
    %define xmm_tmp1                xmm2
    %define xmm_tmp2                xmm9
    %define xmm_tmp3                xmm4
    %define xmm_tmp4                xmm5
    %define xmm_tmp5                xmm6
    %define xmm_xpos_int_begin      xmm14
    %define xmm_xpos_frac_begin     xmm15
    pxor            xmm_0, xmm_0
%endif

    sub             i_dst_height, 1
    je              .final_row
    jl              .done

    mov             i_ypos, 1 << 14
    movd            xmm_xpos_frac, i_scalexd
    pshufd          xmm_xpos_frac, xmm_xpos_frac, 0
    movdqa          xmm_tmp0, xmm_xpos_frac
    pslld           xmm_tmp0, 2
    pslldq          xmm_xpos_frac, 4
    paddd           xmm_tmp0, xmm_xpos_frac
    movdqa          xmm_tmp1, xmm_xpos_frac
    pslldq          xmm_tmp1, 4
    paddd           xmm_xpos_frac, xmm_tmp1
    paddd           xmm_tmp0, xmm_tmp1
    pslldq          xmm_tmp1, 4
    paddd           xmm_xpos_frac, xmm_tmp1
    paddd           xmm_tmp0, xmm_tmp1
    pcmpeqw         xmm_tmp1, xmm_tmp1
    psrld           xmm_tmp1, 31
    pslld           xmm_tmp1, 15
    paddd           xmm_xpos_frac, xmm_tmp1
    paddd           xmm_tmp0, xmm_tmp1
    movdqa          xmm_xpos_int, xmm_xpos_frac
    movdqa          xmm_tmp1, xmm_tmp0
    psrld           xmm_xpos_int, 16
    psrld           xmm_tmp1, 16
    packssdw        xmm_xpos_int, xmm_tmp1
    packuswb        xmm_xpos_int, xmm_xpos_int
    movdqa          xmm_tmp1, xmm_xpos_int
    pcmpeqw         xmm_tmp2, xmm_tmp2
    psubb           xmm_tmp1, xmm_tmp2
    punpcklbw       xmm_xpos_int, xmm_tmp1
    pslld           xmm_xpos_frac, 16
    pslld           xmm_tmp0, 16
    psrad           xmm_xpos_frac, 16
    psrad           xmm_tmp0, 16
    packssdw        xmm_xpos_frac, xmm_tmp0
    movd            xmm_tmp0, i_scalexd
    pslld           xmm_tmp0, 3
    movdqa          xmm_tmp1, xmm_tmp0
    punpcklwd       xmm_tmp0, xmm_tmp0
    pshufd          xmm_tmp0, xmm_tmp0, 0
    movdqa          xmm_xpos_frac_inc, xmm_tmp0
    psrld           xmm_tmp1, 16
    psubw           xmm_tmp1, xmm_tmp2
    pxor            xmm_tmp2, xmm_tmp2
    pshufb          xmm_tmp1, xmm_tmp2
    movdqa          xmm_xpos_int_inc, xmm_tmp1
    movdqa          xmm_xpos_int_begin, xmm_xpos_int
    movdqa          xmm_xpos_frac_begin, xmm_xpos_frac

    cmp             i_scalex, 4 << 16
    ja              .scalex_above4
    cmp             i_scalex, 2 << 16
    ja              .scalex_above2_beloweq4
    SSE2_GeneralBilinearDownsampler_loop SSSE3_BilinearFastDownsample2xOrLess_8px, 1
    jmp             .final_row
%ifdef X86_32
    %undef xmm_yfrac0
    %xdefine xmm_yfrac0 xmm_tmp5
    %undef xmm_tmp5
%endif
.scalex_above2_beloweq4:
    SSE2_GeneralBilinearDownsampler_loop SSSE3_BilinearFastDownsample4xOrLess_8px, 1
    jmp             .final_row
.scalex_above4:
%xdefine xmm_xfrac0 xmm_xpos_frac
%xdefine xmm_xfrac1 xmm_xpos_int
%xdefine xmm_xfrac0_begin xmm_xpos_int_begin
%xdefine xmm_xfrac1_begin xmm_xpos_frac_begin
%xdefine xmm_xfrac_inc xmm_xpos_frac_inc
%undef xmm_xpos_int
%undef xmm_xpos_frac
%undef xmm_xpos_int_begin
%undef xmm_xpos_frac_begin
%undef xmm_xpos_int_inc
%undef xmm_xpos_frac_inc
    SSE2_UnpckXFracuw xmm_tmp0, xmm_xfrac1, xmm_xfrac0
    movdqa          xmm_xfrac0, xmm_tmp0
    movdqa          xmm_xfrac0_begin, xmm_xfrac0
    movdqa          xmm_xfrac1_begin, xmm_xfrac1
    pcmpeqw         xmm_tmp0, xmm_tmp0
    pmullw          xmm_tmp0, xmm_xfrac_inc
    punpcklwd       xmm_tmp0, xmm_xfrac_inc
    movdqa          xmm_xfrac_inc, xmm_tmp0
    SSE2_GeneralBilinearDownsampler_loop SSE2_GeneralBilinearFastDownsample_8px, 1

.final_row:
    mov             p_src_row0, i_ypos
    shr             p_src_row0, 15
    imul            p_src_row0, i_src_stride
    add             p_src_row0, p_src
    mov             i_xpos, 1 << 15
    mov             i_width_cnt, i_dst_width

.final_row_width:
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]
    mov             [p_dst], r_tmp0b
    add             p_dst, 1
    add             i_xpos, i_scalex
    sub             i_width_cnt, 1
    jg              .final_row_width

.done:
%ifdef X86_32
    mov             esp, [esp]
%endif
    POP_XMM
    LOAD_7_PARA_POP
%ifndef X86_32
%ifdef WIN64
    pop             rsi
    pop             rdi
%endif
    pop             rbp
    pop             rbx
    pop             r13
    pop             r12
%endif
    ret
%undef p_dst
%undef i_dst_stride_less_width
%undef i_dst_width
%undef i_dst_height
%undef p_src
%undef i_src_stride
%undef i_scalex
%undef i_scalexd
%undef i_scaleyd
%undef i_xpos
%undef i_ypos
%undef i_yposd
%undef p_src_row0
%undef p_src_row1
%undef i_width_cnt
%undef r_tmp0
%undef r_tmp0b
%undef xmm_0
%undef xmm_xpos_frac
%undef xmm_xpos_frac_inc
%undef xmm_xpos_int
%undef xmm_xpos_int_inc
%undef xmm_yfrac0
%undef xmm_yfrac1
%undef xmm_tmp0
%undef xmm_tmp1
%undef xmm_tmp2
%undef xmm_tmp3
%undef xmm_tmp4
%undef xmm_tmp5
%undef xmm_xpos_int_begin
%undef xmm_xpos_frac_begin
%undef xmm_xfrac0
%undef xmm_xfrac1
%undef xmm_xfrac0_begin
%undef xmm_xfrac1_begin
%undef xmm_xfrac_inc

;**************************************************************************************************************
;void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
;    uint32_t uiScaleY);
;
;**************************************************************************************************************

WELS_EXTERN GeneralBilinearAccurateDownsampler_sse41
    %assign push_num 0
%ifndef X86_32
    push            r12
    push            r13
    push            rbx
    push            rbp
    %assign push_num 4
%ifdef WIN64
    push            rdi
    push            rsi
    %assign push_num push_num + 2
%endif
%endif
    LOAD_7_PARA
    PUSH_XMM 16
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r2, r2d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r5, r5d
    ZERO_EXTENSION  r6d
    sub             r1, r2                                            ; dst_stride - dst_width
    add             r6, r6                                            ; 2 * scalex
%ifdef X86_32
    movd            xmm0, arg8
    movd            xmm1, esp
    and             esp, -16
    sub             esp, 8 * 4 + 8 * 16
    movd            [esp], xmm1
    %define p_dst                   r0
    %define i_dst_stride_less_width [esp + 1 * 4]
    %define i_dst_width             [esp + 2 * 4]
    %define i_dst_height            dword [esp + 3 * 4]
    %define p_src                   [esp + 4 * 4]
    %define i_src_stride            [esp + 5 * 4]
    %define i_scalex                r6
    %define i_scalexd               r6d
    %define i_scaleyd               [esp + 6 * 4]
    %define i_xpos                  r2
    %define i_ypos                  dword [esp + 7 * 4]
    %define i_yposd                 dword [esp + 7 * 4]
    %define p_src_row0              r3
    %define p_src_row1              r4
    %define i_width_cnt             r5
    %define r_tmp0                  r1
    %define r_tmp0b                 r1b
    %define xmm_xpos_frac           xmm1
    %define xmm_xpos_frac_inc       [esp + 8 * 4]
    %define xmm_xpos_int            xmm3
    %define xmm_xpos_int_inc        [esp + 8 * 4 + 1 * 16]
    %define xmm_yfrac0              [esp + 8 * 4 + 2 * 16]
    %define xmm_yfrac1              [esp + 8 * 4 + 3 * 16]
    %define xmm_tmp0                xmm7
    %define xmm_tmp1                xmm0
    %define xmm_tmp2                xmm2
    %define xmm_tmp3                xmm4
    %define xmm_tmp4                xmm5
    %define xmm_tmp5                xmm6
    %define xmm_0                   [esp + 8 * 4 + 4 * 16]
    %define xmm_7fff                [esp + 8 * 4 + 5 * 16]
    %define xmm_xpos_int_begin      [esp + 8 * 4 + 6 * 16]
    %define xmm_xpos_frac_begin     [esp + 8 * 4 + 7 * 16]
    mov             i_dst_stride_less_width, r1
    mov             i_dst_width, r2
    mov             i_dst_height, r3
    mov             p_src, r4
    mov             i_src_stride, r5
    movd            i_scaleyd, xmm0
    pxor            xmm_tmp5, xmm_tmp5
    movdqa          xmm_0, xmm_tmp5
    pcmpeqw         xmm_tmp5, xmm_tmp5
    psrlw           xmm_tmp5, 1
    movdqa          xmm_7fff, xmm_tmp5
%else
    %define p_dst                   r0
    %define i_dst_stride_less_width r1
    %define i_dst_width             r2
    %define i_dst_height            r3
    %define p_src                   r4
    %define i_src_stride            r5
    %define i_scalex                r6
    %define i_scalexd               r6d
    %define i_scaleyd               dword arg8d
    %define i_xpos                  r12
    %define i_ypos                  r13
    %define i_yposd                 r13d
    %define p_src_row0              rbp
%ifdef WIN64
    %define p_src_row1              rsi
    %define i_width_cnt             rdi
%else
    %define p_src_row1              r11
    %define i_width_cnt             rax
%endif
    %define r_tmp0                  rbx
    %define r_tmp0b                 bl
    %define xmm_0                   xmm0
    %define xmm_xpos_frac           xmm1
    %define xmm_xpos_frac_inc       xmm8
    %define xmm_xpos_int            xmm3
    %define xmm_xpos_int_inc        xmm10
    %define xmm_yfrac0              xmm11
    %define xmm_yfrac1              xmm12
    %define xmm_tmp0                xmm7
    %define xmm_tmp1                xmm2
    %define xmm_tmp2                xmm9
    %define xmm_tmp3                xmm4
    %define xmm_tmp4                xmm5
    %define xmm_tmp5                xmm6
    %define xmm_7fff                xmm13
    %define xmm_xpos_int_begin      xmm14
    %define xmm_xpos_frac_begin     xmm15
    pxor            xmm_0, xmm_0
    pcmpeqw         xmm_7fff, xmm_7fff
    psrlw           xmm_7fff, 1
%endif

    sub             i_dst_height, 1
    je              .final_row
    jl              .done

    mov             i_ypos, 1 << 14
    movd            xmm_xpos_frac, i_scalexd
    pshufd          xmm_xpos_frac, xmm_xpos_frac, 0
    movdqa          xmm_tmp0, xmm_xpos_frac
    pslld           xmm_tmp0, 2
    pslldq          xmm_xpos_frac, 4
    paddd           xmm_tmp0, xmm_xpos_frac
    movdqa          xmm_tmp1, xmm_xpos_frac
    pslldq          xmm_tmp1, 4
    paddd           xmm_xpos_frac, xmm_tmp1
    paddd           xmm_tmp0, xmm_tmp1
    pslldq          xmm_tmp1, 4
    paddd           xmm_xpos_frac, xmm_tmp1
    paddd           xmm_tmp0, xmm_tmp1
    pcmpeqw         xmm_tmp1, xmm_tmp1
    psrld           xmm_tmp1, 31
    pslld           xmm_tmp1, 15
    paddd           xmm_xpos_frac, xmm_tmp1
    paddd           xmm_tmp0, xmm_tmp1
    movdqa          xmm_xpos_int, xmm_xpos_frac
    movdqa          xmm_tmp1, xmm_tmp0
    psrld           xmm_xpos_int, 16
    psrld           xmm_tmp1, 16
    packssdw        xmm_xpos_int, xmm_tmp1
    packuswb        xmm_xpos_int, xmm_xpos_int
    movdqa          xmm_tmp1, xmm_xpos_int
    pcmpeqw         xmm_tmp2, xmm_tmp2
    psubb           xmm_tmp1, xmm_tmp2
    punpcklbw       xmm_xpos_int, xmm_tmp1
    pslld           xmm_xpos_frac, 16
    pslld           xmm_tmp0, 16
    psrad           xmm_xpos_frac, 16
    psrad           xmm_tmp0, 16
    packssdw        xmm_xpos_frac, xmm_tmp0
    psrlw           xmm_xpos_frac, 1
    movd            xmm_tmp0, i_scalexd
    pslld           xmm_tmp0, 3
    movdqa          xmm_tmp1, xmm_tmp0
    punpcklwd       xmm_tmp0, xmm_tmp0
    pshufd          xmm_tmp0, xmm_tmp0, 0
    psrlw           xmm_tmp0, 1
    movdqa          xmm_xpos_frac_inc, xmm_tmp0
    psrld           xmm_tmp1, 16
    pxor            xmm_tmp2, xmm_tmp2
    pshufb          xmm_tmp1, xmm_tmp2
    movdqa          xmm_xpos_int_inc, xmm_tmp1
    movdqa          xmm_xpos_int_begin, xmm_xpos_int
    movdqa          xmm_xpos_frac_begin, xmm_xpos_frac

    cmp             i_scalex, 4 << 16
    ja              .scalex_above4
    cmp             i_scalex, 2 << 16
    ja              .scalex_above2_beloweq4
    SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample2xOrLess_8px, 0
    jmp             .final_row
%ifdef X86_32
    %undef xmm_yfrac0
    %xdefine xmm_yfrac0 xmm_tmp5
    %undef xmm_tmp5
%endif
.scalex_above2_beloweq4:
    SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample4xOrLess_8px, 0
    jmp             .final_row
.scalex_above4:
%xdefine xmm_xfrac0 xmm_xpos_frac
%xdefine xmm_xfrac1 xmm_xpos_int
%xdefine xmm_xfrac0_begin xmm_xpos_int_begin
%xdefine xmm_xfrac1_begin xmm_xpos_frac_begin
%xdefine xmm_xfrac_inc xmm_xpos_frac_inc
%undef xmm_xpos_int
%undef xmm_xpos_frac
%undef xmm_xpos_int_begin
%undef xmm_xpos_frac_begin
%undef xmm_xpos_int_inc
%undef xmm_xpos_frac_inc
    SSE2_UnpckXFracw xmm_tmp0, xmm_xfrac1, xmm_xfrac0, xmm_7fff
    movdqa          xmm_xfrac0, xmm_tmp0
    movdqa          xmm_xfrac0_begin, xmm_xfrac0
    movdqa          xmm_xfrac1_begin, xmm_xfrac1
    pcmpeqw         xmm_tmp0, xmm_tmp0
    pmullw          xmm_tmp0, xmm_xfrac_inc
    punpcklwd       xmm_tmp0, xmm_xfrac_inc
    movdqa          xmm_xfrac_inc, xmm_tmp0
    SSE2_GeneralBilinearDownsampler_loop SSE41_GeneralBilinearAccurateDownsample_8px, 0

.final_row:
    mov             p_src_row0, i_ypos
    shr             p_src_row0, 15
    imul            p_src_row0, i_src_stride
    add             p_src_row0, p_src
    mov             i_xpos, 1 << 15
    mov             i_width_cnt, i_dst_width

.final_row_width:
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]
    mov             [p_dst], r_tmp0b
    add             p_dst, 1
    add             i_xpos, i_scalex
    sub             i_width_cnt, 1
    jg              .final_row_width

.done:
%ifdef X86_32
    mov             esp, [esp]
%endif
    POP_XMM
    LOAD_7_PARA_POP
%ifndef X86_32
%ifdef WIN64
    pop             rsi
    pop             rdi
%endif
    pop             rbp
    pop             rbx
    pop             r13
    pop             r12
%endif
    ret
%undef p_dst
%undef i_dst_stride_less_width
%undef i_dst_width
%undef i_dst_height
%undef p_src
%undef i_src_stride
%undef i_scalex
%undef i_scalexd
%undef i_scaleyd
%undef i_xpos
%undef i_ypos
%undef i_yposd
%undef p_src_row0
%undef p_src_row1
%undef i_width_cnt
%undef r_tmp0
%undef r_tmp0b
%undef xmm_0
%undef xmm_xpos_frac
%undef xmm_xpos_frac_inc
%undef xmm_xpos_int
%undef xmm_xpos_int_inc
%undef xmm_yfrac0
%undef xmm_yfrac1
%undef xmm_tmp0
%undef xmm_tmp1
%undef xmm_tmp2
%undef xmm_tmp3
%undef xmm_tmp4
%undef xmm_tmp5
%undef xmm_7fff
%undef xmm_xpos_int_begin
%undef xmm_xpos_frac_begin
%undef xmm_xfrac0
%undef xmm_xfrac1
%undef xmm_xfrac0_begin
%undef xmm_xfrac1_begin
%undef xmm_xfrac_inc

%ifdef HAVE_AVX2
; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
%macro AVX2_BilinearIncXposuw 5
    vpaddusw        %5, %2, %4
    vpaddw          %2, %2, %4
    vpcmpeqw        %5, %5, %2
    vpaddb          %1, %1, %3
    vpaddb          %1, %1, %5  ; subtract 1 if no carry
%endmacro

; outl=%1 outh=%2 in=%3 FFFFh/7FFFh=%4
%macro AVX2_UnpckXFrac 4
    vpxor           %1, %3, %4
    vpunpckhwd      %2, %1, %3
    vpunpcklwd      %1, %1, %3
%endmacro

; out0=%1 out1=%2 xfrac=%3 yfrac0=%4 yfrac1=%5
%macro AVX2_BilinearFastCalcXYFrac 5
    vpmulhuw        %2, %3, %5
    vpmulhuw        %1, %3, %4
%endmacro

; [in:dwordsl out:bytes] dwordsh=%2 zero=%3
%macro AVX2_BilinearFastPackDwordsToBytes 3
    vpsrld          %1, %1, 14
    vpsrld          %2, %2, 14
    vpackssdw       %1, %1, %2
    vpavgw          %1, %1, %3
    vpackuswb       %1, %1, %1
%endmacro

%macro AVX2_BilinearFastDownsample2xOrLess_16px 0
    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_0
    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
    lea             i_xpos, [i_xpos + 8 * i_scalex2]
    shr             r_tmp0, 16
    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
    vpshufb         ymm_tmp4, ymm_tmp4, ymm_xpos_int
    vpshufb         ymm_tmp5, ymm_tmp5, ymm_xpos_int
    AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
    vpunpcklbw      ymm_tmp3, ymm_tmp4, ymm_0
    vpmaddwd        ymm_tmp0, ymm_tmp0, ymm_tmp3
    vpunpcklbw      ymm_tmp3, ymm_tmp5, ymm_0
    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp3
    vpaddd          ymm_tmp0, ymm_tmp0, ymm_tmp2
    AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
    vpunpckhbw      ymm_tmp2, ymm_tmp4, ymm_0
    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp2
    vpunpckhbw      ymm_tmp2, ymm_tmp5, ymm_0
    vpmaddwd        ymm_tmp3, ymm_tmp3, ymm_tmp2
    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp3
    AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
    vmovlps         [p_dst], xmm_tmp0
    vextracti128    [p_dst + 8], ymm_tmp0, 1
    add             p_dst, 16
    AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
%endmacro

%macro AVX2_BilinearFastDownsample4xOrLess_16px 0
%ifdef X86_32_PICASM
    push            r0
    mov             r0, esp
    and             esp, 0xfffffff0
    push            0x08080808
    push            0x08080808
    push            0x00000000
    push            0x00000000
    vbroadcasti128  ymm_tmp0, [esp]
    mov             esp, r0
    pop             r0
%else
    vbroadcasti128  ymm_tmp0, [shufb_0000000088888888]
%endif
    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp3, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
    shr             r_tmp0, 16
    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
    lea             r_tmp0, [i_xpos + 2 * i_scalex2]
    lea             i_xpos, [r_tmp0 + 4 * i_scalex2]
    shr             r_tmp0, 16
    vpunpcklbw      ymm_tmp2, ymm_xpos_int, ymm_ffff
    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp2
    vpshufb         ymm_tmp3, ymm_tmp3, ymm_tmp2
    AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
    vpmaddwd        ymm_tmp0, ymm_tmp0, ymm_tmp4
    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp3
    vpaddd          ymm_tmp0, ymm_tmp0, ymm_tmp2
    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp3, [p_src_row1 + r_tmp0]
    mov             r_tmp0, i_xpos
    lea             i_xpos, [i_xpos + 2 * i_scalex2]
    shr             r_tmp0, 16
    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
    vpunpckhbw      ymm_tmp2, ymm_xpos_int, ymm_ffff
    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp2
    vpshufb         ymm_tmp3, ymm_tmp3, ymm_tmp2
    AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp4
    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp3
    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp2
    AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
    vmovlps         [p_dst], xmm_tmp0
    vextracti128    [p_dst + 8], ymm_tmp0, 1
    add             p_dst, 16
    AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
%endmacro

%macro AVX2_BilinearFastDownsample8xOrLess_16px 0
%ifdef X86_32_PICASM
    push            r0
    mov             r0, esp
    and             esp, 0xffffffe0
    push            0x0c0c0c0c
    push            0x08080808
    push            0x04040404
    push            0x00000000
    vbroadcasti128  ymm_tmp0, [esp]
    mov             esp, r0
    pop             r0
%else
    vbroadcasti128  ymm_tmp0, [shufb_000044448888CCCC]
%endif
    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
    add             i_xpos, i_scalex2
    shr             r_tmp0, 16
    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovdqu         xmm_tmp0, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp1, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
    add             i_xpos, i_scalex2
    shr             r_tmp0, 16
    vinserti128     ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
    vpunpcklbw      ymm_tmp3, ymm_xpos_int, ymm_ffff
    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
    vpshufb         ymm_tmp5, ymm_tmp5, ymm_tmp3
    vpshufb         ymm_tmp0, ymm_tmp0, ymm_tmp3
    vpshufb         ymm_tmp1, ymm_tmp1, ymm_tmp3
    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 11001100b
    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp1, 11001100b
    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
    AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
    vpmaddwd        ymm_tmp0, ymm_tmp0, ymm_tmp4
    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp5
    vpaddd          ymm_tmp0, ymm_tmp0, ymm_tmp2
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
    add             i_xpos, i_scalex2
    shr             r_tmp0, 16
    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
    mov             r_tmp0, i_xpos
    lea             i_xpos, [i_xpos + 4 * i_scalex2]
    shr             r_tmp0, 16
    vmovdqu         xmm_tmp2, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp3, [p_src_row1 + r_tmp0]
    mov             r_tmp0, i_xpos
    add             i_xpos, i_scalex2
    shr             r_tmp0, 16
    vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
    vpshufb         ymm_tmp4, ymm_tmp4, ymm_xpos_int
    vpshufb         ymm_tmp5, ymm_tmp5, ymm_xpos_int
    vpshufb         ymm_tmp2, ymm_tmp2, ymm_xpos_int
    vpshufb         ymm_tmp3, ymm_tmp3, ymm_xpos_int
    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp2, 10001000b
    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp3, 10001000b
    vpunpckhbw      ymm_tmp4, ymm_tmp4, ymm_0
    vpunpckhbw      ymm_tmp5, ymm_tmp5, ymm_0
    AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp4
    vpmaddwd        ymm_tmp3, ymm_tmp3, ymm_tmp5
    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp3
    AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
    vmovlps         [p_dst], xmm_tmp0
    vextracti128    [p_dst + 8], ymm_tmp0, 1
    add             p_dst, 16
    AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
%endmacro

%macro AVX2_GeneralBilinearFastDownsample_16px 0
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp4, [p_src_row0 + r_tmp0]
    vpbroadcastd    ymm_tmp5, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 1 * i_scalex]
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
    vpunpcklwd      ymm_tmp4, ymm_tmp4, ymm_tmp0
    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
    vpunpcklwd      ymm_tmp5, ymm_tmp5, ymm_tmp0
    lea             r_tmp0, [i_xpos + 2 * i_scalex]
    lea             i_xpos, [i_xpos + 4 * i_scalex]
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 00100010b
    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp0, 00100010b
    mov             r_tmp0, i_xpos
    sub             r_tmp0, i_scalex
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
    vpblendw        ymm_tmp4, ymm_tmp4, ymm_tmp0, 1000b
    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
    vpblendw        ymm_tmp5, ymm_tmp5, ymm_tmp0, 1000b
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp2, [p_src_row0 + r_tmp0]
    vpbroadcastd    ymm_tmp3, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 1 * i_scalex]
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
    vpunpcklwd      ymm_tmp2, ymm_tmp2, ymm_tmp0
    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
    vpunpcklwd      ymm_tmp3, ymm_tmp3, ymm_tmp0
    lea             r_tmp0, [i_xpos + 2 * i_scalex]
    lea             i_xpos, [i_xpos + 4 * i_scalex]
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp0, 00100010b
    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
    vpblendd        ymm_tmp3, ymm_tmp3, ymm_tmp0, 00100010b
    mov             r_tmp0, i_xpos
    sub             r_tmp0, i_scalex
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
    vpblendw        ymm_tmp2, ymm_tmp2, ymm_tmp0, 1000b
    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
    vpblendw        ymm_tmp3, ymm_tmp3, ymm_tmp0, 1000b
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovd           xmm_tmp0, [p_src_row0 + r_tmp0]
    vmovd           xmm_tmp1, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + i_scalex]
    shr             r_tmp0, 16
    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 1
    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 1
    lea             r_tmp0, [i_xpos + 2 * i_scalex]
    lea             i_xpos, [i_xpos + 4 * i_scalex]
    shr             r_tmp0, 16
    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 2
    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 2
    mov             r_tmp0, i_xpos
    sub             r_tmp0, i_scalex
    shr             r_tmp0, 16
    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 3
    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 3
    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 00001111b
    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp1, 00001111b
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovd           xmm_tmp0, [p_src_row0 + r_tmp0]
    vmovd           xmm_tmp1, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + i_scalex]
    shr             r_tmp0, 16
    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 1
    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 1
    lea             r_tmp0, [i_xpos + 2 * i_scalex]
    lea             i_xpos, [i_xpos + 4 * i_scalex]
    shr             r_tmp0, 16
    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 2
    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 2
    mov             r_tmp0, i_xpos
    sub             r_tmp0, i_scalex
    shr             r_tmp0, 16
    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 3
    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 3
    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp0, 00001111b
    vpblendd        ymm_tmp3, ymm_tmp3, ymm_tmp1, 00001111b
    vpunpcklbw      ymm_tmp4, ymm_tmp4, ymm_0
    vpunpcklbw      ymm_tmp5, ymm_tmp5, ymm_0
    AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp1, ymm_xfrac0, ymm_yfrac0, ymm_yfrac1
    vpmaddwd        ymm_tmp0, ymm_tmp0, ymm_tmp4
    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp5
    vpaddd          ymm_tmp0, ymm_tmp0, ymm_tmp1
    vpunpcklbw      ymm_tmp4, ymm_tmp2, ymm_0
    vpunpcklbw      ymm_tmp5, ymm_tmp3, ymm_0
    AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_xfrac1, ymm_yfrac0, ymm_yfrac1
    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp4
    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp5
    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp2
    AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
    vpermq          ymm_tmp0, ymm_tmp0, 0010b
    vmovdqu         [p_dst], xmm_tmp0
    add             p_dst, 16
    vpaddw          ymm_xfrac0, ymm_xfrac0, ymm_xfrac_inc
    vpaddw          ymm_xfrac1, ymm_xfrac1, ymm_xfrac_inc
%endmacro

; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6,%7
%macro AVX2_BilinearIncXposw 7
    vpaddb          %1, %1, %3
    vpaddw          %6, %2, %4
    vpcmpgtw        %7, %2, %6
    vpsubb          %1, %1, %7  ; add carry
    vpand           %2, %6, %5
%endmacro

; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6
%macro AVX2_LinearAccurateInterpolateVerticalDwords 6
    vpshufd         %1, %2, 10110001b
    vpshufd         %6, %3, 10110001b
    vpmuludq        %1, %1, %4
    vpmuludq        %6, %6, %5
    vpaddq          %1, %1, %6
    vpmuludq        %2, %2, %4
    vpmuludq        %3, %3, %5
    vpaddq          %2, %2, %3
    vpsllq          %1, %1,  3
    vpsrlq          %2, %2, 29
    vpblendd        %1, %1, %2, 01010101b
%endmacro

%macro AVX2_BilinearAccurateDownsample2xOrLess_16px 0
    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_0
    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
    lea             i_xpos, [i_xpos + 8 * i_scalex2]
    shr             r_tmp0, 16
    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
    vpshufb         ymm_tmp4, ymm_tmp4, ymm_xpos_int
    vpshufb         ymm_tmp5, ymm_tmp5, ymm_xpos_int
    vpunpcklbw      ymm_tmp2, ymm_tmp4, ymm_0
    vpunpcklbw      ymm_tmp3, ymm_tmp5, ymm_0
    vpunpckhbw      ymm_tmp4, ymm_tmp4, ymm_0
    vpunpckhbw      ymm_tmp5, ymm_tmp5, ymm_0
    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp0
    vpmaddwd        ymm_tmp3, ymm_tmp3, ymm_tmp0
    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp1
    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_tmp1
    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp2, ymm_tmp3, ymm_yfrac0, ymm_yfrac1, ymm_tmp1
    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp2
    vpackssdw       ymm_tmp0, ymm_tmp0, ymm_tmp1
    vpavgw          ymm_tmp0, ymm_tmp0, ymm_0
    vpackuswb       ymm_tmp0, ymm_tmp0, ymm_tmp0
    vmovlps         [p_dst], xmm_tmp0
    vextracti128    [p_dst + 8], ymm_tmp0, 1
    add             p_dst, 16
    AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1
%endmacro

%macro AVX2_BilinearAccurateDownsample4xOrLess_16px 0
%ifdef X86_32_PICASM
    push            r5
    mov             r5, esp
    and             esp, 0xffffffe0
    push            0x08080808    ;shufb_0000000088888888
    push            0x08080808
    push            0x00000000
    push            0x00000000
    vbroadcasti128  ymm_tmp0, [esp]
    mov             esp, r5
    pop             r5
%else
    vbroadcasti128  ymm_tmp0, [shufb_0000000088888888]
%endif
    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp2, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
    shr             r_tmp0, 16
    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
    lea             r_tmp0, [i_xpos + 2 * i_scalex2]
    lea             i_xpos, [r_tmp0 + 4 * i_scalex2]
    shr             r_tmp0, 16
%ifdef X86_32_PICASM
    push            r5
    mov             r5, esp
    and             esp, 0xffffffe0
    push            0x80808080    ;db80h_256
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [esp]
    mov             esp, r5
    pop             r5
%else
    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
%endif
    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
    vpshufb         ymm_tmp2, ymm_tmp2, ymm_tmp3
    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp0
    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp0
    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp2, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp2, [p_src_row1 + r_tmp0]
    mov             r_tmp0, i_xpos
    lea             i_xpos, [i_xpos + 2 * i_scalex2]
    shr             r_tmp0, 16
    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
%ifdef X86_32_PICASM
    push            r5
    mov             r5, esp
    and             esp, 0xffffffe0
    push            0x80808080    ;db80h_256
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    vpunpckhbw      ymm_tmp3, ymm_xpos_int, [esp]
    mov             esp, r5
    pop             r5
%else
    vpunpckhbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
%endif
    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
    vpshufb         ymm_tmp2, ymm_tmp2, ymm_tmp3
    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp1
    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp1
    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp2, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
    vpackssdw       ymm_tmp0, ymm_tmp0, ymm_tmp1
    vpavgw          ymm_tmp0, ymm_tmp0, ymm_0
    vpackuswb       ymm_tmp0, ymm_tmp0, ymm_tmp0
    vmovlps         [p_dst], xmm_tmp0
    vextracti128    [p_dst + 8], ymm_tmp0, 1
    add             p_dst, 16
    AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1
%endmacro

%macro AVX2_BilinearAccurateDownsample8xOrLess_16px 0
%ifdef X86_32_PICASM
    push            r5
    mov             r5, esp
    and             esp, 0xffffffe0
    push            0x0c0c0c0c    ;shufb_000044448888cccc
    push            0x08080808
    push            0x04040404
    push            0x00000000
    vbroadcasti128  ymm_tmp0, [esp]
    mov             esp, r5
    pop             r5
%else
    vbroadcasti128  ymm_tmp0, [shufb_000044448888CCCC]
%endif
    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
    add             i_xpos, i_scalex2
    shr             r_tmp0, 16
    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovdqu         xmm_tmp0, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp1, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
    add             i_xpos, i_scalex2
    shr             r_tmp0, 16
    vinserti128     ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
%ifdef X86_32_PICASM
    push            r5
    mov             r5, esp
    and             esp, 0xffffffe0
    push            0x80808080    ;db80h_256
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    push            0x80808080
    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [esp]
    mov             esp, r5
    pop             r5
%else
    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
%endif
    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
    vpshufb         ymm_tmp5, ymm_tmp5, ymm_tmp3
    vpshufb         ymm_tmp0, ymm_tmp0, ymm_tmp3
    vpshufb         ymm_tmp1, ymm_tmp1, ymm_tmp3
    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 11001100b
    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp1, 11001100b
    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp0
    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_tmp0
    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
    add             i_xpos, i_scalex2
    shr             r_tmp0, 16
    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
    mov             r_tmp0, i_xpos
    lea             i_xpos, [i_xpos + 4 * i_scalex2]
    shr             r_tmp0, 16
    vmovdqu         xmm_tmp2, [p_src_row0 + r_tmp0]
    vmovdqu         xmm_tmp3, [p_src_row1 + r_tmp0]
    mov             r_tmp0, i_xpos
    add             i_xpos, i_scalex2
    shr             r_tmp0, 16
    vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row0 + r_tmp0], 1
    vinserti128     ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
    vpshufb         ymm_tmp4, ymm_tmp4, ymm_xpos_int
    vpshufb         ymm_tmp5, ymm_tmp5, ymm_xpos_int
    vpshufb         ymm_tmp2, ymm_tmp2, ymm_xpos_int
    vpshufb         ymm_tmp3, ymm_tmp3, ymm_xpos_int
    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp2, 10001000b
    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp3, 10001000b
    vpunpckhbw      ymm_tmp4, ymm_tmp4, ymm_0
    vpunpckhbw      ymm_tmp5, ymm_tmp5, ymm_0
    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp1
    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_tmp1
    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
    vpackssdw       ymm_tmp0, ymm_tmp0, ymm_tmp1
    vpavgw          ymm_tmp0, ymm_tmp0, ymm_0
    vpackuswb       ymm_tmp0, ymm_tmp0, ymm_tmp0
    vmovlps         [p_dst], xmm_tmp0
    vextracti128    [p_dst + 8], ymm_tmp0, 1
    add             p_dst, 16
    AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1
%endmacro

%macro AVX2_GeneralBilinearAccurateDownsample_16px 0
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp4, [p_src_row0 + r_tmp0]
    vpbroadcastd    ymm_tmp5, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 1 * i_scalex]
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
    vpunpcklwd      ymm_tmp4, ymm_tmp4, ymm_tmp0
    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
    vpunpcklwd      ymm_tmp5, ymm_tmp5, ymm_tmp0
    lea             r_tmp0, [i_xpos + 2 * i_scalex]
    lea             i_xpos, [i_xpos + 4 * i_scalex]
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 00100010b
    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp0, 00100010b
    mov             r_tmp0, i_xpos
    sub             r_tmp0, i_scalex
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
    vpblendw        ymm_tmp4, ymm_tmp4, ymm_tmp0, 1000b
    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
    vpblendw        ymm_tmp5, ymm_tmp5, ymm_tmp0, 1000b
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp2, [p_src_row0 + r_tmp0]
    vpbroadcastd    ymm_tmp3, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + 1 * i_scalex]
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
    vpunpcklwd      ymm_tmp2, ymm_tmp2, ymm_tmp0
    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
    vpunpcklwd      ymm_tmp3, ymm_tmp3, ymm_tmp0
    lea             r_tmp0, [i_xpos + 2 * i_scalex]
    lea             i_xpos, [i_xpos + 4 * i_scalex]
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp0, 00100010b
    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
    vpblendd        ymm_tmp3, ymm_tmp3, ymm_tmp0, 00100010b
    mov             r_tmp0, i_xpos
    sub             r_tmp0, i_scalex
    shr             r_tmp0, 16
    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
    vpblendw        ymm_tmp2, ymm_tmp2, ymm_tmp0, 1000b
    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
    vpblendw        ymm_tmp3, ymm_tmp3, ymm_tmp0, 1000b
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovd           xmm_tmp0, [p_src_row0 + r_tmp0]
    vmovd           xmm_tmp1, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + i_scalex]
    shr             r_tmp0, 16
    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 1
    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 1
    lea             r_tmp0, [i_xpos + 2 * i_scalex]
    lea             i_xpos, [i_xpos + 4 * i_scalex]
    shr             r_tmp0, 16
    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 2
    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 2
    mov             r_tmp0, i_xpos
    sub             r_tmp0, i_scalex
    shr             r_tmp0, 16
    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 3
    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 3
    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 00001111b
    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp1, 00001111b
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    vmovd           xmm_tmp0, [p_src_row0 + r_tmp0]
    vmovd           xmm_tmp1, [p_src_row1 + r_tmp0]
    lea             r_tmp0, [i_xpos + i_scalex]
    shr             r_tmp0, 16
    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 1
    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 1
    lea             r_tmp0, [i_xpos + 2 * i_scalex]
    lea             i_xpos, [i_xpos + 4 * i_scalex]
    shr             r_tmp0, 16
    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 2
    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 2
    mov             r_tmp0, i_xpos
    sub             r_tmp0, i_scalex
    shr             r_tmp0, 16
    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 3
    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 3
    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp0, 00001111b
    vpblendd        ymm_tmp3, ymm_tmp3, ymm_tmp1, 00001111b
    vpunpcklbw      ymm_tmp4, ymm_tmp4, ymm_0
    vpunpcklbw      ymm_tmp5, ymm_tmp5, ymm_0
    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_xfrac0
    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_xfrac0
    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp1
    vpunpcklbw      ymm_tmp4, ymm_tmp2, ymm_0
    vpunpcklbw      ymm_tmp5, ymm_tmp3, ymm_0
    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_xfrac1
    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_xfrac1
    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp2
    vpackssdw       ymm_tmp0, ymm_tmp0, ymm_tmp1
    vpavgw          ymm_tmp0, ymm_tmp0, ymm_0
    vpackuswb       ymm_tmp0, ymm_tmp0, ymm_tmp0
    vextracti128    [p_dst], ymm_tmp0, 1
    vmovlps         [p_dst + 8], xmm_tmp0
    add             p_dst, 16
    vpaddw          ymm_xfrac0, ymm_xfrac0, ymm_xfrac_inc
    vpaddw          ymm_xfrac1, ymm_xfrac1, ymm_xfrac_inc
    vpand           ymm_xfrac0, ymm_xfrac0, ymm_7fff
    vpand           ymm_xfrac1, ymm_xfrac1, ymm_7fff
%endmacro

; downsample_16px_macro=%1 b_fast=%2
%macro AVX2_GeneralBilinearDownsampler_loop 2
%%height:
    mov             p_src_row0, i_ypos
    shr             p_src_row0, 15
    imul            p_src_row0, i_src_stride
    add             p_src_row0, p_src
    mov             p_src_row1, p_src_row0
    add             p_src_row1, i_src_stride
%ifdef X86_32
%if %2
    vpbroadcastw    ymm_tmp1, i_ypos
    vpsllw          ymm_tmp1, ymm_tmp1, 1
    vpsrlw          ymm_tmp1, ymm_tmp1, 1
    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
    vpsrlw          ymm_tmp0, ymm_tmp0, 1
%else
    vpbroadcastd    ymm_tmp1, i_ypos
    vpslld          ymm_tmp1, ymm_tmp1, 17
    vpsrld          ymm_tmp1, ymm_tmp1, 17
    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
    vpsrld          ymm_tmp0, ymm_tmp0, 17
%endif
    vpxor           ymm_tmp0, ymm_tmp0, ymm_tmp1
    vmovdqa         ymm_yfrac0, ymm_tmp0
    vmovdqa         ymm_yfrac1, ymm_tmp1
%else
    vmovd           xmm_tmp0, i_yposd
    vpbroadcastw    ymm_yfrac1, xmm_tmp0
%if %2
    vpsllw          ymm_yfrac1, ymm_yfrac1, 1
    vpsrlw          ymm_yfrac1, ymm_yfrac1, 1
    vpcmpeqw        ymm_yfrac0, ymm_yfrac0, ymm_yfrac0
    vpsrlw          ymm_yfrac0, ymm_yfrac0, 1
%else
    vpslld          ymm_yfrac1, ymm_yfrac1, 17
    vpsrld          ymm_yfrac1, ymm_yfrac1, 17
    vpcmpeqw        ymm_yfrac0, ymm_yfrac0, ymm_yfrac0
    vpsrld          ymm_yfrac0, ymm_yfrac0, 17
%endif
    vpxor           ymm_yfrac0, ymm_yfrac0, ymm_yfrac1
%endif

    mov             i_xpos, 1 << 15
    mov             i_width_cnt, i_dst_width
    sub             i_width_cnt, 1

%ifdef ymm_xpos_int
    vmovdqa         ymm_xpos_int, ymm_xpos_int_begin
    vmovdqa         ymm_xpos_frac, ymm_xpos_frac_begin
%else
    vmovdqa         ymm_xfrac0, ymm_xfrac0_begin
    vmovdqa         ymm_xfrac1, ymm_xfrac1_begin
%endif

%%width:
    %1
    sub             i_width_cnt, 16
    jg              %%width

    lea             p_dst, [p_dst + i_width_cnt + 1]
%ifdef i_scalex2
    mov             r_tmp0, i_scalex2
    shr             r_tmp0, 1
    imul            i_width_cnt, r_tmp0
%else
    imul            i_width_cnt, i_scalex
%endif
    add             i_xpos, i_width_cnt
    shr             i_xpos, 16
    movzx           r_tmp0, byte [p_src_row0 + i_xpos]
    mov             [p_dst - 1], r_tmp0b
%ifdef X86_32
    mov             r_tmp0, i_scaleyd
    add             i_yposd, r_tmp0
%else
    add             i_yposd, i_scaleyd
%endif
    add             p_dst, i_dst_stride_less_width
    sub             i_dst_height, 1
    jg              %%height
%endmacro

;**************************************************************************************************************
;void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
;    uint32_t uiScaleY);
;
;**************************************************************************************************************

WELS_EXTERN GeneralBilinearFastDownsampler_avx2
    %assign push_num 0
%ifndef X86_32
    push            r12
    push            r13
    push            rbx
    push            rbp
    %assign push_num 4
%ifdef WIN64
    push            rdi
    push            rsi
    %assign push_num push_num + 2
%endif
%endif
    LOAD_7_PARA
    PUSH_XMM 16
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r2, r2d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r5, r5d
    ZERO_EXTENSION  r6d
    sub             r1, r2                                            ; dst_stride - dst_width
%ifdef X86_32
    vmovd           xmm0, arg8
    vmovd           xmm1, esp
    and             esp, -32
    sub             esp, 8 * 4 + 8 * 32
    vmovd           [esp], xmm1
    %define p_dst                   r0
    %define i_dst_stride_less_width [esp + 1 * 4]
    %define i_dst_width             [esp + 2 * 4]
    %define i_dst_height            dword [esp + 3 * 4]
    %define p_src                   [esp + 4 * 4]
    %define i_src_stride            [esp + 5 * 4]
    %define i_scalex                r6
    %define i_scalexd               r6d
    %define i_scaleyd               [esp + 6 * 4]
    %define i_xpos                  r2
    %define i_ypos                  [esp + 7 * 4]
    %define i_yposd                 dword [esp + 7 * 4]
    %define p_src_row0              r3
    %define p_src_row1              r4
    %define i_width_cnt             r5
    %define r_tmp0                  r1
    %define r_tmp0b                 r1b
    %define ymm_xpos_frac           ymm1
    %define ymm_xpos_frac_inc       [esp + 8 * 4]
    %define ymm_xpos_int            ymm3
    %define ymm_xpos_int_inc        [esp + 8 * 4 + 1 * 32]
    %define ymm_yfrac0              [esp + 8 * 4 + 2 * 32]
    %define ymm_yfrac1              [esp + 8 * 4 + 3 * 32]
    %define xmm_tmp0                xmm7
    %define ymm_tmp0                ymm7
    %define xmm_tmp1                xmm0
    %define ymm_tmp1                ymm0
    %define xmm_tmp2                xmm2
    %define ymm_tmp2                ymm2
    %define xmm_tmp3                xmm4
    %define ymm_tmp3                ymm4
    %define xmm_tmp4                xmm5
    %define ymm_tmp4                ymm5
    %define xmm_tmp5                xmm6
    %define ymm_tmp5                ymm6
    %define ymm_0                   [esp + 8 * 4 + 4 * 32]
    %define ymm_ffff                [esp + 8 * 4 + 5 * 32]
    %define ymm_xpos_int_begin      [esp + 8 * 4 + 6 * 32]
    %define ymm_xpos_frac_begin     [esp + 8 * 4 + 7 * 32]
    mov             i_dst_stride_less_width, r1
    mov             i_dst_width, r2
    mov             i_dst_height, r3
    mov             p_src, r4
    mov             i_src_stride, r5
    vmovd           i_scaleyd, xmm0
    vpxor           xmm0, xmm0, xmm0
    vmovdqa         ymm_0, ymm0
    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
    vmovdqa         ymm_ffff, ymm_tmp0
%else
    %define p_dst                   r0
    %define i_dst_stride_less_width r1
    %define i_dst_width             r2
    %define i_dst_height            r3
    %define p_src                   r4
    %define i_src_stride            r5
    %define i_scalex                r6
    %define i_scalexd               r6d
    %define i_scaleyd               dword arg8d
    %define i_xpos                  r12
    %define i_ypos                  r13
    %define i_yposd                 r13d
    %define p_src_row0              rbp
%ifdef WIN64
    %define p_src_row1              rsi
    %define i_width_cnt             rdi
%else
    %define p_src_row1              r11
    %define i_width_cnt             rax
%endif
    %define r_tmp0                  rbx
    %define r_tmp0b                 bl
    %define ymm_0                   ymm0
    %define ymm_xpos_frac           ymm1
    %define ymm_xpos_frac_inc       ymm2
    %define ymm_xpos_int            ymm3
    %define ymm_xpos_int_inc        ymm4
    %define ymm_yfrac0              ymm5
    %define ymm_yfrac1              ymm6
    %define xmm_tmp0                xmm7
    %define ymm_tmp0                ymm7
    %define xmm_tmp1                xmm8
    %define ymm_tmp1                ymm8
    %define xmm_tmp2                xmm9
    %define ymm_tmp2                ymm9
    %define xmm_tmp3                xmm10
    %define ymm_tmp3                ymm10
    %define xmm_tmp4                xmm11
    %define ymm_tmp4                ymm11
    %define xmm_tmp5                xmm12
    %define ymm_tmp5                ymm12
    %define ymm_ffff                ymm13
    %define ymm_xpos_int_begin      ymm14
    %define ymm_xpos_frac_begin     ymm15
    vpxor           ymm_0, ymm_0, ymm_0
    vpcmpeqw        ymm_ffff, ymm_ffff, ymm_ffff
%endif

    sub             i_dst_height, 1
    je              .final_row
    jl              .done

    mov             i_yposd, 1 << 14
    vmovd           xmm_tmp0, i_scalexd
    vpbroadcastd    ymm_tmp0, xmm_tmp0
    vpslld          ymm_tmp1, ymm_tmp0, 2
    vpslld          ymm_tmp2, ymm_tmp0, 3
    vpaddd          ymm_tmp3, ymm_tmp1, ymm_tmp2
    vpxor           ymm_tmp4, ymm_tmp4, ymm_tmp4
    vpblendd        ymm_tmp1, ymm_tmp4, ymm_tmp1, 11110000b
    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp3, 11110000b
    vpaddd          ymm_tmp3, ymm_tmp0, ymm_tmp0
    vpblendd        ymm_tmp3, ymm_tmp4, ymm_tmp3, 11001100b
    vpblendd        ymm_tmp0, ymm_tmp4, ymm_tmp0, 10101010b
    vpaddd          ymm_tmp0, ymm_tmp3, ymm_tmp0
    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp0
    vpaddd          ymm_tmp2, ymm_tmp2, ymm_tmp0
    vpcmpeqw        ymm_tmp3, ymm_tmp3, ymm_tmp3
    vpsrld          ymm_tmp3, ymm_tmp3, 31
    vpslld          ymm_tmp3, ymm_tmp3, 15
    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp3
    vpaddd          ymm_tmp2, ymm_tmp2, ymm_tmp3
    vpsrld          ymm_xpos_int, ymm_tmp1, 16
    vpsrld          ymm_tmp0, ymm_tmp2, 16
    vpackssdw       ymm_xpos_int, ymm_xpos_int, ymm_tmp0
    vpermq          ymm_xpos_int, ymm_xpos_int, 11011000b
    vpackuswb       ymm_xpos_int, ymm_xpos_int, ymm_xpos_int
    vpcmpeqw        ymm_tmp3, ymm_tmp3, ymm_tmp3
    vpsubb          ymm_tmp0, ymm_xpos_int, ymm_tmp3
    vpunpcklbw      ymm_xpos_int, ymm_xpos_int, ymm_tmp0
    vpslld          ymm_tmp1, ymm_tmp1, 16
    vpsrld          ymm_tmp1, ymm_tmp1, 16
    vpslld          ymm_tmp2, ymm_tmp2, 16
    vpsrld          ymm_tmp2, ymm_tmp2, 16
    vpackusdw       ymm_xpos_frac, ymm_tmp1, ymm_tmp2
    vpermq          ymm_xpos_frac, ymm_xpos_frac, 11011000b
    vmovd           xmm_tmp0, i_scalexd
    vpslld          xmm_tmp0, xmm_tmp0, 4
    vpbroadcastw    ymm_tmp1, xmm_tmp0
    vmovdqa         ymm_xpos_frac_inc, ymm_tmp1
    vpsrld          xmm_tmp0, xmm_tmp0, 16
    vpsubw          ymm_tmp0, ymm_tmp0, ymm_tmp3
    vpbroadcastb    ymm_tmp0, xmm_tmp0
    vmovdqa         ymm_xpos_int_inc, ymm_tmp0
    vmovdqa         ymm_xpos_int_begin, ymm_xpos_int
    vmovdqa         ymm_xpos_frac_begin, ymm_xpos_frac

    cmp             i_scalex, 4 << 16
    ja              .scalex_above4
    cmp             i_scalex, 2 << 16
    ja              .scalex_above2_beloweq4
    add             i_scalex, i_scalex
%xdefine i_scalex2 i_scalex
%undef i_scalex
    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample2xOrLess_16px, 1
    shr             i_scalex2, 1
%xdefine i_scalex i_scalex2
%undef i_scalex2
    jmp             .final_row
.scalex_above2_beloweq4:
    add             i_scalex, i_scalex
%xdefine i_scalex2 i_scalex
%undef i_scalex
    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample4xOrLess_16px, 1
    shr             i_scalex2, 1
%xdefine i_scalex i_scalex2
%undef i_scalex2
    jmp             .final_row
.scalex_above4:
    cmp             i_scalex, 8 << 16
    ja              .scalex_above8
    add             i_scalex, i_scalex
%xdefine i_scalex2 i_scalex
%undef i_scalex
    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample8xOrLess_16px, 1
    shr             i_scalex2, 1
%xdefine i_scalex i_scalex2
%undef i_scalex2
    jmp             .final_row
.scalex_above8:
%xdefine ymm_xfrac0 ymm_xpos_frac
%xdefine ymm_xfrac1 ymm_xpos_int
%xdefine ymm_xfrac0_begin ymm_xpos_int_begin
%xdefine ymm_xfrac1_begin ymm_xpos_frac_begin
%xdefine ymm_xfrac_inc ymm_xpos_frac_inc
%undef ymm_xpos_int
%undef ymm_xpos_frac
%undef ymm_xpos_int_begin
%undef ymm_xpos_frac_begin
%undef ymm_xpos_int_inc
%undef ymm_xpos_frac_inc
    AVX2_UnpckXFrac ymm_tmp0, ymm_xfrac1, ymm_xfrac0, ymm_ffff
    vpermq          ymm_xfrac0, ymm_tmp0,   01001110b
    vpermq          ymm_xfrac1, ymm_xfrac1, 01001110b
    vmovdqa         ymm_xfrac0_begin, ymm_xfrac0
    vmovdqa         ymm_xfrac1_begin, ymm_xfrac1
    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
    vpmullw         ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
    vpunpcklwd      ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
    vmovdqa         ymm_xfrac_inc, ymm_tmp0
    AVX2_GeneralBilinearDownsampler_loop AVX2_GeneralBilinearFastDownsample_16px, 1

.final_row:
    mov             p_src_row0, i_ypos
    shr             p_src_row0, 15
    imul            p_src_row0, i_src_stride
    add             p_src_row0, p_src
    mov             i_xpos, 1 << 15
    mov             i_width_cnt, i_dst_width

.final_row_width:
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]
    mov             [p_dst], r_tmp0b
    add             p_dst, 1
    add             i_xpos, i_scalex
    sub             i_width_cnt, 1
    jg              .final_row_width

.done:
    vzeroupper
%ifdef X86_32
    mov             esp, [esp]
%endif
    POP_XMM
    LOAD_7_PARA_POP
%ifndef X86_32
%ifdef WIN64
    pop             rsi
    pop             rdi
%endif
    pop             rbp
    pop             rbx
    pop             r13
    pop             r12
%endif
    ret
%undef p_dst
%undef i_dst_stride_less_width
%undef i_dst_width
%undef i_dst_height
%undef p_src
%undef i_src_stride
%undef i_scalex
%undef i_scalexd
%undef i_scaleyd
%undef i_xpos
%undef i_ypos
%undef i_yposd
%undef p_src_row0
%undef p_src_row1
%undef i_width_cnt
%undef r_tmp0
%undef r_tmp0b
%undef ymm_xpos_frac
%undef ymm_xpos_frac_inc
%undef ymm_xpos_int
%undef ymm_xpos_int_inc
%undef ymm_yfrac0
%undef ymm_yfrac1
%undef xmm_tmp0
%undef ymm_tmp0
%undef xmm_tmp1
%undef ymm_tmp1
%undef xmm_tmp2
%undef ymm_tmp2
%undef xmm_tmp3
%undef ymm_tmp3
%undef xmm_tmp4
%undef ymm_tmp4
%undef xmm_tmp5
%undef ymm_tmp5
%undef ymm_ffff
%undef ymm_0
%undef ymm_xpos_int_begin
%undef ymm_xpos_frac_begin
%undef ymm_xfrac0
%undef ymm_xfrac1
%undef ymm_xfrac0_begin
%undef ymm_xfrac1_begin
%undef ymm_xfrac_inc

;**************************************************************************************************************
;void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
;    uint32_t uiScaleY);
;
;**************************************************************************************************************

WELS_EXTERN GeneralBilinearAccurateDownsampler_avx2
    %assign push_num 0
%ifndef X86_32
    push            r12
    push            r13
    push            rbx
    push            rbp
    %assign push_num 4
%ifdef WIN64
    push            rdi
    push            rsi
    %assign push_num push_num + 2
%endif
%endif
    LOAD_7_PARA
    PUSH_XMM 16
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r2, r2d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r5, r5d
    ZERO_EXTENSION  r6d
    sub             r1, r2                                            ; dst_stride - dst_width
    add             r6, r6                                            ; 2 * scalex
%ifdef X86_32
    vmovd           xmm0, arg8
    vmovd           xmm1, esp
    and             esp, -32
    sub             esp, 8 * 4 + 8 * 32
    vmovd           [esp], xmm1
    %define p_dst                   r0
    %define i_dst_stride_less_width [esp + 1 * 4]
    %define i_dst_width             [esp + 2 * 4]
    %define i_dst_height            dword [esp + 3 * 4]
    %define p_src                   [esp + 4 * 4]
    %define i_src_stride            [esp + 5 * 4]
    %define i_scalex                r6
    %define i_scalexd               r6d
    %define i_scaleyd               [esp + 6 * 4]
    %define i_xpos                  r2
    %define i_ypos                  [esp + 7 * 4]
    %define i_yposd                 dword [esp + 7 * 4]
    %define p_src_row0              r3
    %define p_src_row1              r4
    %define i_width_cnt             r5
    %define r_tmp0                  r1
    %define r_tmp0b                 r1b
    %define ymm_xpos_frac           ymm1
    %define ymm_xpos_frac_inc       [esp + 8 * 4]
    %define ymm_xpos_int            ymm3
    %define ymm_xpos_int_inc        [esp + 8 * 4 + 1 * 32]
    %define ymm_yfrac0              [esp + 8 * 4 + 2 * 32]
    %define ymm_yfrac1              [esp + 8 * 4 + 3 * 32]
    %define xmm_tmp0                xmm7
    %define ymm_tmp0                ymm7
    %define xmm_tmp1                xmm0
    %define ymm_tmp1                ymm0
    %define xmm_tmp2                xmm2
    %define ymm_tmp2                ymm2
    %define xmm_tmp3                xmm4
    %define ymm_tmp3                ymm4
    %define xmm_tmp4                xmm5
    %define ymm_tmp4                ymm5
    %define xmm_tmp5                xmm6
    %define ymm_tmp5                ymm6
    %define ymm_0                   [esp + 8 * 4 + 4 * 32]
    %define ymm_7fff                [esp + 8 * 4 + 5 * 32]
    %define ymm_xpos_int_begin      [esp + 8 * 4 + 6 * 32]
    %define ymm_xpos_frac_begin     [esp + 8 * 4 + 7 * 32]
    mov             i_dst_stride_less_width, r1
    mov             i_dst_width, r2
    mov             i_dst_height, r3
    mov             p_src, r4
    mov             i_src_stride, r5
    vmovd           i_scaleyd, xmm0
    vpxor           xmm0, xmm0, xmm0
    vmovdqa         ymm_0, ymm0
    vpcmpeqw        ymm0, ymm0, ymm0
    vpsrlw          ymm0, ymm0, 1
    vmovdqa         ymm_7fff, ymm0
%else
    %define p_dst                   r0
    %define i_dst_stride_less_width r1
    %define i_dst_width             r2
    %define i_dst_height            r3
    %define p_src                   r4
    %define i_src_stride            r5
    %define i_scalex                r6
    %define i_scalexd               r6d
    %define i_scaleyd               dword arg8d
    %define i_xpos                  r12
    %define i_ypos                  r13
    %define i_yposd                 r13d
    %define p_src_row0              rbp
%ifdef WIN64
    %define p_src_row1              rsi
    %define i_width_cnt             rdi
%else
    %define p_src_row1              r11
    %define i_width_cnt             rax
%endif
    %define r_tmp0                  rbx
    %define r_tmp0b                 bl
    %define ymm_0                   ymm0
    %define ymm_xpos_frac           ymm1
    %define ymm_xpos_int            ymm3
    %define ymm_xpos_frac_inc       ymm2
    %define ymm_xpos_int_inc        ymm4
    %define ymm_yfrac0              ymm5
    %define ymm_yfrac1              ymm6
    %define xmm_tmp0                xmm7
    %define ymm_tmp0                ymm7
    %define xmm_tmp1                xmm8
    %define ymm_tmp1                ymm8
    %define xmm_tmp2                xmm9
    %define ymm_tmp2                ymm9
    %define xmm_tmp3                xmm10
    %define ymm_tmp3                ymm10
    %define xmm_tmp4                xmm11
    %define ymm_tmp4                ymm11
    %define xmm_tmp5                xmm12
    %define ymm_tmp5                ymm12
    %define ymm_7fff                ymm13
    %define ymm_xpos_int_begin      ymm14
    %define ymm_xpos_frac_begin     ymm15
    vpxor           ymm_0, ymm_0, ymm_0
    vpcmpeqw        ymm_7fff, ymm_7fff, ymm_7fff
    vpsrlw          ymm_7fff, ymm_7fff, 1
%endif

    sub             i_dst_height, 1
    je              .final_row
    jl              .done

    mov             i_yposd, 1 << 14
    vmovd           xmm_tmp0, i_scalexd
    vpbroadcastd    ymm_tmp0, xmm_tmp0
    vpslld          ymm_tmp1, ymm_tmp0, 2
    vpslld          ymm_tmp2, ymm_tmp0, 3
    vpaddd          ymm_tmp3, ymm_tmp1, ymm_tmp2
    vpxor           ymm_tmp4, ymm_tmp4, ymm_tmp4
    vpblendd        ymm_tmp1, ymm_tmp4, ymm_tmp1, 11110000b
    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp3, 11110000b
    vpaddd          ymm_tmp3, ymm_tmp0, ymm_tmp0
    vpblendd        ymm_tmp3, ymm_tmp4, ymm_tmp3, 11001100b
    vpblendd        ymm_tmp0, ymm_tmp4, ymm_tmp0, 10101010b
    vpaddd          ymm_tmp0, ymm_tmp3, ymm_tmp0
    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp0
    vpaddd          ymm_tmp2, ymm_tmp2, ymm_tmp0
    vpcmpeqw        ymm_tmp3, ymm_tmp3, ymm_tmp3
    vpsrld          ymm_tmp3, ymm_tmp3, 31
    vpslld          ymm_tmp3, ymm_tmp3, 15
    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp3
    vpaddd          ymm_tmp2, ymm_tmp2, ymm_tmp3
    vpsrld          ymm_xpos_int, ymm_tmp1, 16
    vpsrld          ymm_tmp0, ymm_tmp2, 16
    vpackssdw       ymm_xpos_int, ymm_xpos_int, ymm_tmp0
    vpermq          ymm_xpos_int, ymm_xpos_int, 11011000b
    vpackuswb       ymm_xpos_int, ymm_xpos_int, ymm_xpos_int
    vpcmpeqw        ymm_tmp3, ymm_tmp3, ymm_tmp3
    vpsubb          ymm_tmp0, ymm_xpos_int, ymm_tmp3
    vpunpcklbw      ymm_xpos_int, ymm_xpos_int, ymm_tmp0
    vpslld          ymm_tmp1, ymm_tmp1, 16
    vpsrld          ymm_tmp1, ymm_tmp1, 16
    vpslld          ymm_tmp2, ymm_tmp2, 16
    vpsrld          ymm_tmp2, ymm_tmp2, 16
    vpackusdw       ymm_xpos_frac, ymm_tmp1, ymm_tmp2
    vpermq          ymm_xpos_frac, ymm_xpos_frac, 11011000b
    vpsrlw          ymm_xpos_frac, ymm_xpos_frac, 1
    vmovd           xmm_tmp0, i_scalexd
    vpslld          xmm_tmp0, xmm_tmp0, 4
    vpbroadcastw    ymm_tmp1, xmm_tmp0
    vpsrlw          ymm_tmp1, ymm_tmp1, 1
    vmovdqa         ymm_xpos_frac_inc, ymm_tmp1
    vpsrld          xmm_tmp0, xmm_tmp0, 16
    vpsubw          ymm_tmp0, ymm_tmp0, ymm_tmp3
    vpbroadcastb    ymm_tmp0, xmm_tmp0
    vmovdqa         ymm_xpos_int_inc, ymm_tmp0
    vmovdqa         ymm_xpos_int_begin, ymm_xpos_int
    vmovdqa         ymm_xpos_frac_begin, ymm_xpos_frac

    cmp             i_scalex, 4 << 16
    ja              .scalex_above4
    cmp             i_scalex, 2 << 16
    ja              .scalex_above2_beloweq4
    add             i_scalex, i_scalex
%xdefine i_scalex2 i_scalex
%undef i_scalex
    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample2xOrLess_16px, 0
    shr             i_scalex2, 1
%xdefine i_scalex i_scalex2
%undef i_scalex2
    jmp             .final_row
.scalex_above2_beloweq4:
    add             i_scalex, i_scalex
%xdefine i_scalex2 i_scalex
%undef i_scalex
    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample4xOrLess_16px, 0
    shr             i_scalex2, 1
%xdefine i_scalex i_scalex2
%undef i_scalex2
    jmp             .final_row
.scalex_above4:
    cmp             i_scalex, 8 << 16
    ja              .scalex_above8
    add             i_scalex, i_scalex
%xdefine i_scalex2 i_scalex
%undef i_scalex
    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample8xOrLess_16px, 0
    shr             i_scalex2, 1
%xdefine i_scalex i_scalex2
%undef i_scalex2
    jmp             .final_row
.scalex_above8:
%xdefine ymm_xfrac0 ymm_xpos_frac
%xdefine ymm_xfrac1 ymm_xpos_int
%xdefine ymm_xfrac0_begin ymm_xpos_int_begin
%xdefine ymm_xfrac1_begin ymm_xpos_frac_begin
%xdefine ymm_xfrac_inc ymm_xpos_frac_inc
%undef ymm_xpos_int
%undef ymm_xpos_frac
%undef ymm_xpos_int_begin
%undef ymm_xpos_frac_begin
%undef ymm_xpos_int_inc
%undef ymm_xpos_frac_inc
    AVX2_UnpckXFrac ymm_tmp0, ymm_xfrac1, ymm_xfrac0, ymm_7fff
    vpermq          ymm_xfrac0, ymm_tmp0,   01001110b
    vpermq          ymm_xfrac1, ymm_xfrac1, 01001110b
    vmovdqa         ymm_xfrac0_begin, ymm_xfrac0
    vmovdqa         ymm_xfrac1_begin, ymm_xfrac1
    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
    vpmullw         ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
    vpunpcklwd      ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
    vmovdqa         ymm_xfrac_inc, ymm_tmp0
    AVX2_GeneralBilinearDownsampler_loop AVX2_GeneralBilinearAccurateDownsample_16px, 0

.final_row:
    mov             p_src_row0, i_ypos
    shr             p_src_row0, 15
    imul            p_src_row0, i_src_stride
    add             p_src_row0, p_src
    mov             i_xpos, 1 << 15
    mov             i_width_cnt, i_dst_width

.final_row_width:
    mov             r_tmp0, i_xpos
    shr             r_tmp0, 16
    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]
    mov             [p_dst], r_tmp0b
    add             p_dst, 1
    add             i_xpos, i_scalex
    sub             i_width_cnt, 1
    jg              .final_row_width

.done:
    vzeroupper
%ifdef X86_32
    mov             esp, [esp]
%endif
    POP_XMM
    LOAD_7_PARA_POP
%ifndef X86_32
%ifdef WIN64
    pop             rsi
    pop             rdi
%endif
    pop             rbp
    pop             rbx
    pop             r13
    pop             r12
%endif
    ret
%undef p_dst
%undef i_dst_stride_less_width
%undef i_dst_width
%undef i_dst_height
%undef p_src
%undef i_src_stride
%undef i_scalex
%undef i_scalexd
%undef i_scaleyd
%undef i_xpos
%undef i_ypos
%undef i_yposd
%undef p_src_row0
%undef p_src_row1
%undef i_width_cnt
%undef r_tmp0
%undef r_tmp0b
%undef ymm_xpos_frac
%undef ymm_xpos_frac_inc
%undef ymm_xpos_int
%undef ymm_xpos_int_inc
%undef ymm_yfrac0
%undef ymm_yfrac1
%undef xmm_tmp0
%undef ymm_tmp0
%undef xmm_tmp1
%undef ymm_tmp1
%undef xmm_tmp2
%undef ymm_tmp2
%undef xmm_tmp3
%undef ymm_tmp3
%undef xmm_tmp4
%undef ymm_tmp4
%undef xmm_tmp5
%undef ymm_tmp5
%undef ymm_0
%undef ymm_7fff
%undef ymm_xpos_int_begin
%undef ymm_xpos_frac_begin
%undef ymm_xfrac0
%undef ymm_xfrac1
%undef ymm_xfrac0_begin
%undef ymm_xfrac1_begin
%undef ymm_xfrac_inc
%endif