shithub: openh264

--- a/codec/processing/src/downsample/downsample.cpp

+++ b/codec/processing/src/downsample/downsample.cpp

@@ -57,21 +57,21 @@

   sDownsampleFunc.pfGeneralRatioLuma	 = GeneralBilinearFastDownsampler_c;

 #if defined(X86_ASM)

   if (iCpuFlag & WELS_CPU_SSE) {

-    /*  sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse;

-      sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse;

-      sDownsampleFunc.pfHalfAverage[2]	= DyadicBilinearDownsamplerWidthx8_sse;*/

+    sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse;

+    sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse;

+    sDownsampleFunc.pfHalfAverage[2]	= DyadicBilinearDownsamplerWidthx8_sse;

   if (iCpuFlag & WELS_CPU_SSE2) {

-    //  sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;

-    //  sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_sse2;

+    sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;

+    sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_sse2;

   if (iCpuFlag & WELS_CPU_SSSE3) {

-    //  sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_ssse3;

-    //  sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_ssse3;

+      sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_ssse3;

+      sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_ssse3;

   if (iCpuFlag & WELS_CPU_SSE41) {

-    //  sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse4;

-    //  sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse4;

+    sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse4;

+    sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse4;

 #endif//X86_ASM

--- a/codec/processing/src/downsample/downsamplefuncs.cpp

+++ b/codec/processing/src/downsample/downsamplefuncs.cpp

@@ -202,31 +202,31 @@

 #ifdef X86_ASM

-//void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,

-//    const int32_t kiDstHeight,

-//    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {

-//  const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;

-//  const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);

-//

-//  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);

-//  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);

-//

-//  GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,

-//                                       pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);

-//}

-//

-//void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,

-//    const int32_t kiDstHeight,

-//    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {

-//  const int32_t kiScaleBit = 15;

-//  const uint32_t kuiScale = (1 << kiScaleBit);

-//

-//  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);

-//  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);

-//

-//  GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,

-//      pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);

-//}

+void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,

+    const int32_t kiDstHeight,

+    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {

+  const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;

+  const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);

+  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);

+  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);

+  GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,

+                                       pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);

+}

+void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,

+    const int32_t kiDstHeight,

+    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {

+  const int32_t kiScaleBit = 15;

+  const uint32_t kuiScale = (1 << kiScaleBit);

+  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);

+  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);

+  GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,

+      pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);

+}

 #endif //X86_ASM

 #ifdef HAVE_NEON

--- a/codec/processing/src/x86/downsample_bilinear.asm

+++ b/codec/processing/src/x86/downsample_bilinear.asm

@@ -39,7 +39,7 @@

;*

 ;*************************************************************************/

 %include "asm_inc.asm"

-%ifdef X86_32

 ;***********************************************************************

 ; Macros and other preprocessor constants

 ;***********************************************************************

@@ -64,6 +64,8 @@

     db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h

 shufb_mask_high:

     db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h

+add_extra_half:

+    dd 16384,0,0,0

 ;***********************************************************************

@@ -78,28 +80,36 @@

 ;                   const int iSrcWidth, const int iSrcHeight );

 ;***********************************************************************

 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse

-    push ebx

-    push edx

-    push esi

-    push edi

-    push ebp

+%ifdef X86_32

+    push r6

+    %assign push_num 1

+%else

+    %assign push_num 0

+%endif

+    LOAD_6_PARA

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    SIGN_EXTENSION r5, r5d

-    mov edi, [esp+24]   ; pDst

-    mov edx, [esp+28]   ; iDstStride

-    mov esi, [esp+32]   ; pSrc

-    mov ecx, [esp+36]   ; iSrcStride

-    mov ebp, [esp+44]   ; iSrcHeight

+%ifndef X86_32

+    push r12

+    mov r12, r4

+%endif

+    sar r5, $01            ; iSrcHeight >> 1

-    sar ebp, $01            ; iSrcHeight >> 1

-.yloops:

-    mov eax, [esp+40]   ; iSrcWidth

-    sar eax, $01            ; iSrcWidth >> 1

-    mov ebx, eax        ; iDstWidth restored at ebx

-    sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb

-    neg ebx             ; - (iSrcWidth >> 1)

+.yloops1:

+%ifdef X86_32

+    mov r4, arg5

+%else

+    mov r4, r12

+%endif

+    sar r4, $01            ; iSrcWidth >> 1

+    mov r6, r4        ; iDstWidth restored at ebx

+    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb

+    neg r6             ; - (iSrcWidth >> 1)

     ; each loop = source bandwidth: 32 bytes

-.xloops:

+.xloops1:

     ; 1st part horizonal loop: x16 bytes

     ;               mem  hi<-       ->lo

     ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E

@@ -108,10 +118,10 @@

     ;: H G F E D C B A, P O N M L K J I

     ;: h g f e d c b a, p o n m l k j i

     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movq mm0, [esi]         ; 1st pSrc line

-    movq mm1, [esi+8]       ; 1st pSrc line + 8

-    movq mm2, [esi+ecx]     ; 2nd pSrc line

-    movq mm3, [esi+ecx+8]   ; 2nd pSrc line + 8

+    movq mm0, [r2]         ; 1st pSrc line

+    movq mm1, [r2+8]       ; 1st pSrc line + 8

+    movq mm2, [r2+r3]     ; 2nd pSrc line

+    movq mm3, [r2+r3+8]   ; 2nd pSrc line + 8

     ; to handle mm0, mm1, mm2, mm3

     pshufw mm4, mm0, 0d8h   ; d D b B c C a A ; 11011000 B

@@ -156,10 +166,10 @@

     ;: H G F E D C B A, P O N M L K J I

     ;: h g f e d c b a, p o n m l k j i

     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movq mm1, [esi+16]      ; 1st pSrc line + 16

-    movq mm2, [esi+24]      ; 1st pSrc line + 24

-    movq mm3, [esi+ecx+16]  ; 2nd pSrc line + 16

-    movq mm4, [esi+ecx+24]  ; 2nd pSrc line + 24

+    movq mm1, [r2+16]      ; 1st pSrc line + 16

+    movq mm2, [r2+24]      ; 1st pSrc line + 24

+    movq mm3, [r2+r3+16]  ; 2nd pSrc line + 16

+    movq mm4, [r2+r3+24]  ; 2nd pSrc line + 24

     ; to handle mm1, mm2, mm3, mm4

     pshufw mm5, mm1, 0d8h   ; d D b B c C a A ; 11011000 B

@@ -196,31 +206,33 @@

     pavgb mm3, mm7      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2

     pavgb mm2, mm3      ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part

-    movq [edi  ], mm0

-    movq [edi+8], mm2

+    movq [r0  ], mm0

+    movq [r0+8], mm2

     ; next SMB

-    lea esi, [esi+32]

-    lea edi, [edi+16]

+    lea r2, [r2+32]

+    lea r0, [r0+16]

-    dec eax

-    jg near .xloops

+    dec r4

+    jg near .xloops1

     ; next line

-    lea esi, [esi+2*ecx]    ; next end of lines

-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]

-    lea edi, [edi+edx]

-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]

+    lea r2, [r2+2*r3]    ; next end of lines

+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

+    lea r0, [r0+r1]

+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec ebp

-    jg near .yloops

+    dec r5

+    jg near .yloops1

     WELSEMMS

-    pop ebp

-    pop edi

-    pop esi

-    pop edx

-    pop ebx

+%ifndef X86_32

+    pop r12

+%endif

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop r6

+%endif

ret

 ;***********************************************************************

@@ -229,28 +241,36 @@

 ;                     const int iSrcWidth, const int iSrcHeight );

 ;***********************************************************************

 WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse

-    push ebx

-    push edx

-    push esi

-    push edi

-    push ebp

+%ifdef X86_32

+    push r6

+    %assign push_num 1

+%else

+    %assign push_num 0

+%endif

+    LOAD_6_PARA

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    SIGN_EXTENSION r5, r5d

-    mov edi, [esp+24]   ; pDst

-    mov edx, [esp+28]   ; iDstStride

-    mov esi, [esp+32]   ; pSrc

-    mov ecx, [esp+36]   ; iSrcStride

-    mov ebp, [esp+44]   ; iSrcHeight

+%ifndef X86_32

+    push r12

+    mov r12, r4

+%endif

+    sar r5, $01            ; iSrcHeight >> 1

-    sar ebp, $01        ; iSrcHeight >> 1

-.yloops:

-    mov eax, [esp+40]   ; iSrcWidth

-    sar eax, $01        ; iSrcWidth >> 1

-    mov ebx, eax        ; iDstWidth restored at ebx

-    sar eax, $03        ; (iSrcWidth >> 1) / 8      ; loop count = num_of_mb

-    neg ebx         ; - (iSrcWidth >> 1)

+.yloops2:

+%ifdef X86_32

+    mov r4, arg5

+%else

+    mov r4, r12

+%endif

+    sar r4, $01            ; iSrcWidth >> 1

+    mov r6, r4        ; iDstWidth restored at ebx

+    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb

+    neg r6             ; - (iSrcWidth >> 1)

     ; each loop = source bandwidth: 16 bytes

-.xloops:

+.xloops2:

     ; 1st part horizonal loop: x16 bytes

     ;               mem  hi<-       ->lo

     ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E

@@ -259,10 +279,10 @@

     ;: H G F E D C B A, P O N M L K J I

     ;: h g f e d c b a, p o n m l k j i

     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movq mm0, [esi]         ; 1st pSrc line

-    movq mm1, [esi+8]       ; 1st pSrc line + 8

-    movq mm2, [esi+ecx]     ; 2nd pSrc line

-    movq mm3, [esi+ecx+8]   ; 2nd pSrc line + 8

+    movq mm0, [r2]         ; 1st pSrc line

+    movq mm1, [r2+8]       ; 1st pSrc line + 8

+    movq mm2, [r2+r3]     ; 2nd pSrc line

+    movq mm3, [r2+r3+8]   ; 2nd pSrc line + 8

     ; to handle mm0, mm1, mm2, mm3

     pshufw mm4, mm0, 0d8h   ; d D b B c C a A ; 11011000 B

@@ -299,30 +319,32 @@

     pavgb mm1, mm6      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2

     pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once

-    movq [edi  ], mm0

+    movq [r0  ], mm0

     ; next SMB

-    lea esi, [esi+16]

-    lea edi, [edi+8]

+    lea r2, [r2+16]

+    lea r0, [r0+8]

-    dec eax

-    jg near .xloops

+    dec r4

+    jg near .xloops2

     ; next line

-    lea esi, [esi+2*ecx]    ; next end of lines

-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]

-    lea edi, [edi+edx]

-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]

+    lea r2, [r2+2*r3]    ; next end of lines

+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

+    lea r0, [r0+r1]

+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec ebp

-    jg near .yloops

+    dec r5

+    jg near .yloops2

     WELSEMMS

-    pop ebp

-    pop edi

-    pop esi

-    pop edx

-    pop ebx

+%ifndef X86_32

+    pop r12

+%endif

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop r6

+%endif

ret

 ;***********************************************************************

@@ -331,28 +353,36 @@

 ;                     const int iSrcWidth, const int iSrcHeight );

 ;***********************************************************************

 WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse

-    push ebx

-    push edx

-    push esi

-    push edi

-    push ebp

+%ifdef X86_32

+    push r6

+    %assign push_num 1

+%else

+    %assign push_num 0

+%endif

+    LOAD_6_PARA

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    SIGN_EXTENSION r5, r5d

-    mov edi, [esp+24]   ; pDst

-    mov edx, [esp+28]   ; iDstStride

-    mov esi, [esp+32]   ; pSrc

-    mov ecx, [esp+36]   ; iSrcStride

-    mov ebp, [esp+44]   ; iSrcHeight

+%ifndef X86_32

+    push r12

+    mov r12, r4

+%endif

+    sar r5, $01            ; iSrcHeight >> 1

-    sar ebp, $01        ; iSrcHeight >> 1

-.yloops:

-    mov eax, [esp+40]   ; iSrcWidth

-    sar eax, $01        ; iSrcWidth >> 1

-    mov ebx, eax        ; iDstWidth restored at ebx

-    sar eax, $02        ; (iSrcWidth >> 1) / 4      ; loop count = num_of_mb

-    neg ebx         ; - (iSrcWidth >> 1)

+.yloops3:

+%ifdef X86_32

+    mov r4, arg5

+%else

+    mov r4, r12

+%endif

+    sar r4, $01            ; iSrcWidth >> 1

+    mov r6, r4        ; iDstWidth restored at ebx

+    sar r4, $02            ; (iSrcWidth >> 1) / 4     ; loop count = num_of_mb

+    neg r6             ; - (iSrcWidth >> 1)

     ; each loop = source bandwidth: 8 bytes

-.xloops:

+.xloops3:

     ; 1st part horizonal loop: x8 bytes

     ;               mem  hi<-       ->lo

     ;1st Line Src:  mm0: d D c C b B a A

@@ -361,8 +391,8 @@

     ;: H G F E D C B A

     ;: h g f e d c b a

     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movq mm0, [esi]         ; 1st pSrc line

-    movq mm1, [esi+ecx]     ; 2nd pSrc line

+    movq mm0, [r2]         ; 1st pSrc line

+    movq mm1, [r2+r3]     ; 2nd pSrc line

     ; to handle mm0, mm1, mm2, mm3

     pshufw mm2, mm0, 0d8h   ; d D b B c C a A ; 11011000 B

@@ -385,30 +415,32 @@

     pshufw mm1, mm0, 04eh   ; 01001110 B

     pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once

-    movd [edi], mm0

+    movd [r0], mm0

     ; next unit

-    lea esi, [esi+8]

-    lea edi, [edi+4]

+    lea r2, [r2+8]

+    lea r0, [r0+4]

-    dec eax

-    jg near .xloops

+    dec r4

+    jg near .xloops3

     ; next line

-    lea esi, [esi+2*ecx]    ; next end of lines

-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]

-    lea edi, [edi+edx]

-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]

+    lea r2, [r2+2*r3]    ; next end of lines

+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

+    lea r0, [r0+r1]

+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec ebp

-    jg near .yloops

+    dec r5

+    jg near .yloops3

     WELSEMMS

-    pop ebp

-    pop edi

-    pop esi

-    pop edx

-    pop ebx

+%ifndef X86_32

+    pop r12

+%endif

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop r6

+%endif

ret

@@ -420,31 +452,56 @@

 ;                   const int iSrcWidth, const int iSrcHeight );

 ;***********************************************************************

 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3

-    push ebx

-    push edx

-    push esi

-    push edi

-    push ebp

+    ;push ebx

+    ;push edx

+    ;push esi

+    ;push edi

+    ;push ebp

-    mov edi, [esp+24]   ; pDst

-    mov edx, [esp+28]   ; iDstStride

-    mov esi, [esp+32]   ; pSrc

-    mov ecx, [esp+36]   ; iSrcStride

-    mov ebp, [esp+44]   ; iSrcHeight

+    ;mov edi, [esp+24]   ; pDst

+    ;mov edx, [esp+28]   ; iDstStride

+    ;mov esi, [esp+32]   ; pSrc

+    ;mov ecx, [esp+36]   ; iSrcStride

+    ;mov ebp, [esp+44]   ; iSrcHeight

+%ifdef X86_32

+    push r6

+    %assign push_num 1

+%else

+    %assign push_num 0

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    SIGN_EXTENSION r5, r5d

-    sar ebp, $01            ; iSrcHeight >> 1

+%ifndef X86_32

+    push r12

+    mov r12, r4

+%endif

+    sar r5, $01            ; iSrcHeight >> 1

     movdqa xmm7, [shufb_mask_low]   ; mask low

     movdqa xmm6, [shufb_mask_high]  ; mask high

-.yloops:

-    mov eax, [esp+40]   ; iSrcWidth

-    sar eax, $01            ; iSrcWidth >> 1

-    mov ebx, eax        ; iDstWidth restored at ebx

-    sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb

-    neg ebx             ; - (iSrcWidth >> 1)

+.yloops4:

+    ;mov eax, [esp+40]   ; iSrcWidth

+    ;sar eax, $01            ; iSrcWidth >> 1

+    ;mov ebx, eax        ; iDstWidth restored at ebx

+    ;sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb

+    ;neg ebx             ; - (iSrcWidth >> 1)

+%ifdef X86_32

+    mov r4, arg5

+%else

+    mov r4, r12

+%endif

+    sar r4, $01            ; iSrcWidth >> 1

+    mov r6, r4        ; iDstWidth restored at ebx

+    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb

+    neg r6             ; - (iSrcWidth >> 1)

     ; each loop = source bandwidth: 32 bytes

-.xloops:

+.xloops4:

     ; 1st part horizonal loop: x16 bytes

     ;               mem  hi<-       ->lo

     ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A

@@ -458,10 +515,10 @@

     ;: p ..                          a

     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movdqa xmm0, [esi]          ; 1st_src_line

-    movdqa xmm1, [esi+16]       ; 1st_src_line + 16

-    movdqa xmm2, [esi+ecx]      ; 2nd_src_line

-    movdqa xmm3, [esi+ecx+16]   ; 2nd_src_line + 16

+    movdqa xmm0, [r2]          ; 1st_src_line

+    movdqa xmm1, [r2+16]       ; 1st_src_line + 16

+    movdqa xmm2, [r2+r3]      ; 2nd_src_line

+    movdqa xmm3, [r2+r3+16]   ; 2nd_src_line + 16

     ; packing & avg

     movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A

@@ -498,29 +555,33 @@

     pavgb xmm0, xmm2

     ; write pDst

-    movdqa [edi], xmm0

+    movdqa [r0], xmm0

     ; next SMB

-    lea esi, [esi+32]

-    lea edi, [edi+16]

+    lea r2, [r2+32]

+    lea r0, [r0+16]

-    dec eax

-    jg near .xloops

+    dec r4

+    jg near .xloops4

     ; next line

-    lea esi, [esi+2*ecx]    ; next end of lines

-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]

-    lea edi, [edi+edx]

-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]

+    lea r2, [r2+2*r3]    ; next end of lines

+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

+    lea r0, [r0+r1]

+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec ebp

-    jg near .yloops

+    dec r5

+    jg near .yloops4

-    pop ebp

-    pop edi

-    pop esi

-    pop edx

-    pop ebx

+%ifndef X86_32

+    pop r12

+%endif

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop r6

+%endif

ret

 ;***********************************************************************

@@ -529,30 +590,39 @@

 ;                     const int iSrcWidth, const int iSrcHeight );

 ;***********************************************************************

 WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3

-    push ebx

-    push edx

-    push esi

-    push edi

-    push ebp

+%ifdef X86_32

+    push r6

+    %assign push_num 1

+%else

+    %assign push_num 0

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 6

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    SIGN_EXTENSION r5, r5d

-    mov edi, [esp+24]   ; pDst

-    mov edx, [esp+28]   ; iDstStride

-    mov esi, [esp+32]   ; pSrc

-    mov ecx, [esp+36]   ; iSrcStride

-    mov ebp, [esp+44]   ; iSrcHeight

+%ifndef X86_32

+    push r12

+    mov r12, r4

+%endif

+    sar r5, $01            ; iSrcHeight >> 1

+    movdqa xmm5, [shufb_mask_low]   ; mask low

+    movdqa xmm4, [shufb_mask_high]  ; mask high

-    sar ebp, $01        ; iSrcHeight >> 1

-    movdqa xmm7, [shufb_mask_low]   ; mask low

-    movdqa xmm6, [shufb_mask_high]  ; mask high

-.yloops:

-    mov eax, [esp+40]   ; iSrcWidth

-    sar eax, $01        ; iSrcWidth >> 1

-    mov ebx, eax        ; iDstWidth restored at ebx

-    sar eax, $03        ; (iSrcWidth >> 1) / 8      ; loop count = num_of_mb

-    neg ebx         ; - (iSrcWidth >> 1)

+.yloops5:

+%ifdef X86_32

+    mov r4, arg5

+%else

+    mov r4, r12

+%endif

+    sar r4, $01            ; iSrcWidth >> 1

+    mov r6, r4        ; iDstWidth restored at ebx

+    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb

+    neg r6             ; - (iSrcWidth >> 1)

     ; each loop = source bandwidth: 16 bytes

-.xloops:

+.xloops5:

     ; horizonal loop: x16 bytes by source

     ;               mem  hi<-       ->lo

     ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A

@@ -562,13 +632,13 @@

     ;: h g f e d c b a, p o n m l k j i

     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movdqa xmm0, [esi]          ; 1st_src_line

-    movdqa xmm1, [esi+ecx]      ; 2nd_src_line

+    movdqa xmm0, [r2]          ; 1st_src_line

+    movdqa xmm1, [r2+r3]      ; 2nd_src_line

     ; packing & avg

     movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A

-    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A

-    pshufb xmm2, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

+    pshufb xmm0, xmm5           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A

+    pshufb xmm2, xmm4           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

     ; another implementation for xmm2 high bits

 ;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0

 ;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

@@ -575,8 +645,8 @@

     pavgb xmm0, xmm2

     movdqa xmm3, xmm1

-    pshufb xmm1, xmm7

-    pshufb xmm3, xmm6

+    pshufb xmm1, xmm5

+    pshufb xmm3, xmm4

 ;   psubb xmm3, xmm1

 ;   psrlw xmm3, 8

     pavgb xmm1, xmm3

@@ -585,29 +655,32 @@

     packuswb xmm0, xmm1

     ; write pDst

-    movq [edi], xmm0

+    movq [r0], xmm0

     ; next SMB

-    lea esi, [esi+16]

-    lea edi, [edi+8]

+    lea r2, [r2+16]

+    lea r0, [r0+8]

-    dec eax

-    jg near .xloops

+    dec r4

+    jg near .xloops5

-    ; next line

-    lea esi, [esi+2*ecx]    ; next end of lines

-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]

-    lea edi, [edi+edx]

-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]

+    lea r2, [r2+2*r3]    ; next end of lines

+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

+    lea r0, [r0+r1]

+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec ebp

-    jg near .yloops

+    dec r5

+    jg near .yloops5

-    pop ebp

-    pop edi

-    pop esi

-    pop edx

-    pop ebx

+%ifndef X86_32

+    pop r12

+%endif

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop r6

+%endif

ret

 ; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse

@@ -617,31 +690,40 @@

 ;                   const int iSrcWidth, const int iSrcHeight );

 ;***********************************************************************

 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4

-    push ebx

-    push edx

-    push esi

-    push edi

-    push ebp

+%ifdef X86_32

+    push r6

+    %assign push_num 1

+%else

+    %assign push_num 0

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    SIGN_EXTENSION r5, r5d

-    mov edi, [esp+24]   ; pDst

-    mov edx, [esp+28]   ; iDstStride

-    mov esi, [esp+32]   ; pSrc

-    mov ecx, [esp+36]   ; iSrcStride

-    mov ebp, [esp+44]   ; iSrcHeight

+%ifndef X86_32

+    push r12

+    mov r12, r4

+%endif

+    sar r5, $01            ; iSrcHeight >> 1

-    sar ebp, $01            ; iSrcHeight >> 1

     movdqa xmm7, [shufb_mask_low]   ; mask low

     movdqa xmm6, [shufb_mask_high]  ; mask high

-.yloops:

-    mov eax, [esp+40]   ; iSrcWidth

-    sar eax, $01            ; iSrcWidth >> 1

-    mov ebx, eax        ; iDstWidth restored at ebx

-    sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb

-    neg ebx             ; - (iSrcWidth >> 1)

+.yloops6:

+%ifdef X86_32

+    mov r4, arg5

+%else

+    mov r4, r12

+%endif

+    sar r4, $01            ; iSrcWidth >> 1

+    mov r6, r4        ; iDstWidth restored at ebx

+    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb

+    neg r6             ; - (iSrcWidth >> 1)

     ; each loop = source bandwidth: 32 bytes

-.xloops:

+.xloops6:

     ; 1st part horizonal loop: x16 bytes

     ;               mem  hi<-       ->lo

     ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A

@@ -655,10 +737,10 @@

     ;: p ..                          a

     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movntdqa xmm0, [esi]            ; 1st_src_line

-    movntdqa xmm1, [esi+16]     ; 1st_src_line + 16

-    movntdqa xmm2, [esi+ecx]        ; 2nd_src_line

-    movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16

+    movntdqa xmm0, [r2]            ; 1st_src_line

+    movntdqa xmm1, [r2+16]     ; 1st_src_line + 16

+    movntdqa xmm2, [r2+r3]        ; 2nd_src_line

+    movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16

     ; packing & avg

     movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A

@@ -694,29 +776,32 @@

     pavgb xmm0, xmm2

     ; write pDst

-    movdqa [edi], xmm0

+    movdqa [r0], xmm0

     ; next SMB

-    lea esi, [esi+32]

-    lea edi, [edi+16]

+    lea r2, [r2+32]

+    lea r0, [r0+16]

-    dec eax

-    jg near .xloops

+    dec r4

+    jg near .xloops6

-    ; next line

-    lea esi, [esi+2*ecx]    ; next end of lines

-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]

-    lea edi, [edi+edx]

-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]

+    lea r2, [r2+2*r3]    ; next end of lines

+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

+    lea r0, [r0+r1]

+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec ebp

-    jg near .yloops

+    dec r5

+    jg near .yloops6

-    pop ebp

-    pop edi

-    pop esi

-    pop edx

-    pop ebx

+%ifndef X86_32

+    pop r12

+%endif

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop r6

+%endif

ret

 ;***********************************************************************

@@ -725,30 +810,39 @@

 ;                     const int iSrcWidth, const int iSrcHeight );

 ;***********************************************************************

 WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4

-    push ebx

-    push edx

-    push esi

-    push edi

-    push ebp

+%ifdef X86_32

+    push r6

+    %assign push_num 1

+%else

+    %assign push_num 0

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 6

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    SIGN_EXTENSION r5, r5d

-    mov edi, [esp+24]   ; pDst

-    mov edx, [esp+28]   ; iDstStride

-    mov esi, [esp+32]   ; pSrc

-    mov ecx, [esp+36]   ; iSrcStride

-    mov ebp, [esp+44]   ; iSrcHeight

+%ifndef X86_32

+    push r12

+    mov r12, r4

+%endif

+    sar r5, $01            ; iSrcHeight >> 1

+    movdqa xmm5, [shufb_mask_low]   ; mask low

+    movdqa xmm4, [shufb_mask_high]  ; mask high

-    sar ebp, $01        ; iSrcHeight >> 1

-    movdqa xmm7, [shufb_mask_low]   ; mask low

-    movdqa xmm6, [shufb_mask_high]  ; mask high

-.yloops:

-    mov eax, [esp+40]   ; iSrcWidth

-    sar eax, $01        ; iSrcWidth >> 1

-    mov ebx, eax        ; iDstWidth restored at ebx

-    sar eax, $03        ; (iSrcWidth >> 1) / 8      ; loop count = num_of_mb

-    neg ebx         ; - (iSrcWidth >> 1)

+.yloops7:

+%ifdef X86_32

+    mov r4, arg5

+%else

+    mov r4, r12

+%endif

+    sar r4, $01            ; iSrcWidth >> 1

+    mov r6, r4        ; iDstWidth restored at ebx

+    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb

+    neg r6             ; - (iSrcWidth >> 1)

     ; each loop = source bandwidth: 16 bytes

-.xloops:

+.xloops7:

     ; horizonal loop: x16 bytes by source

     ;               mem  hi<-       ->lo

     ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A

@@ -758,20 +852,20 @@

     ;: h g f e d c b a, p o n m l k j i

     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movntdqa xmm0, [esi]            ; 1st_src_line

-    movntdqa xmm1, [esi+ecx]        ; 2nd_src_line

+    movntdqa xmm0, [r2]            ; 1st_src_line

+    movntdqa xmm1, [r2+r3]        ; 2nd_src_line

     ; packing & avg

     movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A

-    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A

-    pshufb xmm2, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

+    pshufb xmm0, xmm5           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A

+    pshufb xmm2, xmm4           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

 ;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0

 ;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

     pavgb xmm0, xmm2

     movdqa xmm3, xmm1

-    pshufb xmm1, xmm7

-    pshufb xmm3, xmm6

+    pshufb xmm1, xmm5

+    pshufb xmm3, xmm4

 ;   psubb xmm3, xmm1

 ;   psrlw xmm3, 8

     pavgb xmm1, xmm3

@@ -780,38 +874,40 @@

     packuswb xmm0, xmm1

     ; write pDst

-    movq [edi], xmm0

+    movq [r0], xmm0

     ; next SMB

-    lea esi, [esi+16]

-    lea edi, [edi+8]

+    lea r2, [r2+16]

+    lea r0, [r0+8]

-    dec eax

-    jg near .xloops

+    dec r4

+    jg near .xloops7

     ; next line

-    lea esi, [esi+2*ecx]    ; next end of lines

-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]

-    lea edi, [edi+edx]

-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]

+    lea r2, [r2+2*r3]    ; next end of lines

+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

+    lea r0, [r0+r1]

+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec ebp

-    jg near .yloops

+    dec r5

+    jg near .yloops7

-    pop ebp

-    pop edi

-    pop esi

-    pop edx

-    pop ebx

+%ifndef X86_32

+    pop r12

+%endif

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop r6

+%endif

ret

+%ifdef X86_32

 ;**************************************************************************************************************

 ;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,

-;                           unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,

+;                           unsigned char* pSrc, const int iSrcStride,

 ;                           unsigned int uiScaleX, unsigned int uiScaleY );

;{

 ;**************************************************************************************************************

@@ -822,7 +918,7 @@

     push    edi

     push    ebx

 %define     pushsize    16

-%define     localsize   28

+%define     localsize   16

 %define     pDstData        esp + pushsize + localsize + 4

 %define     dwDstStride     esp + pushsize + localsize + 8

 %define     dwDstWidth      esp + pushsize + localsize + 12

@@ -829,19 +925,15 @@

 %define     dwDstHeight     esp + pushsize + localsize + 16

 %define     pSrcData        esp + pushsize + localsize + 20

 %define     dwSrcStride     esp + pushsize + localsize + 24

-%define     dwSrcWidth      esp + pushsize + localsize + 28

-%define     dwSrcHeight     esp + pushsize + localsize + 32

-%define     scale           esp + 0

-%define     uiScaleX            esp + pushsize + localsize + 36

-%define     uiScaleY            esp + pushsize + localsize + 40

-%define     tmpHeight       esp + 12

-%define     yInverse        esp + 16

-%define     xInverse        esp + 20

-%define     dstStep         esp + 24

+%define     uiScaleX            esp + pushsize + localsize + 28

+%define     uiScaleY            esp + pushsize + localsize + 32

+%define     tmpHeight       esp + 0

+%define     yInverse        esp + 4

+%define     xInverse        esp + 8

+%define     dstStep         esp + 12

     sub     esp,            localsize

     pxor    xmm0,   xmm0

-    mov     edx,    32767

     mov     eax,    [uiScaleX]

     and     eax,    32767

     mov     ebx,    eax

@@ -999,7 +1091,6 @@

 %undef      dwDstWidth

 %undef      dwDstHeight

 %undef      dwDstStride

-%undef      scale

 %undef      uiScaleX

 %undef      uiScaleY

 %undef      tmpHeight

@@ -1013,7 +1104,7 @@

 ;**************************************************************************************************************

 ;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,

-;               unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,

+;               unsigned char* pSrc, const int iSrcStride,

 ;               unsigned int uiScaleX, unsigned int uiScaleY );

;{

 ;**************************************************************************************************************

@@ -1024,7 +1115,7 @@

     push    edi

     push    ebx

 %define     pushsize    16

-%define     localsize   28

+%define     localsize   16

 %define     pDstData        esp + pushsize + localsize + 4

 %define     dwDstStride     esp + pushsize + localsize + 8

 %define     dwDstWidth      esp + pushsize + localsize + 12

@@ -1031,15 +1122,12 @@

 %define     dwDstHeight     esp + pushsize + localsize + 16

 %define     pSrcData        esp + pushsize + localsize + 20

 %define     dwSrcStride     esp + pushsize + localsize + 24

-%define     dwSrcWidth      esp + pushsize + localsize + 28

-%define     dwSrcHeight     esp + pushsize + localsize + 32

-%define     scale           esp + 0

-%define     uiScaleX            esp + pushsize + localsize + 36

-%define     uiScaleY            esp + pushsize + localsize + 40

-%define     tmpHeight       esp + 12

-%define     yInverse        esp + 16

-%define     xInverse        esp + 20

-%define     dstStep         esp + 24

+%define     uiScaleX            esp + pushsize + localsize + 28

+%define     uiScaleY            esp + pushsize + localsize + 32

+%define     tmpHeight       esp + 0

+%define     yInverse        esp + 4

+%define     xInverse        esp + 8

+%define     dstStep         esp + 12

     sub     esp,            localsize

     pxor    xmm0,   xmm0

@@ -1191,10 +1279,7 @@

 %undef      dwSrcHeight

 %undef      dwSrcStride

 %undef      pDstData

-%undef      dwDstWidth

-%undef      dwDstHeight

 %undef      dwDstStride

-%undef      scale

 %undef      uiScaleX

 %undef      uiScaleY

 %undef      tmpHeight

@@ -1201,5 +1286,613 @@

 %undef      yInverse

 %undef      xInverse

 %undef      dstStep

+    ret

+%elifdef  WIN64

+;**************************************************************************************************************

+;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,

+;                           unsigned char* pSrc, const int iSrcStride,

+;                           unsigned int uiScaleX, unsigned int uiScaleY );

+;{

+;**************************************************************************************************************

+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2

+    push    r12

+    push    r13

+    push    r14

+    push    r15

+    push    rsi

+    push    rdi

+    push    rbx

+    push    rbp

+    %assign push_num 8

+    LOAD_7_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r2, r2d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r5, r5d

+    SIGN_EXTENSION r6, r6d

+    pxor    xmm0,   xmm0

+    mov     r12d,   r6d

+    and     r12d,   32767

+    mov     r13d,   r12d

+    neg     r13d

+    and     r13d,   32767

+    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 32767)

+    movd    xmm2,   r13d                     ; -uinc

+    psllq   xmm1,   32

+    por     xmm1,   xmm2                    ; 0 0  uinc  -uinc   (dword)

+    pshufd  xmm7,   xmm1,   01000100b       ; xmm7: uinc -uinc uinc -uinc

+    mov     r12,    arg8

+    SIGN_EXTENSION r12, r12d

+    mov     rbp,    r12

+    and     r12d,   32767

+    mov     r13d,   r12d

+    neg     r13d

+    and     r13d,   32767

+    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)

+    movd    xmm2,       r13d                     ; -vinc

+    psllq   xmm6,       32

+    por     xmm6,       xmm2                    ; 0 0 vinc -vinc (dword)

+    pshufd  xmm6,       xmm6,   01010000b       ; xmm6: vinc vinc -vinc -vinc

+    mov     r12d,        40003fffh

+    movd    xmm5,       r12d

+    punpcklwd   xmm5,   xmm0                    ; 16384 16383

+    pshufd  xmm5,       xmm5,   01000100b       ; xmm5: 16384 16383 16384 16383

+DOWNSAMPLE:

+    sub     r1, r2                   ; stride - width

+    dec     r3

+    mov     r14,16384

+    pshufd  xmm4,       xmm5,   01010000b   ; initial v to 16384 16384 16383 16383

+HEIGHT:

+    ;mov     r12, r4

+    mov     r12, r14

+    shr     r12,    15

+    imul    r12,    r5

+    add     r12,    r4                 ; get current row address

+    mov     r13,    r12

+    add     r13,    r5

+    mov     r15, 16384

+    mov     rsi, r2

+    dec     rsi

+    movdqa  xmm3,       xmm5            ; initial u to 16384 16383 16384 16383

+WIDTH:

+    mov     rdi,        r15

+    shr     rdi,        15

+    movd    xmm1,       [r12+rdi]       ; xxxxxxba

+    movd    xmm2,       [r13+rdi]       ; xxxxxxdc

+    pxor    xmm0,       xmm0

+    punpcklwd   xmm1,   xmm2            ; xxxxdcba

+    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a

+    punpcklwd   xmm1,   xmm0            ; 000d000c000b000a

+    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv

+    pmaddwd xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2

+    movdqa  xmm0,   xmm2

+    pmuludq xmm2,   xmm1

+    psrlq   xmm0,   32

+    psrlq   xmm1,   32

+    pmuludq xmm0,   xmm1

+    paddq   xmm2,   xmm0

+    pshufd  xmm1,   xmm2,   00001110b

+    paddq   xmm2,   xmm1

+    psrlq   xmm2,   29

+    movd    ebx,    xmm2

+    inc     ebx

+    shr     ebx,    1

+    mov     [r0],   bl

+    inc     r0

+    add      r15, r6

+    paddw   xmm3,       xmm7            ; inc u

+    psllw   xmm3,       1

+    psrlw   xmm3,       1

+    dec     rsi

+    jg      WIDTH

+WIDTH_END:

+    shr     r15, 15

+    mov     bl,  [r12+r15]

+    mov     [r0],bl

+    inc     r0

+    add     r14, rbp

+    add     r0,  r1

+    paddw   xmm4,   xmm6                ; inc v

+    psllw   xmm4,   1

+    psrlw   xmm4,   1

+    dec     r3

+    jg      HEIGHT

+LAST_ROW:

+    shr     r14, 15

+    imul    r14, r5

+    add     r4, r14

+    mov     r15, 16384

+LAST_ROW_WIDTH:

+    mov     rdi, r15

+    shr     rdi, 15

+    mov     bl,  [r4+rdi]

+    mov     [r0],bl

+    inc     r0

+    add     r15, r6

+    dec     r2

+    jg    LAST_ROW_WIDTH

+LAST_ROW_END:

+    POP_XMM

+    pop     rbp

+    pop     rbx

+    pop     rdi

+    pop     rsi

+    pop     r15

+    pop     r14

+    pop     r13

+    pop     r12

+    ret

+;**************************************************************************************************************

+;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,

+;               unsigned char* pSrc, const int iSrcStride,

+;               unsigned int uiScaleX, unsigned int uiScaleY );

+;{

+;**************************************************************************************************************

+WELS_EXTERN GeneralBilinearFastDownsampler_sse2

+    push    r12

+    push    r13

+    push    r14

+    push    r15

+    push    rsi

+    push    rdi

+    push    rbx

+    push    rbp

+    %assign push_num 8

+    LOAD_7_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r2, r2d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r5, r5d

+    SIGN_EXTENSION r6, r6d

+    pxor    xmm0,   xmm0

+    mov     r12d,   r6d

+    and     r12d,   65535

+    mov     r13d,   r12d

+    neg     r13d

+    and     r13d,   65535

+    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 65536)

+    movd    xmm2,   r13d                     ; -uinc

+    psllq   xmm1,   32

+    por     xmm1,   xmm2                    ; 0 uinc 0 -uinc

+    pshuflw xmm7,   xmm1,   10001000b       ; xmm7: uinc -uinc uinc -uinc

+    mov     r12,    arg8

+    SIGN_EXTENSION r12, r12d

+    mov     rbp,    r12

+    and     r12d,   32767

+    mov     r13d,   r12d

+    neg     r13d

+    and     r13d,   32767

+    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)

+    movd    xmm2,       r13d                     ; -vinc

+    psllq   xmm6,       32

+    por     xmm6,       xmm2                    ; 0 vinc 0 -vinc

+    pshuflw xmm6,       xmm6,   10100000b       ; xmm6: vinc vinc -vinc -vinc

+    mov     r12d,       80007fffh               ; 32768 32767

+    movd    xmm5,       r12d

+    pshuflw xmm5,       xmm5,       01000100b   ; 32768 32767 32768 32767

+FAST_DOWNSAMPLE:

+    sub     r1, r2                   ; stride - width

+    dec     r3

+    mov     r14,16384

+    pshuflw xmm4,       xmm5,   01010000b

+    psrlw   xmm4,       1               ; initial v to 16384 16384 16383 16383

+FAST_HEIGHT:

+    mov     r12, r14

+    shr     r12,    15

+    imul    r12,    r5

+    add     r12,    r4                 ; get current row address

+    mov     r13,    r12

+    add     r13,    r5

+    mov     r15, 32768

+    mov     rsi, r2

+    dec     rsi

+    movdqa  xmm3,       xmm5            ; initial u to 32768 32767 32768 32767

+FAST_WIDTH:

+    mov     rdi,        r15

+    shr     rdi,        16

+    movd    xmm1,       [r12+rdi]       ; xxxxxxba

+    movd    xmm2,       [r13+rdi]       ; xxxxxxdc

+    punpcklwd   xmm1,   xmm2            ; xxxxdcba

+    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a

+    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv

+    pmulhuw xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2

+    pmaddwd     xmm2,   xmm1

+    pshufd  xmm1,   xmm2,   00000001b

+    paddd   xmm2,   xmm1

+    movdqa  xmm1,   [add_extra_half]

+    paddd   xmm2,   xmm1

+    psrld   xmm2,   15

+    packuswb    xmm2,   xmm0

+    movd    ebx,    xmm2

+    mov     [r0],  bl

+    inc     r0

+    add     r15, r6

+    paddw   xmm3,       xmm7            ; inc u

+    dec     rsi

+    jg      FAST_WIDTH

+FAST_WIDTH_END:

+    shr     r15, 16

+    mov     bl,  [r12+r15]

+    mov     [r0],bl

+    inc     r0

+    add     r14, rbp

+    add     r0,  r1

+    paddw   xmm4,   xmm6                ; inc v

+    psllw   xmm4,   1

+    psrlw   xmm4,   1

+    dec     r3

+    jg      FAST_HEIGHT

+FAST_LAST_ROW:

+    shr     r14, 15

+    imul    r14, r5

+    add     r4, r14

+    mov     r15, 32768

+FAST_LAST_ROW_WIDTH:

+    mov     rdi, r15

+    shr     rdi, 16

+    mov     bl,  [r4+rdi]

+    mov     [r0],bl

+    inc     r0

+    add     r15, r6

+    dec     r2

+    jg      FAST_LAST_ROW_WIDTH

+FAST_LAST_ROW_END:

+    POP_XMM

+    pop     rbp

+    pop     rbx

+    pop     rdi

+    pop     rsi

+    pop     r15

+    pop     r14

+    pop     r13

+    pop     r12

+    ret

+%elifdef  UNIX64

+;**************************************************************************************************************

+;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,

+;                           unsigned char* pSrc, const int iSrcStride,

+;                           unsigned int uiScaleX, unsigned int uiScaleY );

+;{

+;**************************************************************************************************************

+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2

+    push    r12

+    push    r13

+    push    r14

+    push    r15

+    push    rbx

+    push    rbp

+    %assign push_num 6

+    LOAD_7_PARA

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r2, r2d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r5, r5d

+    SIGN_EXTENSION r6, r6d

+    pxor    xmm0,   xmm0

+    mov     r12d,   r6d

+    and     r12d,   32767

+    mov     r13d,   r12d

+    neg     r13d

+    and     r13d,   32767

+    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 32767)

+    movd    xmm2,   r13d                     ; -uinc

+    psllq   xmm1,   32

+    por     xmm1,   xmm2                    ; 0 0  uinc  -uinc   (dword)

+    pshufd  xmm7,   xmm1,   01000100b       ; xmm7: uinc -uinc uinc -uinc

+    mov     r12,    arg8

+    SIGN_EXTENSION r12, r12d

+    mov     rbp,    r12

+    and     r12d,   32767

+    mov     r13d,   r12d

+    neg     r13d

+    and     r13d,   32767

+    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)

+    movd    xmm2,       r13d                     ; -vinc

+    psllq   xmm6,       32

+    por     xmm6,       xmm2                    ; 0 0 vinc -vinc (dword)

+    pshufd  xmm6,       xmm6,   01010000b       ; xmm6: vinc vinc -vinc -vinc

+    mov     r12d,        40003fffh

+    movd    xmm5,       r12d

+    punpcklwd   xmm5,   xmm0                    ; 16384 16383

+    pshufd  xmm5,       xmm5,   01000100b       ; xmm5: 16384 16383 16384 16383

+DOWNSAMPLE:

+    sub     r1, r2                   ; stride - width

+    dec     r3

+    mov     r14,16384

+    pshufd  xmm4,       xmm5,   01010000b   ; initial v to 16384 16384 16383 16383

+HEIGHT:

+    ;mov     r12, r4

+    mov     r12, r14

+    shr     r12,    15

+    imul    r12,    r5

+    add     r12,    r4                 ; get current row address

+    mov     r13,    r12

+    add     r13,    r5

+    mov     r15, 16384

+    mov     rax, r2

+    dec     rax

+    movdqa  xmm3,       xmm5            ; initial u to 16384 16383 16384 16383

+WIDTH:

+    mov     r11,        r15

+    shr     r11,        15

+    movd    xmm1,       [r12+r11]       ; xxxxxxba

+    movd    xmm2,       [r13+r11]       ; xxxxxxdc

+    pxor    xmm0,       xmm0

+    punpcklwd   xmm1,   xmm2            ; xxxxdcba

+    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a

+    punpcklwd   xmm1,   xmm0            ; 000d000c000b000a

+    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv

+    pmaddwd xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2

+    movdqa  xmm0,   xmm2

+    pmuludq xmm2,   xmm1

+    psrlq   xmm0,   32

+    psrlq   xmm1,   32

+    pmuludq xmm0,   xmm1

+    paddq   xmm2,   xmm0

+    pshufd  xmm1,   xmm2,   00001110b

+    paddq   xmm2,   xmm1

+    psrlq   xmm2,   29

+    movd    ebx,    xmm2

+    inc     ebx

+    shr     ebx,    1

+    mov     [r0],   bl

+    inc     r0

+    add      r15, r6

+    paddw   xmm3,       xmm7            ; inc u

+    psllw   xmm3,       1

+    psrlw   xmm3,       1

+    dec     rax

+    jg      WIDTH

+WIDTH_END:

+    shr     r15, 15

+    mov     bl,  [r12+r15]

+    mov     [r0],bl

+    inc     r0

+    add     r14, rbp

+    add     r0,  r1

+    paddw   xmm4,   xmm6                ; inc v

+    psllw   xmm4,   1

+    psrlw   xmm4,   1

+    dec     r3

+    jg      HEIGHT

+LAST_ROW:

+    shr     r14, 15

+    imul    r14, r5

+    add     r4, r14

+    mov     r15, 16384

+LAST_ROW_WIDTH:

+    mov     r11, r15

+    shr     r11, 15

+    mov     bl,  [r4+r11]

+    mov     [r0],bl

+    inc     r0

+    add     r15, r6

+    dec     r2

+    jg    LAST_ROW_WIDTH

+LAST_ROW_END:

+    pop     rbp

+    pop     rbx

+    pop     r15

+    pop     r14

+    pop     r13

+    pop     r12

+    ret

+;**************************************************************************************************************

+;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,

+;               unsigned char* pSrc, const int iSrcStride,

+;               unsigned int uiScaleX, unsigned int uiScaleY );

+;{

+;**************************************************************************************************************

+WELS_EXTERN GeneralBilinearFastDownsampler_sse2

+    push    r12

+    push    r13

+    push    r14

+    push    r15

+    push    rbx

+    push    rbp

+    %assign push_num 6

+    LOAD_7_PARA

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r2, r2d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r5, r5d

+    SIGN_EXTENSION r6, r6d

+    pxor    xmm0,   xmm0

+    mov     r12d,   r6d

+    and     r12d,   65535

+    mov     r13d,   r12d

+    neg     r13d

+    and     r13d,   65535

+    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 65536)

+    movd    xmm2,   r13d                     ; -uinc

+    psllq   xmm1,   32

+    por     xmm1,   xmm2                    ; 0 uinc 0 -uinc

+    pshuflw xmm7,   xmm1,   10001000b       ; xmm7: uinc -uinc uinc -uinc

+    mov     r12,    arg8

+    SIGN_EXTENSION r12, r12d

+    mov     rbp,    r12

+    and     r12d,   32767

+    mov     r13d,   r12d

+    neg     r13d

+    and     r13d,   32767

+    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)

+    movd    xmm2,       r13d                     ; -vinc

+    psllq   xmm6,       32

+    por     xmm6,       xmm2                    ; 0 vinc 0 -vinc

+    pshuflw xmm6,       xmm6,   10100000b       ; xmm6: vinc vinc -vinc -vinc

+    mov     r12d,       80007fffh               ; 32768 32767

+    movd    xmm5,       r12d

+    pshuflw xmm5,       xmm5,       01000100b   ; 32768 32767 32768 32767

+FAST_DOWNSAMPLE:

+    sub     r1, r2                   ; stride - width

+    dec     r3

+    mov     r14,16384

+    pshuflw xmm4,       xmm5,   01010000b

+    psrlw   xmm4,       1               ; initial v to 16384 16384 16383 16383

+FAST_HEIGHT:

+    mov     r12, r14

+    shr     r12,    15

+    imul    r12,    r5

+    add     r12,    r4                 ; get current row address

+    mov     r13,    r12

+    add     r13,    r5

+    mov     r15, 32768

+    mov     rax, r2

+    dec     rax

+    movdqa  xmm3,       xmm5            ; initial u to 32768 32767 32768 32767

+FAST_WIDTH:

+    mov     r11,        r15

+    shr     r11,        16

+    movd    xmm1,       [r12+r11]       ; xxxxxxba

+    movd    xmm2,       [r13+r11]       ; xxxxxxdc

+    punpcklwd   xmm1,   xmm2            ; xxxxdcba

+    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a

+    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv

+    pmulhuw xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2

+    pmaddwd     xmm2,   xmm1

+    pshufd  xmm1,   xmm2,   00000001b

+    paddd   xmm2,   xmm1

+    movdqa  xmm1,   [add_extra_half]

+    paddd   xmm2,   xmm1

+    psrld   xmm2,   15

+    packuswb    xmm2,   xmm0

+    movd    ebx,    xmm2

+    mov     [r0],  bl

+    inc     r0

+    add     r15, r6

+    paddw   xmm3,       xmm7            ; inc u

+    dec     rax

+    jg      FAST_WIDTH

+FAST_WIDTH_END:

+    shr     r15, 16

+    mov     bl,  [r12+r15]

+    mov     [r0],bl

+    inc     r0

+    add     r14, rbp

+    add     r0,  r1

+    paddw   xmm4,   xmm6                ; inc v

+    psllw   xmm4,   1

+    psrlw   xmm4,   1

+    dec     r3

+    jg      FAST_HEIGHT

+FAST_LAST_ROW:

+    shr     r14, 15

+    imul    r14, r5

+    add     r4, r14

+    mov     r15, 32768

+FAST_LAST_ROW_WIDTH:

+    mov     r11, r15

+    shr     r11, 16

+    mov     bl,  [r4+r11]

+    mov     [r0],bl

+    inc     r0

+    add     r15, r6

+    dec     r2

+    jg      FAST_LAST_ROW_WIDTH

+FAST_LAST_ROW_END:

+    pop     rbp

+    pop     rbx

+    pop     r15

+    pop     r14

+    pop     r13

+    pop     r12

ret

 %endif

--

⑨