shithub: openh264

Download patch

ref: 39c2fb3d6bb60b45ed5b046839b23e51e5ab23ce
parent: c17a58efdfa8c03e3a1ae8e7f78483d48700499c
parent: 563376df0c45ce1cc26200a36d99526c9943f2ba
author: ruil2 <ruil2@cisco.com>
date: Fri May 27 11:17:31 EDT 2016

Merge pull request #2472 from saamas/processing-x86-general-bilinear-downsample-optimizations

[Processing/x86] GeneralBilinearDownsample optimizations

--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -478,6 +478,12 @@
     %endif
 %endmacro
 
+%macro ZERO_EXTENSION 1
+    %ifndef X86_32
+        mov dword %1, %1
+    %endif
+%endmacro
+
 %macro WELS_EXTERN 1
     ALIGN 16
     %ifdef PREFIX
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -100,6 +100,7 @@
     sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_ssse3;
     sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3;
     sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_ssse3;
+    sDownsampleFunc.pfGeneralRatioLuma    = GeneralBilinearFastDownsamplerWrap_ssse3;
   }
   if (iCpuFlag & WELS_CPU_SSE41) {
     sDownsampleFunc.pfHalfAverage[0]    = DyadicBilinearDownsamplerWidthx32_sse4;
@@ -106,6 +107,11 @@
     sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_sse4;
     sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
     sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_sse4;
+    sDownsampleFunc.pfGeneralRatioChroma  = GeneralBilinearAccurateDownsamplerWrap_sse41;
+  }
+  if (iCpuFlag & WELS_CPU_AVX2) {
+    sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_avx2;
+    sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_avx2;
   }
 #endif//X86_ASM
 
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@@ -101,6 +101,10 @@
 
 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
+GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_ssse3;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse41;
+GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_avx2;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_avx2;
 
 SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_ssse3;
 SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_sse4;
@@ -114,6 +118,18 @@
 void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
     const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,
     const uint32_t kuiScaleY);
+void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+    uint32_t uiScaleY);
+void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+    uint32_t uiScaleY);
+void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+    uint32_t uiScaleY);
+void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+    uint32_t uiScaleY);
 
 WELSVP_EXTERN_C_END
 #endif
--- a/codec/processing/src/downsample/downsamplefuncs.cpp
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@@ -247,58 +247,52 @@
   }
 }
 
-
-#ifdef X86_ASM
-void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+#if defined(X86_ASM) || defined(HAVE_NEON) || defined(HAVE_NEON_AARCH64)
+static void GeneralBilinearDownsamplerWrap (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
     const int32_t kiDstHeight,
-    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
-  const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
+    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
+    const int32_t kiScaleBitWidth, const int32_t kiScaleBitHeight,
+    void (*func) (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, int32_t iDstHeight,
+                  uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, uint32_t uiScaleY)) {
   const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
 
   uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);
   uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);
 
-  GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
-                                       pSrc, kiSrcStride, uiScalex, uiScaley);
+  func (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);
 }
 
-void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-    const int32_t kiDstHeight,
-    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
-  const int32_t kiScaleBit = 15;
-  const uint32_t kuiScale = (1 << kiScaleBit);
+#define DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP(suffix) \
+  void GeneralBilinearFastDownsamplerWrap_ ## suffix ( \
+      uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \
+      uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \
+    GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \
+        pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 16, 15, GeneralBilinearFastDownsampler_ ## suffix); \
+  }
 
-  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
-  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
+#define DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP(suffix) \
+  void GeneralBilinearAccurateDownsamplerWrap_ ## suffix ( \
+      uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \
+      uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \
+    GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \
+        pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 15, 15, GeneralBilinearAccurateDownsampler_ ## suffix); \
+  }
+#endif
 
-  GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
-      pSrc, kiSrcStride, uiScalex, uiScaley);
-}
+#ifdef X86_ASM
+DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (sse2)
+DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2)
+DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3)
+DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41)
+DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (avx2)
+DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (avx2)
 #endif //X86_ASM
 
 #ifdef HAVE_NEON
-void GeneralBilinearAccurateDownsamplerWrap_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-    const int32_t kiDstHeight,
-    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
-  const int32_t kiScaleBit = 15;
-  const uint32_t kuiScale = (1 << kiScaleBit);
-  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
-  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
-  GeneralBilinearAccurateDownsampler_neon (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex,
-      uiScaley);
-}
+DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (neon)
 #endif
 
 #ifdef HAVE_NEON_AARCH64
-void GeneralBilinearAccurateDownsamplerWrap_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride,
-    const int32_t kiDstWidth, const int32_t kiDstHeight,
-    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
-  const int32_t kiScaleBit = 15;
-  const uint32_t kuiScale = (1 << kiScaleBit);
-  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
-  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
-  GeneralBilinearAccurateDownsampler_AArch64_neon (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride,
-      uiScalex, uiScaley);
-}
+DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (AArch64_neon)
 #endif
 WELSVP_NAMESPACE_END
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -53,13 +53,23 @@
 ; Local Data (Read Only)
 ;***********************************************************************
 
-SECTION .rodata align=16
+SECTION .rodata align=32
 
 ;***********************************************************************
 ; Various memory constants (trigonometric values or rounding values)
 ;***********************************************************************
 
-ALIGN 16
+ALIGN 32
+db80h_256:
+    times 32 db 80h
+shufb_0000000088888888:
+    times 8 db 0
+    times 8 db 8
+shufb_000044448888CCCC:
+    times 4 db 0
+    times 4 db 4
+    times 4 db 8
+    times 4 db 12
 shufb_mask_low:
     db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
 shufb_mask_high:
@@ -2595,3 +2605,2260 @@
 %endif
     ret
 
+; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
+%macro SSE2_BilinearIncXposuw 5
+    movdqa          %5, %2
+    paddw           %2, %4
+    paddusw         %5, %4
+    pcmpeqw         %5, %2
+    paddb           %1, %3
+    paddb           %1, %5  ; subtract 1 if no carry
+%endmacro
+
+; outl=%1 outh=%2 in=%3
+%macro SSE2_UnpckXFracuw 3
+    pcmpeqw         %1, %1
+    pxor            %1, %3
+    movdqa          %2, %1
+    punpcklwd       %1, %3
+    punpckhwd       %2, %3
+%endmacro
+
+; [in:xfrac out:xyfrac0]=%1 [out:xyfrac1]=%2 yfrac0=%3 yfrac1=%4
+%macro SSE2_BilinearFastCalcXYFrac 4
+    movdqa          %2, %1
+    pmulhuw         %1, %3
+    pmulhuw         %2, %4
+%endmacro
+
+; [in:dwordsl out:bytes] dwordsh=%2 zero=%3
+%macro SSE2_BilinearFastPackDwordsToBytes 3
+    psrld           %1, 14
+    psrld           %2, 14
+    packssdw        %1, %2
+    pavgw           %1, %3
+    packuswb        %1, %1
+%endmacro
+
+%macro SSSE3_BilinearFastDownsample2xOrLess_8px 0
+    movdqa          xmm_tmp0, xmm_xpos_int
+    pshufb          xmm_tmp0, xmm_0
+    psubb           xmm_xpos_int, xmm_tmp0
+    SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
+    mov             r_tmp0, i_xpos
+    lea             i_xpos, [i_xpos + 8 * i_scalex]
+    shr             r_tmp0, 16
+    movdqu          xmm_tmp4, [p_src_row0 + r_tmp0]
+    pshufb          xmm_tmp4, xmm_xpos_int
+    movdqa          xmm_tmp5, xmm_tmp4
+    punpcklbw       xmm_tmp4, xmm_0
+    punpckhbw       xmm_tmp5, xmm_0
+    SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
+    SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp3, xmm_yfrac0, xmm_yfrac1
+    pmaddwd         xmm_tmp0, xmm_tmp4
+    pmaddwd         xmm_tmp1, xmm_tmp5
+    movdqu          xmm_tmp4, [p_src_row1 + r_tmp0]
+    pshufb          xmm_tmp4, xmm_xpos_int
+    movdqa          xmm_tmp5, xmm_tmp4
+    punpcklbw       xmm_tmp4, xmm_0
+    punpckhbw       xmm_tmp5, xmm_0
+    pmaddwd         xmm_tmp2, xmm_tmp4
+    pmaddwd         xmm_tmp3, xmm_tmp5
+    paddd           xmm_tmp0, xmm_tmp2
+    paddd           xmm_tmp1, xmm_tmp3
+    SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0
+    movlps          [p_dst], xmm_tmp0
+    add             p_dst, 8
+    SSE2_BilinearIncXposuw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_tmp0
+%endmacro
+
+%macro SSSE3_BilinearFastDownsample4xOrLess_8px 0
+    movdqa          xmm_tmp0, xmm_xpos_int
+    pshufb          xmm_tmp0, [shufb_0000000088888888]
+    psubb           xmm_xpos_int, xmm_tmp0
+    SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movdqu          xmm_tmp3, [p_src_row0 + r_tmp0]
+    movdqu          xmm_tmp4, [p_src_row1 + r_tmp0]
+    movdqa          xmm_tmp2, xmm_xpos_int
+    punpcklbw       xmm_tmp2, [db80h_256]
+    pshufb          xmm_tmp3, xmm_tmp2
+    pshufb          xmm_tmp4, xmm_tmp2
+    SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
+    pmaddwd         xmm_tmp0, xmm_tmp3
+    pmaddwd         xmm_tmp2, xmm_tmp4
+    paddd           xmm_tmp0, xmm_tmp2
+    lea             r_tmp0, [i_xpos + 4 * i_scalex]
+    lea             i_xpos, [i_xpos + 8 * i_scalex]
+    shr             r_tmp0, 16
+    movdqu          xmm_tmp3, [p_src_row0 + r_tmp0]
+    movdqu          xmm_tmp4, [p_src_row1 + r_tmp0]
+    movdqa          xmm_tmp2, xmm_xpos_int
+    punpckhbw       xmm_tmp2, [db80h_256]
+    pshufb          xmm_tmp3, xmm_tmp2
+    pshufb          xmm_tmp4, xmm_tmp2
+    SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
+    pmaddwd         xmm_tmp1, xmm_tmp3
+    pmaddwd         xmm_tmp2, xmm_tmp4
+    paddd           xmm_tmp1, xmm_tmp2
+    SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0
+    movlps          [p_dst], xmm_tmp0
+    add             p_dst, 8
+    SSE2_BilinearIncXposuw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_tmp0
+%endmacro
+
+%macro SSE2_GeneralBilinearFastDownsample_8px 0
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movd            xmm_tmp3, [p_src_row0 + r_tmp0]
+    movd            xmm_tmp4, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + i_scalex]
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 1
+    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 1
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 2
+    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 2
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 3
+    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 3
+    punpcklbw       xmm_tmp3, xmm_0
+    punpcklbw       xmm_tmp4, xmm_0
+    movdqa          xmm_tmp0, xmm_xfrac0
+    SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
+    pmaddwd         xmm_tmp0, xmm_tmp3
+    pmaddwd         xmm_tmp2, xmm_tmp4
+    paddd           xmm_tmp0, xmm_tmp2
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movd            xmm_tmp3, [p_src_row0 + r_tmp0]
+    movd            xmm_tmp4, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + i_scalex]
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 1
+    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 1
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 2
+    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 2
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 3
+    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 3
+    punpcklbw       xmm_tmp3, xmm_0
+    punpcklbw       xmm_tmp4, xmm_0
+    movdqa          xmm_tmp1, xmm_xfrac1
+    SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
+    pmaddwd         xmm_tmp1, xmm_tmp3
+    pmaddwd         xmm_tmp2, xmm_tmp4
+    paddd           xmm_tmp1, xmm_tmp2
+    SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0
+    movlps          [p_dst], xmm_tmp0
+    add             p_dst, 8
+    paddw           xmm_xfrac0, xmm_xfrac_inc
+    paddw           xmm_xfrac1, xmm_xfrac_inc
+%endmacro
+
+; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6
+%macro SSE2_BilinearIncXposw 6
+    pxor            %6, %6
+    paddw           %2, %4
+    pcmpgtw         %6, %2
+    paddb           %1, %3
+    psubb           %1, %6  ; add carry
+    pand            %2, %5
+%endmacro
+
+; outl=%1 outh=%2 in=%3 7FFFh=%4
+%macro SSE2_UnpckXFracw 4
+    movdqa          %1, %3
+    pxor            %1, %4
+    movdqa          %2, %1
+    punpcklwd       %1, %3
+    punpckhwd       %2, %3
+%endmacro
+
+; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6
+%macro SSE41_LinearAccurateInterpolateVerticalDwords 6
+    pshufd          %1, %2, 10110001b
+    pshufd          %6, %3, 10110001b
+    pmuludq         %1, %4
+    pmuludq         %6, %5
+    paddq           %1, %6
+    pmuludq         %2, %4
+    pmuludq         %3, %5
+    paddq           %2, %3
+    psllq           %1,  3
+    psrlq           %2, 29
+    blendps         %1, %2, 0101b
+%endmacro
+
+%macro SSE41_BilinearAccurateDownsample2xOrLess_8px 0
+    movdqa          xmm_tmp0, xmm_xpos_int
+    pshufb          xmm_tmp0, xmm_0
+    psubb           xmm_xpos_int, xmm_tmp0
+    SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
+    mov             r_tmp0, i_xpos
+    lea             i_xpos, [i_xpos + 8 * i_scalex]
+    shr             r_tmp0, 16
+    movdqu          xmm_tmp4, [p_src_row0 + r_tmp0]
+    pshufb          xmm_tmp4, xmm_xpos_int
+    movdqa          xmm_tmp5, xmm_tmp4
+    punpcklbw       xmm_tmp4, xmm_0
+    punpckhbw       xmm_tmp5, xmm_0
+    pmaddwd         xmm_tmp4, xmm_tmp0
+    pmaddwd         xmm_tmp5, xmm_tmp1
+    movdqu          xmm_tmp2, [p_src_row1 + r_tmp0]
+    pshufb          xmm_tmp2, xmm_xpos_int
+    movdqa          xmm_tmp3, xmm_tmp2
+    punpcklbw       xmm_tmp2, xmm_0
+    punpckhbw       xmm_tmp3, xmm_0
+    pmaddwd         xmm_tmp2, xmm_tmp0
+    pmaddwd         xmm_tmp3, xmm_tmp1
+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp1
+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp5, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
+    packssdw        xmm_tmp0, xmm_tmp1
+    pavgw           xmm_tmp0, xmm_0
+    packuswb        xmm_tmp0, xmm_tmp0
+    movlps          [p_dst], xmm_tmp0
+    add             p_dst, 8
+    SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0
+%endmacro
+
+%macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0
+    movdqa          xmm_tmp0, xmm_xpos_int
+    pshufb          xmm_tmp0, [shufb_0000000088888888]
+    psubb           xmm_xpos_int, xmm_tmp0
+    SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movdqa          xmm_tmp3, xmm_xpos_int
+    punpcklbw       xmm_tmp3, [db80h_256]
+    movdqu          xmm_tmp4, [p_src_row0 + r_tmp0]
+    movdqu          xmm_tmp2, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 4 * i_scalex]
+    lea             i_xpos, [i_xpos + 8 * i_scalex]
+    shr             r_tmp0, 16
+    pshufb          xmm_tmp4, xmm_tmp3
+    pshufb          xmm_tmp2, xmm_tmp3
+    pmaddwd         xmm_tmp4, xmm_tmp0
+    pmaddwd         xmm_tmp2, xmm_tmp0
+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
+    movdqa          xmm_tmp2, xmm_xpos_int
+    punpckhbw       xmm_tmp2, [db80h_256]
+    movdqu          xmm_tmp4, [p_src_row0 + r_tmp0]
+    movdqu          xmm_tmp3, [p_src_row1 + r_tmp0]
+    pshufb          xmm_tmp4, xmm_tmp2
+    pshufb          xmm_tmp3, xmm_tmp2
+    pmaddwd         xmm_tmp4, xmm_tmp1
+    pmaddwd         xmm_tmp3, xmm_tmp1
+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
+    packssdw        xmm_tmp0, xmm_tmp1
+    pavgw           xmm_tmp0, xmm_0
+    packuswb        xmm_tmp0, xmm_tmp0
+    movlps          [p_dst], xmm_tmp0
+    add             p_dst, 8
+    SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0
+%endmacro
+
+%macro SSE41_GeneralBilinearAccurateDownsample_8px 0
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movd            xmm_tmp4, [p_src_row0 + r_tmp0]
+    movd            xmm_tmp2, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 1 * i_scalex]
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 1
+    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 1
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 2
+    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 2
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 3
+    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 3
+    punpcklbw       xmm_tmp4, xmm_0
+    punpcklbw       xmm_tmp2, xmm_0
+    pmaddwd         xmm_tmp4, xmm_xfrac0
+    pmaddwd         xmm_tmp2, xmm_xfrac0
+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movd            xmm_tmp4, [p_src_row0 + r_tmp0]
+    movd            xmm_tmp3, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 1 * i_scalex]
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 1
+    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 1
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 2
+    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 2
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 3
+    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 3
+    punpcklbw       xmm_tmp4, xmm_0
+    punpcklbw       xmm_tmp3, xmm_0
+    pmaddwd         xmm_tmp4, xmm_xfrac1
+    pmaddwd         xmm_tmp3, xmm_xfrac1
+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
+    packssdw        xmm_tmp0, xmm_tmp1
+    pavgw           xmm_tmp0, xmm_0
+    packuswb        xmm_tmp0, xmm_tmp0
+    movlps          [p_dst], xmm_tmp0
+    add             p_dst, 8
+    paddw           xmm_xfrac0, xmm_xfrac_inc
+    paddw           xmm_xfrac1, xmm_xfrac_inc
+    pand            xmm_xfrac0, xmm_7fff
+    pand            xmm_xfrac1, xmm_7fff
+%endmacro
+
+; downsample_8px_macro=%1 b_fast=%2
+%macro SSE2_GeneralBilinearDownsampler_loop 2
+%%height:
+    mov             p_src_row0, i_ypos
+    shr             p_src_row0, 15
+    imul            p_src_row0, i_src_stride
+    add             p_src_row0, p_src
+    mov             p_src_row1, p_src_row0
+    add             p_src_row1, i_src_stride
+    movd            xmm_tmp1, i_yposd
+%if %2
+    pshuflw         xmm_tmp1, xmm_tmp1, 0
+    psllw           xmm_tmp1, 1
+    psrlw           xmm_tmp1, 1
+%else
+    pslld           xmm_tmp1, 17
+    psrld           xmm_tmp1, 17
+%endif
+%ifdef X86_32
+    pshufd          xmm_tmp1, xmm_tmp1, 0
+    pcmpeqw         xmm_tmp0, xmm_tmp0
+%if %2
+    psrlw           xmm_tmp0, 1
+%else
+    psrld           xmm_tmp0, 17
+%endif
+    pxor            xmm_tmp0, xmm_tmp1
+    movdqa          xmm_yfrac0, xmm_tmp0
+    movdqa          xmm_yfrac1, xmm_tmp1
+%else
+    pshufd          xmm_yfrac1, xmm_tmp1, 0
+    pcmpeqw         xmm_yfrac0, xmm_yfrac0
+%if %2
+    psrlw           xmm_yfrac0, 1
+%else
+    psrld           xmm_yfrac0, 17
+%endif
+    pxor            xmm_yfrac0, xmm_yfrac1
+%endif
+
+    mov             i_xpos, 1 << 15
+    mov             i_width_cnt, i_dst_width
+    sub             i_width_cnt, 1
+
+%ifdef xmm_xpos_int
+    movdqa          xmm_xpos_int, xmm_xpos_int_begin
+    movdqa          xmm_xpos_frac, xmm_xpos_frac_begin
+%else
+    movdqa          xmm_xfrac0, xmm_xfrac0_begin
+    movdqa          xmm_xfrac1, xmm_xfrac1_begin
+%endif
+
+%%width:
+    %1
+    sub             i_width_cnt, 8
+    jg              %%width
+
+    lea             p_dst, [p_dst + i_width_cnt + 1]
+    imul            i_width_cnt, i_scalex
+    add             i_xpos, i_width_cnt
+    shr             i_xpos, 16
+    movzx           r_tmp0, byte [p_src_row0 + i_xpos]
+    mov             [p_dst - 1], r_tmp0b
+%ifdef X86_32
+    mov             r_tmp0, i_scaleyd
+    add             i_yposd, r_tmp0
+%else
+    add             i_yposd, i_scaleyd
+%endif
+    add             p_dst, i_dst_stride_less_width
+    sub             i_dst_height, 1
+    jg              %%height
+%endmacro
+
+;**************************************************************************************************************
+;void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+;    uint32_t uiScaleY);
+;
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearFastDownsampler_ssse3
+    %assign push_num 0
+%ifndef X86_32
+    push            r12
+    push            r13
+    push            rbx
+    push            rbp
+    %assign push_num 4
+%ifdef WIN64
+    push            rdi
+    push            rsi
+    %assign push_num push_num + 2
+%endif
+%endif
+    LOAD_7_PARA
+    PUSH_XMM 16
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
+    ZERO_EXTENSION  r6d
+    sub             r1, r2                                            ; dst_stride - dst_width
+%ifdef X86_32
+    movd            xmm0, arg8
+    movd            xmm1, esp
+    and             esp, -16
+    sub             esp, 8 * 4 + 7 * 16
+    movd            [esp], xmm1
+    %define p_dst                   r0
+    %define i_dst_stride_less_width [esp + 1 * 4]
+    %define i_dst_width             [esp + 2 * 4]
+    %define i_dst_height            dword [esp + 3 * 4]
+    %define p_src                   [esp + 4 * 4]
+    %define i_src_stride            [esp + 5 * 4]
+    %define i_scalex                r6
+    %define i_scalexd               r6d
+    %define i_scaleyd               [esp + 6 * 4]
+    %define i_xpos                  r2
+    %define i_ypos                  dword [esp + 7 * 4]
+    %define i_yposd                 dword [esp + 7 * 4]
+    %define p_src_row0              r3
+    %define p_src_row1              r4
+    %define i_width_cnt             r5
+    %define r_tmp0                  r1
+    %define r_tmp0b                 r1b
+    %define xmm_xpos_frac           xmm1
+    %define xmm_xpos_frac_inc       [esp + 8 * 4]
+    %define xmm_xpos_int            xmm3
+    %define xmm_xpos_int_inc        [esp + 8 * 4 + 1 * 16]
+    %define xmm_yfrac0              [esp + 8 * 4 + 2 * 16]
+    %define xmm_yfrac1              [esp + 8 * 4 + 3 * 16]
+    %define xmm_tmp0                xmm7
+    %define xmm_tmp1                xmm0
+    %define xmm_tmp2                xmm2
+    %define xmm_tmp3                xmm4
+    %define xmm_tmp4                xmm5
+    %define xmm_tmp5                xmm6
+    %define xmm_0                   [esp + 8 * 4 + 4 * 16]
+    %define xmm_xpos_int_begin      [esp + 8 * 4 + 5 * 16]
+    %define xmm_xpos_frac_begin     [esp + 8 * 4 + 6 * 16]
+    mov             i_dst_stride_less_width, r1
+    mov             i_dst_width, r2
+    mov             i_dst_height, r3
+    mov             p_src, r4
+    mov             i_src_stride, r5
+    movd            i_scaleyd, xmm0
+    pxor            xmm_tmp0, xmm_tmp0
+    movdqa          xmm_0, xmm_tmp0
+%else
+    %define p_dst                   r0
+    %define i_dst_stride_less_width r1
+    %define i_dst_width             r2
+    %define i_dst_height            r3
+    %define p_src                   r4
+    %define i_src_stride            r5
+    %define i_scalex                r6
+    %define i_scalexd               r6d
+    %define i_scaleyd               dword arg8d
+    %define i_xpos                  r12
+    %define i_ypos                  r13
+    %define i_yposd                 r13d
+    %define p_src_row0              rbp
+%ifdef WIN64
+    %define p_src_row1              rsi
+    %define i_width_cnt             rdi
+%else
+    %define p_src_row1              r11
+    %define i_width_cnt             rax
+%endif
+    %define r_tmp0                  rbx
+    %define r_tmp0b                 bl
+    %define xmm_0                   xmm0
+    %define xmm_xpos_frac           xmm1
+    %define xmm_xpos_frac_inc       xmm8
+    %define xmm_xpos_int            xmm3
+    %define xmm_xpos_int_inc        xmm10
+    %define xmm_yfrac0              xmm11
+    %define xmm_yfrac1              xmm12
+    %define xmm_tmp0                xmm7
+    %define xmm_tmp1                xmm2
+    %define xmm_tmp2                xmm9
+    %define xmm_tmp3                xmm4
+    %define xmm_tmp4                xmm5
+    %define xmm_tmp5                xmm6
+    %define xmm_xpos_int_begin      xmm14
+    %define xmm_xpos_frac_begin     xmm15
+    pxor            xmm_0, xmm_0
+%endif
+
+    sub             i_dst_height, 1
+    je              .final_row
+    jl              .done
+
+    mov             i_ypos, 1 << 14
+    movd            xmm_xpos_frac, i_scalexd
+    pshufd          xmm_xpos_frac, xmm_xpos_frac, 0
+    movdqa          xmm_tmp0, xmm_xpos_frac
+    pslld           xmm_tmp0, 2
+    pslldq          xmm_xpos_frac, 4
+    paddd           xmm_tmp0, xmm_xpos_frac
+    movdqa          xmm_tmp1, xmm_xpos_frac
+    pslldq          xmm_tmp1, 4
+    paddd           xmm_xpos_frac, xmm_tmp1
+    paddd           xmm_tmp0, xmm_tmp1
+    pslldq          xmm_tmp1, 4
+    paddd           xmm_xpos_frac, xmm_tmp1
+    paddd           xmm_tmp0, xmm_tmp1
+    pcmpeqw         xmm_tmp1, xmm_tmp1
+    psrld           xmm_tmp1, 31
+    pslld           xmm_tmp1, 15
+    paddd           xmm_xpos_frac, xmm_tmp1
+    paddd           xmm_tmp0, xmm_tmp1
+    movdqa          xmm_xpos_int, xmm_xpos_frac
+    movdqa          xmm_tmp1, xmm_tmp0
+    psrld           xmm_xpos_int, 16
+    psrld           xmm_tmp1, 16
+    packssdw        xmm_xpos_int, xmm_tmp1
+    packuswb        xmm_xpos_int, xmm_xpos_int
+    movdqa          xmm_tmp1, xmm_xpos_int
+    pcmpeqw         xmm_tmp2, xmm_tmp2
+    psubb           xmm_tmp1, xmm_tmp2
+    punpcklbw       xmm_xpos_int, xmm_tmp1
+    pslld           xmm_xpos_frac, 16
+    pslld           xmm_tmp0, 16
+    psrad           xmm_xpos_frac, 16
+    psrad           xmm_tmp0, 16
+    packssdw        xmm_xpos_frac, xmm_tmp0
+    movd            xmm_tmp0, i_scalexd
+    pslld           xmm_tmp0, 3
+    movdqa          xmm_tmp1, xmm_tmp0
+    punpcklwd       xmm_tmp0, xmm_tmp0
+    pshufd          xmm_tmp0, xmm_tmp0, 0
+    movdqa          xmm_xpos_frac_inc, xmm_tmp0
+    psrld           xmm_tmp1, 16
+    psubw           xmm_tmp1, xmm_tmp2
+    pxor            xmm_tmp2, xmm_tmp2
+    pshufb          xmm_tmp1, xmm_tmp2
+    movdqa          xmm_xpos_int_inc, xmm_tmp1
+    movdqa          xmm_xpos_int_begin, xmm_xpos_int
+    movdqa          xmm_xpos_frac_begin, xmm_xpos_frac
+
+    cmp             i_scalex, 4 << 16
+    ja              .scalex_above4
+    cmp             i_scalex, 2 << 16
+    ja              .scalex_above2_beloweq4
+    SSE2_GeneralBilinearDownsampler_loop SSSE3_BilinearFastDownsample2xOrLess_8px, 1
+    jmp             .final_row
+%ifdef X86_32
+    %undef xmm_yfrac0
+    %xdefine xmm_yfrac0 xmm_tmp5
+    %undef xmm_tmp5
+%endif
+.scalex_above2_beloweq4:
+    SSE2_GeneralBilinearDownsampler_loop SSSE3_BilinearFastDownsample4xOrLess_8px, 1
+    jmp             .final_row
+.scalex_above4:
+%xdefine xmm_xfrac0 xmm_xpos_frac
+%xdefine xmm_xfrac1 xmm_xpos_int
+%xdefine xmm_xfrac0_begin xmm_xpos_int_begin
+%xdefine xmm_xfrac1_begin xmm_xpos_frac_begin
+%xdefine xmm_xfrac_inc xmm_xpos_frac_inc
+%undef xmm_xpos_int
+%undef xmm_xpos_frac
+%undef xmm_xpos_int_begin
+%undef xmm_xpos_frac_begin
+%undef xmm_xpos_int_inc
+%undef xmm_xpos_frac_inc
+    SSE2_UnpckXFracuw xmm_tmp0, xmm_xfrac1, xmm_xfrac0
+    movdqa          xmm_xfrac0, xmm_tmp0
+    movdqa          xmm_xfrac0_begin, xmm_xfrac0
+    movdqa          xmm_xfrac1_begin, xmm_xfrac1
+    pcmpeqw         xmm_tmp0, xmm_tmp0
+    pmullw          xmm_tmp0, xmm_xfrac_inc
+    punpcklwd       xmm_tmp0, xmm_xfrac_inc
+    movdqa          xmm_xfrac_inc, xmm_tmp0
+    SSE2_GeneralBilinearDownsampler_loop SSE2_GeneralBilinearFastDownsample_8px, 1
+
+.final_row:
+    mov             p_src_row0, i_ypos
+    shr             p_src_row0, 15
+    imul            p_src_row0, i_src_stride
+    add             p_src_row0, p_src
+    mov             i_xpos, 1 << 15
+    mov             i_width_cnt, i_dst_width
+
+.final_row_width:
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]
+    mov             [p_dst], r_tmp0b
+    add             p_dst, 1
+    add             i_xpos, i_scalex
+    sub             i_width_cnt, 1
+    jg              .final_row_width
+
+.done:
+%ifdef X86_32
+    mov             esp, [esp]
+%endif
+    POP_XMM
+    LOAD_7_PARA_POP
+%ifndef X86_32
+%ifdef WIN64
+    pop             rsi
+    pop             rdi
+%endif
+    pop             rbp
+    pop             rbx
+    pop             r13
+    pop             r12
+%endif
+    ret
+%undef p_dst
+%undef i_dst_stride_less_width
+%undef i_dst_width
+%undef i_dst_height
+%undef p_src
+%undef i_src_stride
+%undef i_scalex
+%undef i_scalexd
+%undef i_scaleyd
+%undef i_xpos
+%undef i_ypos
+%undef i_yposd
+%undef p_src_row0
+%undef p_src_row1
+%undef i_width_cnt
+%undef r_tmp0
+%undef r_tmp0b
+%undef xmm_0
+%undef xmm_xpos_frac
+%undef xmm_xpos_frac_inc
+%undef xmm_xpos_int
+%undef xmm_xpos_int_inc
+%undef xmm_yfrac0
+%undef xmm_yfrac1
+%undef xmm_tmp0
+%undef xmm_tmp1
+%undef xmm_tmp2
+%undef xmm_tmp3
+%undef xmm_tmp4
+%undef xmm_tmp5
+%undef xmm_xpos_int_begin
+%undef xmm_xpos_frac_begin
+%undef xmm_xfrac0
+%undef xmm_xfrac1
+%undef xmm_xfrac0_begin
+%undef xmm_xfrac1_begin
+%undef xmm_xfrac_inc
+
+;**************************************************************************************************************
+;void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+;    uint32_t uiScaleY);
+;
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse41
+    %assign push_num 0
+%ifndef X86_32
+    push            r12
+    push            r13
+    push            rbx
+    push            rbp
+    %assign push_num 4
+%ifdef WIN64
+    push            rdi
+    push            rsi
+    %assign push_num push_num + 2
+%endif
+%endif
+    LOAD_7_PARA
+    PUSH_XMM 16
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
+    ZERO_EXTENSION  r6d
+    sub             r1, r2                                            ; dst_stride - dst_width
+    add             r6, r6                                            ; 2 * scalex
+%ifdef X86_32
+    movd            xmm0, arg8
+    movd            xmm1, esp
+    and             esp, -16
+    sub             esp, 8 * 4 + 8 * 16
+    movd            [esp], xmm1
+    %define p_dst                   r0
+    %define i_dst_stride_less_width [esp + 1 * 4]
+    %define i_dst_width             [esp + 2 * 4]
+    %define i_dst_height            dword [esp + 3 * 4]
+    %define p_src                   [esp + 4 * 4]
+    %define i_src_stride            [esp + 5 * 4]
+    %define i_scalex                r6
+    %define i_scalexd               r6d
+    %define i_scaleyd               [esp + 6 * 4]
+    %define i_xpos                  r2
+    %define i_ypos                  dword [esp + 7 * 4]
+    %define i_yposd                 dword [esp + 7 * 4]
+    %define p_src_row0              r3
+    %define p_src_row1              r4
+    %define i_width_cnt             r5
+    %define r_tmp0                  r1
+    %define r_tmp0b                 r1b
+    %define xmm_xpos_frac           xmm1
+    %define xmm_xpos_frac_inc       [esp + 8 * 4]
+    %define xmm_xpos_int            xmm3
+    %define xmm_xpos_int_inc        [esp + 8 * 4 + 1 * 16]
+    %define xmm_yfrac0              [esp + 8 * 4 + 2 * 16]
+    %define xmm_yfrac1              [esp + 8 * 4 + 3 * 16]
+    %define xmm_tmp0                xmm7
+    %define xmm_tmp1                xmm0
+    %define xmm_tmp2                xmm2
+    %define xmm_tmp3                xmm4
+    %define xmm_tmp4                xmm5
+    %define xmm_tmp5                xmm6
+    %define xmm_0                   [esp + 8 * 4 + 4 * 16]
+    %define xmm_7fff                [esp + 8 * 4 + 5 * 16]
+    %define xmm_xpos_int_begin      [esp + 8 * 4 + 6 * 16]
+    %define xmm_xpos_frac_begin     [esp + 8 * 4 + 7 * 16]
+    mov             i_dst_stride_less_width, r1
+    mov             i_dst_width, r2
+    mov             i_dst_height, r3
+    mov             p_src, r4
+    mov             i_src_stride, r5
+    movd            i_scaleyd, xmm0
+    pxor            xmm_tmp5, xmm_tmp5
+    movdqa          xmm_0, xmm_tmp5
+    pcmpeqw         xmm_tmp5, xmm_tmp5
+    psrlw           xmm_tmp5, 1
+    movdqa          xmm_7fff, xmm_tmp5
+%else
+    %define p_dst                   r0
+    %define i_dst_stride_less_width r1
+    %define i_dst_width             r2
+    %define i_dst_height            r3
+    %define p_src                   r4
+    %define i_src_stride            r5
+    %define i_scalex                r6
+    %define i_scalexd               r6d
+    %define i_scaleyd               dword arg8d
+    %define i_xpos                  r12
+    %define i_ypos                  r13
+    %define i_yposd                 r13d
+    %define p_src_row0              rbp
+%ifdef WIN64
+    %define p_src_row1              rsi
+    %define i_width_cnt             rdi
+%else
+    %define p_src_row1              r11
+    %define i_width_cnt             rax
+%endif
+    %define r_tmp0                  rbx
+    %define r_tmp0b                 bl
+    %define xmm_0                   xmm0
+    %define xmm_xpos_frac           xmm1
+    %define xmm_xpos_frac_inc       xmm8
+    %define xmm_xpos_int            xmm3
+    %define xmm_xpos_int_inc        xmm10
+    %define xmm_yfrac0              xmm11
+    %define xmm_yfrac1              xmm12
+    %define xmm_tmp0                xmm7
+    %define xmm_tmp1                xmm2
+    %define xmm_tmp2                xmm9
+    %define xmm_tmp3                xmm4
+    %define xmm_tmp4                xmm5
+    %define xmm_tmp5                xmm6
+    %define xmm_7fff                xmm13
+    %define xmm_xpos_int_begin      xmm14
+    %define xmm_xpos_frac_begin     xmm15
+    pxor            xmm_0, xmm_0
+    pcmpeqw         xmm_7fff, xmm_7fff
+    psrlw           xmm_7fff, 1
+%endif
+
+    sub             i_dst_height, 1
+    je              .final_row
+    jl              .done
+
+    mov             i_ypos, 1 << 14
+    movd            xmm_xpos_frac, i_scalexd
+    pshufd          xmm_xpos_frac, xmm_xpos_frac, 0
+    movdqa          xmm_tmp0, xmm_xpos_frac
+    pslld           xmm_tmp0, 2
+    pslldq          xmm_xpos_frac, 4
+    paddd           xmm_tmp0, xmm_xpos_frac
+    movdqa          xmm_tmp1, xmm_xpos_frac
+    pslldq          xmm_tmp1, 4
+    paddd           xmm_xpos_frac, xmm_tmp1
+    paddd           xmm_tmp0, xmm_tmp1
+    pslldq          xmm_tmp1, 4
+    paddd           xmm_xpos_frac, xmm_tmp1
+    paddd           xmm_tmp0, xmm_tmp1
+    pcmpeqw         xmm_tmp1, xmm_tmp1
+    psrld           xmm_tmp1, 31
+    pslld           xmm_tmp1, 15
+    paddd           xmm_xpos_frac, xmm_tmp1
+    paddd           xmm_tmp0, xmm_tmp1
+    movdqa          xmm_xpos_int, xmm_xpos_frac
+    movdqa          xmm_tmp1, xmm_tmp0
+    psrld           xmm_xpos_int, 16
+    psrld           xmm_tmp1, 16
+    packssdw        xmm_xpos_int, xmm_tmp1
+    packuswb        xmm_xpos_int, xmm_xpos_int
+    movdqa          xmm_tmp1, xmm_xpos_int
+    pcmpeqw         xmm_tmp2, xmm_tmp2
+    psubb           xmm_tmp1, xmm_tmp2
+    punpcklbw       xmm_xpos_int, xmm_tmp1
+    pslld           xmm_xpos_frac, 16
+    pslld           xmm_tmp0, 16
+    psrad           xmm_xpos_frac, 16
+    psrad           xmm_tmp0, 16
+    packssdw        xmm_xpos_frac, xmm_tmp0
+    psrlw           xmm_xpos_frac, 1
+    movd            xmm_tmp0, i_scalexd
+    pslld           xmm_tmp0, 3
+    movdqa          xmm_tmp1, xmm_tmp0
+    punpcklwd       xmm_tmp0, xmm_tmp0
+    pshufd          xmm_tmp0, xmm_tmp0, 0
+    psrlw           xmm_tmp0, 1
+    movdqa          xmm_xpos_frac_inc, xmm_tmp0
+    psrld           xmm_tmp1, 16
+    pxor            xmm_tmp2, xmm_tmp2
+    pshufb          xmm_tmp1, xmm_tmp2
+    movdqa          xmm_xpos_int_inc, xmm_tmp1
+    movdqa          xmm_xpos_int_begin, xmm_xpos_int
+    movdqa          xmm_xpos_frac_begin, xmm_xpos_frac
+
+    cmp             i_scalex, 4 << 16
+    ja              .scalex_above4
+    cmp             i_scalex, 2 << 16
+    ja              .scalex_above2_beloweq4
+    SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample2xOrLess_8px, 0
+    jmp             .final_row
+%ifdef X86_32
+    %undef xmm_yfrac0
+    %xdefine xmm_yfrac0 xmm_tmp5
+    %undef xmm_tmp5
+%endif
+.scalex_above2_beloweq4:
+    SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample4xOrLess_8px, 0
+    jmp             .final_row
+.scalex_above4:
+%xdefine xmm_xfrac0 xmm_xpos_frac
+%xdefine xmm_xfrac1 xmm_xpos_int
+%xdefine xmm_xfrac0_begin xmm_xpos_int_begin
+%xdefine xmm_xfrac1_begin xmm_xpos_frac_begin
+%xdefine xmm_xfrac_inc xmm_xpos_frac_inc
+%undef xmm_xpos_int
+%undef xmm_xpos_frac
+%undef xmm_xpos_int_begin
+%undef xmm_xpos_frac_begin
+%undef xmm_xpos_int_inc
+%undef xmm_xpos_frac_inc
+    SSE2_UnpckXFracw xmm_tmp0, xmm_xfrac1, xmm_xfrac0, xmm_7fff
+    movdqa          xmm_xfrac0, xmm_tmp0
+    movdqa          xmm_xfrac0_begin, xmm_xfrac0
+    movdqa          xmm_xfrac1_begin, xmm_xfrac1
+    pcmpeqw         xmm_tmp0, xmm_tmp0
+    pmullw          xmm_tmp0, xmm_xfrac_inc
+    punpcklwd       xmm_tmp0, xmm_xfrac_inc
+    movdqa          xmm_xfrac_inc, xmm_tmp0
+    SSE2_GeneralBilinearDownsampler_loop SSE41_GeneralBilinearAccurateDownsample_8px, 0
+
+.final_row:
+    mov             p_src_row0, i_ypos
+    shr             p_src_row0, 15
+    imul            p_src_row0, i_src_stride
+    add             p_src_row0, p_src
+    mov             i_xpos, 1 << 15
+    mov             i_width_cnt, i_dst_width
+
+.final_row_width:
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]
+    mov             [p_dst], r_tmp0b
+    add             p_dst, 1
+    add             i_xpos, i_scalex
+    sub             i_width_cnt, 1
+    jg              .final_row_width
+
+.done:
+%ifdef X86_32
+    mov             esp, [esp]
+%endif
+    POP_XMM
+    LOAD_7_PARA_POP
+%ifndef X86_32
+%ifdef WIN64
+    pop             rsi
+    pop             rdi
+%endif
+    pop             rbp
+    pop             rbx
+    pop             r13
+    pop             r12
+%endif
+    ret
+%undef p_dst
+%undef i_dst_stride_less_width
+%undef i_dst_width
+%undef i_dst_height
+%undef p_src
+%undef i_src_stride
+%undef i_scalex
+%undef i_scalexd
+%undef i_scaleyd
+%undef i_xpos
+%undef i_ypos
+%undef i_yposd
+%undef p_src_row0
+%undef p_src_row1
+%undef i_width_cnt
+%undef r_tmp0
+%undef r_tmp0b
+%undef xmm_0
+%undef xmm_xpos_frac
+%undef xmm_xpos_frac_inc
+%undef xmm_xpos_int
+%undef xmm_xpos_int_inc
+%undef xmm_yfrac0
+%undef xmm_yfrac1
+%undef xmm_tmp0
+%undef xmm_tmp1
+%undef xmm_tmp2
+%undef xmm_tmp3
+%undef xmm_tmp4
+%undef xmm_tmp5
+%undef xmm_7fff
+%undef xmm_xpos_int_begin
+%undef xmm_xpos_frac_begin
+%undef xmm_xfrac0
+%undef xmm_xfrac1
+%undef xmm_xfrac0_begin
+%undef xmm_xfrac1_begin
+%undef xmm_xfrac_inc
+
+; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
+%macro AVX2_BilinearIncXposuw 5
+    vpaddusw        %5, %2, %4
+    vpaddw          %2, %2, %4
+    vpcmpeqw        %5, %5, %2
+    vpaddb          %1, %1, %3
+    vpaddb          %1, %1, %5  ; subtract 1 if no carry
+%endmacro
+
+; outl=%1 outh=%2 in=%3 FFFFh/7FFFh=%4
+%macro AVX2_UnpckXFrac 4
+    vpxor           %1, %3, %4
+    vpunpckhwd      %2, %1, %3
+    vpunpcklwd      %1, %1, %3
+%endmacro
+
+; out0=%1 out1=%2 xfrac=%3 yfrac0=%4 yfrac1=%5
+%macro AVX2_BilinearFastCalcXYFrac 5
+    vpmulhuw        %2, %3, %5
+    vpmulhuw        %1, %3, %4
+%endmacro
+
+; [in:dwordsl out:bytes] dwordsh=%2 zero=%3
+%macro AVX2_BilinearFastPackDwordsToBytes 3
+    vpsrld          %1, %1, 14
+    vpsrld          %2, %2, 14
+    vpackssdw       %1, %1, %2
+    vpavgw          %1, %1, %3
+    vpackuswb       %1, %1, %1
+%endmacro
+
+%macro AVX2_BilinearFastDownsample2xOrLess_16px 0
+    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_0
+    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
+    lea             i_xpos, [i_xpos + 8 * i_scalex2]
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
+    vpshufb         ymm_tmp4, ymm_tmp4, ymm_xpos_int
+    vpshufb         ymm_tmp5, ymm_tmp5, ymm_xpos_int
+    AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
+    vpunpcklbw      ymm_tmp3, ymm_tmp4, ymm_0
+    vpmaddwd        ymm_tmp0, ymm_tmp0, ymm_tmp3
+    vpunpcklbw      ymm_tmp3, ymm_tmp5, ymm_0
+    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp3
+    vpaddd          ymm_tmp0, ymm_tmp0, ymm_tmp2
+    AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
+    vpunpckhbw      ymm_tmp2, ymm_tmp4, ymm_0
+    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp2
+    vpunpckhbw      ymm_tmp2, ymm_tmp5, ymm_0
+    vpmaddwd        ymm_tmp3, ymm_tmp3, ymm_tmp2
+    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp3
+    AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
+    vmovlps         [p_dst], xmm_tmp0
+    vextracti128    [p_dst + 8], ymm_tmp0, 1
+    add             p_dst, 16
+    AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
+%endmacro
+
+%macro AVX2_BilinearFastDownsample4xOrLess_16px 0
+    vbroadcasti128  ymm_tmp0, [shufb_0000000088888888]
+    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
+    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp3, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
+    lea             r_tmp0, [i_xpos + 2 * i_scalex2]
+    lea             i_xpos, [r_tmp0 + 4 * i_scalex2]
+    shr             r_tmp0, 16
+    vpunpcklbw      ymm_tmp2, ymm_xpos_int, ymm_ffff
+    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp2
+    vpshufb         ymm_tmp3, ymm_tmp3, ymm_tmp2
+    AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
+    vpmaddwd        ymm_tmp0, ymm_tmp0, ymm_tmp4
+    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp3
+    vpaddd          ymm_tmp0, ymm_tmp0, ymm_tmp2
+    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp3, [p_src_row1 + r_tmp0]
+    mov             r_tmp0, i_xpos
+    lea             i_xpos, [i_xpos + 2 * i_scalex2]
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
+    vpunpckhbw      ymm_tmp2, ymm_xpos_int, ymm_ffff
+    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp2
+    vpshufb         ymm_tmp3, ymm_tmp3, ymm_tmp2
+    AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
+    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp4
+    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp3
+    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp2
+    AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
+    vmovlps         [p_dst], xmm_tmp0
+    vextracti128    [p_dst + 8], ymm_tmp0, 1
+    add             p_dst, 16
+    AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
+%endmacro
+
+%macro AVX2_BilinearFastDownsample8xOrLess_16px 0
+    vbroadcasti128  ymm_tmp0, [shufb_000044448888CCCC]
+    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
+    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
+    add             i_xpos, i_scalex2
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovdqu         xmm_tmp0, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp1, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
+    add             i_xpos, i_scalex2
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
+    vpunpcklbw      ymm_tmp3, ymm_xpos_int, ymm_ffff
+    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
+    vpshufb         ymm_tmp5, ymm_tmp5, ymm_tmp3
+    vpshufb         ymm_tmp0, ymm_tmp0, ymm_tmp3
+    vpshufb         ymm_tmp1, ymm_tmp1, ymm_tmp3
+    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 11001100b
+    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp1, 11001100b
+    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
+    AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
+    vpmaddwd        ymm_tmp0, ymm_tmp0, ymm_tmp4
+    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp5
+    vpaddd          ymm_tmp0, ymm_tmp0, ymm_tmp2
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
+    add             i_xpos, i_scalex2
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
+    mov             r_tmp0, i_xpos
+    lea             i_xpos, [i_xpos + 4 * i_scalex2]
+    shr             r_tmp0, 16
+    vmovdqu         xmm_tmp2, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp3, [p_src_row1 + r_tmp0]
+    mov             r_tmp0, i_xpos
+    add             i_xpos, i_scalex2
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
+    vpshufb         ymm_tmp4, ymm_tmp4, ymm_xpos_int
+    vpshufb         ymm_tmp5, ymm_tmp5, ymm_xpos_int
+    vpshufb         ymm_tmp2, ymm_tmp2, ymm_xpos_int
+    vpshufb         ymm_tmp3, ymm_tmp3, ymm_xpos_int
+    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp2, 10001000b
+    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp3, 10001000b
+    vpunpckhbw      ymm_tmp4, ymm_tmp4, ymm_0
+    vpunpckhbw      ymm_tmp5, ymm_tmp5, ymm_0
+    AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
+    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp4
+    vpmaddwd        ymm_tmp3, ymm_tmp3, ymm_tmp5
+    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp3
+    AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
+    vmovlps         [p_dst], xmm_tmp0
+    vextracti128    [p_dst + 8], ymm_tmp0, 1
+    add             p_dst, 16
+    AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
+%endmacro
+
+%macro AVX2_GeneralBilinearFastDownsample_16px 0
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp4, [p_src_row0 + r_tmp0]
+    vpbroadcastd    ymm_tmp5, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 1 * i_scalex]
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
+    vpunpcklwd      ymm_tmp4, ymm_tmp4, ymm_tmp0
+    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
+    vpunpcklwd      ymm_tmp5, ymm_tmp5, ymm_tmp0
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
+    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 00100010b
+    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
+    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp0, 00100010b
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
+    vpblendw        ymm_tmp4, ymm_tmp4, ymm_tmp0, 1000b
+    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
+    vpblendw        ymm_tmp5, ymm_tmp5, ymm_tmp0, 1000b
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp2, [p_src_row0 + r_tmp0]
+    vpbroadcastd    ymm_tmp3, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 1 * i_scalex]
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
+    vpunpcklwd      ymm_tmp2, ymm_tmp2, ymm_tmp0
+    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
+    vpunpcklwd      ymm_tmp3, ymm_tmp3, ymm_tmp0
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
+    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp0, 00100010b
+    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
+    vpblendd        ymm_tmp3, ymm_tmp3, ymm_tmp0, 00100010b
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
+    vpblendw        ymm_tmp2, ymm_tmp2, ymm_tmp0, 1000b
+    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
+    vpblendw        ymm_tmp3, ymm_tmp3, ymm_tmp0, 1000b
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovd           xmm_tmp0, [p_src_row0 + r_tmp0]
+    vmovd           xmm_tmp1, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + i_scalex]
+    shr             r_tmp0, 16
+    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 1
+    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 1
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 2
+    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 2
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 3
+    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 3
+    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 00001111b
+    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp1, 00001111b
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovd           xmm_tmp0, [p_src_row0 + r_tmp0]
+    vmovd           xmm_tmp1, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + i_scalex]
+    shr             r_tmp0, 16
+    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 1
+    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 1
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 2
+    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 2
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 3
+    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 3
+    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp0, 00001111b
+    vpblendd        ymm_tmp3, ymm_tmp3, ymm_tmp1, 00001111b
+    vpunpcklbw      ymm_tmp4, ymm_tmp4, ymm_0
+    vpunpcklbw      ymm_tmp5, ymm_tmp5, ymm_0
+    AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp1, ymm_xfrac0, ymm_yfrac0, ymm_yfrac1
+    vpmaddwd        ymm_tmp0, ymm_tmp0, ymm_tmp4
+    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp5
+    vpaddd          ymm_tmp0, ymm_tmp0, ymm_tmp1
+    vpunpcklbw      ymm_tmp4, ymm_tmp2, ymm_0
+    vpunpcklbw      ymm_tmp5, ymm_tmp3, ymm_0
+    AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_xfrac1, ymm_yfrac0, ymm_yfrac1
+    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp4
+    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp5
+    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp2
+    AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
+    vpermq          ymm_tmp0, ymm_tmp0, 0010b
+    vmovdqu         [p_dst], xmm_tmp0
+    add             p_dst, 16
+    vpaddw          ymm_xfrac0, ymm_xfrac0, ymm_xfrac_inc
+    vpaddw          ymm_xfrac1, ymm_xfrac1, ymm_xfrac_inc
+%endmacro
+
+; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6,%7
+%macro AVX2_BilinearIncXposw 7
+    vpaddb          %1, %1, %3
+    vpaddw          %6, %2, %4
+    vpcmpgtw        %7, %2, %6
+    vpsubb          %1, %1, %7  ; add carry
+    vpand           %2, %6, %5
+%endmacro
+
+; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6
+%macro AVX2_LinearAccurateInterpolateVerticalDwords 6
+    vpshufd         %1, %2, 10110001b
+    vpshufd         %6, %3, 10110001b
+    vpmuludq        %1, %1, %4
+    vpmuludq        %6, %6, %5
+    vpaddq          %1, %1, %6
+    vpmuludq        %2, %2, %4
+    vpmuludq        %3, %3, %5
+    vpaddq          %2, %2, %3
+    vpsllq          %1, %1,  3
+    vpsrlq          %2, %2, 29
+    vpblendd        %1, %1, %2, 01010101b
+%endmacro
+
+%macro AVX2_BilinearAccurateDownsample2xOrLess_16px 0
+    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_0
+    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
+    lea             i_xpos, [i_xpos + 8 * i_scalex2]
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
+    vpshufb         ymm_tmp4, ymm_tmp4, ymm_xpos_int
+    vpshufb         ymm_tmp5, ymm_tmp5, ymm_xpos_int
+    vpunpcklbw      ymm_tmp2, ymm_tmp4, ymm_0
+    vpunpcklbw      ymm_tmp3, ymm_tmp5, ymm_0
+    vpunpckhbw      ymm_tmp4, ymm_tmp4, ymm_0
+    vpunpckhbw      ymm_tmp5, ymm_tmp5, ymm_0
+    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp0
+    vpmaddwd        ymm_tmp3, ymm_tmp3, ymm_tmp0
+    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp1
+    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_tmp1
+    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp2, ymm_tmp3, ymm_yfrac0, ymm_yfrac1, ymm_tmp1
+    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp2
+    vpackssdw       ymm_tmp0, ymm_tmp0, ymm_tmp1
+    vpavgw          ymm_tmp0, ymm_tmp0, ymm_0
+    vpackuswb       ymm_tmp0, ymm_tmp0, ymm_tmp0
+    vmovlps         [p_dst], xmm_tmp0
+    vextracti128    [p_dst + 8], ymm_tmp0, 1
+    add             p_dst, 16
+    AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1
+%endmacro
+
+%macro AVX2_BilinearAccurateDownsample4xOrLess_16px 0
+    vbroadcasti128  ymm_tmp0, [shufb_0000000088888888]
+    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
+    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp2, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
+    lea             r_tmp0, [i_xpos + 2 * i_scalex2]
+    lea             i_xpos, [r_tmp0 + 4 * i_scalex2]
+    shr             r_tmp0, 16
+    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
+    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
+    vpshufb         ymm_tmp2, ymm_tmp2, ymm_tmp3
+    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp0
+    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp0
+    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp2, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
+    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp2, [p_src_row1 + r_tmp0]
+    mov             r_tmp0, i_xpos
+    lea             i_xpos, [i_xpos + 2 * i_scalex2]
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
+    vpunpckhbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
+    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
+    vpshufb         ymm_tmp2, ymm_tmp2, ymm_tmp3
+    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp1
+    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp1
+    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp2, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
+    vpackssdw       ymm_tmp0, ymm_tmp0, ymm_tmp1
+    vpavgw          ymm_tmp0, ymm_tmp0, ymm_0
+    vpackuswb       ymm_tmp0, ymm_tmp0, ymm_tmp0
+    vmovlps         [p_dst], xmm_tmp0
+    vextracti128    [p_dst + 8], ymm_tmp0, 1
+    add             p_dst, 16
+    AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1
+%endmacro
+
+%macro AVX2_BilinearAccurateDownsample8xOrLess_16px 0
+    vbroadcasti128  ymm_tmp0, [shufb_000044448888CCCC]
+    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
+    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
+    add             i_xpos, i_scalex2
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovdqu         xmm_tmp0, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp1, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
+    add             i_xpos, i_scalex2
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
+    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
+    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
+    vpshufb         ymm_tmp5, ymm_tmp5, ymm_tmp3
+    vpshufb         ymm_tmp0, ymm_tmp0, ymm_tmp3
+    vpshufb         ymm_tmp1, ymm_tmp1, ymm_tmp3
+    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 11001100b
+    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp1, 11001100b
+    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
+    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp0
+    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_tmp0
+    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
+    add             i_xpos, i_scalex2
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
+    mov             r_tmp0, i_xpos
+    lea             i_xpos, [i_xpos + 4 * i_scalex2]
+    shr             r_tmp0, 16
+    vmovdqu         xmm_tmp2, [p_src_row0 + r_tmp0]
+    vmovdqu         xmm_tmp3, [p_src_row1 + r_tmp0]
+    mov             r_tmp0, i_xpos
+    add             i_xpos, i_scalex2
+    shr             r_tmp0, 16
+    vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row0 + r_tmp0], 1
+    vinserti128     ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
+    vpshufb         ymm_tmp4, ymm_tmp4, ymm_xpos_int
+    vpshufb         ymm_tmp5, ymm_tmp5, ymm_xpos_int
+    vpshufb         ymm_tmp2, ymm_tmp2, ymm_xpos_int
+    vpshufb         ymm_tmp3, ymm_tmp3, ymm_xpos_int
+    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp2, 10001000b
+    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp3, 10001000b
+    vpunpckhbw      ymm_tmp4, ymm_tmp4, ymm_0
+    vpunpckhbw      ymm_tmp5, ymm_tmp5, ymm_0
+    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp1
+    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_tmp1
+    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
+    vpackssdw       ymm_tmp0, ymm_tmp0, ymm_tmp1
+    vpavgw          ymm_tmp0, ymm_tmp0, ymm_0
+    vpackuswb       ymm_tmp0, ymm_tmp0, ymm_tmp0
+    vmovlps         [p_dst], xmm_tmp0
+    vextracti128    [p_dst + 8], ymm_tmp0, 1
+    add             p_dst, 16
+    AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1
+%endmacro
+
+%macro AVX2_GeneralBilinearAccurateDownsample_16px 0
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp4, [p_src_row0 + r_tmp0]
+    vpbroadcastd    ymm_tmp5, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 1 * i_scalex]
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
+    vpunpcklwd      ymm_tmp4, ymm_tmp4, ymm_tmp0
+    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
+    vpunpcklwd      ymm_tmp5, ymm_tmp5, ymm_tmp0
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
+    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 00100010b
+    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
+    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp0, 00100010b
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
+    vpblendw        ymm_tmp4, ymm_tmp4, ymm_tmp0, 1000b
+    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
+    vpblendw        ymm_tmp5, ymm_tmp5, ymm_tmp0, 1000b
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp2, [p_src_row0 + r_tmp0]
+    vpbroadcastd    ymm_tmp3, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 1 * i_scalex]
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
+    vpunpcklwd      ymm_tmp2, ymm_tmp2, ymm_tmp0
+    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
+    vpunpcklwd      ymm_tmp3, ymm_tmp3, ymm_tmp0
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
+    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp0, 00100010b
+    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
+    vpblendd        ymm_tmp3, ymm_tmp3, ymm_tmp0, 00100010b
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
+    vpblendw        ymm_tmp2, ymm_tmp2, ymm_tmp0, 1000b
+    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
+    vpblendw        ymm_tmp3, ymm_tmp3, ymm_tmp0, 1000b
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovd           xmm_tmp0, [p_src_row0 + r_tmp0]
+    vmovd           xmm_tmp1, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + i_scalex]
+    shr             r_tmp0, 16
+    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 1
+    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 1
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 2
+    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 2
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 3
+    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 3
+    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 00001111b
+    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp1, 00001111b
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    vmovd           xmm_tmp0, [p_src_row0 + r_tmp0]
+    vmovd           xmm_tmp1, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + i_scalex]
+    shr             r_tmp0, 16
+    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 1
+    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 1
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 2
+    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 2
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 3
+    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 3
+    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp0, 00001111b
+    vpblendd        ymm_tmp3, ymm_tmp3, ymm_tmp1, 00001111b
+    vpunpcklbw      ymm_tmp4, ymm_tmp4, ymm_0
+    vpunpcklbw      ymm_tmp5, ymm_tmp5, ymm_0
+    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_xfrac0
+    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_xfrac0
+    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp1
+    vpunpcklbw      ymm_tmp4, ymm_tmp2, ymm_0
+    vpunpcklbw      ymm_tmp5, ymm_tmp3, ymm_0
+    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_xfrac1
+    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_xfrac1
+    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp2
+    vpackssdw       ymm_tmp0, ymm_tmp0, ymm_tmp1
+    vpavgw          ymm_tmp0, ymm_tmp0, ymm_0
+    vpackuswb       ymm_tmp0, ymm_tmp0, ymm_tmp0
+    vextracti128    [p_dst], ymm_tmp0, 1
+    vmovlps         [p_dst + 8], xmm_tmp0
+    add             p_dst, 16
+    vpaddw          ymm_xfrac0, ymm_xfrac0, ymm_xfrac_inc
+    vpaddw          ymm_xfrac1, ymm_xfrac1, ymm_xfrac_inc
+    vpand           ymm_xfrac0, ymm_xfrac0, ymm_7fff
+    vpand           ymm_xfrac1, ymm_xfrac1, ymm_7fff
+%endmacro
+
+; downsample_16px_macro=%1 b_fast=%2
+%macro AVX2_GeneralBilinearDownsampler_loop 2
+%%height:
+    mov             p_src_row0, i_ypos
+    shr             p_src_row0, 15
+    imul            p_src_row0, i_src_stride
+    add             p_src_row0, p_src
+    mov             p_src_row1, p_src_row0
+    add             p_src_row1, i_src_stride
+%ifdef X86_32
+%if %2
+    vpbroadcastw    ymm_tmp1, i_ypos
+    vpsllw          ymm_tmp1, ymm_tmp1, 1
+    vpsrlw          ymm_tmp1, ymm_tmp1, 1
+    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
+    vpsrlw          ymm_tmp0, ymm_tmp0, 1
+%else
+    vpbroadcastd    ymm_tmp1, i_ypos
+    vpslld          ymm_tmp1, ymm_tmp1, 17
+    vpsrld          ymm_tmp1, ymm_tmp1, 17
+    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
+    vpsrld          ymm_tmp0, ymm_tmp0, 17
+%endif
+    vpxor           ymm_tmp0, ymm_tmp0, ymm_tmp1
+    vmovdqa         ymm_yfrac0, ymm_tmp0
+    vmovdqa         ymm_yfrac1, ymm_tmp1
+%else
+    vmovd           xmm_tmp0, i_yposd
+    vpbroadcastw    ymm_yfrac1, xmm_tmp0
+%if %2
+    vpsllw          ymm_yfrac1, ymm_yfrac1, 1
+    vpsrlw          ymm_yfrac1, ymm_yfrac1, 1
+    vpcmpeqw        ymm_yfrac0, ymm_yfrac0, ymm_yfrac0
+    vpsrlw          ymm_yfrac0, ymm_yfrac0, 1
+%else
+    vpslld          ymm_yfrac1, ymm_yfrac1, 17
+    vpsrld          ymm_yfrac1, ymm_yfrac1, 17
+    vpcmpeqw        ymm_yfrac0, ymm_yfrac0, ymm_yfrac0
+    vpsrld          ymm_yfrac0, ymm_yfrac0, 17
+%endif
+    vpxor           ymm_yfrac0, ymm_yfrac0, ymm_yfrac1
+%endif
+
+    mov             i_xpos, 1 << 15
+    mov             i_width_cnt, i_dst_width
+    sub             i_width_cnt, 1
+
+%ifdef ymm_xpos_int
+    vmovdqa         ymm_xpos_int, ymm_xpos_int_begin
+    vmovdqa         ymm_xpos_frac, ymm_xpos_frac_begin
+%else
+    vmovdqa         ymm_xfrac0, ymm_xfrac0_begin
+    vmovdqa         ymm_xfrac1, ymm_xfrac1_begin
+%endif
+
+%%width:
+    %1
+    sub             i_width_cnt, 16
+    jg              %%width
+
+    lea             p_dst, [p_dst + i_width_cnt + 1]
+%ifdef i_scalex2
+    mov             r_tmp0, i_scalex2
+    shr             r_tmp0, 1
+    imul            i_width_cnt, r_tmp0
+%else
+    imul            i_width_cnt, i_scalex
+%endif
+    add             i_xpos, i_width_cnt
+    shr             i_xpos, 16
+    movzx           r_tmp0, byte [p_src_row0 + i_xpos]
+    mov             [p_dst - 1], r_tmp0b
+%ifdef X86_32
+    mov             r_tmp0, i_scaleyd
+    add             i_yposd, r_tmp0
+%else
+    add             i_yposd, i_scaleyd
+%endif
+    add             p_dst, i_dst_stride_less_width
+    sub             i_dst_height, 1
+    jg              %%height
+%endmacro
+
+;**************************************************************************************************************
+;void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+;    uint32_t uiScaleY);
+;
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearFastDownsampler_avx2
+    %assign push_num 0
+%ifndef X86_32
+    push            r12
+    push            r13
+    push            rbx
+    push            rbp
+    %assign push_num 4
+%ifdef WIN64
+    push            rdi
+    push            rsi
+    %assign push_num push_num + 2
+%endif
+%endif
+    LOAD_7_PARA
+    PUSH_XMM 16
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
+    ZERO_EXTENSION  r6d
+    sub             r1, r2                                            ; dst_stride - dst_width
+%ifdef X86_32
+    vmovd           xmm0, arg8
+    vmovd           xmm1, esp
+    and             esp, -32
+    sub             esp, 8 * 4 + 8 * 32
+    vmovd           [esp], xmm1
+    %define p_dst                   r0
+    %define i_dst_stride_less_width [esp + 1 * 4]
+    %define i_dst_width             [esp + 2 * 4]
+    %define i_dst_height            dword [esp + 3 * 4]
+    %define p_src                   [esp + 4 * 4]
+    %define i_src_stride            [esp + 5 * 4]
+    %define i_scalex                r6
+    %define i_scalexd               r6d
+    %define i_scaleyd               [esp + 6 * 4]
+    %define i_xpos                  r2
+    %define i_ypos                  [esp + 7 * 4]
+    %define i_yposd                 dword [esp + 7 * 4]
+    %define p_src_row0              r3
+    %define p_src_row1              r4
+    %define i_width_cnt             r5
+    %define r_tmp0                  r1
+    %define r_tmp0b                 r1b
+    %define ymm_xpos_frac           ymm1
+    %define ymm_xpos_frac_inc       [esp + 8 * 4]
+    %define ymm_xpos_int            ymm3
+    %define ymm_xpos_int_inc        [esp + 8 * 4 + 1 * 32]
+    %define ymm_yfrac0              [esp + 8 * 4 + 2 * 32]
+    %define ymm_yfrac1              [esp + 8 * 4 + 3 * 32]
+    %define xmm_tmp0                xmm7
+    %define ymm_tmp0                ymm7
+    %define xmm_tmp1                xmm0
+    %define ymm_tmp1                ymm0
+    %define xmm_tmp2                xmm2
+    %define ymm_tmp2                ymm2
+    %define xmm_tmp3                xmm4
+    %define ymm_tmp3                ymm4
+    %define xmm_tmp4                xmm5
+    %define ymm_tmp4                ymm5
+    %define xmm_tmp5                xmm6
+    %define ymm_tmp5                ymm6
+    %define ymm_0                   [esp + 8 * 4 + 4 * 32]
+    %define ymm_ffff                [esp + 8 * 4 + 5 * 32]
+    %define ymm_xpos_int_begin      [esp + 8 * 4 + 6 * 32]
+    %define ymm_xpos_frac_begin     [esp + 8 * 4 + 7 * 32]
+    mov             i_dst_stride_less_width, r1
+    mov             i_dst_width, r2
+    mov             i_dst_height, r3
+    mov             p_src, r4
+    mov             i_src_stride, r5
+    vmovd           i_scaleyd, xmm0
+    vpxor           xmm0, xmm0, xmm0
+    vmovdqa         ymm_0, ymm0
+    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
+    vmovdqa         ymm_ffff, ymm_tmp0
+%else
+    %define p_dst                   r0
+    %define i_dst_stride_less_width r1
+    %define i_dst_width             r2
+    %define i_dst_height            r3
+    %define p_src                   r4
+    %define i_src_stride            r5
+    %define i_scalex                r6
+    %define i_scalexd               r6d
+    %define i_scaleyd               dword arg8d
+    %define i_xpos                  r12
+    %define i_ypos                  r13
+    %define i_yposd                 r13d
+    %define p_src_row0              rbp
+%ifdef WIN64
+    %define p_src_row1              rsi
+    %define i_width_cnt             rdi
+%else
+    %define p_src_row1              r11
+    %define i_width_cnt             rax
+%endif
+    %define r_tmp0                  rbx
+    %define r_tmp0b                 bl
+    %define ymm_0                   ymm0
+    %define ymm_xpos_frac           ymm1
+    %define ymm_xpos_frac_inc       ymm2
+    %define ymm_xpos_int            ymm3
+    %define ymm_xpos_int_inc        ymm4
+    %define ymm_yfrac0              ymm5
+    %define ymm_yfrac1              ymm6
+    %define xmm_tmp0                xmm7
+    %define ymm_tmp0                ymm7
+    %define xmm_tmp1                xmm8
+    %define ymm_tmp1                ymm8
+    %define xmm_tmp2                xmm9
+    %define ymm_tmp2                ymm9
+    %define xmm_tmp3                xmm10
+    %define ymm_tmp3                ymm10
+    %define xmm_tmp4                xmm11
+    %define ymm_tmp4                ymm11
+    %define xmm_tmp5                xmm12
+    %define ymm_tmp5                ymm12
+    %define ymm_ffff                ymm13
+    %define ymm_xpos_int_begin      ymm14
+    %define ymm_xpos_frac_begin     ymm15
+    vpxor           ymm_0, ymm_0, ymm_0
+    vpcmpeqw        ymm_ffff, ymm_ffff, ymm_ffff
+%endif
+
+    sub             i_dst_height, 1
+    je              .final_row
+    jl              .done
+
+    mov             i_yposd, 1 << 14
+    vmovd           xmm_tmp0, i_scalexd
+    vpbroadcastd    ymm_tmp0, xmm_tmp0
+    vpslld          ymm_tmp1, ymm_tmp0, 2
+    vpslld          ymm_tmp2, ymm_tmp0, 3
+    vpaddd          ymm_tmp3, ymm_tmp1, ymm_tmp2
+    vpxor           ymm_tmp4, ymm_tmp4, ymm_tmp4
+    vpblendd        ymm_tmp1, ymm_tmp4, ymm_tmp1, 11110000b
+    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp3, 11110000b
+    vpaddd          ymm_tmp3, ymm_tmp0, ymm_tmp0
+    vpblendd        ymm_tmp3, ymm_tmp4, ymm_tmp3, 11001100b
+    vpblendd        ymm_tmp0, ymm_tmp4, ymm_tmp0, 10101010b
+    vpaddd          ymm_tmp0, ymm_tmp3, ymm_tmp0
+    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp0
+    vpaddd          ymm_tmp2, ymm_tmp2, ymm_tmp0
+    vpcmpeqw        ymm_tmp3, ymm_tmp3, ymm_tmp3
+    vpsrld          ymm_tmp3, ymm_tmp3, 31
+    vpslld          ymm_tmp3, ymm_tmp3, 15
+    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp3
+    vpaddd          ymm_tmp2, ymm_tmp2, ymm_tmp3
+    vpsrld          ymm_xpos_int, ymm_tmp1, 16
+    vpsrld          ymm_tmp0, ymm_tmp2, 16
+    vpackssdw       ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+    vpermq          ymm_xpos_int, ymm_xpos_int, 11011000b
+    vpackuswb       ymm_xpos_int, ymm_xpos_int, ymm_xpos_int
+    vpcmpeqw        ymm_tmp3, ymm_tmp3, ymm_tmp3
+    vpsubb          ymm_tmp0, ymm_xpos_int, ymm_tmp3
+    vpunpcklbw      ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+    vpslld          ymm_tmp1, ymm_tmp1, 16
+    vpsrld          ymm_tmp1, ymm_tmp1, 16
+    vpslld          ymm_tmp2, ymm_tmp2, 16
+    vpsrld          ymm_tmp2, ymm_tmp2, 16
+    vpackusdw       ymm_xpos_frac, ymm_tmp1, ymm_tmp2
+    vpermq          ymm_xpos_frac, ymm_xpos_frac, 11011000b
+    vmovd           xmm_tmp0, i_scalexd
+    vpslld          xmm_tmp0, xmm_tmp0, 4
+    vpbroadcastw    ymm_tmp1, xmm_tmp0
+    vmovdqa         ymm_xpos_frac_inc, ymm_tmp1
+    vpsrld          xmm_tmp0, xmm_tmp0, 16
+    vpsubw          ymm_tmp0, ymm_tmp0, ymm_tmp3
+    vpbroadcastb    ymm_tmp0, xmm_tmp0
+    vmovdqa         ymm_xpos_int_inc, ymm_tmp0
+    vmovdqa         ymm_xpos_int_begin, ymm_xpos_int
+    vmovdqa         ymm_xpos_frac_begin, ymm_xpos_frac
+
+    cmp             i_scalex, 4 << 16
+    ja              .scalex_above4
+    cmp             i_scalex, 2 << 16
+    ja              .scalex_above2_beloweq4
+    add             i_scalex, i_scalex
+%xdefine i_scalex2 i_scalex
+%undef i_scalex
+    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample2xOrLess_16px, 1
+    shr             i_scalex2, 1
+%xdefine i_scalex i_scalex2
+%undef i_scalex2
+    jmp             .final_row
+.scalex_above2_beloweq4:
+    add             i_scalex, i_scalex
+%xdefine i_scalex2 i_scalex
+%undef i_scalex
+    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample4xOrLess_16px, 1
+    shr             i_scalex2, 1
+%xdefine i_scalex i_scalex2
+%undef i_scalex2
+    jmp             .final_row
+.scalex_above4:
+    cmp             i_scalex, 8 << 16
+    ja              .scalex_above8
+    add             i_scalex, i_scalex
+%xdefine i_scalex2 i_scalex
+%undef i_scalex
+    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample8xOrLess_16px, 1
+    shr             i_scalex2, 1
+%xdefine i_scalex i_scalex2
+%undef i_scalex2
+    jmp             .final_row
+.scalex_above8:
+%xdefine ymm_xfrac0 ymm_xpos_frac
+%xdefine ymm_xfrac1 ymm_xpos_int
+%xdefine ymm_xfrac0_begin ymm_xpos_int_begin
+%xdefine ymm_xfrac1_begin ymm_xpos_frac_begin
+%xdefine ymm_xfrac_inc ymm_xpos_frac_inc
+%undef ymm_xpos_int
+%undef ymm_xpos_frac
+%undef ymm_xpos_int_begin
+%undef ymm_xpos_frac_begin
+%undef ymm_xpos_int_inc
+%undef ymm_xpos_frac_inc
+    AVX2_UnpckXFrac ymm_tmp0, ymm_xfrac1, ymm_xfrac0, ymm_ffff
+    vpermq          ymm_xfrac0, ymm_tmp0,   01001110b
+    vpermq          ymm_xfrac1, ymm_xfrac1, 01001110b
+    vmovdqa         ymm_xfrac0_begin, ymm_xfrac0
+    vmovdqa         ymm_xfrac1_begin, ymm_xfrac1
+    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
+    vpmullw         ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
+    vpunpcklwd      ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
+    vmovdqa         ymm_xfrac_inc, ymm_tmp0
+    AVX2_GeneralBilinearDownsampler_loop AVX2_GeneralBilinearFastDownsample_16px, 1
+
+.final_row:
+    mov             p_src_row0, i_ypos
+    shr             p_src_row0, 15
+    imul            p_src_row0, i_src_stride
+    add             p_src_row0, p_src
+    mov             i_xpos, 1 << 15
+    mov             i_width_cnt, i_dst_width
+
+.final_row_width:
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]
+    mov             [p_dst], r_tmp0b
+    add             p_dst, 1
+    add             i_xpos, i_scalex
+    sub             i_width_cnt, 1
+    jg              .final_row_width
+
+.done:
+    vzeroupper
+%ifdef X86_32
+    mov             esp, [esp]
+%endif
+    POP_XMM
+    LOAD_7_PARA_POP
+%ifndef X86_32
+%ifdef WIN64
+    pop             rsi
+    pop             rdi
+%endif
+    pop             rbp
+    pop             rbx
+    pop             r13
+    pop             r12
+%endif
+    ret
+%undef p_dst
+%undef i_dst_stride_less_width
+%undef i_dst_width
+%undef i_dst_height
+%undef p_src
+%undef i_src_stride
+%undef i_scalex
+%undef i_scalexd
+%undef i_scaleyd
+%undef i_xpos
+%undef i_ypos
+%undef i_yposd
+%undef p_src_row0
+%undef p_src_row1
+%undef i_width_cnt
+%undef r_tmp0
+%undef r_tmp0b
+%undef ymm_xpos_frac
+%undef ymm_xpos_frac_inc
+%undef ymm_xpos_int
+%undef ymm_xpos_int_inc
+%undef ymm_yfrac0
+%undef ymm_yfrac1
+%undef xmm_tmp0
+%undef ymm_tmp0
+%undef xmm_tmp1
+%undef ymm_tmp1
+%undef xmm_tmp2
+%undef ymm_tmp2
+%undef xmm_tmp3
+%undef ymm_tmp3
+%undef xmm_tmp4
+%undef ymm_tmp4
+%undef xmm_tmp5
+%undef ymm_tmp5
+%undef ymm_ffff
+%undef ymm_0
+%undef ymm_xpos_int_begin
+%undef ymm_xpos_frac_begin
+%undef ymm_xfrac0
+%undef ymm_xfrac1
+%undef ymm_xfrac0_begin
+%undef ymm_xfrac1_begin
+%undef ymm_xfrac_inc
+
+;**************************************************************************************************************
+;void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+;    uint32_t uiScaleY);
+;
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearAccurateDownsampler_avx2
+    %assign push_num 0
+%ifndef X86_32
+    push            r12
+    push            r13
+    push            rbx
+    push            rbp
+    %assign push_num 4
+%ifdef WIN64
+    push            rdi
+    push            rsi
+    %assign push_num push_num + 2
+%endif
+%endif
+    LOAD_7_PARA
+    PUSH_XMM 16
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
+    ZERO_EXTENSION  r6d
+    sub             r1, r2                                            ; dst_stride - dst_width
+    add             r6, r6                                            ; 2 * scalex
+%ifdef X86_32
+    vmovd           xmm0, arg8
+    vmovd           xmm1, esp
+    and             esp, -32
+    sub             esp, 8 * 4 + 8 * 32
+    vmovd           [esp], xmm1
+    %define p_dst                   r0
+    %define i_dst_stride_less_width [esp + 1 * 4]
+    %define i_dst_width             [esp + 2 * 4]
+    %define i_dst_height            dword [esp + 3 * 4]
+    %define p_src                   [esp + 4 * 4]
+    %define i_src_stride            [esp + 5 * 4]
+    %define i_scalex                r6
+    %define i_scalexd               r6d
+    %define i_scaleyd               [esp + 6 * 4]
+    %define i_xpos                  r2
+    %define i_ypos                  [esp + 7 * 4]
+    %define i_yposd                 dword [esp + 7 * 4]
+    %define p_src_row0              r3
+    %define p_src_row1              r4
+    %define i_width_cnt             r5
+    %define r_tmp0                  r1
+    %define r_tmp0b                 r1b
+    %define ymm_xpos_frac           ymm1
+    %define ymm_xpos_frac_inc       [esp + 8 * 4]
+    %define ymm_xpos_int            ymm3
+    %define ymm_xpos_int_inc        [esp + 8 * 4 + 1 * 32]
+    %define ymm_yfrac0              [esp + 8 * 4 + 2 * 32]
+    %define ymm_yfrac1              [esp + 8 * 4 + 3 * 32]
+    %define xmm_tmp0                xmm7
+    %define ymm_tmp0                ymm7
+    %define xmm_tmp1                xmm0
+    %define ymm_tmp1                ymm0
+    %define xmm_tmp2                xmm2
+    %define ymm_tmp2                ymm2
+    %define xmm_tmp3                xmm4
+    %define ymm_tmp3                ymm4
+    %define xmm_tmp4                xmm5
+    %define ymm_tmp4                ymm5
+    %define xmm_tmp5                xmm6
+    %define ymm_tmp5                ymm6
+    %define ymm_0                   [esp + 8 * 4 + 4 * 32]
+    %define ymm_7fff                [esp + 8 * 4 + 5 * 32]
+    %define ymm_xpos_int_begin      [esp + 8 * 4 + 6 * 32]
+    %define ymm_xpos_frac_begin     [esp + 8 * 4 + 7 * 32]
+    mov             i_dst_stride_less_width, r1
+    mov             i_dst_width, r2
+    mov             i_dst_height, r3
+    mov             p_src, r4
+    mov             i_src_stride, r5
+    vmovd           i_scaleyd, xmm0
+    vpxor           xmm0, xmm0, xmm0
+    vmovdqa         ymm_0, ymm0
+    vpcmpeqw        ymm0, ymm0, ymm0
+    vpsrlw          ymm0, ymm0, 1
+    vmovdqa         ymm_7fff, ymm0
+%else
+    %define p_dst                   r0
+    %define i_dst_stride_less_width r1
+    %define i_dst_width             r2
+    %define i_dst_height            r3
+    %define p_src                   r4
+    %define i_src_stride            r5
+    %define i_scalex                r6
+    %define i_scalexd               r6d
+    %define i_scaleyd               dword arg8d
+    %define i_xpos                  r12
+    %define i_ypos                  r13
+    %define i_yposd                 r13d
+    %define p_src_row0              rbp
+%ifdef WIN64
+    %define p_src_row1              rsi
+    %define i_width_cnt             rdi
+%else
+    %define p_src_row1              r11
+    %define i_width_cnt             rax
+%endif
+    %define r_tmp0                  rbx
+    %define r_tmp0b                 bl
+    %define ymm_0                   ymm0
+    %define ymm_xpos_frac           ymm1
+    %define ymm_xpos_int            ymm3
+    %define ymm_xpos_frac_inc       ymm2
+    %define ymm_xpos_int_inc        ymm4
+    %define ymm_yfrac0              ymm5
+    %define ymm_yfrac1              ymm6
+    %define xmm_tmp0                xmm7
+    %define ymm_tmp0                ymm7
+    %define xmm_tmp1                xmm8
+    %define ymm_tmp1                ymm8
+    %define xmm_tmp2                xmm9
+    %define ymm_tmp2                ymm9
+    %define xmm_tmp3                xmm10
+    %define ymm_tmp3                ymm10
+    %define xmm_tmp4                xmm11
+    %define ymm_tmp4                ymm11
+    %define xmm_tmp5                xmm12
+    %define ymm_tmp5                ymm12
+    %define ymm_7fff                ymm13
+    %define ymm_xpos_int_begin      ymm14
+    %define ymm_xpos_frac_begin     ymm15
+    vpxor           ymm_0, ymm_0, ymm_0
+    vpcmpeqw        ymm_7fff, ymm_7fff, ymm_7fff
+    vpsrlw          ymm_7fff, ymm_7fff, 1
+%endif
+
+    sub             i_dst_height, 1
+    je              .final_row
+    jl              .done
+
+    mov             i_yposd, 1 << 14
+    vmovd           xmm_tmp0, i_scalexd
+    vpbroadcastd    ymm_tmp0, xmm_tmp0
+    vpslld          ymm_tmp1, ymm_tmp0, 2
+    vpslld          ymm_tmp2, ymm_tmp0, 3
+    vpaddd          ymm_tmp3, ymm_tmp1, ymm_tmp2
+    vpxor           ymm_tmp4, ymm_tmp4, ymm_tmp4
+    vpblendd        ymm_tmp1, ymm_tmp4, ymm_tmp1, 11110000b
+    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp3, 11110000b
+    vpaddd          ymm_tmp3, ymm_tmp0, ymm_tmp0
+    vpblendd        ymm_tmp3, ymm_tmp4, ymm_tmp3, 11001100b
+    vpblendd        ymm_tmp0, ymm_tmp4, ymm_tmp0, 10101010b
+    vpaddd          ymm_tmp0, ymm_tmp3, ymm_tmp0
+    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp0
+    vpaddd          ymm_tmp2, ymm_tmp2, ymm_tmp0
+    vpcmpeqw        ymm_tmp3, ymm_tmp3, ymm_tmp3
+    vpsrld          ymm_tmp3, ymm_tmp3, 31
+    vpslld          ymm_tmp3, ymm_tmp3, 15
+    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp3
+    vpaddd          ymm_tmp2, ymm_tmp2, ymm_tmp3
+    vpsrld          ymm_xpos_int, ymm_tmp1, 16
+    vpsrld          ymm_tmp0, ymm_tmp2, 16
+    vpackssdw       ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+    vpermq          ymm_xpos_int, ymm_xpos_int, 11011000b
+    vpackuswb       ymm_xpos_int, ymm_xpos_int, ymm_xpos_int
+    vpcmpeqw        ymm_tmp3, ymm_tmp3, ymm_tmp3
+    vpsubb          ymm_tmp0, ymm_xpos_int, ymm_tmp3
+    vpunpcklbw      ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+    vpslld          ymm_tmp1, ymm_tmp1, 16
+    vpsrld          ymm_tmp1, ymm_tmp1, 16
+    vpslld          ymm_tmp2, ymm_tmp2, 16
+    vpsrld          ymm_tmp2, ymm_tmp2, 16
+    vpackusdw       ymm_xpos_frac, ymm_tmp1, ymm_tmp2
+    vpermq          ymm_xpos_frac, ymm_xpos_frac, 11011000b
+    vpsrlw          ymm_xpos_frac, ymm_xpos_frac, 1
+    vmovd           xmm_tmp0, i_scalexd
+    vpslld          xmm_tmp0, xmm_tmp0, 4
+    vpbroadcastw    ymm_tmp1, xmm_tmp0
+    vpsrlw          ymm_tmp1, ymm_tmp1, 1
+    vmovdqa         ymm_xpos_frac_inc, ymm_tmp1
+    vpsrld          xmm_tmp0, xmm_tmp0, 16
+    vpsubw          ymm_tmp0, ymm_tmp0, ymm_tmp3
+    vpbroadcastb    ymm_tmp0, xmm_tmp0
+    vmovdqa         ymm_xpos_int_inc, ymm_tmp0
+    vmovdqa         ymm_xpos_int_begin, ymm_xpos_int
+    vmovdqa         ymm_xpos_frac_begin, ymm_xpos_frac
+
+    cmp             i_scalex, 4 << 16
+    ja              .scalex_above4
+    cmp             i_scalex, 2 << 16
+    ja              .scalex_above2_beloweq4
+    add             i_scalex, i_scalex
+%xdefine i_scalex2 i_scalex
+%undef i_scalex
+    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample2xOrLess_16px, 0
+    shr             i_scalex2, 1
+%xdefine i_scalex i_scalex2
+%undef i_scalex2
+    jmp             .final_row
+.scalex_above2_beloweq4:
+    add             i_scalex, i_scalex
+%xdefine i_scalex2 i_scalex
+%undef i_scalex
+    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample4xOrLess_16px, 0
+    shr             i_scalex2, 1
+%xdefine i_scalex i_scalex2
+%undef i_scalex2
+    jmp             .final_row
+.scalex_above4:
+    cmp             i_scalex, 8 << 16
+    ja              .scalex_above8
+    add             i_scalex, i_scalex
+%xdefine i_scalex2 i_scalex
+%undef i_scalex
+    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample8xOrLess_16px, 0
+    shr             i_scalex2, 1
+%xdefine i_scalex i_scalex2
+%undef i_scalex2
+    jmp             .final_row
+.scalex_above8:
+%xdefine ymm_xfrac0 ymm_xpos_frac
+%xdefine ymm_xfrac1 ymm_xpos_int
+%xdefine ymm_xfrac0_begin ymm_xpos_int_begin
+%xdefine ymm_xfrac1_begin ymm_xpos_frac_begin
+%xdefine ymm_xfrac_inc ymm_xpos_frac_inc
+%undef ymm_xpos_int
+%undef ymm_xpos_frac
+%undef ymm_xpos_int_begin
+%undef ymm_xpos_frac_begin
+%undef ymm_xpos_int_inc
+%undef ymm_xpos_frac_inc
+    AVX2_UnpckXFrac ymm_tmp0, ymm_xfrac1, ymm_xfrac0, ymm_7fff
+    vpermq          ymm_xfrac0, ymm_tmp0,   01001110b
+    vpermq          ymm_xfrac1, ymm_xfrac1, 01001110b
+    vmovdqa         ymm_xfrac0_begin, ymm_xfrac0
+    vmovdqa         ymm_xfrac1_begin, ymm_xfrac1
+    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
+    vpmullw         ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
+    vpunpcklwd      ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
+    vmovdqa         ymm_xfrac_inc, ymm_tmp0
+    AVX2_GeneralBilinearDownsampler_loop AVX2_GeneralBilinearAccurateDownsample_16px, 0
+
+.final_row:
+    mov             p_src_row0, i_ypos
+    shr             p_src_row0, 15
+    imul            p_src_row0, i_src_stride
+    add             p_src_row0, p_src
+    mov             i_xpos, 1 << 15
+    mov             i_width_cnt, i_dst_width
+
+.final_row_width:
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]
+    mov             [p_dst], r_tmp0b
+    add             p_dst, 1
+    add             i_xpos, i_scalex
+    sub             i_width_cnt, 1
+    jg              .final_row_width
+
+.done:
+    vzeroupper
+%ifdef X86_32
+    mov             esp, [esp]
+%endif
+    POP_XMM
+    LOAD_7_PARA_POP
+%ifndef X86_32
+%ifdef WIN64
+    pop             rsi
+    pop             rdi
+%endif
+    pop             rbp
+    pop             rbx
+    pop             r13
+    pop             r12
+%endif
+    ret
+%undef p_dst
+%undef i_dst_stride_less_width
+%undef i_dst_width
+%undef i_dst_height
+%undef p_src
+%undef i_src_stride
+%undef i_scalex
+%undef i_scalexd
+%undef i_scaleyd
+%undef i_xpos
+%undef i_ypos
+%undef i_yposd
+%undef p_src_row0
+%undef p_src_row1
+%undef i_width_cnt
+%undef r_tmp0
+%undef r_tmp0b
+%undef ymm_xpos_frac
+%undef ymm_xpos_frac_inc
+%undef ymm_xpos_int
+%undef ymm_xpos_int_inc
+%undef ymm_yfrac0
+%undef ymm_yfrac1
+%undef xmm_tmp0
+%undef ymm_tmp0
+%undef xmm_tmp1
+%undef ymm_tmp1
+%undef xmm_tmp2
+%undef ymm_tmp2
+%undef xmm_tmp3
+%undef ymm_tmp3
+%undef xmm_tmp4
+%undef ymm_tmp4
+%undef xmm_tmp5
+%undef ymm_tmp5
+%undef ymm_0
+%undef ymm_7fff
+%undef ymm_xpos_int_begin
+%undef ymm_xpos_frac_begin
+%undef ymm_xfrac0
+%undef ymm_xfrac1
+%undef ymm_xfrac0_begin
+%undef ymm_xfrac1_begin
+%undef ymm_xfrac_inc
--- a/test/processing/ProcessUT_DownSample.cpp
+++ b/test/processing/ProcessUT_DownSample.cpp
@@ -296,22 +296,24 @@
   int src_stride_a; \
   int src_width_a; \
   int src_height_a; \
-  dst_stride_c = dst_stride_a = 320; \
-  src_stride_c = src_stride_a = 320; \
-  src_width_c = src_width_a = 320; \
-  src_height_c = src_height_a = 180; \
-  dst_width_c = dst_width_a = 300; \
-  dst_height_c = dst_height_a = 160; \
-  for (int j = 0; j < 70000; j++) { \
-    dst_c[j] = dst_a[j] = rand() % 256; \
-    src_c[j] = src_a[j] = rand() % 256; \
-  } \
-  ref (dst_c, dst_stride_c, dst_width_c, dst_height_c, src_c, src_stride_c, src_width_c, src_height_c); \
-  func (dst_a, dst_stride_a, dst_width_a, dst_height_a, src_a, src_stride_a, src_width_a, src_height_a); \
-  for (int j = 0; j < dst_height_c; j++) { \
-    for (int m = 0; m < dst_width_c ; m++) { \
-      ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
+  for (int i = 0; i < 5; i++) { \
+    dst_stride_c = dst_stride_a = 320; \
+    src_stride_c = src_stride_a = 320; \
+    src_width_c = src_width_a = 320; \
+    src_height_c = src_height_a = 180; \
+    dst_width_c = dst_width_a = (src_width_c >> (i + 1)) + rand() % (src_width_c >> (i + 1)); \
+    dst_height_c = dst_height_a = (src_height_c >> (i + 1)) + rand() % (src_height_c >> (i + 1)); \
+    for (int j = 0; j < 70000; j++) { \
+      dst_c[j] = dst_a[j] = rand() % 256; \
+      src_c[j] = src_a[j] = rand() % 256; \
     } \
+    ref (dst_c, dst_stride_c, dst_width_c, dst_height_c, src_c, src_stride_c, src_width_c, src_height_c); \
+    func (dst_a, dst_stride_a, dst_width_a, dst_height_a, src_a, src_stride_a, src_width_a, src_height_a); \
+    for (int j = 0; j < dst_height_c; j++) { \
+      for (int m = 0; m < dst_width_c ; m++) { \
+        ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
+      } \
+    } \
   } \
 }
 
@@ -343,6 +345,14 @@
                                         WELS_CPU_SSE2)
 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse2,
                                         GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE2)
+GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_ssse3, GeneralBilinearFastDownsampler_ref, 1,
+                                        WELS_CPU_SSSE3)
+GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse41,
+                                        GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE41)
+GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_avx2, GeneralBilinearFastDownsampler_ref, 1,
+                                        WELS_CPU_AVX2)
+GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_avx2,
+                                        GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_AVX2)
 #endif
 
 #if defined(HAVE_NEON)