shithub: openh264

--- a/codec/processing/src/downsample/downsample.cpp

+++ b/codec/processing/src/downsample/downsample.cpp

@@ -107,6 +107,7 @@

     sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_sse4;

     sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;

     sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_sse4;

+    sDownsampleFunc.pfGeneralRatioChroma  = GeneralBilinearAccurateDownsamplerWrap_sse41;

 #endif//X86_ASM

--- a/codec/processing/src/downsample/downsample.h

+++ b/codec/processing/src/downsample/downsample.h

@@ -102,6 +102,7 @@

 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;

 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;

 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_ssse3;

+GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse41;

 SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_ssse3;

 SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_sse4;

@@ -116,6 +117,9 @@

     const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,

     const uint32_t kuiScaleY);

 void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,

+    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,

+    uint32_t uiScaleY);

+void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,

     int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,

     uint32_t uiScaleY);

--- a/codec/processing/src/downsample/downsamplefuncs.cpp

+++ b/codec/processing/src/downsample/downsamplefuncs.cpp

@@ -283,6 +283,7 @@

 DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (sse2)

 DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2)

 DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3)

+DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41)

 #endif //X86_ASM

 #ifdef HAVE_NEON

--- a/codec/processing/src/x86/downsample_bilinear.asm

+++ b/codec/processing/src/x86/downsample_bilinear.asm

@@ -2766,6 +2766,166 @@

     paddw           xmm_xfrac1, xmm_xfrac_inc

 %endmacro

+; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6

+%macro SSE2_BilinearIncXposw 6

+    pxor            %6, %6

+    paddw           %2, %4

+    pcmpgtw         %6, %2

+    paddb           %1, %3

+    psubb           %1, %6  ; add carry

+    pand            %2, %5

+%endmacro

+; outl=%1 outh=%2 in=%3 7FFFh=%4

+%macro SSE2_UnpckXFracw 4

+    movdqa          %1, %3

+    pxor            %1, %4

+    movdqa          %2, %1

+    punpcklwd       %1, %3

+    punpckhwd       %2, %3

+%endmacro

+; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6

+%macro SSE41_LinearAccurateInterpolateVerticalDwords 6

+    pshufd          %1, %2, 10110001b

+    pshufd          %6, %3, 10110001b

+    pmuludq         %1, %4

+    pmuludq         %6, %5

+    paddq           %1, %6

+    pmuludq         %2, %4

+    pmuludq         %3, %5

+    paddq           %2, %3

+    psllq           %1,  3

+    psrlq           %2, 29

+    blendps         %1, %2, 0101b

+%endmacro

+%macro SSE41_BilinearAccurateDownsample2xOrLess_8px 0

+    movdqa          xmm_tmp0, xmm_xpos_int

+    pshufb          xmm_tmp0, xmm_0

+    psubb           xmm_xpos_int, xmm_tmp0

+    SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff

+    mov             r_tmp0, i_xpos

+    lea             i_xpos, [i_xpos + 8 * i_scalex]

+    shr             r_tmp0, 16

+    movdqu          xmm_tmp4, [p_src_row0 + r_tmp0]

+    pshufb          xmm_tmp4, xmm_xpos_int

+    movdqa          xmm_tmp5, xmm_tmp4

+    punpcklbw       xmm_tmp4, xmm_0

+    punpckhbw       xmm_tmp5, xmm_0

+    pmaddwd         xmm_tmp4, xmm_tmp0

+    pmaddwd         xmm_tmp5, xmm_tmp1

+    movdqu          xmm_tmp2, [p_src_row1 + r_tmp0]

+    pshufb          xmm_tmp2, xmm_xpos_int

+    movdqa          xmm_tmp3, xmm_tmp2

+    punpcklbw       xmm_tmp2, xmm_0

+    punpckhbw       xmm_tmp3, xmm_0

+    pmaddwd         xmm_tmp2, xmm_tmp0

+    pmaddwd         xmm_tmp3, xmm_tmp1

+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp1

+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp5, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2

+    packssdw        xmm_tmp0, xmm_tmp1

+    pavgw           xmm_tmp0, xmm_0

+    packuswb        xmm_tmp0, xmm_tmp0

+    movlps          [p_dst], xmm_tmp0

+    add             p_dst, 8

+    SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0

+%endmacro

+%macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0

+    movdqa          xmm_tmp0, xmm_xpos_int

+    pshufb          xmm_tmp0, [shufb_0000000088888888]

+    psubb           xmm_xpos_int, xmm_tmp0

+    SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff

+    mov             r_tmp0, i_xpos

+    shr             r_tmp0, 16

+    movdqa          xmm_tmp3, xmm_xpos_int

+    punpcklbw       xmm_tmp3, [db80h_128]

+    movdqu          xmm_tmp4, [p_src_row0 + r_tmp0]

+    movdqu          xmm_tmp2, [p_src_row1 + r_tmp0]

+    lea             r_tmp0, [i_xpos + 4 * i_scalex]

+    lea             i_xpos, [i_xpos + 8 * i_scalex]

+    shr             r_tmp0, 16

+    pshufb          xmm_tmp4, xmm_tmp3

+    pshufb          xmm_tmp2, xmm_tmp3

+    pmaddwd         xmm_tmp4, xmm_tmp0

+    pmaddwd         xmm_tmp2, xmm_tmp0

+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3

+    movdqa          xmm_tmp2, xmm_xpos_int

+    punpckhbw       xmm_tmp2, [db80h_128]

+    movdqu          xmm_tmp4, [p_src_row0 + r_tmp0]

+    movdqu          xmm_tmp3, [p_src_row1 + r_tmp0]

+    pshufb          xmm_tmp4, xmm_tmp2

+    pshufb          xmm_tmp3, xmm_tmp2

+    pmaddwd         xmm_tmp4, xmm_tmp1

+    pmaddwd         xmm_tmp3, xmm_tmp1

+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2

+    packssdw        xmm_tmp0, xmm_tmp1

+    pavgw           xmm_tmp0, xmm_0

+    packuswb        xmm_tmp0, xmm_tmp0

+    movlps          [p_dst], xmm_tmp0

+    add             p_dst, 8

+    SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0

+%endmacro

+%macro SSE41_GeneralBilinearAccurateDownsample_8px 0

+    mov             r_tmp0, i_xpos

+    shr             r_tmp0, 16

+    movd            xmm_tmp4, [p_src_row0 + r_tmp0]

+    movd            xmm_tmp2, [p_src_row1 + r_tmp0]

+    lea             r_tmp0, [i_xpos + 1 * i_scalex]

+    shr             r_tmp0, 16

+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 1

+    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 1

+    lea             r_tmp0, [i_xpos + 2 * i_scalex]

+    lea             i_xpos, [i_xpos + 4 * i_scalex]

+    shr             r_tmp0, 16

+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 2

+    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 2

+    mov             r_tmp0, i_xpos

+    sub             r_tmp0, i_scalex

+    shr             r_tmp0, 16

+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 3

+    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 3

+    punpcklbw       xmm_tmp4, xmm_0

+    punpcklbw       xmm_tmp2, xmm_0

+    pmaddwd         xmm_tmp4, xmm_xfrac0

+    pmaddwd         xmm_tmp2, xmm_xfrac0

+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3

+    mov             r_tmp0, i_xpos

+    shr             r_tmp0, 16

+    movd            xmm_tmp4, [p_src_row0 + r_tmp0]

+    movd            xmm_tmp3, [p_src_row1 + r_tmp0]

+    lea             r_tmp0, [i_xpos + 1 * i_scalex]

+    shr             r_tmp0, 16

+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 1

+    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 1

+    lea             r_tmp0, [i_xpos + 2 * i_scalex]

+    lea             i_xpos, [i_xpos + 4 * i_scalex]

+    shr             r_tmp0, 16

+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 2

+    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 2

+    mov             r_tmp0, i_xpos

+    sub             r_tmp0, i_scalex

+    shr             r_tmp0, 16

+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 3

+    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 3

+    punpcklbw       xmm_tmp4, xmm_0

+    punpcklbw       xmm_tmp3, xmm_0

+    pmaddwd         xmm_tmp4, xmm_xfrac1

+    pmaddwd         xmm_tmp3, xmm_xfrac1

+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2

+    packssdw        xmm_tmp0, xmm_tmp1

+    pavgw           xmm_tmp0, xmm_0

+    packuswb        xmm_tmp0, xmm_tmp0

+    movlps          [p_dst], xmm_tmp0

+    add             p_dst, 8

+    paddw           xmm_xfrac0, xmm_xfrac_inc

+    paddw           xmm_xfrac1, xmm_xfrac_inc

+    pand            xmm_xfrac0, xmm_7fff

+    pand            xmm_xfrac1, xmm_7fff

+%endmacro

 ; downsample_8px_macro=%1 b_fast=%2

 %macro SSE2_GeneralBilinearDownsampler_loop 2

 %%height:

@@ -3109,6 +3269,293 @@

 %undef xmm_tmp3

 %undef xmm_tmp4

 %undef xmm_tmp5

+%undef xmm_xpos_int_begin

+%undef xmm_xpos_frac_begin

+%undef xmm_xfrac0

+%undef xmm_xfrac1

+%undef xmm_xfrac0_begin

+%undef xmm_xfrac1_begin

+%undef xmm_xfrac_inc

+;**************************************************************************************************************

+;void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,

+;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,

+;    uint32_t uiScaleY);

+;

+;**************************************************************************************************************

+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse41

+    %assign push_num 0

+%ifndef X86_32

+    push            r12

+    push            r13

+    push            rbx

+    push            rbp

+    %assign push_num 4

+%ifdef WIN64

+    push            rdi

+    push            rsi

+    %assign push_num push_num + 2

+%endif

+%endif

+    LOAD_7_PARA

+    PUSH_XMM 16

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r2, r2d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r5, r5d

+    ZERO_EXTENSION  r6d

+    sub             r1, r2                                            ; dst_stride - dst_width

+    add             r6, r6                                            ; 2 * scalex

+%ifdef X86_32

+    movd            xmm0, arg8

+    movd            xmm1, esp

+    and             esp, -16

+    sub             esp, 8 * 4 + 8 * 16

+    movd            [esp], xmm1

+    %define p_dst                   r0

+    %define i_dst_stride_less_width [esp + 1 * 4]

+    %define i_dst_width             [esp + 2 * 4]

+    %define i_dst_height            dword [esp + 3 * 4]

+    %define p_src                   [esp + 4 * 4]

+    %define i_src_stride            [esp + 5 * 4]

+    %define i_scalex                r6

+    %define i_scalexd               r6d

+    %define i_scaleyd               [esp + 6 * 4]

+    %define i_xpos                  r2

+    %define i_ypos                  dword [esp + 7 * 4]

+    %define i_yposd                 dword [esp + 7 * 4]

+    %define p_src_row0              r3

+    %define p_src_row1              r4

+    %define i_width_cnt             r5

+    %define r_tmp0                  r1

+    %define r_tmp0b                 r1b

+    %define xmm_xpos_frac           xmm1

+    %define xmm_xpos_frac_inc       [esp + 8 * 4]

+    %define xmm_xpos_int            xmm3

+    %define xmm_xpos_int_inc        [esp + 8 * 4 + 1 * 16]

+    %define xmm_yfrac0              [esp + 8 * 4 + 2 * 16]

+    %define xmm_yfrac1              [esp + 8 * 4 + 3 * 16]

+    %define xmm_tmp0                xmm7

+    %define xmm_tmp1                xmm0

+    %define xmm_tmp2                xmm2

+    %define xmm_tmp3                xmm4

+    %define xmm_tmp4                xmm5

+    %define xmm_tmp5                xmm6

+    %define xmm_0                   [esp + 8 * 4 + 4 * 16]

+    %define xmm_7fff                [esp + 8 * 4 + 5 * 16]

+    %define xmm_xpos_int_begin      [esp + 8 * 4 + 6 * 16]

+    %define xmm_xpos_frac_begin     [esp + 8 * 4 + 7 * 16]

+    mov             i_dst_stride_less_width, r1

+    mov             i_dst_width, r2

+    mov             i_dst_height, r3

+    mov             p_src, r4

+    mov             i_src_stride, r5

+    movd            i_scaleyd, xmm0

+    pxor            xmm_tmp5, xmm_tmp5

+    movdqa          xmm_0, xmm_tmp5

+    pcmpeqw         xmm_tmp5, xmm_tmp5

+    psrlw           xmm_tmp5, 1

+    movdqa          xmm_7fff, xmm_tmp5

+%else

+    %define p_dst                   r0

+    %define i_dst_stride_less_width r1

+    %define i_dst_width             r2

+    %define i_dst_height            r3

+    %define p_src                   r4

+    %define i_src_stride            r5

+    %define i_scalex                r6

+    %define i_scalexd               r6d

+    %define i_scaleyd               dword arg8d

+    %define i_xpos                  r12

+    %define i_ypos                  r13

+    %define i_yposd                 r13d

+    %define p_src_row0              rbp

+%ifdef WIN64

+    %define p_src_row1              rsi

+    %define i_width_cnt             rdi

+%else

+    %define p_src_row1              r11

+    %define i_width_cnt             rax

+%endif

+    %define r_tmp0                  rbx

+    %define r_tmp0b                 bl

+    %define xmm_0                   xmm0

+    %define xmm_xpos_frac           xmm1

+    %define xmm_xpos_frac_inc       xmm8

+    %define xmm_xpos_int            xmm3

+    %define xmm_xpos_int_inc        xmm10

+    %define xmm_yfrac0              xmm11

+    %define xmm_yfrac1              xmm12

+    %define xmm_tmp0                xmm7

+    %define xmm_tmp1                xmm2

+    %define xmm_tmp2                xmm9

+    %define xmm_tmp3                xmm4

+    %define xmm_tmp4                xmm5

+    %define xmm_tmp5                xmm6

+    %define xmm_7fff                xmm13

+    %define xmm_xpos_int_begin      xmm14

+    %define xmm_xpos_frac_begin     xmm15

+    pxor            xmm_0, xmm_0

+    pcmpeqw         xmm_7fff, xmm_7fff

+    psrlw           xmm_7fff, 1

+%endif

+    sub             i_dst_height, 1

+    je              .final_row

+    jl              .done

+    mov             i_ypos, 1 << 14

+    movd            xmm_xpos_frac, i_scalexd

+    pshufd          xmm_xpos_frac, xmm_xpos_frac, 0

+    movdqa          xmm_tmp0, xmm_xpos_frac

+    pslld           xmm_tmp0, 2

+    pslldq          xmm_xpos_frac, 4

+    paddd           xmm_tmp0, xmm_xpos_frac

+    movdqa          xmm_tmp1, xmm_xpos_frac

+    pslldq          xmm_tmp1, 4

+    paddd           xmm_xpos_frac, xmm_tmp1

+    paddd           xmm_tmp0, xmm_tmp1

+    pslldq          xmm_tmp1, 4

+    paddd           xmm_xpos_frac, xmm_tmp1

+    paddd           xmm_tmp0, xmm_tmp1

+    pcmpeqw         xmm_tmp1, xmm_tmp1

+    psrld           xmm_tmp1, 31

+    pslld           xmm_tmp1, 15

+    paddd           xmm_xpos_frac, xmm_tmp1

+    paddd           xmm_tmp0, xmm_tmp1

+    movdqa          xmm_xpos_int, xmm_xpos_frac

+    movdqa          xmm_tmp1, xmm_tmp0

+    psrld           xmm_xpos_int, 16

+    psrld           xmm_tmp1, 16

+    packssdw        xmm_xpos_int, xmm_tmp1

+    packuswb        xmm_xpos_int, xmm_xpos_int

+    movdqa          xmm_tmp1, xmm_xpos_int

+    pcmpeqw         xmm_tmp2, xmm_tmp2

+    psubb           xmm_tmp1, xmm_tmp2

+    punpcklbw       xmm_xpos_int, xmm_tmp1

+    pslld           xmm_xpos_frac, 16

+    pslld           xmm_tmp0, 16

+    psrad           xmm_xpos_frac, 16

+    psrad           xmm_tmp0, 16

+    packssdw        xmm_xpos_frac, xmm_tmp0

+    psrlw           xmm_xpos_frac, 1

+    movd            xmm_tmp0, i_scalexd

+    pslld           xmm_tmp0, 3

+    movdqa          xmm_tmp1, xmm_tmp0

+    punpcklwd       xmm_tmp0, xmm_tmp0

+    pshufd          xmm_tmp0, xmm_tmp0, 0

+    psrlw           xmm_tmp0, 1

+    movdqa          xmm_xpos_frac_inc, xmm_tmp0

+    psrld           xmm_tmp1, 16

+    pxor            xmm_tmp2, xmm_tmp2

+    pshufb          xmm_tmp1, xmm_tmp2

+    movdqa          xmm_xpos_int_inc, xmm_tmp1

+    movdqa          xmm_xpos_int_begin, xmm_xpos_int

+    movdqa          xmm_xpos_frac_begin, xmm_xpos_frac

+    cmp             i_scalex, 4 << 16

+    ja              .scalex_above4

+    cmp             i_scalex, 2 << 16

+    ja              .scalex_above2_beloweq4

+    SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample2xOrLess_8px, 0

+    jmp             .final_row

+%ifdef X86_32

+    %undef xmm_yfrac0

+    %xdefine xmm_yfrac0 xmm_tmp5

+    %undef xmm_tmp5

+%endif

+.scalex_above2_beloweq4:

+    SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample4xOrLess_8px, 0

+    jmp             .final_row

+.scalex_above4:

+%xdefine xmm_xfrac0 xmm_xpos_frac

+%xdefine xmm_xfrac1 xmm_xpos_int

+%xdefine xmm_xfrac0_begin xmm_xpos_int_begin

+%xdefine xmm_xfrac1_begin xmm_xpos_frac_begin

+%xdefine xmm_xfrac_inc xmm_xpos_frac_inc

+%undef xmm_xpos_int

+%undef xmm_xpos_frac

+%undef xmm_xpos_int_begin

+%undef xmm_xpos_frac_begin

+%undef xmm_xpos_int_inc

+%undef xmm_xpos_frac_inc

+    SSE2_UnpckXFracw xmm_tmp0, xmm_xfrac1, xmm_xfrac0, xmm_7fff

+    movdqa          xmm_xfrac0, xmm_tmp0

+    movdqa          xmm_xfrac0_begin, xmm_xfrac0

+    movdqa          xmm_xfrac1_begin, xmm_xfrac1

+    pcmpeqw         xmm_tmp0, xmm_tmp0

+    pmullw          xmm_tmp0, xmm_xfrac_inc

+    punpcklwd       xmm_tmp0, xmm_xfrac_inc

+    movdqa          xmm_xfrac_inc, xmm_tmp0

+    SSE2_GeneralBilinearDownsampler_loop SSE41_GeneralBilinearAccurateDownsample_8px, 0

+.final_row:

+    mov             p_src_row0, i_ypos

+    shr             p_src_row0, 15

+    imul            p_src_row0, i_src_stride

+    add             p_src_row0, p_src

+    mov             i_xpos, 1 << 15

+    mov             i_width_cnt, i_dst_width

+.final_row_width:

+    mov             r_tmp0, i_xpos

+    shr             r_tmp0, 16

+    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]

+    mov             [p_dst], r_tmp0b

+    add             p_dst, 1

+    add             i_xpos, i_scalex

+    sub             i_width_cnt, 1

+    jg              .final_row_width

+.done:

+%ifdef X86_32

+    mov             esp, [esp]

+%endif

+    POP_XMM

+    LOAD_7_PARA_POP

+%ifndef X86_32

+%ifdef WIN64

+    pop             rsi

+    pop             rdi

+%endif

+    pop             rbp

+    pop             rbx

+    pop             r13

+    pop             r12

+%endif

+    ret

+%undef p_dst

+%undef i_dst_stride_less_width

+%undef i_dst_width

+%undef i_dst_height

+%undef p_src

+%undef i_src_stride

+%undef i_scalex

+%undef i_scalexd

+%undef i_scaleyd

+%undef i_xpos

+%undef i_ypos

+%undef i_yposd

+%undef p_src_row0

+%undef p_src_row1

+%undef i_width_cnt

+%undef r_tmp0

+%undef r_tmp0b

+%undef xmm_0

+%undef xmm_xpos_frac

+%undef xmm_xpos_frac_inc

+%undef xmm_xpos_int

+%undef xmm_xpos_int_inc

+%undef xmm_yfrac0

+%undef xmm_yfrac1

+%undef xmm_tmp0

+%undef xmm_tmp1

+%undef xmm_tmp2

+%undef xmm_tmp3

+%undef xmm_tmp4

+%undef xmm_tmp5

+%undef xmm_7fff

 %undef xmm_xpos_int_begin

 %undef xmm_xpos_frac_begin

 %undef xmm_xfrac0

--- a/test/processing/ProcessUT_DownSample.cpp

+++ b/test/processing/ProcessUT_DownSample.cpp

@@ -345,6 +345,8 @@

                                         GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE2)

 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_ssse3, GeneralBilinearFastDownsampler_ref, 1,

                                         WELS_CPU_SSSE3)

+GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse41,

+                                        GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE41)

 #endif

 #if defined(HAVE_NEON)

--

⑨