ref: b1013095b1525b5c5ed35fcc42764e06704ee278
parent: 1995e03d91646feb33fec49032c2d4dedc6d1ecc
author: Sindre Aamås <saamas@cisco.com>
date: Mon May 23 09:02:21 EDT 2016
[Processing/x86] Add an SSE4.1 implementation of GeneralBilinearAccurateDownsample Keep track of relative pixel offsets and utilize pshufb to efficiently extract relevant pixels for horizontal scaling ratios <= 4. Fall back to a generic approach for ratios > 4. The use of blendps makes this require SSE4.1. The pshufb path can be backported to SSSE3 and the generic path to SSE2 for a minor reduction in performance by replacing blendps and preceding instructions with an equivalent sequence. The implementation assumes that data beyond the end of each line, before the next line begins, can be dirtied; which AFAICT is safe with the current usage of these routines. Speedup is ~5.32x/~4.25x (32-bit/64-bit) for horizontal ratios <= 2, ~5.06x/~3.97x for ratios within (2, 4], and ~3.93x/~3.13x for ratios > 4 when not memory-bound on Haswell as compared with the current SSE2 implementation.
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -107,6 +107,7 @@
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4;
+ sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse41;
}
#endif//X86_ASM
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@@ -102,6 +102,7 @@
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_ssse3;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse41;
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_ssse3;
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_sse4;
@@ -116,6 +117,9 @@
const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,
const uint32_t kuiScaleY);
void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+ int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+ uint32_t uiScaleY);
+void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
uint32_t uiScaleY);
--- a/codec/processing/src/downsample/downsamplefuncs.cpp
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@@ -283,6 +283,7 @@
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (sse2)
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2)
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3)
+DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41)
#endif //X86_ASM
#ifdef HAVE_NEON
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -2766,6 +2766,166 @@
paddw xmm_xfrac1, xmm_xfrac_inc
%endmacro
+; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6
+%macro SSE2_BilinearIncXposw 6
+ pxor %6, %6
+ paddw %2, %4
+ pcmpgtw %6, %2
+ paddb %1, %3
+ psubb %1, %6 ; add carry
+ pand %2, %5
+%endmacro
+
+; outl=%1 outh=%2 in=%3 7FFFh=%4
+%macro SSE2_UnpckXFracw 4
+ movdqa %1, %3
+ pxor %1, %4
+ movdqa %2, %1
+ punpcklwd %1, %3
+ punpckhwd %2, %3
+%endmacro
+
+; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6
+%macro SSE41_LinearAccurateInterpolateVerticalDwords 6
+ pshufd %1, %2, 10110001b
+ pshufd %6, %3, 10110001b
+ pmuludq %1, %4
+ pmuludq %6, %5
+ paddq %1, %6
+ pmuludq %2, %4
+ pmuludq %3, %5
+ paddq %2, %3
+ psllq %1, 3
+ psrlq %2, 29
+ blendps %1, %2, 0101b
+%endmacro
+
+%macro SSE41_BilinearAccurateDownsample2xOrLess_8px 0
+ movdqa xmm_tmp0, xmm_xpos_int
+ pshufb xmm_tmp0, xmm_0
+ psubb xmm_xpos_int, xmm_tmp0
+ SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
+ mov r_tmp0, i_xpos
+ lea i_xpos, [i_xpos + 8 * i_scalex]
+ shr r_tmp0, 16
+ movdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ pshufb xmm_tmp4, xmm_xpos_int
+ movdqa xmm_tmp5, xmm_tmp4
+ punpcklbw xmm_tmp4, xmm_0
+ punpckhbw xmm_tmp5, xmm_0
+ pmaddwd xmm_tmp4, xmm_tmp0
+ pmaddwd xmm_tmp5, xmm_tmp1
+ movdqu xmm_tmp2, [p_src_row1 + r_tmp0]
+ pshufb xmm_tmp2, xmm_xpos_int
+ movdqa xmm_tmp3, xmm_tmp2
+ punpcklbw xmm_tmp2, xmm_0
+ punpckhbw xmm_tmp3, xmm_0
+ pmaddwd xmm_tmp2, xmm_tmp0
+ pmaddwd xmm_tmp3, xmm_tmp1
+ SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp1
+ SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp5, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
+ packssdw xmm_tmp0, xmm_tmp1
+ pavgw xmm_tmp0, xmm_0
+ packuswb xmm_tmp0, xmm_tmp0
+ movlps [p_dst], xmm_tmp0
+ add p_dst, 8
+ SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0
+%endmacro
+
+%macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0
+ movdqa xmm_tmp0, xmm_xpos_int
+ pshufb xmm_tmp0, [shufb_0000000088888888]
+ psubb xmm_xpos_int, xmm_tmp0
+ SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movdqa xmm_tmp3, xmm_xpos_int
+ punpcklbw xmm_tmp3, [db80h_128]
+ movdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ movdqu xmm_tmp2, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 4 * i_scalex]
+ lea i_xpos, [i_xpos + 8 * i_scalex]
+ shr r_tmp0, 16
+ pshufb xmm_tmp4, xmm_tmp3
+ pshufb xmm_tmp2, xmm_tmp3
+ pmaddwd xmm_tmp4, xmm_tmp0
+ pmaddwd xmm_tmp2, xmm_tmp0
+ SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
+ movdqa xmm_tmp2, xmm_xpos_int
+ punpckhbw xmm_tmp2, [db80h_128]
+ movdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ movdqu xmm_tmp3, [p_src_row1 + r_tmp0]
+ pshufb xmm_tmp4, xmm_tmp2
+ pshufb xmm_tmp3, xmm_tmp2
+ pmaddwd xmm_tmp4, xmm_tmp1
+ pmaddwd xmm_tmp3, xmm_tmp1
+ SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
+ packssdw xmm_tmp0, xmm_tmp1
+ pavgw xmm_tmp0, xmm_0
+ packuswb xmm_tmp0, xmm_tmp0
+ movlps [p_dst], xmm_tmp0
+ add p_dst, 8
+ SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0
+%endmacro
+
+%macro SSE41_GeneralBilinearAccurateDownsample_8px 0
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movd xmm_tmp4, [p_src_row0 + r_tmp0]
+ movd xmm_tmp2, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 1 * i_scalex]
+ shr r_tmp0, 16
+ pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 1
+ pinsrw xmm_tmp2, [p_src_row1 + r_tmp0], 1
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 2
+ pinsrw xmm_tmp2, [p_src_row1 + r_tmp0], 2
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 3
+ pinsrw xmm_tmp2, [p_src_row1 + r_tmp0], 3
+ punpcklbw xmm_tmp4, xmm_0
+ punpcklbw xmm_tmp2, xmm_0
+ pmaddwd xmm_tmp4, xmm_xfrac0
+ pmaddwd xmm_tmp2, xmm_xfrac0
+ SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movd xmm_tmp4, [p_src_row0 + r_tmp0]
+ movd xmm_tmp3, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 1 * i_scalex]
+ shr r_tmp0, 16
+ pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 1
+ pinsrw xmm_tmp3, [p_src_row1 + r_tmp0], 1
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 2
+ pinsrw xmm_tmp3, [p_src_row1 + r_tmp0], 2
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 3
+ pinsrw xmm_tmp3, [p_src_row1 + r_tmp0], 3
+ punpcklbw xmm_tmp4, xmm_0
+ punpcklbw xmm_tmp3, xmm_0
+ pmaddwd xmm_tmp4, xmm_xfrac1
+ pmaddwd xmm_tmp3, xmm_xfrac1
+ SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
+ packssdw xmm_tmp0, xmm_tmp1
+ pavgw xmm_tmp0, xmm_0
+ packuswb xmm_tmp0, xmm_tmp0
+ movlps [p_dst], xmm_tmp0
+ add p_dst, 8
+ paddw xmm_xfrac0, xmm_xfrac_inc
+ paddw xmm_xfrac1, xmm_xfrac_inc
+ pand xmm_xfrac0, xmm_7fff
+ pand xmm_xfrac1, xmm_7fff
+%endmacro
+
; downsample_8px_macro=%1 b_fast=%2
%macro SSE2_GeneralBilinearDownsampler_loop 2
%%height:
@@ -3109,6 +3269,293 @@
%undef xmm_tmp3
%undef xmm_tmp4
%undef xmm_tmp5
+%undef xmm_xpos_int_begin
+%undef xmm_xpos_frac_begin
+%undef xmm_xfrac0
+%undef xmm_xfrac1
+%undef xmm_xfrac0_begin
+%undef xmm_xfrac1_begin
+%undef xmm_xfrac_inc
+
+;**************************************************************************************************************
+;void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+; uint32_t uiScaleY);
+;
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse41
+ %assign push_num 0
+%ifndef X86_32
+ push r12
+ push r13
+ push rbx
+ push rbp
+ %assign push_num 4
+%ifdef WIN64
+ push rdi
+ push rsi
+ %assign push_num push_num + 2
+%endif
+%endif
+ LOAD_7_PARA
+ PUSH_XMM 16
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ ZERO_EXTENSION r6d
+ sub r1, r2 ; dst_stride - dst_width
+ add r6, r6 ; 2 * scalex
+%ifdef X86_32
+ movd xmm0, arg8
+ movd xmm1, esp
+ and esp, -16
+ sub esp, 8 * 4 + 8 * 16
+ movd [esp], xmm1
+ %define p_dst r0
+ %define i_dst_stride_less_width [esp + 1 * 4]
+ %define i_dst_width [esp + 2 * 4]
+ %define i_dst_height dword [esp + 3 * 4]
+ %define p_src [esp + 4 * 4]
+ %define i_src_stride [esp + 5 * 4]
+ %define i_scalex r6
+ %define i_scalexd r6d
+ %define i_scaleyd [esp + 6 * 4]
+ %define i_xpos r2
+ %define i_ypos dword [esp + 7 * 4]
+ %define i_yposd dword [esp + 7 * 4]
+ %define p_src_row0 r3
+ %define p_src_row1 r4
+ %define i_width_cnt r5
+ %define r_tmp0 r1
+ %define r_tmp0b r1b
+ %define xmm_xpos_frac xmm1
+ %define xmm_xpos_frac_inc [esp + 8 * 4]
+ %define xmm_xpos_int xmm3
+ %define xmm_xpos_int_inc [esp + 8 * 4 + 1 * 16]
+ %define xmm_yfrac0 [esp + 8 * 4 + 2 * 16]
+ %define xmm_yfrac1 [esp + 8 * 4 + 3 * 16]
+ %define xmm_tmp0 xmm7
+ %define xmm_tmp1 xmm0
+ %define xmm_tmp2 xmm2
+ %define xmm_tmp3 xmm4
+ %define xmm_tmp4 xmm5
+ %define xmm_tmp5 xmm6
+ %define xmm_0 [esp + 8 * 4 + 4 * 16]
+ %define xmm_7fff [esp + 8 * 4 + 5 * 16]
+ %define xmm_xpos_int_begin [esp + 8 * 4 + 6 * 16]
+ %define xmm_xpos_frac_begin [esp + 8 * 4 + 7 * 16]
+ mov i_dst_stride_less_width, r1
+ mov i_dst_width, r2
+ mov i_dst_height, r3
+ mov p_src, r4
+ mov i_src_stride, r5
+ movd i_scaleyd, xmm0
+ pxor xmm_tmp5, xmm_tmp5
+ movdqa xmm_0, xmm_tmp5
+ pcmpeqw xmm_tmp5, xmm_tmp5
+ psrlw xmm_tmp5, 1
+ movdqa xmm_7fff, xmm_tmp5
+%else
+ %define p_dst r0
+ %define i_dst_stride_less_width r1
+ %define i_dst_width r2
+ %define i_dst_height r3
+ %define p_src r4
+ %define i_src_stride r5
+ %define i_scalex r6
+ %define i_scalexd r6d
+ %define i_scaleyd dword arg8d
+ %define i_xpos r12
+ %define i_ypos r13
+ %define i_yposd r13d
+ %define p_src_row0 rbp
+%ifdef WIN64
+ %define p_src_row1 rsi
+ %define i_width_cnt rdi
+%else
+ %define p_src_row1 r11
+ %define i_width_cnt rax
+%endif
+ %define r_tmp0 rbx
+ %define r_tmp0b bl
+ %define xmm_0 xmm0
+ %define xmm_xpos_frac xmm1
+ %define xmm_xpos_frac_inc xmm8
+ %define xmm_xpos_int xmm3
+ %define xmm_xpos_int_inc xmm10
+ %define xmm_yfrac0 xmm11
+ %define xmm_yfrac1 xmm12
+ %define xmm_tmp0 xmm7
+ %define xmm_tmp1 xmm2
+ %define xmm_tmp2 xmm9
+ %define xmm_tmp3 xmm4
+ %define xmm_tmp4 xmm5
+ %define xmm_tmp5 xmm6
+ %define xmm_7fff xmm13
+ %define xmm_xpos_int_begin xmm14
+ %define xmm_xpos_frac_begin xmm15
+ pxor xmm_0, xmm_0
+ pcmpeqw xmm_7fff, xmm_7fff
+ psrlw xmm_7fff, 1
+%endif
+
+ sub i_dst_height, 1
+ je .final_row
+ jl .done
+
+ mov i_ypos, 1 << 14
+ movd xmm_xpos_frac, i_scalexd
+ pshufd xmm_xpos_frac, xmm_xpos_frac, 0
+ movdqa xmm_tmp0, xmm_xpos_frac
+ pslld xmm_tmp0, 2
+ pslldq xmm_xpos_frac, 4
+ paddd xmm_tmp0, xmm_xpos_frac
+ movdqa xmm_tmp1, xmm_xpos_frac
+ pslldq xmm_tmp1, 4
+ paddd xmm_xpos_frac, xmm_tmp1
+ paddd xmm_tmp0, xmm_tmp1
+ pslldq xmm_tmp1, 4
+ paddd xmm_xpos_frac, xmm_tmp1
+ paddd xmm_tmp0, xmm_tmp1
+ pcmpeqw xmm_tmp1, xmm_tmp1
+ psrld xmm_tmp1, 31
+ pslld xmm_tmp1, 15
+ paddd xmm_xpos_frac, xmm_tmp1
+ paddd xmm_tmp0, xmm_tmp1
+ movdqa xmm_xpos_int, xmm_xpos_frac
+ movdqa xmm_tmp1, xmm_tmp0
+ psrld xmm_xpos_int, 16
+ psrld xmm_tmp1, 16
+ packssdw xmm_xpos_int, xmm_tmp1
+ packuswb xmm_xpos_int, xmm_xpos_int
+ movdqa xmm_tmp1, xmm_xpos_int
+ pcmpeqw xmm_tmp2, xmm_tmp2
+ psubb xmm_tmp1, xmm_tmp2
+ punpcklbw xmm_xpos_int, xmm_tmp1
+ pslld xmm_xpos_frac, 16
+ pslld xmm_tmp0, 16
+ psrad xmm_xpos_frac, 16
+ psrad xmm_tmp0, 16
+ packssdw xmm_xpos_frac, xmm_tmp0
+ psrlw xmm_xpos_frac, 1
+ movd xmm_tmp0, i_scalexd
+ pslld xmm_tmp0, 3
+ movdqa xmm_tmp1, xmm_tmp0
+ punpcklwd xmm_tmp0, xmm_tmp0
+ pshufd xmm_tmp0, xmm_tmp0, 0
+ psrlw xmm_tmp0, 1
+ movdqa xmm_xpos_frac_inc, xmm_tmp0
+ psrld xmm_tmp1, 16
+ pxor xmm_tmp2, xmm_tmp2
+ pshufb xmm_tmp1, xmm_tmp2
+ movdqa xmm_xpos_int_inc, xmm_tmp1
+ movdqa xmm_xpos_int_begin, xmm_xpos_int
+ movdqa xmm_xpos_frac_begin, xmm_xpos_frac
+
+ cmp i_scalex, 4 << 16
+ ja .scalex_above4
+ cmp i_scalex, 2 << 16
+ ja .scalex_above2_beloweq4
+ SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample2xOrLess_8px, 0
+ jmp .final_row
+%ifdef X86_32
+ %undef xmm_yfrac0
+ %xdefine xmm_yfrac0 xmm_tmp5
+ %undef xmm_tmp5
+%endif
+.scalex_above2_beloweq4:
+ SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample4xOrLess_8px, 0
+ jmp .final_row
+.scalex_above4:
+%xdefine xmm_xfrac0 xmm_xpos_frac
+%xdefine xmm_xfrac1 xmm_xpos_int
+%xdefine xmm_xfrac0_begin xmm_xpos_int_begin
+%xdefine xmm_xfrac1_begin xmm_xpos_frac_begin
+%xdefine xmm_xfrac_inc xmm_xpos_frac_inc
+%undef xmm_xpos_int
+%undef xmm_xpos_frac
+%undef xmm_xpos_int_begin
+%undef xmm_xpos_frac_begin
+%undef xmm_xpos_int_inc
+%undef xmm_xpos_frac_inc
+ SSE2_UnpckXFracw xmm_tmp0, xmm_xfrac1, xmm_xfrac0, xmm_7fff
+ movdqa xmm_xfrac0, xmm_tmp0
+ movdqa xmm_xfrac0_begin, xmm_xfrac0
+ movdqa xmm_xfrac1_begin, xmm_xfrac1
+ pcmpeqw xmm_tmp0, xmm_tmp0
+ pmullw xmm_tmp0, xmm_xfrac_inc
+ punpcklwd xmm_tmp0, xmm_xfrac_inc
+ movdqa xmm_xfrac_inc, xmm_tmp0
+ SSE2_GeneralBilinearDownsampler_loop SSE41_GeneralBilinearAccurateDownsample_8px, 0
+
+.final_row:
+ mov p_src_row0, i_ypos
+ shr p_src_row0, 15
+ imul p_src_row0, i_src_stride
+ add p_src_row0, p_src
+ mov i_xpos, 1 << 15
+ mov i_width_cnt, i_dst_width
+
+.final_row_width:
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movzx r_tmp0, byte [p_src_row0 + r_tmp0]
+ mov [p_dst], r_tmp0b
+ add p_dst, 1
+ add i_xpos, i_scalex
+ sub i_width_cnt, 1
+ jg .final_row_width
+
+.done:
+%ifdef X86_32
+ mov esp, [esp]
+%endif
+ POP_XMM
+ LOAD_7_PARA_POP
+%ifndef X86_32
+%ifdef WIN64
+ pop rsi
+ pop rdi
+%endif
+ pop rbp
+ pop rbx
+ pop r13
+ pop r12
+%endif
+ ret
+%undef p_dst
+%undef i_dst_stride_less_width
+%undef i_dst_width
+%undef i_dst_height
+%undef p_src
+%undef i_src_stride
+%undef i_scalex
+%undef i_scalexd
+%undef i_scaleyd
+%undef i_xpos
+%undef i_ypos
+%undef i_yposd
+%undef p_src_row0
+%undef p_src_row1
+%undef i_width_cnt
+%undef r_tmp0
+%undef r_tmp0b
+%undef xmm_0
+%undef xmm_xpos_frac
+%undef xmm_xpos_frac_inc
+%undef xmm_xpos_int
+%undef xmm_xpos_int_inc
+%undef xmm_yfrac0
+%undef xmm_yfrac1
+%undef xmm_tmp0
+%undef xmm_tmp1
+%undef xmm_tmp2
+%undef xmm_tmp3
+%undef xmm_tmp4
+%undef xmm_tmp5
+%undef xmm_7fff
%undef xmm_xpos_int_begin
%undef xmm_xpos_frac_begin
%undef xmm_xfrac0
--- a/test/processing/ProcessUT_DownSample.cpp
+++ b/test/processing/ProcessUT_DownSample.cpp
@@ -345,6 +345,8 @@
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE2)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_ssse3, GeneralBilinearFastDownsampler_ref, 1,
WELS_CPU_SSSE3)
+GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse41,
+ GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE41)
#endif
#if defined(HAVE_NEON)