shithub: openh264

--- a/codec/common/x86/asm_inc.asm

+++ b/codec/common/x86/asm_inc.asm

@@ -485,7 +485,7 @@

 %endmacro

 %macro WELS_EXTERN 1

-    ALIGN 16

+    ALIGN 16, nop

     %ifdef PREFIX

         global _%1

         %define %1 _%1

--- a/codec/processing/src/downsample/downsample.cpp

+++ b/codec/processing/src/downsample/downsample.cpp

@@ -102,8 +102,6 @@

     sDownsampleFunc.pfGeneralRatioLuma    = GeneralBilinearFastDownsamplerWrap_ssse3;

   if (iCpuFlag & WELS_CPU_SSE41) {

-    sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_sse4;

-    sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsamplerWidthx16_sse4;

     sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;

     sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_sse4;

     sDownsampleFunc.pfGeneralRatioChroma  = GeneralBilinearAccurateDownsamplerWrap_sse41;

--- a/codec/processing/src/downsample/downsample.h

+++ b/codec/processing/src/downsample/downsample.h

@@ -94,10 +94,6 @@

 HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx16_ssse3;

 // iSrcWidth= x32 pixels

 HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_ssse3;

-// iSrcWidth= x16 pixels

-HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx16_sse4;

-// iSrcWidth= x32 pixels

-HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_sse4;

 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;

 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;

--- a/codec/processing/src/x86/downsample_bilinear.asm

+++ b/codec/processing/src/x86/downsample_bilinear.asm

@@ -40,6 +40,10 @@

 ;*************************************************************************/

 %include "asm_inc.asm"

+%ifdef __NASM_VER__

+    %use smartalign

+%endif

 ;***********************************************************************

 ; Macros and other preprocessor constants

 ;***********************************************************************

@@ -471,7 +475,6 @@

-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse

 ;***********************************************************************

 ;   void DyadicBilinearDownsamplerWidthx32_ssse3(   unsigned char* pDst, const int iDstStride,

 ;                   unsigned char* pSrc, const int iSrcStride,

@@ -478,17 +481,6 @@

 ;                   const int iSrcWidth, const int iSrcHeight );

 ;***********************************************************************

 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3

-    ;push ebx

-    ;push edx

-    ;push esi

-    ;push edi

-    ;push ebp

-    ;mov edi, [esp+24]   ; pDst

-    ;mov edx, [esp+28]   ; iDstStride

-    ;mov esi, [esp+32]   ; pSrc

-    ;mov ecx, [esp+36]   ; iSrcStride

-    ;mov ebp, [esp+44]   ; iSrcHeight

 %ifdef X86_32

     push r6

     %assign push_num 1

@@ -496,7 +488,7 @@

     %assign push_num 0

 %endif

     LOAD_6_PARA

-    PUSH_XMM 8

+    PUSH_XMM 4

     SIGN_EXTENSION r1, r1d

     SIGN_EXTENSION r3, r3d

     SIGN_EXTENSION r4, r4d

@@ -508,15 +500,12 @@

 %endif

     sar r5, $01            ; iSrcHeight >> 1

-    movdqa xmm7, [shufb_mask_low]   ; mask low

-    movdqa xmm6, [shufb_mask_high]  ; mask high

+    WELS_DB1 xmm3

+    WELS_Zero xmm2

+    sar r4, $01            ; iSrcWidth >> 1

+    add r0, r4             ; pDst += iSrcWidth >> 1

 .yloops4:

-    ;mov eax, [esp+40]   ; iSrcWidth

-    ;sar eax, $01            ; iSrcWidth >> 1

-    ;mov ebx, eax        ; iDstWidth restored at ebx

-    ;sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb

-    ;neg ebx             ; - (iSrcWidth >> 1)

 %ifdef X86_32

     mov r4, arg5

 %else

@@ -523,81 +512,32 @@

     mov r4, r12

 %endif

     sar r4, $01            ; iSrcWidth >> 1

-    mov r6, r4        ; iDstWidth restored at ebx

-    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb

-    neg r6             ; - (iSrcWidth >> 1)

+    neg r4                 ; -(iSrcWidth >> 1)

+    mov r6, r4

+    align 16

     ; each loop = source bandwidth: 32 bytes

 .xloops4:

-    ; 1st part horizonal loop: x16 bytes

-    ;               mem  hi<-       ->lo

-    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A

-    ;               xmm1: p P o O n N m M l L k K j J i I

-    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A

-    ;               xmm3: p P o O n N m M l L k K j J i I

-    ;=> target:

-    ;: P O N M L K J I H G F E D C B A

-    ;: p o n m l k j i h g f e d c b a

-    ;: P ..                          A

-    ;: p ..                          a

+    movdqa xmm0, [r2+r3]

+    movdqa xmm1, [r2+r3+16]

+    pavgb  xmm0, [r2]          ; avg vertical pixels 0-15

+    pavgb  xmm1, [r2+16]       ; avg vertical pixels 16-31

+    add r2, 32                 ; pSrc += 32

+    pmaddubsw xmm0, xmm3       ; pairwise horizontal sum neighboring pixels 0-15

+    pmaddubsw xmm1, xmm3       ; pairwise horizontal sum neighboring pixels 16-31

+    pavgw xmm0, xmm2           ; (sum + 1) >> 1

+    pavgw xmm1, xmm2           ; (sum + 1) >> 1

+    packuswb xmm0, xmm1        ; pack words to bytes

+    movdqa [r0+r4], xmm0       ; store results

+    add r4, 16

+    jl .xloops4

-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movdqa xmm0, [r2]          ; 1st_src_line

-    movdqa xmm1, [r2+16]       ; 1st_src_line + 16

-    movdqa xmm2, [r2+r3]      ; 2nd_src_line

-    movdqa xmm3, [r2+r3+16]   ; 2nd_src_line + 16

-    ; packing & avg

-    movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A

-    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A

-    pshufb xmm4, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-    ; another implementation for xmm4 high bits

-;   psubb xmm4, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0

-;   psrlw xmm4, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-    pavgb xmm0, xmm4

-    movdqa xmm5, xmm1

-    pshufb xmm1, xmm7

-    pshufb xmm5, xmm6

-;   psubb xmm5, xmm1

-;   psrlw xmm5, 8

-    pavgb xmm1, xmm5

-    movdqa xmm4, xmm2

-    pshufb xmm2, xmm7

-    pshufb xmm4, xmm6

-;   psubb xmm4, xmm2

-;   psrlw xmm4, 8

-    pavgb xmm2, xmm4

-    movdqa xmm5, xmm3

-    pshufb xmm3, xmm7

-    pshufb xmm5, xmm6

-;   psubb xmm5, xmm3

-;   psrlw xmm5, 8

-    pavgb xmm3, xmm5

-    packuswb xmm0, xmm1

-    packuswb xmm2, xmm3

-    pavgb xmm0, xmm2

-    ; write pDst

-    movdqa [r0], xmm0

-    ; next SMB

-    lea r2, [r2+32]

-    lea r0, [r0+16]

-    dec r4

-    jg near .xloops4

     ; next line

     lea r2, [r2+2*r3]    ; next end of lines

     lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

     lea r0, [r0+r1]

-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec r5

-    jg near .yloops4

+    sub r5, 1

+    jg .yloops4

 %ifndef X86_32

     pop r12

@@ -623,7 +563,7 @@

     %assign push_num 0

 %endif

     LOAD_6_PARA

-    PUSH_XMM 6

+    PUSH_XMM 4

     SIGN_EXTENSION r1, r1d

     SIGN_EXTENSION r3, r3d

     SIGN_EXTENSION r4, r4d

@@ -634,8 +574,11 @@

     mov r12, r4

 %endif

     sar r5, $01            ; iSrcHeight >> 1

-    movdqa xmm5, [shufb_mask_low]   ; mask low

-    movdqa xmm4, [shufb_mask_high]  ; mask high

+    WELS_DB1 xmm3

+    WELS_Zero xmm2

+    add r2, r4             ; pSrc += iSrcWidth

+    sar r4, $01            ; iSrcWidth >> 1

+    add r0, r4             ; pDst += iSrcWidth >> 1

 .yloops5:

 %ifdef X86_32

@@ -644,279 +587,26 @@

     mov r4, r12

 %endif

     sar r4, $01            ; iSrcWidth >> 1

-    mov r6, r4        ; iDstWidth restored at ebx

-    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb

-    neg r6             ; - (iSrcWidth >> 1)

+    neg r4                 ; -(iSrcWidth >> 1)

+    lea r6, [r2+r3]        ; pSrc + iSrcStride

+    align 16

     ; each loop = source bandwidth: 16 bytes

 .xloops5:

-    ; horizonal loop: x16 bytes by source

-    ;               mem  hi<-       ->lo

-    ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A

-    ;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I

-    ;=> target:

-    ;: H G F E D C B A, P O N M L K J I

-    ;: h g f e d c b a, p o n m l k j i

+    movdqa xmm0, [r2+2*r4]

+    pavgb  xmm0, [r6+2*r4]     ; avg vertical pixels

+    pmaddubsw xmm0, xmm3       ; pairwise horizontal sum neighboring pixels

+    pavgw xmm0, xmm2           ; (sum + 1) >> 1

+    packuswb xmm0, xmm0        ; pack words to bytes

+    movlps [r0+r4], xmm0       ; store results

+    add r4, 8

+    jl .xloops5

-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movdqa xmm0, [r2]          ; 1st_src_line

-    movdqa xmm1, [r2+r3]      ; 2nd_src_line

-    ; packing & avg

-    movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A

-    pshufb xmm0, xmm5           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A

-    pshufb xmm2, xmm4           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-    ; another implementation for xmm2 high bits

-;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0

-;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-    pavgb xmm0, xmm2

-    movdqa xmm3, xmm1

-    pshufb xmm1, xmm5

-    pshufb xmm3, xmm4

-;   psubb xmm3, xmm1

-;   psrlw xmm3, 8

-    pavgb xmm1, xmm3

-    pavgb xmm0, xmm1

-    packuswb xmm0, xmm1

-    ; write pDst

-    movq [r0], xmm0

-    ; next SMB

-    lea r2, [r2+16]

-    lea r0, [r0+8]

-    dec r4

-    jg near .xloops5

-    lea r2, [r2+2*r3]    ; next end of lines

-    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

-    lea r0, [r0+r1]

-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec r5

-    jg near .yloops5

-%ifndef X86_32

-    pop r12

-%endif

-    POP_XMM

-    LOAD_6_PARA_POP

-%ifdef X86_32

-    pop r6

-%endif

-    ret

-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse

-;***********************************************************************

-;   void DyadicBilinearDownsamplerWidthx32_sse4(    unsigned char* pDst, const int iDstStride,

-;                   unsigned char* pSrc, const int iSrcStride,

-;                   const int iSrcWidth, const int iSrcHeight );

-;***********************************************************************

-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4

-%ifdef X86_32

-    push r6

-    %assign push_num 1

-%else

-    %assign push_num 0

-%endif

-    LOAD_6_PARA

-    PUSH_XMM 8

-    SIGN_EXTENSION r1, r1d

-    SIGN_EXTENSION r3, r3d

-    SIGN_EXTENSION r4, r4d

-    SIGN_EXTENSION r5, r5d

-%ifndef X86_32

-    push r12

-    mov r12, r4

-%endif

-    sar r5, $01            ; iSrcHeight >> 1

-    movdqa xmm7, [shufb_mask_low]   ; mask low

-    movdqa xmm6, [shufb_mask_high]  ; mask high

-.yloops6:

-%ifdef X86_32

-    mov r4, arg5

-%else

-    mov r4, r12

-%endif

-    sar r4, $01            ; iSrcWidth >> 1

-    mov r6, r4        ; iDstWidth restored at ebx

-    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb

-    neg r6             ; - (iSrcWidth >> 1)

-    ; each loop = source bandwidth: 32 bytes

-.xloops6:

-    ; 1st part horizonal loop: x16 bytes

-    ;               mem  hi<-       ->lo

-    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A

-    ;               xmm1: p P o O n N m M l L k K j J i I

-    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A

-    ;               xmm3: p P o O n N m M l L k K j J i I

-    ;=> target:

-    ;: P O N M L K J I H G F E D C B A

-    ;: p o n m l k j i h g f e d c b a

-    ;: P ..                          A

-    ;: p ..                          a

-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movntdqa xmm0, [r2]            ; 1st_src_line

-    movntdqa xmm1, [r2+16]     ; 1st_src_line + 16

-    movntdqa xmm2, [r2+r3]        ; 2nd_src_line

-    movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16

-    ; packing & avg

-    movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A

-    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A

-    pshufb xmm4, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-;   psubb xmm4, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0

-;   psrlw xmm4, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-    pavgb xmm0, xmm4

-    movdqa xmm5, xmm1

-    pshufb xmm1, xmm7

-    pshufb xmm5, xmm6

-;   psubb xmm5, xmm1

-;   psrlw xmm5, 8

-    pavgb xmm1, xmm5

-    movdqa xmm4, xmm2

-    pshufb xmm2, xmm7

-    pshufb xmm4, xmm6

-;   psubb xmm4, xmm2

-;   psrlw xmm4, 8

-    pavgb xmm2, xmm4

-    movdqa xmm5, xmm3

-    pshufb xmm3, xmm7

-    pshufb xmm5, xmm6

-;   psubb xmm5, xmm3

-;   psrlw xmm5, 8

-    pavgb xmm3, xmm5

-    packuswb xmm0, xmm1

-    packuswb xmm2, xmm3

-    pavgb xmm0, xmm2

-    ; write pDst

-    movdqa [r0], xmm0

-    ; next SMB

-    lea r2, [r2+32]

-    lea r0, [r0+16]

-    dec r4

-    jg near .xloops6

-    lea r2, [r2+2*r3]    ; next end of lines

-    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

-    lea r0, [r0+r1]

-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec r5

-    jg near .yloops6

-%ifndef X86_32

-    pop r12

-%endif

-    POP_XMM

-    LOAD_6_PARA_POP

-%ifdef X86_32

-    pop r6

-%endif

-    ret

-;***********************************************************************

-;   void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,

-;                     unsigned char* pSrc, const int iSrcStride,

-;                     const int iSrcWidth, const int iSrcHeight );

-;***********************************************************************

-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4

-%ifdef X86_32

-    push r6

-    %assign push_num 1

-%else

-    %assign push_num 0

-%endif

-    LOAD_6_PARA

-    PUSH_XMM 6

-    SIGN_EXTENSION r1, r1d

-    SIGN_EXTENSION r3, r3d

-    SIGN_EXTENSION r4, r4d

-    SIGN_EXTENSION r5, r5d

-%ifndef X86_32

-    push r12

-    mov r12, r4

-%endif

-    sar r5, $01            ; iSrcHeight >> 1

-    movdqa xmm5, [shufb_mask_low]   ; mask low

-    movdqa xmm4, [shufb_mask_high]  ; mask high

-.yloops7:

-%ifdef X86_32

-    mov r4, arg5

-%else

-    mov r4, r12

-%endif

-    sar r4, $01            ; iSrcWidth >> 1

-    mov r6, r4        ; iDstWidth restored at ebx

-    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb

-    neg r6             ; - (iSrcWidth >> 1)

-    ; each loop = source bandwidth: 16 bytes

-.xloops7:

-    ; horizonal loop: x16 bytes by source

-    ;               mem  hi<-       ->lo

-    ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A

-    ;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I

-    ;=> target:

-    ;: H G F E D C B A, P O N M L K J I

-    ;: h g f e d c b a, p o n m l k j i

-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movntdqa xmm0, [r2]            ; 1st_src_line

-    movntdqa xmm1, [r2+r3]        ; 2nd_src_line

-    ; packing & avg

-    movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A

-    pshufb xmm0, xmm5           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A

-    pshufb xmm2, xmm4           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0

-;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-    pavgb xmm0, xmm2

-    movdqa xmm3, xmm1

-    pshufb xmm1, xmm5

-    pshufb xmm3, xmm4

-;   psubb xmm3, xmm1

-;   psrlw xmm3, 8

-    pavgb xmm1, xmm3

-    pavgb xmm0, xmm1

-    packuswb xmm0, xmm1

-    ; write pDst

-    movq [r0], xmm0

-    ; next SMB

-    lea r2, [r2+16]

-    lea r0, [r0+8]

-    dec r4

-    jg near .xloops7

     ; next line

     lea r2, [r2+2*r3]    ; next end of lines

-    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

     lea r0, [r0+r1]

-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec r5

-    jg near .yloops7

+    sub r5, 1

+    jg .yloops5

 %ifndef X86_32

     pop r12

--- a/test/api/decode_api_test.cpp

+++ b/test/api/decode_api_test.cpp

@@ -759,9 +759,15 @@

 const uint32_t kiFrameRate = 12; //DO NOT CHANGE!

 const uint32_t kiFrameNum = 100; //DO NOT CHANGE!

 const char* pHashStr[] = { //DO NOT CHANGE!

+#ifdef X86_ASM

+  "244eebcb51f4c2a56e83fc5da3373cad9ec0e1e5",

+  "bbad99ef99e37b34bcb4f09a7ec4d144375f6be7",

+  "809f97e836650624d92f0b8e200a6ab25f810d6f"

+#else

   "9c4e6146b29bac5d5d4be3c5bbab9c072dcb3f3f",

   "f350001c333902029800bd291fbed915a4bdf19a",

   "eb9d853b7daec03052c4850027ac94adc84c3a7e"

+#endif

};

 class DecodeParseAPI : public ::testing::TestWithParam<EncodeDecodeFileParamBase>, public EncodeDecodeTestBase {

--- a/test/api/encoder_test.cpp

+++ b/test/api/encoder_test.cpp

@@ -123,7 +123,12 @@

},

     "res/CiscoVT2people_320x192_12fps.yuv",

-    "73156dfc1dc45924349b5b79f8debcac13d7231d", CAMERA_VIDEO_REAL_TIME, 320, 192, 12.0f, SM_SINGLE_SLICE, false, 2, false, false, false

+#ifdef X86_ASM

+    "a5341d588b769809c1f1d983e5a0fcef7362f3ad",

+#else

+    "73156dfc1dc45924349b5b79f8debcac13d7231d",

+#endif

+    CAMERA_VIDEO_REAL_TIME, 320, 192, 12.0f, SM_SINGLE_SLICE, false, 2, false, false, false

},

     "res/Cisco_Absolute_Power_1280x720_30fps.yuv",

@@ -131,7 +136,12 @@

},

     "res/Cisco_Absolute_Power_1280x720_30fps.yuv",

-    "3943145545a2bd27a642b2045d4e3dbae55c6870", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false

+#ifdef X86_ASM

+    "ec9d776a7d92cf0f6640065aee8af2450af0e993",

+#else

+    "3943145545a2bd27a642b2045d4e3dbae55c6870",

+#endif

+    CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false

},

   // the following values may be adjusted for times since we start tuning the strategy

--- a/test/processing/ProcessUT_DownSample.cpp

+++ b/test/processing/ProcessUT_DownSample.cpp

@@ -30,6 +30,27 @@

+void DyadicBilinearDownsampler2_ref (uint8_t* pDst, const int32_t kiDstStride,

+                                     const uint8_t* pSrc, const int32_t kiSrcStride,

+                                     const int32_t kiSrcWidth, const int32_t kiSrcHeight) {

+  uint8_t* pDstLine = pDst;

+  const uint8_t* pSrcLine1 = pSrc;

+  const uint8_t* pSrcLine2 = pSrc + kiSrcStride;

+  const int32_t kiDstWidth  = kiSrcWidth >> 1;

+  const int32_t kiDstHeight = kiSrcHeight >> 1;

+  for (int32_t j = 0; j < kiDstHeight; j++) {

+    for (int32_t i = 0; i < kiDstWidth; i++) {

+      const int32_t kiTempCol1 = (pSrcLine1[2 * i + 0] + pSrcLine2[2 * i + 0] + 1) >> 1;

+      const int32_t kiTempCol2 = (pSrcLine1[2 * i + 1] + pSrcLine2[2 * i + 1] + 1) >> 1;

+      pDstLine[i] = (uint8_t) ((kiTempCol1 + kiTempCol2 + 1) >> 1);

+    }

+    pDstLine += kiDstStride;

+    pSrcLine1 += 2 * kiSrcStride;

+    pSrcLine2 += 2 * kiSrcStride;

+  }

+}

 void GeneralBilinearFastDownsampler_ref (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,

     const int32_t kiDstHeight,

     uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {

@@ -162,7 +183,7 @@

-#define GENERATE_DyadicBilinearDownsampler_UT(func, ASM, CPUFLAGS) \

+#define GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, ref_func) \

 TEST (DownSampleTest, func) { \

   if (ASM) {\

     int32_t iCpuCores = 0; \

@@ -190,7 +211,7 @@

     dst_c[j] = dst_a[j] = rand() % 256; \

     src_c[j] = src_a[j] = rand() % 256; \

} \

-  DyadicBilinearDownsampler_ref (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \

+  ref_func (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \

   func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a); \

   for (int j = 0; j < (src_height_c >> 1); j++) { \

     for (int m = 0; m < (src_width_c >> 1); m++) { \

@@ -199,6 +220,11 @@

} \

+#define GENERATE_DyadicBilinearDownsampler_UT(func, ASM, CPUFLAGS) \

+  GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, DyadicBilinearDownsampler_ref)

+#define GENERATE_DyadicBilinearDownsampler2_UT(func, ASM, CPUFLAGS) \

+  GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, DyadicBilinearDownsampler2_ref)

 #define GENERATE_DyadicBilinearOneThirdDownsampler_UT(func, ASM, CPUFLAGS) \

 TEST (DownSampleTest, func) { \

   if (ASM) {\

@@ -328,11 +354,8 @@

 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse, 1, WELS_CPU_SSE)

 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx8_sse, 1, WELS_CPU_SSE)

-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_ssse3, 1, WELS_CPU_SSSE3)

-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_ssse3, 1, WELS_CPU_SSSE3)

-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse4, 1, WELS_CPU_SSE41)

-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse4, 1, WELS_CPU_SSE41)

+GENERATE_DyadicBilinearDownsampler2_UT (DyadicBilinearDownsamplerWidthx32_ssse3, 1, WELS_CPU_SSSE3)

+GENERATE_DyadicBilinearDownsampler2_UT (DyadicBilinearDownsamplerWidthx16_ssse3, 1, WELS_CPU_SSSE3)

 GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_ssse3, 1, WELS_CPU_SSSE3)

 GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_sse4, 1, WELS_CPU_SSE41)

--

⑨