shithub: openh264

--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj

+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj

@@ -24,6 +24,7 @@

 		F556A8251906673900E156A8 /* expand_picture_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */; };

 		F5AC94FF193EB7D800F58154 /* deblocking_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */; };

 		F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5B8D82C190757290037849A /* mc_aarch64_neon.S */; };

+		F5BB0BB8196BB5960072D50D /* copy_mb_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5BB0BB7196BB5960072D50D /* copy_mb_aarch64_neon.S */; };

 		FAABAA1818E9354A00D4186F /* sad_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FAABAA1718E9354A00D4186F /* sad_common.cpp */; };

 /* End PBXBuildFile section */

@@ -72,6 +73,7 @@

 		F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = expand_picture_aarch64_neon.S; path = arm64/expand_picture_aarch64_neon.S; sourceTree = "<group>"; };

 		F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = deblocking_aarch64_neon.S; path = arm64/deblocking_aarch64_neon.S; sourceTree = "<group>"; };

 		F5B8D82C190757290037849A /* mc_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = mc_aarch64_neon.S; path = arm64/mc_aarch64_neon.S; sourceTree = "<group>"; };

+		F5BB0BB7196BB5960072D50D /* copy_mb_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = copy_mb_aarch64_neon.S; path = arm64/copy_mb_aarch64_neon.S; sourceTree = "<group>"; };

 		FAABAA1618E9353F00D4186F /* sad_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sad_common.h; sourceTree = "<group>"; };

 		FAABAA1718E9354A00D4186F /* sad_common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sad_common.cpp; sourceTree = "<group>"; };

 /* End PBXFileReference section */

@@ -177,6 +179,7 @@

 		F556A81D1906669F00E156A8 /* arm64 */ = {

 			isa = PBXGroup;

 			children = (

+				F5BB0BB7196BB5960072D50D /* copy_mb_aarch64_neon.S */,

 				F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */,

 				F5B8D82C190757290037849A /* mc_aarch64_neon.S */,

 				F556A8221906673900E156A8 /* arm_arch64_common_macro.S */,

@@ -245,6 +248,7 @@

 				5BA8F2C019603F5F00011CE4 /* common_tables.cpp in Sources */,

 				4C3406D118D96EA600DFA14A /* WelsThreadLib.cpp in Sources */,

 				4C3406CC18D96EA600DFA14A /* mc_neon.S in Sources */,

+				F5BB0BB8196BB5960072D50D /* copy_mb_aarch64_neon.S in Sources */,

 				4C3406CB18D96EA600DFA14A /* expand_picture_neon.S in Sources */,

 				4CC61F0918FF6B4B00E56EAB /* copy_mb_neon.S in Sources */,

 				53C1C9BC193F0FB000404D8F /* expand_pic.cpp in Sources */,

--- /dev/null

+++ b/codec/common/arm64/copy_mb_aarch64_neon.S

@@ -1,0 +1,274 @@

+/*!

+ * \copy

+ *     Copyright (c)  2013, Cisco Systems

+ *     All rights reserved.

+ *

+ *     Redistribution and use in source and binary forms, with or without

+ *     modification, are permitted provided that the following conditions

+ *     are met:

+ *

+ *        * Redistributions of source code must retain the above copyright

+ *          notice, this list of conditions and the following disclaimer.

+ *

+ *        * Redistributions in binary form must reproduce the above copyright

+ *          notice, this list of conditions and the following disclaimer in

+ *          the documentation and/or other materials provided with the

+ *          distribution.

+ *

+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+ *     POSSIBILITY OF SUCH DAMAGE.

+ *

+ */

+#ifdef  HAVE_NEON_AARCH64

+.text

+#include "arm_arch64_common_macro.S"

+#ifdef __APPLE__

+.macro LOAD_ALIGNED_DATA_WITH_STRIDE

+//  {   //  input: $0~$3, src*, src_stride

+    ld1 {$0.d}[0], [$4], $5

+    ld1 {$1.d}[0], [$4], $5

+    ld1 {$2.d}[0], [$4], $5

+    ld1 {$3.d}[0], [$4], $5

+//  }

+.endm

+.macro STORE_ALIGNED_DATA_WITH_STRIDE

+//  {   //  input: $0~$3, dst*, dst_stride

+    st1 {$0.d}[0], [$4], $5

+    st1 {$1.d}[0], [$4], $5

+    st1 {$2.d}[0], [$4], $5

+    st1 {$3.d}[0], [$4], $5

+//  }

+.endm

+.macro LOAD_UNALIGNED_DATA_WITH_STRIDE

+//  {   //  input: $0~$3, src*, src_stride

+    ld1 {$0.8b}, [$4], $5

+    ld1 {$1.8b}, [$4], $5

+    ld1 {$2.8b}, [$4], $5

+    ld1 {$3.8b}, [$4], $5

+//  }

+.endm

+.macro STORE_UNALIGNED_DATA_WITH_STRIDE

+//  {   //  input: $0~$3, dst*, dst_stride

+    st1 {$0.8b}, [$4], $5

+    st1 {$1.8b}, [$4], $5

+    st1 {$2.8b}, [$4], $5

+    st1 {$3.8b}, [$4], $5

+//  }

+.endm

+.macro LOAD16_ALIGNED_DATA_WITH_STRIDE

+//  {   //  input: $0~$3, src*, src_stride

+    ld1 {$0.2d}, [$4], $5

+    ld1 {$1.2d}, [$4], $5

+    ld1 {$2.2d}, [$4], $5

+    ld1 {$3.2d}, [$4], $5

+//  }

+.endm

+.macro STORE16_ALIGNED_DATA_WITH_STRIDE

+//  {   //  input: $0~$3, dst*, dst_stride

+    st1 {$0.2d}, [$4], $5

+    st1 {$1.2d}, [$4], $5

+    st1 {$2.2d}, [$4], $5

+    st1 {$3.2d}, [$4], $5

+//  }

+.endm

+.macro LOAD16_UNALIGNED_DATA_WITH_STRIDE

+//  {   //  input: $0~$3, src*, src_stride

+    ld1 {$0.16b}, [$4], $5

+    ld1 {$1.16b}, [$4], $5

+    ld1 {$2.16b}, [$4], $5

+    ld1 {$3.16b}, [$4], $5

+//  }

+.endm

+.macro STORE16_UNALIGNED_DATA_WITH_STRIDE

+//  {   //  input: $0~$3, dst*, dst_stride

+    st1 {$0.16b}, [$4], $5

+    st1 {$1.16b}, [$4], $5

+    st1 {$2.16b}, [$4], $5

+    st1 {$3.16b}, [$4], $5

+//  }

+.endm

+#else

+.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

+//  {   //  input: $0~$3, src*, src_stride

+    ld1 {\arg0\().d}[0], [\arg4], \arg5

+    ld1 {\arg1\().d}[0], [\arg4], \arg5

+    ld1 {\arg2\().d}[0], [\arg4], \arg5

+    ld1 {\arg3\().d}[0], [\arg4], \arg5

+//  }

+.endm

+.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

+//  {   //  input: $0~$3, dst*, dst_stride

+    st1 {\arg0\().d}[0], [\arg4], \arg5

+    st1 {\arg1\().d}[0], [\arg4], \arg5

+    st1 {\arg2\().d}[0], [\arg4], \arg5

+    st1 {\arg3\().d}[0], [\arg4], \arg5

+//  }

+.endm

+.macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

+//  {   //  input: $0~$3, src*, src_stride

+    ld1 {\arg0\().8b}, [\arg4], \arg5

+    ld1 {\arg1\().8b}, [\arg4], \arg5

+    ld1 {\arg2\().8b}, [\arg4], \arg5

+    ld1 {\arg3\().8b}, [\arg4], \arg5

+//  }

+.endm

+.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

+//  {   //  input: $0~$3, dst*, dst_stride

+    st1 {\arg0\().8b}, [\arg4], \arg5

+    st1 {\arg1\().8b}, [\arg4], \arg5

+    st1 {\arg2\().8b}, [\arg4], \arg5

+    st1 {\arg3\().8b}, [\arg4], \arg5

+//  }

+.endm

+.macro LOAD16_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

+//  {   //  input: $0~$3, src*, src_stride

+    ld1 {\arg0\().2d}, [\arg4], \arg5

+    ld1 {\arg1\().2d}, [\arg4], \arg5

+    ld1 {\arg2\().2d}, [\arg4], \arg5

+    ld1 {\arg3\().2d}, [\arg4], \arg5

+//  }

+.endm

+.macro STORE16_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

+//  {   //  input: $0~$3, dst*, dst_stride

+    st1 {\arg0\().2d}, [\arg4], \arg5

+    st1 {\arg1\().2d}, [\arg4], \arg5

+    st1 {\arg2\().2d}, [\arg4], \arg5

+    st1 {\arg3\().2d}, [\arg4], \arg5

+//  }

+.endm

+.macro LOAD16_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

+//  {   //  input: $0~$3, src*, src_stride

+    ld1 {\arg0\().16b}, [\arg4], \arg5

+    ld1 {\arg1\().16b}, [\arg4], \arg5

+    ld1 {\arg2\().16b}, [\arg4], \arg5

+    ld1 {\arg3\().16b}, [\arg4], \arg5

+//  }

+.endm

+.macro STORE16_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

+//  {   //  input: $0~$3, dst*, dst_stride

+    st1 {\arg0\().16b}, [\arg4], \arg5

+    st1 {\arg1\().16b}, [\arg4], \arg5

+    st1 {\arg2\().16b}, [\arg4], \arg5

+    st1 {\arg3\().16b}, [\arg4], \arg5

+//  }

+.endm

+#endif

+WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy8x8_AArch64_neon

+    LOAD_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3

+    STORE_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1

+    LOAD_UNALIGNED_DATA_WITH_STRIDE v4, v5, v6, v7, x2, x3

+    STORE_UNALIGNED_DATA_WITH_STRIDE    v4, v5, v6, v7, x0, x1

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy16x16_AArch64_neon

+    LOAD16_ALIGNED_DATA_WITH_STRIDE   v0, v1, v2, v3, x2, x3

+    STORE16_ALIGNED_DATA_WITH_STRIDE  v0, v1, v2, v3, x0, x1

+    LOAD16_ALIGNED_DATA_WITH_STRIDE   v16, v17, v18, v19, x2, x3

+    STORE16_ALIGNED_DATA_WITH_STRIDE  v16, v17, v18, v19, x0, x1

+    LOAD16_ALIGNED_DATA_WITH_STRIDE   v0, v1, v2, v3, x2, x3

+    STORE16_ALIGNED_DATA_WITH_STRIDE  v0, v1, v2, v3, x0, x1

+    LOAD16_ALIGNED_DATA_WITH_STRIDE   v16, v17, v18, v19, x2, x3

+    STORE16_ALIGNED_DATA_WITH_STRIDE  v16, v17, v18, v19, x0, x1

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy16x16NotAligned_AArch64_neon

+    LOAD16_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3

+    STORE16_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1

+    LOAD16_UNALIGNED_DATA_WITH_STRIDE v16, v17, v18, v19, x2, x3

+    STORE16_UNALIGNED_DATA_WITH_STRIDE    v16, v17, v18, v19, x0, x1

+    LOAD16_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3

+    STORE16_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1

+    LOAD16_UNALIGNED_DATA_WITH_STRIDE v16, v17, v18, v19, x2, x3

+    STORE16_UNALIGNED_DATA_WITH_STRIDE    v16, v17, v18, v19, x0, x1

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy16x8NotAligned_AArch64_neon

+    LOAD16_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3

+    STORE16_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1

+    LOAD16_UNALIGNED_DATA_WITH_STRIDE v16, v17, v18, v19, x2, x3

+    STORE16_UNALIGNED_DATA_WITH_STRIDE    v16, v17, v18, v19, x0, x1

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy8x16_AArch64_neon

+    LOAD_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3

+    STORE_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1

+    LOAD_UNALIGNED_DATA_WITH_STRIDE v4, v5, v6, v7, x2, x3

+    STORE_UNALIGNED_DATA_WITH_STRIDE    v4, v5, v6, v7, x0, x1

+    LOAD_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3

+    STORE_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1

+    LOAD_UNALIGNED_DATA_WITH_STRIDE v4, v5, v6, v7, x2, x3

+    STORE_UNALIGNED_DATA_WITH_STRIDE    v4, v5, v6, v7, x0, x1

+WELS_ASM_AARCH64_FUNC_END

+#endif

--- a/codec/common/inc/copy_mb.h

+++ b/codec/common/inc/copy_mb.h

@@ -65,6 +65,14 @@

 void WelsCopy8x16_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);

 #endif

+#if defined (HAVE_NEON_AARCH64)

+void WelsCopy8x8_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);

+void WelsCopy16x16_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);

+void WelsCopy16x16NotAligned_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);

+void WelsCopy16x8NotAligned_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);

+void WelsCopy8x16_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);

+#endif

 #if defined(__cplusplus)

 #endif//__cplusplus

--- a/codec/common/targets.mk

+++ b/codec/common/targets.mk

@@ -39,6 +39,7 @@

 ifeq ($(ASM_ARCH), arm64)

 COMMON_ASM_ARM64_SRCS=\

+	$(COMMON_SRCDIR)/arm64/copy_mb_aarch64_neon.S\

 	$(COMMON_SRCDIR)/arm64/deblocking_aarch64_neon.S\

 	$(COMMON_SRCDIR)/arm64/expand_picture_aarch64_neon.S\

 	$(COMMON_SRCDIR)/arm64/mc_aarch64_neon.S\

--- a/codec/encoder/core/src/encode_mb_aux.cpp

+++ b/codec/encoder/core/src/encode_mb_aux.cpp

@@ -553,8 +553,8 @@

     pFuncList->pfQuantizationHadamard2x2		= WelsHadamardQuant2x2_AArch64_neon;

     pFuncList->pfQuantizationHadamard2x2Skip	= WelsHadamardQuant2x2Skip_AArch64_neon;

     pFuncList->pfDctT4					= WelsDctT4_AArch64_neon;

-    //pFuncList->pfCopy8x8Aligned			= WelsCopy8x8_AArch64_neon; // will enable in next update

-    //pFuncList->pfCopy8x16Aligned		= WelsCopy8x16_AArch64_neon; // will enable in next update

+    pFuncList->pfCopy8x8Aligned			= WelsCopy8x8_AArch64_neon;

+    pFuncList->pfCopy8x16Aligned		= WelsCopy8x16_AArch64_neon;

     pFuncList->pfGetNoneZeroCount		= WelsGetNoneZeroCount_AArch64_neon;

     pFuncList->pfTransformHadamard4x4Dc	= WelsHadamardT4Dc_AArch64_neon;

@@ -564,9 +564,9 @@

     pFuncList->pfQuantizationFour4x4	= WelsQuantFour4x4_AArch64_neon;

     pFuncList->pfQuantizationFour4x4Max	= WelsQuantFour4x4Max_AArch64_neon;

-    //pFuncList->pfCopy16x16Aligned		= WelsCopy16x16_AArch64_neon; // will enable in next update

-    //pFuncList->pfCopy16x16NotAligned	= WelsCopy16x16NotAligned_AArch64_neon; // will enable in next update

-    //pFuncList->pfCopy16x8NotAligned		= WelsCopy16x8NotAligned_AArch64_neon; // will enable in next update

+    pFuncList->pfCopy16x16Aligned		= WelsCopy16x16_AArch64_neon;

+    pFuncList->pfCopy16x16NotAligned	= WelsCopy16x16NotAligned_AArch64_neon;

+    pFuncList->pfCopy16x8NotAligned		= WelsCopy16x8NotAligned_AArch64_neon;

     pFuncList->pfDctFourT4				= WelsDctFourT4_AArch64_neon;

 #endif

--- /dev/null

+++ b/test/encoder/EncUT_MBCopy.cpp

@@ -1,0 +1,140 @@

+#include <gtest/gtest.h>

+#include <math.h>

+#include <stdlib.h>

+#include <time.h>

+#include "cpu_core.h"

+#include "cpu.h"

+#include "macros.h"

+#include "encode_mb_aux.h"

+#include "wels_func_ptr_def.h"

+#include "copy_mb.h"

+using namespace WelsSVCEnc;

+#define MBCOPYTEST_NUM 1000

+static void FillWithRandomData (uint8_t* p, int32_t Len) {

+  for (int32_t i = 0; i < Len; i++) {

+    p[i] = rand() % 256;

+  }

+}

+TEST (MBCopyFunTest, pfCopy8x8Aligned) {

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pSrcAlign, 16 * 64 + 1, 16)

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pDstAlign, 2, 16 * 32 + 16, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < MBCOPYTEST_NUM; k++) {

+    memset (pDstAlign[0], 0, 16 * 32 + 1);

+    memset (pDstAlign[1], 0, 16 * 32 + 1);

+    FillWithRandomData ((uint8_t*)pSrcAlign, 16 * 64 + 1);

+    WelsCopy8x8_c (pDstAlign[0], 32, pSrcAlign, 64);

+    sFuncPtrList.pfCopy8x8Aligned (pDstAlign[1], 32, pSrcAlign, 64);

+    for (int32_t i = 0; i < 16 * 32 + 1; i++) {

+      ASSERT_EQ (pDstAlign[0][i], pDstAlign[1][i]);

+    }

+  }

+}

+TEST (MBCopyFunTest, pfCopy8x16Aligned) {

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pSrcAlign, 16 * 64 + 1, 16)

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pDstAlign, 2, 16 * 32 + 16, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < MBCOPYTEST_NUM; k++) {

+    memset (pDstAlign[0], 0, 16 * 32 + 1);

+    memset (pDstAlign[1], 0, 16 * 32 + 1);

+    FillWithRandomData ((uint8_t*)pSrcAlign, 16 * 64 + 1);

+    WelsCopy8x16_c (pDstAlign[0], 32, pSrcAlign, 64);

+    sFuncPtrList.pfCopy8x16Aligned (pDstAlign[1], 32, pSrcAlign, 64);

+    for (int32_t i = 0; i < 16 * 32 + 1; i++) {

+      ASSERT_EQ (pDstAlign[0][i], pDstAlign[1][i]);

+    }

+  }

+}

+TEST (MBCopyFunTest, pfCopy16x16Aligned) {

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pSrcAlign, 16 * 64 + 1, 16)

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pDstAlign, 2, 16 * 32 + 16, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < MBCOPYTEST_NUM; k++) {

+    memset (pDstAlign[0], 0, 16 * 32 + 1);

+    memset (pDstAlign[1], 0, 16 * 32 + 1);

+    FillWithRandomData ((uint8_t*)pSrcAlign, 16 * 64 + 1);

+    WelsCopy16x16_c (pDstAlign[0], 32, pSrcAlign, 64);

+    sFuncPtrList.pfCopy16x16Aligned (pDstAlign[1], 32, pSrcAlign, 64);

+    for (int32_t i = 0; i < 16 * 32 + 1; i++) {

+      ASSERT_EQ (pDstAlign[0][i], pDstAlign[1][i]);

+    }

+  }

+}

+TEST (MBCopyFunTest, pfCopy16x8NotAligned) {

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pSrcAlign, 16 * 64 + 1, 16)

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pDstAlign, 2, 16 * 32 + 16, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < MBCOPYTEST_NUM; k++) {

+    memset (pDstAlign[0], 0, 16 * 32 + 1);

+    memset (pDstAlign[1], 0, 16 * 32 + 1);

+    FillWithRandomData ((uint8_t*)pSrcAlign, 16 * 64 + 1);

+    WelsCopy16x8_c (pDstAlign[0], 32, pSrcAlign + 1, 64);

+    sFuncPtrList.pfCopy16x8NotAligned (pDstAlign[1], 32, pSrcAlign + 1, 64);

+    for (int32_t i = 0; i < 16 * 32 + 1; i++) {

+      ASSERT_EQ (pDstAlign[0][i], pDstAlign[1][i]);

+    }

+  }

+}

+TEST (MBCopyFunTest, pfCopy16x16NotAligned) {

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pSrcAlign, 16 * 64 + 1, 16)

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pDstAlign, 2, 16 * 32 + 16, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < MBCOPYTEST_NUM; k++) {

+    memset (pDstAlign[0], 0, 16 * 32 + 1);

+    memset (pDstAlign[1], 0, 16 * 32 + 1);

+    FillWithRandomData ((uint8_t*)pSrcAlign, 16 * 64 + 1);

+    WelsCopy16x16_c (pDstAlign[0], 32, pSrcAlign + 1, 64);

+    sFuncPtrList.pfCopy16x16NotAligned (pDstAlign[1], 32, pSrcAlign + 1, 64);

+    for (int32_t i = 0; i < 16 * 32 + 1; i++) {

+      ASSERT_EQ (pDstAlign[0][i], pDstAlign[1][i]);

+    }

+  }

+}

--- a/test/encoder/targets.mk

+++ b/test/encoder/targets.mk

@@ -6,6 +6,7 @@

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMbAux.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_ExpGolomb.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_GetIntraPredictor.cpp\

+	$(ENCODER_UNITTEST_SRCDIR)/EncUT_MBCopy.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_MemoryAlloc.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_MemoryZero.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_MotionEstimate.cpp\

--

⑨