shithub: openh264

Download patch

ref: 439e51bc11cd27569f205cda9809bb30986b6488
parent: ef1098be9ce373840e5c744b7c8fe533bdc5bfca
parent: cff49f5e45c3ccc849a5d7b6dfd6e0048859107b
author: zhilwang <zhilwang@cisco.com>
date: Fri Aug 8 05:19:51 EDT 2014

Merge pull request #1249 from dongzha/addArm32SCCNew

add arm 32/64 code and UT for SVC SCC motion estimation

--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
@@ -45,6 +45,8 @@
 		4CE4472918BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446F818BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp */; };
 		4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446FA18BC605C0017DF25 /* wels_preprocess.cpp */; };
 		4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4470618BC605C0017DF25 /* welsEncoderExt.cpp */; };
+		6CA38DA31991CACE003EAAE0 /* svc_motion_estimation.S in Sources */ = {isa = PBXBuildFile; fileRef = 6CA38DA21991CACE003EAAE0 /* svc_motion_estimation.S */; };
+		6CA38DA51991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 6CA38DA41991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S */; };
 		9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */; };
 		9AED66661946A2B3009A3567 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66651946A2B3009A3567 /* utils.cpp */; };
 		F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */; };
@@ -154,6 +156,8 @@
 		4CE446FE18BC605C0017DF25 /* welsEncoderExt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = welsEncoderExt.h; sourceTree = "<group>"; };
 		4CE4470418BC605C0017DF25 /* wels_enc_export.def */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = wels_enc_export.def; sourceTree = "<group>"; };
 		4CE4470618BC605C0017DF25 /* welsEncoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsEncoderExt.cpp; sourceTree = "<group>"; };
+		6CA38DA21991CACE003EAAE0 /* svc_motion_estimation.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = svc_motion_estimation.S; sourceTree = "<group>"; };
+		6CA38DA41991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = svc_motion_estimation_aarch64_neon.S; path = arm64/svc_motion_estimation_aarch64_neon.S; sourceTree = "<group>"; };
 		9AED664819469FAF009A3567 /* welsCodecTrace.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = welsCodecTrace.h; path = ../../../common/inc/welsCodecTrace.h; sourceTree = "<group>"; };
 		9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = welsCodecTrace.cpp; path = ../../../common/src/welsCodecTrace.cpp; sourceTree = "<group>"; };
 		9AED66651946A2B3009A3567 /* utils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = utils.cpp; path = ../../../common/src/utils.cpp; sourceTree = "<group>"; };
@@ -177,6 +181,7 @@
 		4C34066418C57D0400DFA14A /* arm */ = {
 			isa = PBXGroup;
 			children = (
+				6CA38DA21991CACE003EAAE0 /* svc_motion_estimation.S */,
 				4C34066618C57D0400DFA14A /* intra_pred_neon.S */,
 				4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */,
 				4C34066918C57D0400DFA14A /* memory_neon.S */,
@@ -189,6 +194,7 @@
 		4CB8F2B219235FAC005D6386 /* arm64 */ = {
 			isa = PBXGroup;
 			children = (
+				6CA38DA41991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S */,
 				F5BE8004196B913200ED02ED /* memory_aarch64_neon.S */,
 				F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */,
 				4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */,
@@ -423,6 +429,7 @@
 				4CE4471D18BC605C0017DF25 /* property.cpp in Sources */,
 				4CE4471018BC605C0017DF25 /* decode_mb_aux.cpp in Sources */,
 				4CE4472018BC605C0017DF25 /* sample.cpp in Sources */,
+				6CA38DA31991CACE003EAAE0 /* svc_motion_estimation.S in Sources */,
 				4CE4471318BC605C0017DF25 /* encoder_data_tables.cpp in Sources */,
 				4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */,
 				9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */,
@@ -455,6 +462,7 @@
 				4CE4471218BC605C0017DF25 /* encoder.cpp in Sources */,
 				4CE4471618BC605C0017DF25 /* get_intra_predictor.cpp in Sources */,
 				4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */,
+				6CA38DA51991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S in Sources */,
 				4CE4471418BC605C0017DF25 /* encoder_ext.cpp in Sources */,
 				4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */,
 			);
--- /dev/null
+++ b/codec/encoder/core/arm/svc_motion_estimation.S
@@ -1,0 +1,168 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef  HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+
+WELS_ASM_FUNC_BEGIN SumOf8x8SingleBlock_neon
+    vld1.64 {d0}, [r0], r1
+    vld1.64 {d1}, [r0], r1
+    vld1.64 {d2}, [r0], r1
+    vld1.64 {d3}, [r0], r1
+    vld1.64 {d4}, [r0], r1
+    vld1.64 {d5}, [r0], r1
+    vld1.64 {d6}, [r0], r1
+    vld1.64 {d7}, [r0]
+    vpaddl.u8 q0, q0
+    vpadal.u8 q0, q1
+    vpadal.u8 q0, q2
+    vpadal.u8 q0, q3
+
+    vpaddl.u16 q0, q0
+    vpadd.i32 d0, d1
+    vpadd.i32 d0, d0
+    vmov    r0, r1, d0
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN SumOf16x16SingleBlock_neon
+    vld1.64 {q0}, [r0], r1
+    vpaddl.u8 q0, q0
+.rept 15
+    vld1.64 {q1}, [r0], r1
+    vpadal.u8 q0, q1
+.endr
+    vpaddl.u16 q0, q0
+    vpadd.i32 d0, d1
+    vpadd.i32 d0, d0
+    vmov    r0, r1, d0
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN SumOf8x8BlockOfFrame_neon
+//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
+    stmdb sp!, {r4-r8}
+    ldr	r5, [sp, #24] //pTimesOfFeatureValue
+    ldr	r4, [sp, #20] //pFeatureOfBlock
+
+    mov r8, r0
+    mov r6, r1
+    add r8, r6
+    add r4, r6, lsl #1
+
+_height_loop8x8:
+    mov r7, r6
+_width_loop8x8:
+    subs r0, r8, r7
+    vld1.64 {d0}, [r0], r3
+    vld1.64 {d1}, [r0], r3
+    vld1.64 {d2}, [r0], r3
+    vld1.64 {d3}, [r0], r3
+    vld1.64 {d4}, [r0], r3
+    vld1.64 {d5}, [r0], r3
+    vld1.64 {d6}, [r0], r3
+    vld1.64 {d7}, [r0]
+
+    vpaddl.u8 q0, q0
+    vpadal.u8 q0, q1
+    vpadal.u8 q0, q2
+    vpadal.u8 q0, q3
+
+    vpaddl.u16 q0, q0
+    vpadd.i32 d0, d1
+    vpadd.i32 d0, d0
+
+    subs r1, r4, r7, lsl #1
+    vst1.16 {d0[0]}, [r1] // sum -> pFeatureOfBlock[i]
+    vmov    r0, r1, d0
+    add r1, r5, r0, lsl #2
+    ldr r0, [r1]
+    add r0, #1
+    str r0, [r1]
+
+    subs r7, #1
+    bne _width_loop8x8
+
+    add r8, r3
+    add r4, r6, lsl #1
+    subs r2, #1
+    bne _height_loop8x8
+
+    ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN SumOf16x16BlockOfFrame_neon
+//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
+    stmdb sp!, {r4-r8}
+    ldr	r5, [sp, #24] //pTimesOfFeatureValue
+    ldr	r4, [sp, #20] //pFeatureOfBlock
+
+    mov r8, r0
+    mov r6, r1
+    add r8, r6
+    add r4, r6, lsl #1
+
+_height_loop16x16:
+    mov r7, r6
+_width_loop16x16:
+    subs r0, r8, r7
+    vld1.64 {q0}, [r0], r3
+    vpaddl.u8 q0, q0
+.rept 15
+    vld1.64 {q1}, [r0], r3
+    vpadal.u8 q0, q1
+.endr
+    vpaddl.u16 q0, q0
+    vpadd.i32 d0, d1
+    vpadd.i32 d0, d0
+
+    subs r1, r4, r7, lsl #1
+    vst1.16 {d0[0]}, [r1] // sum -> pFeatureOfBlock[i]
+    vmov    r0, r1, d0
+    add r1, r5, r0, lsl #2
+    ldr r0, [r1]
+    add r0, #1
+    str r0, [r1]
+
+    subs r7, #1
+    bne _width_loop16x16
+
+    add r8, r3
+    add r4, r6, lsl #1
+    subs r2, #1
+    bne _height_loop16x16
+
+    ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
+#endif
\ No newline at end of file
--- /dev/null
+++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
@@ -1,0 +1,151 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef  HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8SingleBlock_AArch64_neon
+    ld1 {v0.d}[0], [x0], x1
+    ld1 {v0.d}[1], [x0], x1
+    ld1 {v1.d}[0], [x0], x1
+    ld1 {v1.d}[1], [x0], x1
+    ld1 {v2.d}[0], [x0], x1
+    ld1 {v2.d}[1], [x0], x1
+    ld1 {v3.d}[0], [x0], x1
+    ld1 {v3.d}[1], [x0]
+    uaddlp v0.8h, v0.16b
+    uadalp v0.8h, v1.16b
+    uadalp v0.8h, v2.16b
+    uadalp v0.8h, v3.16b
+    uaddlv s0, v0.8h
+    mov    x0, v0.d[0]
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16SingleBlock_AArch64_neon
+    ld1 {v0.16b}, [x0], x1
+    uaddlp v0.8h, v0.16b
+.rept 15
+    ld1 {v1.16b}, [x0], x1
+    uadalp v0.8h, v1.16b
+.endr
+    uaddlv s0, v0.8h
+    mov    x0, v0.d[0]
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8BlockOfFrame_AArch64_neon
+//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
+    //x5: pTimesOfFeatureValue
+    //x4: pFeatureOfBlock
+
+    mov x8, x0
+    mov x6, x1
+    add x8, x8, x6
+    add x4, x4, x6, lsl #1
+
+_height_loop8x8:
+    mov x7, x6
+_width_loop8x8:
+    subs x0, x8, x7
+    ld1 {v0.d}[0], [x0], x3
+    ld1 {v0.d}[1], [x0], x3
+    ld1 {v1.d}[0], [x0], x3
+    ld1 {v1.d}[1], [x0], x3
+    ld1 {v2.d}[0], [x0], x3
+    ld1 {v2.d}[1], [x0], x3
+    ld1 {v3.d}[0], [x0], x3
+    ld1 {v3.d}[1], [x0]
+    uaddlp v0.8h, v0.16b
+    uadalp v0.8h, v1.16b
+    uadalp v0.8h, v2.16b
+    uadalp v0.8h, v3.16b
+    uaddlv s0, v0.8h
+
+    subs x1, x4, x7, lsl #1
+    st1 {v0.h}[0], [x1] // sum -> pFeatureOfBlock[i]
+    mov w0, #0
+    ins v0.s[1], w0
+    mov    x0, v0.d[0]
+    add x1, x5, x0, lsl #2
+    ldr w0, [x1]
+    add w0, w0, #1
+    str w0, [x1]
+    subs x7, x7, #1
+    cbnz x7, _width_loop8x8
+
+    add x8, x8, x3
+    add x4, x4, x6, lsl #1
+    subs x2, x2, #1
+    cbnz x2, _height_loop8x8
+
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16BlockOfFrame_AArch64_neon
+//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
+    //x5: pTimesOfFeatureValue
+    //x4: pFeatureOfBlock
+
+    mov x8, x0
+    mov x6, x1
+    add x8, x8, x6
+    add x4, x4, x6, lsl #1
+
+_height_loop16x16:
+    mov x7, x6
+_width_loop16x16:
+    subs x0, x8, x7
+    ld1 {v0.16b}, [x0], x3
+    uaddlp v0.8h, v0.16b
+.rept 15
+    ld1 {v1.16b}, [x0], x3
+    uadalp v0.8h, v1.16b
+.endr
+    uaddlv s0, v0.8h
+
+    subs x1, x4, x7, lsl #1
+    st1 {v0.h}[0], [x1] // sum -> pFeatureOfBlock[i]
+    mov w0, #0
+    ins v0.s[1], w0
+    mov    x0, v0.d[0]
+    add x1, x5, x0, lsl #2
+    ldr w0, [x1]
+    add w0, w0, #1
+    str w0, [x1]
+    subs x7, x7, #1
+    cbnz x7, _width_loop16x16
+
+    add x8, x8, x3
+    add x4, x4, x6, lsl #1
+    subs x2, x2, #1
+    cbnz x2, _height_loop16x16
+WELS_ASM_AARCH64_FUNC_END
+#endif
\ No newline at end of file
--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -244,6 +244,33 @@
 void SumOf16x16BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
                                const int32_t kiRefStride,
                                uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+#ifdef HAVE_NEON
+extern "C"
+{
+int32_t SumOf8x8SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
+int32_t SumOf16x16SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
+void SumOf8x8BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+                                const int32_t kiRefStride,
+                                uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+void SumOf16x16BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+                                  const int32_t kiRefStride,
+                                  uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+}
+#endif
+
+#ifdef HAVE_NEON_AARCH64
+extern "C"
+{
+int32_t SumOf8x8SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
+int32_t SumOf16x16SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
+void SumOf8x8BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+                                const int32_t kiRefStride,
+                                uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+void SumOf16x16BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+                                  const int32_t kiRefStride,
+                                  uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+}
+#endif
 int32_t RequestScreenBlockFeatureStorage (CMemoryAlign* pMa, const int32_t kiFrameWidth,  const int32_t kiFrameHeight,
     const int32_t iNeedFeatureStorage,
     SScreenBlockFeatureStorage* pScreenBlockFeatureStorage);
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -102,6 +102,23 @@
     //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
     pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
     pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
+#if defined (HAVE_NEON)
+    //for feature search
+    pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
+    pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
+    //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
+    pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon;
+    pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon;
+#endif
+
+#if defined (HAVE_NEON_AARCH64)
+    //for feature search
+    pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
+    pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
+    //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
+    pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon;
+    pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon;
+#endif
   }
 }
 
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -53,6 +53,7 @@
 	$(ENCODER_SRCDIR)/core/arm/memory_neon.S\
 	$(ENCODER_SRCDIR)/core/arm/pixel_neon.S\
 	$(ENCODER_SRCDIR)/core/arm/reconstruct_neon.S\
+	$(ENCODER_SRCDIR)/core/arm/svc_motion_estimation.S\
 
 ENCODER_OBJS += $(ENCODER_ASM_ARM_SRCS:.S=.$(OBJ))
 endif
@@ -64,6 +65,7 @@
 	$(ENCODER_SRCDIR)/core/arm64/memory_aarch64_neon.S\
 	$(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\
 	$(ENCODER_SRCDIR)/core/arm64/reconstruct_aarch64_neon.S\
+	$(ENCODER_SRCDIR)/core/arm64/svc_motion_estimation_aarch64_neon.S\
 
 ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))
 endif
--- /dev/null
+++ b/test/encoder/EncUT_SVC_me.cpp
@@ -1,0 +1,157 @@
+#include <gtest/gtest.h>
+#include <math.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "cpu_core.h"
+#include "cpu.h"
+#include "macros.h"
+#include "svc_motion_estimate.h"
+
+using namespace WelsSVCEnc;
+#define SVC_ME_TEST_NUM 10
+static void FillWithRandomData (uint8_t* p, int32_t Len) {
+  for (int32_t i = 0; i < Len; i++) {
+    p[i] = rand() % 256;
+  }
+}
+
+//preprocess related
+int32_t SumOf8x8SingleBlock_ref (uint8_t* pRef, const int32_t kiRefStride) {
+  int32_t iSum = 0, i;
+  for (i = 0; i < 8; i++) {
+    iSum +=  pRef[0]    + pRef[1]  + pRef[2]  + pRef[3];
+    iSum +=  pRef[4]    + pRef[5]  + pRef[6]  + pRef[7];
+    pRef += kiRefStride;
+  }
+  return iSum;
+}
+int32_t SumOf16x16SingleBlock_ref (uint8_t* pRef, const int32_t kiRefStride) {
+  int32_t iSum = 0, i;
+  for (i = 0; i < 16; i++) {
+    iSum +=  pRef[0]    + pRef[1]  + pRef[2]  + pRef[3];
+    iSum +=  pRef[4]    + pRef[5]  + pRef[6]  + pRef[7];
+    iSum    +=  pRef[8]    + pRef[9]  + pRef[10]  + pRef[11];
+    iSum    +=  pRef[12]  + pRef[13]  + pRef[14]  + pRef[15];
+    pRef += kiRefStride;
+  }
+  return iSum;
+}
+
+void SumOf8x8BlockOfFrame_ref (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+                               const int32_t kiRefStride,
+                               uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) {
+  int32_t x, y;
+  uint8_t* pRef;
+  uint16_t* pBuffer;
+  int32_t iSum;
+  for (y = 0; y < kiHeight; y++) {
+    pRef = pRefPicture  + kiRefStride * y;
+    pBuffer  = pFeatureOfBlock + kiWidth * y;
+    for (x = 0; x < kiWidth; x++) {
+      iSum = SumOf8x8SingleBlock_c (pRef + x, kiRefStride);
+
+      pBuffer[x] = iSum;
+      pTimesOfFeatureValue[iSum]++;
+    }
+  }
+}
+
+void SumOf16x16BlockOfFrame_ref (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+                                 const int32_t kiRefStride,
+                                 uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) {
+  //TODO: this is similar to SumOf8x8BlockOfFrame_c expect the calling of single block func, refactor-able?
+  int32_t x, y;
+  uint8_t* pRef;
+  uint16_t* pBuffer;
+  int32_t iSum;
+  for (y = 0; y < kiHeight; y++) {
+    pRef = pRefPicture  + kiRefStride * y;
+    pBuffer  = pFeatureOfBlock + kiWidth * y;
+    for (x = 0; x < kiWidth; x++) {
+      iSum = SumOf16x16SingleBlock_c (pRef + x, kiRefStride);
+
+      pBuffer[x] = iSum;
+      pTimesOfFeatureValue[iSum]++;
+    }
+  }
+}
+
+#define GENERATE_SumOfSingleBlock(anchor, method) \
+TEST (SVC_ME_FunTest, method) {\
+  ENFORCE_STACK_ALIGN_1D (uint8_t,  uiRefBuf,   16*320, 16);\
+  int32_t iRes[2];\
+  for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) {\
+    FillWithRandomData (uiRefBuf,16*320);\
+    iRes[0] = anchor (uiRefBuf,320);\
+    iRes[1] = method (uiRefBuf,320);\
+    ASSERT_EQ (iRes[0], iRes[1]);\
+  }\
+}
+
+GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_c)
+GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_c)
+
+#ifdef HAVE_NEON
+GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_neon)
+GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_neon)
+#endif
+
+#ifdef HAVE_NEON_AARCH64
+GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_AArch64_neon)
+GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_AArch64_neon)
+#endif
+
+
+#define ENFORCE_NEW_ALIGN_1D(_tp, _nm, _nbuff, _sz, _al) \
+_tp *_nbuff = new _tp[(_sz)+(_al)-1]; \
+_tp *_nm = _nbuff + ((_al)-1) - (((uintptr_t)(_nbuff + ((_al)-1)) & ((_al)-1))/sizeof(_tp));
+
+#define GENERATE_SumOfFrame(anchor, method, kiWidth, kiHeight) \
+TEST (SVC_ME_FunTest, method##_##kiWidth##x##kiHeight) {\
+ENFORCE_NEW_ALIGN_1D (uint8_t, pRefPicture, pRefPictureBuff, ((kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock1, pFeatureOfBlockBuff1, (kiWidth*kiHeight), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock2, pFeatureOfBlockBuff2, (kiWidth*kiHeight), 16) \
+uint32_t pTimesOfFeatureValue[2][65536]; \
+for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) {\
+  FillWithRandomData (pRefPicture,(kiHeight+16)*((((kiWidth+15)>>4)<<4)+16));\
+  memset(pTimesOfFeatureValue[0], 0, 65536*sizeof(uint32_t)); \
+  memset(pTimesOfFeatureValue[1], 0, 65536*sizeof(uint32_t)); \
+  anchor (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock1,pTimesOfFeatureValue[0]); \
+  method (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock2,pTimesOfFeatureValue[1]); \
+  for(int32_t j=0;j<kiWidth*kiHeight;j++){\
+      ASSERT_EQ (pFeatureOfBlock1[j], pFeatureOfBlock2[j]);\
+  }\
+  for(int32_t  j=0;j<65536;j++){\
+      ASSERT_EQ (pTimesOfFeatureValue[0][j], pTimesOfFeatureValue[1][j]);\
+  }\
+}\
+delete[] pRefPictureBuff; \
+delete[] pFeatureOfBlockBuff1; \
+delete[] pFeatureOfBlockBuff2; \
+}
+
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 1)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 1)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 320)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 640, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 640, 320)
+
+#ifdef HAVE_NEON
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 1)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 1)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320)
+#endif
+
+#ifdef HAVE_NEON_AARCH64
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 1, 1)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 1)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 1, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 320)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 640, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 640, 320)
+#endif
--- a/test/encoder/targets.mk
+++ b/test/encoder/targets.mk
@@ -1,7 +1,7 @@
 ENCODER_UNITTEST_SRCDIR=test/encoder
 ENCODER_UNITTEST_CPP_SRCS=\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_DecodeMbAux.cpp\
-	$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderExt.cpp\
+    $(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderExt.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMb.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMbAux.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_ExpGolomb.cpp\
@@ -13,6 +13,7 @@
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_MotionEstimate.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_Reconstruct.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_Sample.cpp\
+	$(ENCODER_UNITTEST_SRCDIR)/EncUT_SVC_me.cpp\
 
 ENCODER_UNITTEST_OBJS += $(ENCODER_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))